Esempio n. 1
0
class Spider(object):
    def __init__(self):
        self._headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'Sec-Fetch-Mode': 'no-cors',
            'Host': 'arxiv.org'
        }
        self._sess = requests.Session()
        self._sleep_time = 5
        self._mysql = MySQL()

    def _get_detail(self, url):
        while 1:
            try:
                content = self._sess.get(url, headers=self._headers).content
            except Exception as e:
                print(e)
                self._sess.close()
                self._sess = requests.Session()
                time.sleep(self._sleep_time)
                continue
            html = etree.HTML(content)
            title = html.xpath(
                '//h1[@class="title mathjax"]/text()')[0].strip()
            #authors = ','.join(html.xpath('//div[@class="authors"]/a/text()')).strip()
            abstract = html.xpath(
                '//blockquote[@class="abstract mathjax"]/text()')[0].strip()
            subjects = html.xpath(
                'string(//td[@class="tablecell subjects"])').strip()
            arxiv = url.split('/')[-1]
            print(arxiv)
            return (arxiv, title, abstract, subjects)

    def crawl_arxiv_n(self, begin, stop):
        self._mysql.connect()
        # for month in ['07, 06, 05, 04, 03, 02, 01']:
        #     try:
        index_error_count = 0
        for i in range(begin, stop + 1):
            try:
                result = self._get_detail(
                    'https://arxiv.org/abs/1709.{:05d}'.format(i))
                index_error_count = 0
                self._mysql.execute(
                    'INSERT IGNORE INTO `rec_arxiv_paper` \
                    (`arxiv`, `title`, `abstract`, `subjects`) VALUES \
                    (%s, %s, %s, %s)', result)
                time.sleep(self._sleep_time // 5)
                if i % 150:
                    self._sess.close()
                    self._sess = requests.Session()
            except IndexError:
                index_error_count += 1
                if index_error_count > 5:
                    break
            # except IndexError:
            #     continue
        self._mysql.close()