コード例 #1
0
 def __init__(self):
     super(ACMSpider, self).__init__()
     self.startPage = 0
     self.pageSize = 20
     self.startTime = get_project_settings().get('START_TIME')
     self.proxyUpdateDelay = get_project_settings().get('PROXY_UPDATE_DELAY')
     getProxy().main()
コード例 #2
0
    def parse(self, response):
        item = AcaspiderItem()
        print('爬取第', self.startPage, '页')
        results_num = response.xpath(
            '//span[@class="hitsLength"]/text()').extract()[0].replace(
                ',', '')
        subjects = response.xpath(
            '//ul[@class="rlist--inline facet__list--applied"]/li/span/text()'
        ).extract()[0]
        response = response.xpath(
            '//li[@class="search__item issue-item-container"]')
        item['title'] = []
        item['authors'] = []
        item['year'] = []
        item['typex'] = []
        item['subjects'] = []
        item['url'] = []
        item['abstract'] = []
        item['citation'] = []

        for res in response:
            try:
                item['title'].append(
                    self.remove_html(
                        res.xpath('.//span[@class="hlFld-Title"]/a/text()').
                        extract()[0]))
            except:
                item['title'].append(' ')

            try:
                item['authors'].append(
                    self.merge_authors(
                        res.xpath(
                            './/ul[@aria-label="authors"]/li/a/span/text()').
                        extract()))
            except:
                item['authors'].append(' ')

            try:
                item['year'].append(
                    self.remove4year(
                        self.remove_html(
                            res.xpath('.//span[@class="dot-separator"]').
                            extract()[0])))
            except:
                item['year'].append(' ')

            try:
                item['typex'].append(
                    res.xpath('.//span[@class="epub-section__title"]/text()').
                    extract()[0])
            except:
                item['typex'].append(' ')

            try:
                item['url'].append(
                    res.xpath(
                        './/a[@class="issue-item__doi dot-separator"]/text()').
                    extract()[0])
            except:
                item['url'].append(' ')

            try:
                item['abstract'].append(
                    self.remove_html(
                        res.xpath(
                            './/div[contains(@class, "issue-item__abstract")]/p'
                        ).extract()[0]))
            except:
                item['abstract'].append(' ')

            try:
                item['citation'].append(
                    res.xpath(
                        './/span[@class="citation"]/span/text()').extract()[0])
            except:
                item['citation'].append(' ')

            item['subjects'].append(subjects)

        yield item
        logging.warning('$ ACM_Spider已爬取:' +
                        str((self.startPage + 1) * self.pageSize))

        if (datetime.datetime.now() -
                self.startTime).seconds > self.proxyUpdateDelay:
            getProxy().main()
            print('已爬取:', (self.startPage + 1) * self.pageSize)
            logging.warning('$ ACM_Spider runs getProxy')

        if (self.startPage +
                1) * self.pageSize < int(results_num) and self.startPage < 1:
            self.startPage += 1
            next_url = self.start_urls[0] + '&startPage=' + str(
                self.startPage) + '&pageSize=' + str(self.pageSize)
            yield scrapy.Request(
                next_url,
                callback=self.parse,
            )