Beispiel #1
0
 def parseRef(self,response,filename,references=None):
     """
     get info about references
     """
     if references is None:
         references = []
         logger.info(f"===Start Scraping references of {filename}")
     else:
         pagenumb = int(response.url[response.url.find("page")+5:])
         logger.info(f"======== Scraping references of {filename} at page {pagenumb}")
     from scrapy.shell import inspect_response
     inspect_response(response,self)
     for iessay in response.css("div.essayBox"):
         # number of essay of this type
         logger.info(f"======== Scraping essaybox")
         num = int(iessay.xpath(".//span[@name='pcount']/text()").get())
         ref = iessay.xpath(".//li//text()").getall()
         references.extend(self.cleanref(ref))
         reftype = iessay.xpath('./div[@class="dbTitle"]/text()').get()
         if num > 10:
             # if exist page page + 1
             if response.url.find("page") >0:
                 pagenumb = int(response.url[response.url.find("page")+5:])
                 if num > pagenumb*10:
                     newurl = response.url.replace(f"page={pagenumb}",f"page={pagenumb+1}")
                     yield scrapy.Request(newurl,callback=self.parseRef,cb_kwargs=dict(filename=filename,references=references))
             else:
                 newurl=f"{response.url}&CurDBCode={self.DBCODE[reftype]}&page=2"
                 yield scrapy.Request(newurl,callback=self.parseRef,cb_kwargs=dict(filename=filename,references=references))
     
     logger.info(f"{references}")
     logger.info(f"===End Scraping references of {filename}")
     yield None
Beispiel #2
0
 def process_item(self, item, spider):
     logger.info(f"Inserting file {item['filename']} into database")
     collection_name = item['journal_code']
     # check if the scrapy is correctly done
     if "references" in item:
         if len(item["references"]) == item["ref_num"]:
             logger.info(
                 f"{item['filename']} has {len(item['references'])} references in total"
             )
             item['done'] = True
         else:
             logger.info(
                 f"We got {len(item['references'])}/{item['ref_num']} references of {item['filename']}"
             )
             item['done'] = False
     else:
         if item["ref_num"] == 0:
             item['done'] = True
             logger.info(f"{item['filename']} get no references.")
         else:
             logger.info(
                 f"No references of {item['filename']} is scrapied.")
             item['done'] = False
     # insert new ones
     # self.db[collection_name].insert_one(dict(item))
     self.db[collection_name].update_one({
         "url": item['url'],
         "done": False
     }, {"$set": dict(item)}, True)
     return item
Beispiel #3
0
    def parse(self, response, filename):
        '''
        parse article
        url = scrapy.Field()
        title = xpath(//h2[@class='title']/text()).get()
        authors = xpath(//div[@class='author']//a/text())
        abstract = xpath(//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text())
        journal_name_ch = scrapy.Field()
        journal_name_en = scrapy.Field()
        filename = scrapy.Field()
        keywords = //div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text()
        fenleihao = //div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text()
        found = //div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text()
        download_num = //div[@class='info']/div[@class='total']/span[1]/b/text()
        pages =  //div[@class='info']/div[@class='total']/span[3]/b/text()
        references = //div[@class='essayBox']//li//a/text()
        next_link 
        '''
        aloader = ArticleLoader(item=ArticleItem(), response=response)
        aloader.add_value("filename", filename)
        aloader.add_value("url", response.url)
        aloader.add_value("journal_name_ch", self.journal)
        aloader.add_value("journal_name_en", self.journal_en)
        aloader.add_value("journal_code", self.journal_code)
        aloader.add_xpath("title", "//h2[@class='title']/text()")
        aloader.add_xpath("authors", "//div[@class='author']//a/text()")
        aloader.add_xpath(
            "abstract",
            "//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text()")
        aloader.add_xpath(
            "keywords",
            "//div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text()"
        )
        aloader.add_xpath(
            "fenleihao",
            "//div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text()")
        aloader.add_xpath(
            "found",
            "//div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text()"
        )
        aloader.add_xpath(
            "download_num",
            "//div[@class='info']/div[@class='total']/span[1]/b/text()")
        aloader.add_xpath(
            "pages",
            "//div[@class='info']/div[@class='total']/span[3]/b/text()")
        logger.info(f"Scraping base info. of {filename} of 原子能科学技术")

        # yield a new request for the references of this article
        ref_url = self.REF_BASE_URL_AEST.replace("fxxxx", filename)
        logger.info(f"Scraping ref from {ref_url}")
        yield scrapy.Request(ref_url,
                             callback=self.parseRef,
                             headers=dict(Referer=response.url),
                             cb_kwargs=dict(loader=aloader, filename=filename))
Beispiel #4
0
    def parse(self, response):
        for quote in response.css("div.quote"):
            q = QuotesLoader(item=QuotesbotItem(), selector=quote)
            q.add_xpath('text', './span[@class="text"]/text()')
            q.add_xpath('author', './/small[@class="author"]/text()')
            q.add_xpath('tags', './/div[@class="tags"]/a[@class="tag"]/text()')
            yield q.load_item()

        next_page_url = response.xpath(
            '//li[@class="next"]/a/@href').extract_first()
        if next_page_url is not None:
            pagenum = int(next_page_url.split("/")[2])
            logger.info(f"Next Request is page {pagenum}!")
            if pagenum < 5:
                yield scrapy.Request(response.urljoin(next_page_url))
Beispiel #5
0
    def parse(self, response):
        '''
        parse article
        url = scrapy.Field()
        title = xpath(//h2[@class='title']/text()).get()
        authors = xpath(//div[@class='author']//a/text())
        abstract = xpath(//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text())
        journal_name_ch = scrapy.Field()
        journal_name_en = scrapy.Field()
        filename = scrapy.Field()
        keywords = //div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text()
        fenleihao = //div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text()
        found = //div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text()
        download_num = //div[@class='info']/div[@class='total']/span[1]/b/text()
        pages =  //div[@class='info']/div[@class='total']/span[3]/b/text()
        references = //div[@class='essayBox']//li//a/text()
        next_link 
        '''
        from scrapy.shell import inspect_response
        inspect_response(response,self)
        # aloader = ArticleLoader(item=ArticleItem(),response=response)
        # aloader.add_value("filename",filename)
        # aloader.add_value("url",response.url)
        # aloader.add_value("journal_name_ch","原子能科学技术")
        # aloader.add_value("journal_name_en","Atomic Energy Science and Technology")
        # aloader.add_xpath("title","//h2[@class='title']/text()")
        # aloader.add_xpath("authors","//div[@class='author']//a/text()")
        # aloader.add_xpath("abstract","//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text()")
        # aloader.add_xpath("keywords","//div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text()")
        # aloader.add_xpath("fenleihao","//div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text()")
        # aloader.add_xpath("found","//div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text()")
        # aloader.add_xpath("download_num","//div[@class='info']/div[@class='total']/span[1]/b/text()")
        # aloader.add_xpath("pages","//div[@class='info']/div[@class='total']/span[3]/b/text()")
        # logger.info(f"Scraping base info. of {filename} of 原子能科学技术")

        # yield a new request for the references of this article
        ref_url = self.REF_BASE_URL_AEST.replace("fxxxx",filename)
        logger.info(f"Scraping ref from {ref_url}")
        yield None
Beispiel #6
0
    def parseRef(self,
                 response,
                 loader,
                 filename,
                 references=None,
                 total_ref_num=0):
        """
        get info about references
        """
        if references is None:
            references = []
            logger.info(f"===Start Scraping references of {filename}")
            total_ref_num = sum(
                map(int,
                    response.xpath(".//span[@name='pcount']/text()").getall()))
            loader.add_value("ref_num", total_ref_num)
        else:
            pagenumb = int(response.url[response.url.find("page") + 5:])
            logger.info(
                f"======== Scraping references of {filename} at page {pagenumb}"
            )
            # from scrapy.shell import inspect_response
            # inspect_response(response,self)
        for iessay in response.css("div.essayBox"):
            # number of essay of this type
            logger.info(f"======== Scraping essaybox")
            num = int(iessay.xpath(".//span[@name='pcount']/text()").get())
            ref = iessay.xpath(".//li//text()").getall()
            references.extend(self.cleanref(ref))
            reftype = iessay.xpath('./div[@class="dbTitle"]/text()').get()
            if num > 10:
                # if exist page page + 1
                if response.url.find("page") > 0:
                    pagenumb = int(response.url[response.url.find("page") +
                                                5:])
                    if num > pagenumb * 10:
                        newurl = response.url.replace(f"page={pagenumb}",
                                                      f"page={pagenumb+1}")
                        yield scrapy.Request(newurl,
                                             callback=self.parseRef,
                                             cb_kwargs=dict(
                                                 loader=loader,
                                                 filename=filename,
                                                 references=references,
                                                 total_ref_num=total_ref_num))
                else:
                    newurl = f"{response.url}&CurDBCode={self.DBCODE[reftype]}&page=2"
                    yield scrapy.Request(newurl,
                                         callback=self.parseRef,
                                         cb_kwargs=dict(
                                             loader=loader,
                                             filename=filename,
                                             references=references,
                                             total_ref_num=total_ref_num))

        if len(references) >= total_ref_num:
            references = list(set(references))
            logger.info(
                f"{filename}: Total number of reference is {total_ref_num}, Now We got {len(references)}"
            )
            loader.add_value("references", references)
            logger.info(f"===End Scraping references of {filename}")
            yield loader.load_item()
        else:
            logger.info(
                f"{filename}: Total number of reference is {total_ref_num}, Now We got {len(references)}"
            )