def parseRef(self,response,filename,references=None): """ get info about references """ if references is None: references = [] logger.info(f"===Start Scraping references of {filename}") else: pagenumb = int(response.url[response.url.find("page")+5:]) logger.info(f"======== Scraping references of {filename} at page {pagenumb}") from scrapy.shell import inspect_response inspect_response(response,self) for iessay in response.css("div.essayBox"): # number of essay of this type logger.info(f"======== Scraping essaybox") num = int(iessay.xpath(".//span[@name='pcount']/text()").get()) ref = iessay.xpath(".//li//text()").getall() references.extend(self.cleanref(ref)) reftype = iessay.xpath('./div[@class="dbTitle"]/text()').get() if num > 10: # if exist page page + 1 if response.url.find("page") >0: pagenumb = int(response.url[response.url.find("page")+5:]) if num > pagenumb*10: newurl = response.url.replace(f"page={pagenumb}",f"page={pagenumb+1}") yield scrapy.Request(newurl,callback=self.parseRef,cb_kwargs=dict(filename=filename,references=references)) else: newurl=f"{response.url}&CurDBCode={self.DBCODE[reftype]}&page=2" yield scrapy.Request(newurl,callback=self.parseRef,cb_kwargs=dict(filename=filename,references=references)) logger.info(f"{references}") logger.info(f"===End Scraping references of {filename}") yield None
def process_item(self, item, spider): logger.info(f"Inserting file {item['filename']} into database") collection_name = item['journal_code'] # check if the scrapy is correctly done if "references" in item: if len(item["references"]) == item["ref_num"]: logger.info( f"{item['filename']} has {len(item['references'])} references in total" ) item['done'] = True else: logger.info( f"We got {len(item['references'])}/{item['ref_num']} references of {item['filename']}" ) item['done'] = False else: if item["ref_num"] == 0: item['done'] = True logger.info(f"{item['filename']} get no references.") else: logger.info( f"No references of {item['filename']} is scrapied.") item['done'] = False # insert new ones # self.db[collection_name].insert_one(dict(item)) self.db[collection_name].update_one({ "url": item['url'], "done": False }, {"$set": dict(item)}, True) return item
def parse(self, response, filename): ''' parse article url = scrapy.Field() title = xpath(//h2[@class='title']/text()).get() authors = xpath(//div[@class='author']//a/text()) abstract = xpath(//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text()) journal_name_ch = scrapy.Field() journal_name_en = scrapy.Field() filename = scrapy.Field() keywords = //div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text() fenleihao = //div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text() found = //div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text() download_num = //div[@class='info']/div[@class='total']/span[1]/b/text() pages = //div[@class='info']/div[@class='total']/span[3]/b/text() references = //div[@class='essayBox']//li//a/text() next_link ''' aloader = ArticleLoader(item=ArticleItem(), response=response) aloader.add_value("filename", filename) aloader.add_value("url", response.url) aloader.add_value("journal_name_ch", self.journal) aloader.add_value("journal_name_en", self.journal_en) aloader.add_value("journal_code", self.journal_code) aloader.add_xpath("title", "//h2[@class='title']/text()") aloader.add_xpath("authors", "//div[@class='author']//a/text()") aloader.add_xpath( "abstract", "//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text()") aloader.add_xpath( "keywords", "//div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text()" ) aloader.add_xpath( "fenleihao", "//div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text()") aloader.add_xpath( "found", "//div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text()" ) aloader.add_xpath( "download_num", "//div[@class='info']/div[@class='total']/span[1]/b/text()") aloader.add_xpath( "pages", "//div[@class='info']/div[@class='total']/span[3]/b/text()") logger.info(f"Scraping base info. of {filename} of 原子能科学技术") # yield a new request for the references of this article ref_url = self.REF_BASE_URL_AEST.replace("fxxxx", filename) logger.info(f"Scraping ref from {ref_url}") yield scrapy.Request(ref_url, callback=self.parseRef, headers=dict(Referer=response.url), cb_kwargs=dict(loader=aloader, filename=filename))
def parse(self, response): for quote in response.css("div.quote"): q = QuotesLoader(item=QuotesbotItem(), selector=quote) q.add_xpath('text', './span[@class="text"]/text()') q.add_xpath('author', './/small[@class="author"]/text()') q.add_xpath('tags', './/div[@class="tags"]/a[@class="tag"]/text()') yield q.load_item() next_page_url = response.xpath( '//li[@class="next"]/a/@href').extract_first() if next_page_url is not None: pagenum = int(next_page_url.split("/")[2]) logger.info(f"Next Request is page {pagenum}!") if pagenum < 5: yield scrapy.Request(response.urljoin(next_page_url))
def parse(self, response): ''' parse article url = scrapy.Field() title = xpath(//h2[@class='title']/text()).get() authors = xpath(//div[@class='author']//a/text()) abstract = xpath(//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text()) journal_name_ch = scrapy.Field() journal_name_en = scrapy.Field() filename = scrapy.Field() keywords = //div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text() fenleihao = //div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text() found = //div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text() download_num = //div[@class='info']/div[@class='total']/span[1]/b/text() pages = //div[@class='info']/div[@class='total']/span[3]/b/text() references = //div[@class='essayBox']//li//a/text() next_link ''' from scrapy.shell import inspect_response inspect_response(response,self) # aloader = ArticleLoader(item=ArticleItem(),response=response) # aloader.add_value("filename",filename) # aloader.add_value("url",response.url) # aloader.add_value("journal_name_ch","原子能科学技术") # aloader.add_value("journal_name_en","Atomic Energy Science and Technology") # aloader.add_xpath("title","//h2[@class='title']/text()") # aloader.add_xpath("authors","//div[@class='author']//a/text()") # aloader.add_xpath("abstract","//div[@class='wxBaseinfo']//span[@id='ChDivSummary']/text()") # aloader.add_xpath("keywords","//div[@class='wxBaseinfo']//label[@id='catalog_KEYWORD']/../a/text()") # aloader.add_xpath("fenleihao","//div[@class='wxBaseinfo']//label[@id='catalog_ZTCLS']/../text()") # aloader.add_xpath("found","//div[@class='wxBaseinfo']//label[@id='catalog_FUND']/../a/text()") # aloader.add_xpath("download_num","//div[@class='info']/div[@class='total']/span[1]/b/text()") # aloader.add_xpath("pages","//div[@class='info']/div[@class='total']/span[3]/b/text()") # logger.info(f"Scraping base info. of {filename} of 原子能科学技术") # yield a new request for the references of this article ref_url = self.REF_BASE_URL_AEST.replace("fxxxx",filename) logger.info(f"Scraping ref from {ref_url}") yield None
def parseRef(self, response, loader, filename, references=None, total_ref_num=0): """ get info about references """ if references is None: references = [] logger.info(f"===Start Scraping references of {filename}") total_ref_num = sum( map(int, response.xpath(".//span[@name='pcount']/text()").getall())) loader.add_value("ref_num", total_ref_num) else: pagenumb = int(response.url[response.url.find("page") + 5:]) logger.info( f"======== Scraping references of {filename} at page {pagenumb}" ) # from scrapy.shell import inspect_response # inspect_response(response,self) for iessay in response.css("div.essayBox"): # number of essay of this type logger.info(f"======== Scraping essaybox") num = int(iessay.xpath(".//span[@name='pcount']/text()").get()) ref = iessay.xpath(".//li//text()").getall() references.extend(self.cleanref(ref)) reftype = iessay.xpath('./div[@class="dbTitle"]/text()').get() if num > 10: # if exist page page + 1 if response.url.find("page") > 0: pagenumb = int(response.url[response.url.find("page") + 5:]) if num > pagenumb * 10: newurl = response.url.replace(f"page={pagenumb}", f"page={pagenumb+1}") yield scrapy.Request(newurl, callback=self.parseRef, cb_kwargs=dict( loader=loader, filename=filename, references=references, total_ref_num=total_ref_num)) else: newurl = f"{response.url}&CurDBCode={self.DBCODE[reftype]}&page=2" yield scrapy.Request(newurl, callback=self.parseRef, cb_kwargs=dict( loader=loader, filename=filename, references=references, total_ref_num=total_ref_num)) if len(references) >= total_ref_num: references = list(set(references)) logger.info( f"{filename}: Total number of reference is {total_ref_num}, Now We got {len(references)}" ) loader.add_value("references", references) logger.info(f"===End Scraping references of {filename}") yield loader.load_item() else: logger.info( f"{filename}: Total number of reference is {total_ref_num}, Now We got {len(references)}" )