def parse2(self, response): print('-------parse2爬取结果-------') hrefs = response.xpath("//main//ul/li/div[@class='list_con']/div[@class='title']/h2/a/@href").extract() print(hrefs) print('---------------------') for i in range(2): #len(hrefs) print('Go to(X2):',hrefs[i]) yield proxyRequest(url=hrefs[i], callback=self.parse3)
def parse(self, response): print('-------首页header所有分类-------') hrefs = response.xpath("//ul[@class='cl']/li/a/@href").extract() print('一共 %s 个分类' % len(hrefs)) print(hrefs) print('---------------------') for i in range(len(hrefs)): url = hrefs[i] print('Go to:',url) yield proxyRequest(url=url, callback=self.parseBookList)
def parse(self, response): hrefs = response.xpath("//div[@class='nav_com']/ul/li/a/@href").extract() print('-------首页类别-------') print(len(hrefs)) print('------------------------') hrefs = hrefs[3:] for i in range(3): #len(hrefs) url = self.base_url+hrefs[i] print('Go to:',url) yield proxyRequest(url=url, callback=self.parse2)
def parseBookIntro(self, response): print('-------book介绍页,进入“查看全部章节”-------') hrefs = response.xpath("//div[@class='ft']/a/@href").extract() href = hrefs[0] if 'http' not in href: print('href补全,自动补全base_url') href = self.base_url + href print(href) print('---------------------') yield proxyRequest(url=href, callback=self.parseBookIndex)
def parseBookList(self, response): hrefs = response.xpath("//div[@class='list_l_box']//h4/a/@href").extract() print('-------book列表页-------') print('本页一共 %s 本书' % len(hrefs)) hrefs=list(map(lambda x:self.base_url + x,hrefs)) print(hrefs) print('---------------------') for i in range(len(hrefs)): url = hrefs[i] print('Go to:',url) yield proxyRequest(url=url, callback=self.parseBookIntro)
def parse(self, response): # 文章url hrefs = response.xpath( "//div[@class='art_title linkable']/a[@class='ref nowrap']/@href" ).extract() volume = response.xpath( "//div[@class='pager issueBookNavPager']/span[@class='journalNavCenterTd']/div[@class='journalNavTitle']/text()" ).extract()[0] for i in range(len(hrefs)): yield proxyRequest(url=self.base_url + hrefs[i], meta={'annualVolume': volume}, callback=self.parse2)
def parseBookIndex(self,response): title=response.xpath("//div[@class='catalog_hd']/h1/text()").extract()[0] # title=response.xpath("//div[@class='cont']/h1/text()").extract()[0] print('title',title) filePath = os.path.join(Config().corpusPath,title+'.txt') if os.path.exists(filePath) == True: print(filePath) print('------文章已爬过,跳过------------') print('------文章已爬过,跳过------------') return False print('-------章节目录页-------') hrefs = response.xpath("//ul[@class='catalog_list clearfix']/li/a/@href").extract() print('一共 %s 个章节' % len(hrefs)) hrefs=list(map(lambda x:self.base_url + x,hrefs)) print(hrefs) for i in range(len(hrefs)): print('Go to 内容页:',hrefs[i]) # time.sleep(1) yield proxyRequest(url=hrefs[i], callback=self.parseContentWrapper(title))
def parse2(self, response): item = QikanItem() # 文章题目 item['title'] = '' titles = response.xpath( "//div[@class='hlFld-Title']//div[@class='publicationContentTitle']//h1" ).extract() pat = re.compile('<[^>]+>', re.S) for title in titles: item['title'] = item['title'] + pat.sub('', title).strip() # item['title'] = response.xpath("//div[@class='hlFld-Title']//div[@class='publicationContentTitle']//h1/text()").extract()[0].strip() # # titles = response.xpath("//h2[@class='citation__title']/text()").extract() # pat = re.compile('<[^>]+>', re.S) # 作者 item['author'] = '' # 通讯作者 # 通讯作者单位 aus = [] if response.xpath( "//div[@class='header']/a[@class='entryAuthor']").extract(): authors = response.xpath( "//div[@class='header']/a[@class='entryAuthor']").extract() for author in authors: item['author'] = item['author'] + pat.sub('', author).strip() + "," else: item['author'] = 'NULL' if response.xpath( "//div[@class='hlFld-ContribAuthor']/span[@class='NLM_contrib-group']/div[@class='artice-info-affiliation']/text()" ).extract(): item['authorAffiliation'] = response.xpath( "//div[@class='hlFld-ContribAuthor']/span[@class='NLM_contrib-group']/div[@class='artice-info-affiliation']/text()" ).extract()[0] elif response.xpath( "//div[@class='hlFld-ContribAuthor']/div[@class='artice-info-affiliation'][1]/text()" ).extract(): item['authorAffiliation'] = response.xpath( "//div[@class='hlFld-ContribAuthor']/div[@class='artice-info-affiliation'][1]/text()" ).extract()[0] elif response.xpath( "//div[@class='artice-notes']//corresp//text()").extract(): item['authorAffiliation'] = response.xpath( "//div[@class='artice-notes']//corresp//text()").extract( )[0].replace('Email:', '') else: item['authorAffiliation'] = 'NULL' item['authorAffiliation'] = item['authorAffiliation'].replace( '\n', '').replace('\r', '').replace('\t', '').replace(' ', ' ') # print(item['authorAffiliation']) item['correspongdingauthorEmail'] = '' if response.xpath( "//a[@class='email']/span[@class='nobrWithWbr']").extract(): correspongdingauthorEmails = response.xpath( "//a[@class='email']/span[@class='nobrWithWbr']").extract() for correspongdingauthorEmail in correspongdingauthorEmails: item['correspongdingauthorEmail'] = item[ 'correspongdingauthorEmail'] + pat.sub( '', correspongdingauthorEmail).strip() + '||' else: item['correspongdingauthorEmail'] = 'NULL' # item['correspongdingauthorEmail'] = response.xpath("//a[@class='email']/span[@class='nobrWithWbr']").extract() if response.xpath( "//div[@class='hlFld-ContribAuthor']/span[@class='contribDegrees'][1]/div[@class='authorLayer']/div[@class='header']/a[@class='entryAuthor']/text()" ).extract(): item['correspongdingauthor'] = response.xpath( "//div[@class='hlFld-ContribAuthor']/span[@class='contribDegrees'][1]/div[@class='authorLayer']/div[@class='header']/a[@class='entryAuthor']/text()" ).extract()[0] + '||' else: item['correspongdingauthor'] = 'NULL' # # DOI号 if item['correspongdingauthor'] == 'NULL': item['correspongdingauthor'] = 'NULL' elif item['correspongdingauthor'] != '': correspongdingau = item['correspongdingauthor'].split("||") correspongdingEm = item['correspongdingauthorEmail'].split("||") item['correspongdingauthor'] = '' for i in range(len(correspongdingau)): if correspongdingau[i] != '': item['correspongdingauthor'] += '(' + correspongdingau[ i] + ',' + correspongdingEm[i] + '),' else: item['correspongdingauthor'] = 'NULL' # print(item['correspongdingauthor']) item['DOI'] = response.xpath( "//div[@class='widget-body body body-none body-compact-all']/div[@class='doiWidgetContainer']/a[@class='doiWidgetLink']/text()" ).extract()[0] # # print(item['DOI']) # # 没有关键词 item['keyword'] = '' if response.xpath( "//div[@class='hlFld-KeywordText']/kwd-group/a[@class='attributes']/text()" ).extract(): keywords = response.xpath( "//div[@class='hlFld-KeywordText']/kwd-group/a[@class='attributes']/text()" ).extract() for keyword in keywords: item['keyword'] = item['keyword'] + keyword + ',' else: item['keyword'] = 'NULL' # # 摘要 item['abstract'] = '' pat = re.compile('<[^>]+>', re.S) if response.xpath( "//div[@class='hlFld-Abstract']//div[@class='abstractSection abstractInFull']//p" ): coninfos = response.xpath( "//div[@class='hlFld-Abstract']//div[@class='abstractSection abstractInFull']//p" ).extract() for coninfo in coninfos: item['abstract'] = item['abstract'] + pat.sub( '', coninfo).strip() + '<br>' else: item['abstract'] = 'NULL' item['abstract'] = item['abstract'].replace('\n', '') # print(item['abstract']) header = { 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3013.3 Safari/537.36' } if response.xpath( "//div[@class='rightMobileMenuButton articleToolsButton PDFTool pdf-access redButton smallButton']/a/@href" ).extract(): pdf = response.xpath( "//div[@class='rightMobileMenuButton articleToolsButton PDFTool pdf-access redButton smallButton']/a/@href" ).extract()[0] item['pdf'] = self.base_url + pdf yield proxyRequest(url=self.base_url + pdf, meta={'filename': pdf.split('/')[-1] + '.pdf'}, headers=header, callback=postItemWithPdf(item)) else: item['pdf'] = 'NULL' postItem(item) # print(item['pdf']) # 卷,期,年 item['annualVolume'] = response.meta['annualVolume'].strip() # item['annualVolume'] = response.xpath("//div[@class='Article information']/div[1]/text()").extract()[0].strip() # item['annualVolume'] = pat.sub('', annualVolume).strip() # print(item['annualVolume']) # 页码 item['pageNumber'] = 'NULL' # print(pageNumber) # ru2 = re.compile(r'pp (.*)') # # 页码 # item['pageNumber'] = ru2.search(pageNumber).group(1) # print(item['pageNumber']) # 期刊名 item['journalTitle'] = pat.sub( '', response.xpath( "//div[@id='e3c018c7-8573-4acd-93ae-0ff4b1f3baf3']/div[@class='wrapped ']" ).extract()[0]).strip() # print(item['journalTitle']) # 有些期刊目录有一张图片 item['imageUrlList'] = 'NULL' # 12 July 2018 item['publishTime'] = response.xpath( "//span[@class='publicationContentEpubDate dates']/text()" ).extract()[1].strip() # 改成2018-07-12 temp = time.strptime(item['publishTime'], "%B %d, %Y") item['publishTime'] = time.strftime("%Y-%m-%d", temp) # print(item['publishTime']) yield item