def parse_article(self, response): apsitem = LalItem() data = response.meta['data'] apsitem['url'] = response.url apsitem['title'] = data['title'] apsitem['journal'] = data['citation'].split(' <')[0] apsitem['doi'] = data['doi'] year_match = re.match(r'.*\((\d{4})\).*', data['citation']) apsitem['year'] = int(year_match.group(1)) citing_info = data.get('citation_count_text', '') apsitem['citing_num'] = int( citing_info.split(' ')[0]) if citing_info else 0 abstract_text = data['summary'] abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S) apsitem['abstract'] = abstract_match.group(1) img_url = response.css('.clear-wrap li img::attr(src)').extract() apsitem['abs_img_url'] = ('https://journals.aps.org' + img_url[0]) if img_url else '' subject_areas = data.get('subject_areas', []) apsitem['keywords'] = [subject['label'] for subject in subject_areas ] if subject_areas else [] authors = data['authors'].split(', ') authors[-1] = re.sub(r'and ', '', authors[-1]) apsitem['authors'] = authors apsitem['_id'] = get_md5(apsitem['url']) apsitem['company'] = self.name yield apsitem
def parse_article(self, response): osaitem = LalItem() data = response.meta['data'] osaitem['url'] = response.url osaitem['title'] = data['title'] osaitem['authors'] = data['author'].split('; ') osaitem['doi'] = data['doi'] osaitem['journal'] = response.css('.article-journal-name li strong::text').extract_first('') if not osaitem['journal']: osaitem['journal'] = data['name'].split(',')[0] osaitem['year'] = int(data['years']) osaitem['keywords'] = [] osaitem['abs_img_url'] = response.css('img[alt="Fig. 1"]::attr(data-src)').extract_first(default='') #citing_num是隐藏字段,display:none,需要使用selenium。有点麻烦。就不取了 osaitem['citing_num'] = 0 abstract_text = response.css('#articleBody p').extract() abstract_list = [] if abstract_text: #有些文章是没有摘要的。如果有摘要,格式也是多变的。 for element in abstract_text: if '©' in element: break else: abstract_match = re.match(r'<.+?>(.*)</.+>', element, re.S) abstract_list.append(abstract_match.group(1)) osaitem['abstract'] = ''.join(abstract_list) osaitem['_id'] = get_md5(osaitem['url']) osaitem['company'] = self.name yield osaitem
def parse_article(self, response): # 解析文章主页 aipitem = LalItem() aipitem['url'] = response.url # 保留sub_tag title = response.css('.publicationContentTitle h3').extract()[0] title_match = re.match(r'<h3>(.*?)<span.*', title, re.S) aipitem['title'] = title_match.group(1).strip() aipitem['journal'] = response.css( '.publicationContentCitation::text').extract()[0].strip() doi_link = response.css( '.publicationContentCitation a::text').extract()[0] aipitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link) year_info = response.css( '.publicationContentCitation::text').extract()[1].strip() year_match = re.match(r'.*\((\d{4})\).*', year_info) aipitem['year'] = int(year_match.group(1)) abstract_text = response.css('div.NLM_paragraph').extract()[0] aipitem['abstract'] = re.sub(r'(<|</)(div|named).*?>', '', abstract_text, re.S) img_url = response.css('.figure-no-f1 img::attr(src)').extract_first( default='') if img_url: aipitem['abs_img_url'] = 'https://aip.scitation.org' + img_url else: info_match = re.match( r".*journal=(.+?)&volume=(\d+?)&issue=(\d+?)&doi=10.1063/(.+?)\';.*", response.text, re.S) if info_match: jname, vol, issue, doiend = info_match.groups() img_url = 'https://aip.scitation.org/na101/home/literatum/publisher/aip/journals/content/{0}/{4}/{0}.{4}.{1}.issue-{2}/{3}/production/images/small/{3}.figures.f1.gif' aipitem['abs_img_url'] = img_url.format( jname, vol, issue, doiend, aipitem['year']) #此法对于2017-2018(即近两年)的文章是没有效果的 else: aipitem['abs_img_url'] = '' aipitem['citing_num'] = response.meta['citation'] aipitem['keywords'] = response.css('.topicTags a::text').extract() author_group = response.css('.contrib-author').extract() commun_author = [author for author in author_group if 'a)' in author] authors = [] for author in author_group: match = re.match(r'.*<a href=.*?>(.+?)</a.*', author, re.S) name = match.group(1).strip() if author in commun_author: name = name + '*' authors.append(name) aipitem['authors'] = authors aipitem['_id'] = get_md5(aipitem['url']) aipitem['company'] = self.name yield aipitem
def parse_article(self, response): # 解析文章主页 wileyitem = LalItem() wileyitem['url'] = response.url # 部分title中会有子标签 try: #有些文章有两个title,第一个是德文的,第二个才是英文的 title = response.css('.citation__title--second').extract()[0] except: title = response.css('.citation__title').extract()[0] title_match = re.match(r'<.+?>(.+)</.+>', title, re.S) title = title_match.group(1) wileyitem['title'] = re.sub('\n', ' ', title) wileyitem['journal'] = response.css( '.article-citation h1 a::text').extract()[0] doi_link = response.css('.epub-doi::text').extract()[0] wileyitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link) abstract_text = response.css( '.article-section__content p').extract()[0] abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S) abstract_text = abstract_match.group(1) wileyitem['abstract'] = re.sub('\n', ' ', abstract_text) wileyitem['citing_num'] = int( response.css('a[href="#citedby-section"]::text').extract_first( default='0')) wileyitem['keywords'] = response.css( 'meta[name="citation_keywords"]::attr(content)').extract() wileyitem['year'] = int( response.css('.epub-date::text').extract()[0][-4:]) author_group = response.css( '.accordion-tabbed .accordion-tabbed__tab-mobile').extract() commun_author = [ author for author in author_group if 'Corresponding Author' in author ] authors = [] for author in author_group: match = re.match(r'.*<a href=.*?><span>(.+?)<.*', author, re.S) name = match.group(1) if author in commun_author: name = name + '*' authors.append(name) wileyitem['authors'] = authors wileyitem['_id'] = get_md5(wileyitem['url']) wileyitem['company'] = self.name toc_url = 'https://onlinelibrary.wiley.com' + response.css( 'a.volume-issue::attr(href)').extract()[0] yield Request(url=toc_url, callback=self.parse_toc, meta={'item': wileyitem})
def parse_article(self, response): # 解析文章主页 rscitem = LalItem() rscitem['url'] = response.url # 部分title中会有子标签 title = response.css( '.article__title h2 p, .article__title p, .article__title h2, .article-control h2' ).extract()[0] title_match = re.match(r'<.+?>(.+)</.*?>', title, re.S) rscitem['title'] = title_match.group(1).strip() rscitem['journal'] = response.css( '.h--heading3.no-heading::text').extract_first(default='') rscitem['doi'] = response.css('.list__item-data::text')[1].extract() abstract_text = response.css('.capsule__text p').extract_first( default='') if abstract_text: abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S) rscitem['abstract'] = abstract_match.group(1) else: rscitem['abstract'] = '' img_url = response.css( '.capsule__article-image img::attr(src)').extract_first(default='') if img_url: rscitem['abs_img_url'] = 'https://pubs.rsc.org' + img_url else: rscitem['abs_img_url'] = '' #citing_num需要额外请求js文件,不过rsc提供的值不太准,考虑中 rscitem['citing_num'] = 0 rscitem['keywords'] = [] year_info = response.css( '.article-nav__issue.autopad--h a::text').extract_first(default='') if year_info: year_match = re.match(r'.*Issue \d+, (\d{4}).*', year_info) rscitem['year'] = int(year_match.group(1)) else: rscitem['year'] = None author_group = response.css('.article__author-link').extract() commun_author = [author for author in author_group if '>*</' in author] authors = [] for author in author_group: match = re.match(r'.*<a href=.*?>(.+?)</a.*', author, re.S) name = match.group(1) if author in commun_author: name = name + '*' authors.append(name) rscitem['authors'] = authors rscitem['_id'] = get_md5(rscitem['url']) rscitem['company'] = self.name yield rscitem
def parse_article(self, response): # 解析文章主页 iopitem = LalItem() iopitem['url'] = response.url # 保留sub_tag title = response.css('.wd-jnl-art-title').extract()[0] title_match = re.match(r'<.+?>(.+)</.+>', title, re.S) iopitem['title'] = title_match.group(1) iopitem['journal'] = response.css( '.wd-jnl-art-breadcrumb-title a::text').extract()[0] doi_link = response.css('.wd-jnl-art-doi a::text').extract()[0] iopitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link) try: abstract_text = response.css('.wd-jnl-art-abstract p').extract()[0] except: abstract_text = response.css('.wd-jnl-art-abstract').extract()[0] abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S) iopitem['abstract'] = abstract_match.group(1) img_url = response.css('img[alt="Fig. 1."]::attr(src)').extract_first( default='') if img_url: iopitem['abs_img_url'] = img_url else: iopitem['abs_img_url'] = '' citing_num = response.css('.wd-jnl-art-cited-by::text').extract_first( '') #0引用时不出现该字段 iopitem['citing_num'] = int( re.match(r'.*(\d+).*', citing_num).group(1)) if citing_num else 0 iopitem['keywords'] = [] year_info = response.css( '.wd-jnl-art-article-info-citation p::text').extract() if year_info: year_match = re.match(r'.*\s(\d{4})\s.*', ' '.join(year_info)) iopitem['year'] = int(year_match.group(1)) else: iopitem['year'] = None iopitem['authors'] = response.css( '.mb-0 span[itemprop="name"]::text').extract() iopitem['_id'] = get_md5(iopitem['url']) iopitem['company'] = self.name yield iopitem
def parse(self, response): # 解析文章主页 acsitem = LalItem() acsitem['url'] = response.url # 保留sub_tag title = response.css('.hlFld-Title').extract()[0] title_match = re.match(r'<.+?>(.+)</.+>', title, re.S) acsitem['title'] = title_match.group(1) acsitem['journal'] = response.css('#citation cite::text').extract_first(default='') acsitem['doi'] = response.css('#doi::text').extract()[0] #保留sub_tag abstract_text = response.css('.articleBody_abstractText').extract_first(default='') if abstract_text: abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S) acsitem['abstract'] = abstract_match.group(1) else: acsitem['abstract'] = '' abs_img_url = response.css('#absImg img::attr(src)').extract_first(default='') if abs_img_url: abs_img_url = urljoin('https://pubs.acs.org', abs_img_url) acsitem['abs_img_url'] = abs_img_url # acsitem['abs_img_url'] = ('https://pubs.acs.org'+response.meta['img_link']) if response.meta['img_link'] else '' acsitem['citing_num'] = len(response.css('#citedBy li')) acsitem['keywords'] = [] try: acsitem['year'] = int(response.css('.citation_year::text').extract()[0]) except: acsitem['year'] = int(response.css('#pubDate::text').extract()[0][-4:]) author_group = response.css('#authors > span').extract() commun_author = [author for author in author_group if '#cor1' in author] authors = [] for author in author_group: match = re.match(r'.*<a id="authors".*?>(.+?)</a.*', author, re.S) if match: name = match.group(1) if author in commun_author: name = name + '*' authors.append(name) acsitem['authors'] = authors acsitem['_id'] = get_md5(acsitem['url']) acsitem['company'] = self.name yield acsitem
def parse_article(self, response): scienceitem = LalItem() scienceitem['url'] = response.url data = response.meta['data'] scienceitem['title'] = data['title'][0] scienceitem['journal'] = data['source'][0] scienceitem['doi'] = data['doi'][0] scienceitem['authors'] = data['authors'] scienceitem['year'] = int(data['pubyear'][0]) abstract_text = response.css('.section.abstract p').extract() if abstract_text: abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text[0], re.S) scienceitem['abstract'] = abstract_match.group(1) else: scienceitem['abstract'] = '' scienceitem['citing_num'] = 0 scienceitem['abs_img_url'] = response.url + '/F1.large.jpg' scienceitem['keywords'] = [] scienceitem['_id'] = get_md5(scienceitem['url']) scienceitem['company'] = self.name yield scienceitem
def parse_article(self, response): natureitem = LalItem() natureitem['url'] = response.url title = response.css( 'header .tighten-line-height.small-space-below').extract()[0] title_match = re.match(r'<.+?>(.+)</.+>', title, re.S) natureitem['title'] = title_match.group(1) #journal, doi新旧版本位置不一样 try: natureitem['journal'] = response.css( '.flex-box-item.none.border-gray-medium i::text').extract()[0] doi_tag = response.css( '.flex-box-item.none.border-gray-medium li')[1].extract() doi_match = re.match(r'.*</abbr>:(.+?)</li>', doi_tag, re.S) natureitem['doi'] = doi_match.group(1) year_info = response.css( '.flex-box-item.none.border-gray-medium li').extract()[0] year_match = re.match(r'.*\(.*(\d{4}).*\).*', year_info, re.S) except: natureitem['journal'] = response.css( '.scroll-wrapper dd i::text').extract_first(default='') doi_tag = response.css('.scroll-wrapper dd').extract()[1] doi_match = re.match(r'.*>doi<.*?"(.+?)".*', doi_tag, re.S) natureitem['doi'] = doi_match.group(1) year_info = response.css('.scroll-wrapper dd').extract()[0] year_match = re.match(r'.*\(.*(\d{4}).*\).*', year_info, re.S) abstract_text = response.css( '.pl20.mq875-pl0.js-collapsible-section p').extract()[0] abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S) natureitem['abstract'] = abstract_match.group(1) img_match = re.match( r'.*?"index" : 1.*?"imagePaths" : \[ "(.*?jpg)" \].*', response.text, re.S) if img_match: natureitem['abs_img_url'] = ( 'https:' + img_match.group(1)) if not img_match.group( 1).startswith('http') else img_match.group(1) else: natureitem['abs_img_url'] = '' citing_info = response.css('li[data-test="citation-count"]::text' ).extract() # 如果没有引用,则就没有对应标签,结果为0 natureitem['citing_num'] = int( citing_info[0].split(' ')[-1]) if citing_info else 0 natureitem['keywords'] = response.css( '.subject-tag-link::text').extract() natureitem['year'] = int(year_match.group(1)) author_group = response.css('li[itemprop="author"]').extract() commun_author = [ author for author in author_group if 'data-corresp-id' in author ] authors = [] for author in author_group: match = re.match( r'.*<span itemprop="name".*?>(?:<a data-test="author-name".*?>)?(.+?)(?:</a>)?</span.*', author, re.S) name = match.group(1) if author in commun_author: name = name + '*' authors.append(name) natureitem['authors'] = authors natureitem['_id'] = get_md5(natureitem['url']) natureitem['company'] = self.name yield natureitem
def parse_article(self, response): scidiritem = LalItem() scidiritem['url'] = response.url #title中可能有子标签 title = response.css('.title-text').extract()[0] title_match = re.match(r'<.+?>(.+)</.+>', title, re.S) scidiritem['title'] = title_match.group(1) scidiritem['journal'] = response.css( '.publication-title-link::text').extract()[0] doi_link = response.css('.DoiLink .doi::text').extract()[0] scidiritem['doi'] = re.sub(r'https://doi\.org/', '', doi_link) abstract_text = response.css('.abstract.author p').extract() abstract_text = '\n'.join(abstract_text) abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S) scidiritem['abstract'] = abstract_match.group(1) img_url = response.css( '.abstract.graphical img::attr(src)').extract_first(default='') if img_url: scidiritem['abs_img_url'] = response.css( '.abstract.graphical img::attr(src)').extract_first(default='') else: scidiritem[ 'abs_img_url'] = 'https://ars.els-cdn.com/content/image/1-s2.0-' + response.url.split( '/')[-1] + '-gr1.jpg' #有少量abs_img_url是无效的 citing_info = response.css( '.related-content-links .button-text::text').extract() if citing_info: citing_num = ''.join(citing_info) num_match = re.match(r'.*\((\d+)\)', citing_num) scidiritem['citing_num'] = int(num_match.group(1)) else: scidiritem['citing_num'] = 0 scidiritem['keywords'] = response.css( '.keywords-section .keyword span::text').extract() year = response.css('.publication-volume .size-m::text').extract() year = ''.join(year) year_match = re.match(r'.*\s(\d{4}),.*', year) scidiritem['year'] = int(year_match.group(1)) author_group = response.css('.AuthorGroups .author').extract() commun_author = [author for author in author_group if '<svg' in author] authors = [] for author in author_group: match = re.match( r'.*"text given-name">(.+?)<.*"text surname">(.+?)<.*', author) name = match.group(1) + ' ' + match.group(2) if author in commun_author: name = name + '*' authors.append(name) scidiritem['authors'] = authors scidiritem['_id'] = get_md5(scidiritem['url']) scidiritem['company'] = self.name yield scidiritem