Beispiel #1
0
    def parse_article(self, response):
        apsitem = LalItem()
        data = response.meta['data']
        apsitem['url'] = response.url
        apsitem['title'] = data['title']
        apsitem['journal'] = data['citation'].split(' <')[0]
        apsitem['doi'] = data['doi']

        year_match = re.match(r'.*\((\d{4})\).*', data['citation'])
        apsitem['year'] = int(year_match.group(1))
        citing_info = data.get('citation_count_text', '')
        apsitem['citing_num'] = int(
            citing_info.split(' ')[0]) if citing_info else 0

        abstract_text = data['summary']
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        apsitem['abstract'] = abstract_match.group(1)

        img_url = response.css('.clear-wrap li img::attr(src)').extract()
        apsitem['abs_img_url'] = ('https://journals.aps.org' +
                                  img_url[0]) if img_url else ''

        subject_areas = data.get('subject_areas', [])
        apsitem['keywords'] = [subject['label'] for subject in subject_areas
                               ] if subject_areas else []

        authors = data['authors'].split(', ')
        authors[-1] = re.sub(r'and ', '', authors[-1])
        apsitem['authors'] = authors

        apsitem['_id'] = get_md5(apsitem['url'])
        apsitem['company'] = self.name

        yield apsitem
Beispiel #2
0
    def parse_article(self, response):
        osaitem = LalItem()
        data = response.meta['data']
        osaitem['url'] = response.url
        osaitem['title'] = data['title']
        osaitem['authors'] = data['author'].split('; ')
        osaitem['doi'] = data['doi']
        osaitem['journal'] = response.css('.article-journal-name li strong::text').extract_first('')
        if not osaitem['journal']:
            osaitem['journal'] = data['name'].split(',')[0]

        osaitem['year'] = int(data['years'])
        osaitem['keywords'] = []
        osaitem['abs_img_url'] = response.css('img[alt="Fig. 1"]::attr(data-src)').extract_first(default='')

        #citing_num是隐藏字段,display:none,需要使用selenium。有点麻烦。就不取了
        osaitem['citing_num'] = 0

        abstract_text = response.css('#articleBody p').extract()
        abstract_list = []
        if abstract_text:
            #有些文章是没有摘要的。如果有摘要,格式也是多变的。
            for element in abstract_text:
                if '©' in element:
                    break
                else:
                    abstract_match = re.match(r'<.+?>(.*)</.+>', element, re.S)
                    abstract_list.append(abstract_match.group(1))
        osaitem['abstract'] = ''.join(abstract_list)

        osaitem['_id'] = get_md5(osaitem['url'])
        osaitem['company'] = self.name

        yield osaitem
Beispiel #3
0
    def parse_article(self, response):
        # 解析文章主页
        aipitem = LalItem()
        aipitem['url'] = response.url
        # 保留sub_tag
        title = response.css('.publicationContentTitle h3').extract()[0]
        title_match = re.match(r'<h3>(.*?)<span.*', title, re.S)
        aipitem['title'] = title_match.group(1).strip()

        aipitem['journal'] = response.css(
            '.publicationContentCitation::text').extract()[0].strip()
        doi_link = response.css(
            '.publicationContentCitation a::text').extract()[0]
        aipitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        year_info = response.css(
            '.publicationContentCitation::text').extract()[1].strip()
        year_match = re.match(r'.*\((\d{4})\).*', year_info)
        aipitem['year'] = int(year_match.group(1))

        abstract_text = response.css('div.NLM_paragraph').extract()[0]
        aipitem['abstract'] = re.sub(r'(<|</)(div|named).*?>', '',
                                     abstract_text, re.S)

        img_url = response.css('.figure-no-f1 img::attr(src)').extract_first(
            default='')
        if img_url:
            aipitem['abs_img_url'] = 'https://aip.scitation.org' + img_url
        else:
            info_match = re.match(
                r".*journal=(.+?)&volume=(\d+?)&issue=(\d+?)&doi=10.1063/(.+?)\';.*",
                response.text, re.S)
            if info_match:
                jname, vol, issue, doiend = info_match.groups()
                img_url = 'https://aip.scitation.org/na101/home/literatum/publisher/aip/journals/content/{0}/{4}/{0}.{4}.{1}.issue-{2}/{3}/production/images/small/{3}.figures.f1.gif'
                aipitem['abs_img_url'] = img_url.format(
                    jname, vol, issue, doiend, aipitem['year'])
                #此法对于2017-2018(即近两年)的文章是没有效果的
            else:
                aipitem['abs_img_url'] = ''

        aipitem['citing_num'] = response.meta['citation']
        aipitem['keywords'] = response.css('.topicTags a::text').extract()
        author_group = response.css('.contrib-author').extract()
        commun_author = [author for author in author_group if 'a)' in author]
        authors = []
        for author in author_group:
            match = re.match(r'.*<a href=.*?>(.+?)</a.*', author, re.S)
            name = match.group(1).strip()
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        aipitem['authors'] = authors

        aipitem['_id'] = get_md5(aipitem['url'])
        aipitem['company'] = self.name

        yield aipitem
Beispiel #4
0
    def parse_article(self, response):
        # 解析文章主页
        wileyitem = LalItem()
        wileyitem['url'] = response.url
        # 部分title中会有子标签
        try:  #有些文章有两个title,第一个是德文的,第二个才是英文的
            title = response.css('.citation__title--second').extract()[0]
        except:
            title = response.css('.citation__title').extract()[0]

        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        title = title_match.group(1)
        wileyitem['title'] = re.sub('\n', ' ', title)

        wileyitem['journal'] = response.css(
            '.article-citation h1 a::text').extract()[0]

        doi_link = response.css('.epub-doi::text').extract()[0]
        wileyitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        abstract_text = response.css(
            '.article-section__content p').extract()[0]
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        abstract_text = abstract_match.group(1)
        wileyitem['abstract'] = re.sub('\n', ' ', abstract_text)

        wileyitem['citing_num'] = int(
            response.css('a[href="#citedby-section"]::text').extract_first(
                default='0'))
        wileyitem['keywords'] = response.css(
            'meta[name="citation_keywords"]::attr(content)').extract()
        wileyitem['year'] = int(
            response.css('.epub-date::text').extract()[0][-4:])

        author_group = response.css(
            '.accordion-tabbed .accordion-tabbed__tab-mobile').extract()
        commun_author = [
            author for author in author_group
            if 'Corresponding Author' in author
        ]
        authors = []
        for author in author_group:
            match = re.match(r'.*<a href=.*?><span>(.+?)<.*', author, re.S)
            name = match.group(1)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        wileyitem['authors'] = authors

        wileyitem['_id'] = get_md5(wileyitem['url'])
        wileyitem['company'] = self.name

        toc_url = 'https://onlinelibrary.wiley.com' + response.css(
            'a.volume-issue::attr(href)').extract()[0]
        yield Request(url=toc_url,
                      callback=self.parse_toc,
                      meta={'item': wileyitem})
Beispiel #5
0
    def parse_article(self, response):
        # 解析文章主页
        rscitem = LalItem()
        rscitem['url'] = response.url
        # 部分title中会有子标签
        title = response.css(
            '.article__title h2 p, .article__title p, .article__title h2, .article-control h2'
        ).extract()[0]
        title_match = re.match(r'<.+?>(.+)</.*?>', title, re.S)
        rscitem['title'] = title_match.group(1).strip()

        rscitem['journal'] = response.css(
            '.h--heading3.no-heading::text').extract_first(default='')
        rscitem['doi'] = response.css('.list__item-data::text')[1].extract()

        abstract_text = response.css('.capsule__text p').extract_first(
            default='')
        if abstract_text:
            abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
            rscitem['abstract'] = abstract_match.group(1)
        else:
            rscitem['abstract'] = ''

        img_url = response.css(
            '.capsule__article-image img::attr(src)').extract_first(default='')
        if img_url:
            rscitem['abs_img_url'] = 'https://pubs.rsc.org' + img_url
        else:
            rscitem['abs_img_url'] = ''

        #citing_num需要额外请求js文件,不过rsc提供的值不太准,考虑中
        rscitem['citing_num'] = 0

        rscitem['keywords'] = []
        year_info = response.css(
            '.article-nav__issue.autopad--h a::text').extract_first(default='')
        if year_info:
            year_match = re.match(r'.*Issue \d+, (\d{4}).*', year_info)
            rscitem['year'] = int(year_match.group(1))
        else:
            rscitem['year'] = None

        author_group = response.css('.article__author-link').extract()
        commun_author = [author for author in author_group if '>*</' in author]
        authors = []
        for author in author_group:
            match = re.match(r'.*<a href=.*?>(.+?)</a.*', author, re.S)
            name = match.group(1)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        rscitem['authors'] = authors

        rscitem['_id'] = get_md5(rscitem['url'])
        rscitem['company'] = self.name

        yield rscitem
Beispiel #6
0
    def parse_article(self, response):
        # 解析文章主页
        iopitem = LalItem()
        iopitem['url'] = response.url
        # 保留sub_tag
        title = response.css('.wd-jnl-art-title').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        iopitem['title'] = title_match.group(1)

        iopitem['journal'] = response.css(
            '.wd-jnl-art-breadcrumb-title a::text').extract()[0]

        doi_link = response.css('.wd-jnl-art-doi a::text').extract()[0]
        iopitem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        try:
            abstract_text = response.css('.wd-jnl-art-abstract p').extract()[0]
        except:
            abstract_text = response.css('.wd-jnl-art-abstract').extract()[0]
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        iopitem['abstract'] = abstract_match.group(1)

        img_url = response.css('img[alt="Fig. 1."]::attr(src)').extract_first(
            default='')
        if img_url:
            iopitem['abs_img_url'] = img_url
        else:
            iopitem['abs_img_url'] = ''

        citing_num = response.css('.wd-jnl-art-cited-by::text').extract_first(
            '')  #0引用时不出现该字段
        iopitem['citing_num'] = int(
            re.match(r'.*(\d+).*', citing_num).group(1)) if citing_num else 0

        iopitem['keywords'] = []
        year_info = response.css(
            '.wd-jnl-art-article-info-citation p::text').extract()
        if year_info:
            year_match = re.match(r'.*\s(\d{4})\s.*', ' '.join(year_info))
            iopitem['year'] = int(year_match.group(1))
        else:
            iopitem['year'] = None

        iopitem['authors'] = response.css(
            '.mb-0 span[itemprop="name"]::text').extract()
        iopitem['_id'] = get_md5(iopitem['url'])
        iopitem['company'] = self.name

        yield iopitem
Beispiel #7
0
    def parse(self, response):
        # 解析文章主页
        acsitem = LalItem()
        acsitem['url'] = response.url
        # 保留sub_tag
        title = response.css('.hlFld-Title').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        acsitem['title'] = title_match.group(1)

        acsitem['journal'] = response.css('#citation cite::text').extract_first(default='')
        acsitem['doi'] = response.css('#doi::text').extract()[0]
        #保留sub_tag
        abstract_text = response.css('.articleBody_abstractText').extract_first(default='')
        if abstract_text:
            abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
            acsitem['abstract'] = abstract_match.group(1)
        else:
            acsitem['abstract'] = ''

        abs_img_url = response.css('#absImg img::attr(src)').extract_first(default='')
        if abs_img_url:
            abs_img_url = urljoin('https://pubs.acs.org', abs_img_url)
        acsitem['abs_img_url'] = abs_img_url
        # acsitem['abs_img_url'] = ('https://pubs.acs.org'+response.meta['img_link']) if response.meta['img_link'] else ''

        acsitem['citing_num'] = len(response.css('#citedBy li'))
        acsitem['keywords'] = []
        try:
            acsitem['year'] = int(response.css('.citation_year::text').extract()[0])
        except:
            acsitem['year'] = int(response.css('#pubDate::text').extract()[0][-4:])
        author_group = response.css('#authors > span').extract()
        commun_author = [author for author in author_group if '#cor1' in author]
        authors = []
        for author in author_group:
            match = re.match(r'.*<a id="authors".*?>(.+?)</a.*', author, re.S)
            if match:
                name = match.group(1)
                if author in commun_author:
                    name = name + '*'
                authors.append(name)
        acsitem['authors'] = authors
        acsitem['_id'] = get_md5(acsitem['url'])
        acsitem['company'] = self.name

        yield acsitem
Beispiel #8
0
    def parse_article(self, response):
        scienceitem = LalItem()
        scienceitem['url'] = response.url
        data = response.meta['data']
        scienceitem['title'] = data['title'][0]
        scienceitem['journal'] = data['source'][0]
        scienceitem['doi'] = data['doi'][0]
        scienceitem['authors'] = data['authors']
        scienceitem['year'] = int(data['pubyear'][0])

        abstract_text = response.css('.section.abstract p').extract()
        if abstract_text:
            abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text[0],
                                      re.S)
            scienceitem['abstract'] = abstract_match.group(1)
        else:
            scienceitem['abstract'] = ''

        scienceitem['citing_num'] = 0
        scienceitem['abs_img_url'] = response.url + '/F1.large.jpg'
        scienceitem['keywords'] = []
        scienceitem['_id'] = get_md5(scienceitem['url'])
        scienceitem['company'] = self.name
        yield scienceitem
Beispiel #9
0
    def parse_article(self, response):
        natureitem = LalItem()

        natureitem['url'] = response.url
        title = response.css(
            'header .tighten-line-height.small-space-below').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        natureitem['title'] = title_match.group(1)
        #journal, doi新旧版本位置不一样
        try:
            natureitem['journal'] = response.css(
                '.flex-box-item.none.border-gray-medium i::text').extract()[0]
            doi_tag = response.css(
                '.flex-box-item.none.border-gray-medium li')[1].extract()
            doi_match = re.match(r'.*</abbr>:(.+?)</li>', doi_tag, re.S)
            natureitem['doi'] = doi_match.group(1)
            year_info = response.css(
                '.flex-box-item.none.border-gray-medium li').extract()[0]
            year_match = re.match(r'.*\(.*(\d{4}).*\).*', year_info, re.S)
        except:
            natureitem['journal'] = response.css(
                '.scroll-wrapper dd i::text').extract_first(default='')
            doi_tag = response.css('.scroll-wrapper dd').extract()[1]
            doi_match = re.match(r'.*>doi<.*?"(.+?)".*', doi_tag, re.S)
            natureitem['doi'] = doi_match.group(1)
            year_info = response.css('.scroll-wrapper dd').extract()[0]
            year_match = re.match(r'.*\(.*(\d{4}).*\).*', year_info, re.S)

        abstract_text = response.css(
            '.pl20.mq875-pl0.js-collapsible-section p').extract()[0]
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        natureitem['abstract'] = abstract_match.group(1)
        img_match = re.match(
            r'.*?"index" : 1.*?"imagePaths" : \[ "(.*?jpg)" \].*',
            response.text, re.S)
        if img_match:
            natureitem['abs_img_url'] = (
                'https:' + img_match.group(1)) if not img_match.group(
                    1).startswith('http') else img_match.group(1)
        else:
            natureitem['abs_img_url'] = ''

        citing_info = response.css('li[data-test="citation-count"]::text'
                                   ).extract()  # 如果没有引用,则就没有对应标签,结果为0
        natureitem['citing_num'] = int(
            citing_info[0].split(' ')[-1]) if citing_info else 0

        natureitem['keywords'] = response.css(
            '.subject-tag-link::text').extract()
        natureitem['year'] = int(year_match.group(1))

        author_group = response.css('li[itemprop="author"]').extract()
        commun_author = [
            author for author in author_group if 'data-corresp-id' in author
        ]
        authors = []
        for author in author_group:
            match = re.match(
                r'.*<span itemprop="name".*?>(?:<a data-test="author-name".*?>)?(.+?)(?:</a>)?</span.*',
                author, re.S)
            name = match.group(1)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        natureitem['authors'] = authors

        natureitem['_id'] = get_md5(natureitem['url'])
        natureitem['company'] = self.name
        yield natureitem
Beispiel #10
0
    def parse_article(self, response):
        scidiritem = LalItem()
        scidiritem['url'] = response.url
        #title中可能有子标签
        title = response.css('.title-text').extract()[0]
        title_match = re.match(r'<.+?>(.+)</.+>', title, re.S)
        scidiritem['title'] = title_match.group(1)

        scidiritem['journal'] = response.css(
            '.publication-title-link::text').extract()[0]

        doi_link = response.css('.DoiLink .doi::text').extract()[0]
        scidiritem['doi'] = re.sub(r'https://doi\.org/', '', doi_link)

        abstract_text = response.css('.abstract.author p').extract()
        abstract_text = '\n'.join(abstract_text)
        abstract_match = re.match(r'<.+?>(.+)</.+>', abstract_text, re.S)
        scidiritem['abstract'] = abstract_match.group(1)

        img_url = response.css(
            '.abstract.graphical img::attr(src)').extract_first(default='')
        if img_url:
            scidiritem['abs_img_url'] = response.css(
                '.abstract.graphical img::attr(src)').extract_first(default='')
        else:
            scidiritem[
                'abs_img_url'] = 'https://ars.els-cdn.com/content/image/1-s2.0-' + response.url.split(
                    '/')[-1] + '-gr1.jpg'
        #有少量abs_img_url是无效的
        citing_info = response.css(
            '.related-content-links .button-text::text').extract()
        if citing_info:
            citing_num = ''.join(citing_info)
            num_match = re.match(r'.*\((\d+)\)', citing_num)
            scidiritem['citing_num'] = int(num_match.group(1))
        else:
            scidiritem['citing_num'] = 0

        scidiritem['keywords'] = response.css(
            '.keywords-section .keyword span::text').extract()

        year = response.css('.publication-volume .size-m::text').extract()
        year = ''.join(year)
        year_match = re.match(r'.*\s(\d{4}),.*', year)
        scidiritem['year'] = int(year_match.group(1))

        author_group = response.css('.AuthorGroups .author').extract()
        commun_author = [author for author in author_group if '<svg' in author]
        authors = []
        for author in author_group:
            match = re.match(
                r'.*"text given-name">(.+?)<.*"text surname">(.+?)<.*', author)
            name = match.group(1) + ' ' + match.group(2)
            if author in commun_author:
                name = name + '*'
            authors.append(name)
        scidiritem['authors'] = authors

        scidiritem['_id'] = get_md5(scidiritem['url'])
        scidiritem['company'] = self.name
        yield scidiritem