Example #1
0
    def parse_node(self, response, node):
        """Parse an arXiv XML exported file into a HEP record."""
        node.remove_namespaces()

        record = HEPLoader(item=HEPRecord(), selector=node)
        record.add_xpath('title', './/title/text()')
        record.add_xpath('abstract', './/abstract/text()')
        record.add_xpath('preprint_date', './/created/text()')
        record.add_xpath('dois', './/doi//text()')
        record.add_xpath('pubinfo_freetext', './/journal-ref//text()')
        record.add_value('source', 'arXiv')

        authors, collabs = self._get_authors_or_collaboration(node)
        record.add_value('authors', authors)
        record.add_value('collaborations', collabs)

        collections = ['HEP', 'Citeable', 'arXiv']
        comments = '; '.join(node.xpath('.//comments//text()').extract())
        if comments:
            pages, notes, doctype = self._parse_comments_info(comments)
            record.add_value('public_notes', notes)
            if pages:
                record.add_value('page_nr', pages)
            if doctype:
                collections.append(doctype)
        record.add_value('collections', collections)

        record.add_value(
            'report_numbers',
            self._get_arxiv_report_numbers(node)
        )

        plain_categories = ' '.join(
            node.xpath('.//categories//text()').extract()
        ).split()
        categories = self._get_categories_object(plain_categories)
        record.add_value('field_categories', categories)
        record.add_value(
            'arxiv_eprints',
            self._get_arxiv_eprint(node, plain_categories)
        )
        record.add_value(
            'external_system_numbers',
            self._get_ext_systems_number(node)
        )

        license = get_license(
            license_url=node.xpath('.//license//text()').extract_first()
        )
        record.add_value('license', license)

        parsed_record = dict(record.load_item())
        validate_schema(data=parsed_record, schema_name='hep')

        return parsed_record
Example #2
0
    def parse_node(self, response, node):
        """Parse an arXiv XML exported file into a HEP record."""
        node.remove_namespaces()

        record = HEPLoader(item=HEPRecord(), selector=node)
        record.add_xpath('title', './/title/text()')
        record.add_xpath('abstract', './/abstract/text()')
        record.add_xpath('preprint_date', './/created/text()')
        record.add_xpath('dois', './/doi//text()')
        record.add_xpath('pubinfo_freetext', './/journal-ref//text()')
        record.add_value('source', 'arXiv')

        authors, collabs = self._get_authors_or_collaboration(node)
        record.add_value('authors', authors)
        record.add_value('collaborations', collabs)

        collections = ['HEP', 'Citeable', 'arXiv']
        comments = '; '.join(node.xpath('.//comments//text()').extract())
        if comments:
            pages, notes, doctype = self._parse_comments_info(comments)
            record.add_value('public_notes', notes)
            if pages:
                record.add_value('page_nr', pages)
            if doctype:
                collections.append(doctype)
        record.add_value('collections', collections)

        record.add_value('report_numbers',
                         self._get_arxiv_report_numbers(node))

        plain_categories = ' '.join(
            node.xpath('.//categories//text()').extract()).split()
        categories = self._get_categories_object(plain_categories)
        record.add_value('field_categories', categories)
        record.add_value('arxiv_eprints',
                         self._get_arxiv_eprint(node, plain_categories))
        record.add_value('external_system_numbers',
                         self._get_ext_systems_number(node))

        license = get_license(
            license_url=node.xpath('.//license//text()').extract_first())
        record.add_value('license', license)

        parsed_record = dict(record.load_item())
        validate_schema(data=parsed_record, schema_name='hep')

        return parsed_record
Example #3
0
    def process_item(self, item, spider):
        """Convert internal format to INSPIRE data model."""
        self.count += 1
        if 'related_article_doi' in item:
            item['dois'] += item.pop('related_article_doi', [])

        source = item.pop('source', spider.name)
        item['acquisition_source'] = {
            'source': source,
            # NOTE: Keeps method same as source to conform with INSPIRE
            # submissions which add `submissions` to this field.
            'method': source,
            'date': datetime.datetime.now().isoformat(),
            'submission_number': os.environ.get('SCRAPY_JOB', ''),
        }

        item['titles'] = [{
            'title': item.pop('title', ''),
            'subtitle': item.pop('subtitle', ''),
            'source': source,
        }]
        item['abstracts'] = [{
            'value': item.pop('abstract', ''),
            'source': source,
        }]
        item['imprints'] = [{
            'date': item.pop('date_published', ''),
        }]
        item['copyright'] = [{
            'holder': item.pop('copyright_holder', ''),
            'year': item.pop('copyright_year', ''),
            'statement': item.pop('copyright_statement', ''),
            'material': item.pop('copyright_material', ''),
        }]
        if not item.get('publication_info'):
            if has_publication_info(item):
                item['publication_info'] = [{
                    'journal_title': item.pop('journal_title', ''),
                    'journal_volume': item.pop('journal_volume', ''),
                    'year': int(item.pop('journal_year', 0)) or '',
                    'journal_issue': item.pop('journal_issue', ''),
                    'artid': item.pop('journal_artid', ''),
                    'page_start': item.pop('journal_fpage', ''),
                    'page_end': item.pop('journal_lpage', ''),
                    'note': item.pop('journal_doctype', ''),
                    'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
                }]

        # Remove any fields
        filter_fields(item, [
            'journal_title',
            'journal_volume',
            'journal_year',
            'journal_issue',
            'journal_fpage',
            'journal_lpage',
            'journal_doctype',
            'journal_artid',
            'pubinfo_freetext',
        ])

        validate_schema(dict(item), 'hep')
        return item
Example #4
0
    def parse_node(self, response, node):
        """Parse a WSP XML file into a HEP record."""
        node.remove_namespaces()
        article_type = node.xpath('@article-type').extract()
        self.log("Got article_type {0}".format(article_type))
        if article_type is None or article_type[
                0] not in self.allowed_article_types:
            # Filter out non-interesting article types
            return None

        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        if article_type in ['correction', 'addendum']:
            record.add_xpath('related_article_doi',
                             "//related-article[@ext-link-type='doi']/@href")
            record.add_value('journal_doctype', article_type)
        record.add_xpath('dois', "//article-id[@pub-id-type='doi']/text()")
        record.add_xpath('page_nr', "//counts/page-count/@count")

        record.add_xpath('abstract', '//abstract[1]')
        record.add_xpath('title', '//article-title/text()')
        record.add_xpath('subtitle', '//subtitle/text()')

        record.add_value('authors', self._get_authors(node))
        record.add_xpath('collaborations', "//contrib/collab/text()")

        free_keywords, classification_numbers = self._get_keywords(node)
        record.add_value('free_keywords', free_keywords)
        record.add_value('classification_numbers', classification_numbers)

        record.add_value('date_published', self._get_published_date(node))

        # TODO: Special journal title handling
        # journal, volume = fix_journal_name(journal, self.journal_mappings)
        # volume += get_value_in_tag(self.document, 'volume')
        journal_title = '//abbrev-journal-title/text()|//journal-title/text()'
        record.add_xpath('journal_title', journal_title)
        record.add_xpath('journal_issue', '//issue/text()')
        record.add_xpath('journal_volume', '//volume/text()')
        record.add_xpath('journal_artid', '//elocation-id/text()')

        record.add_xpath('journal_fpage', '//fpage/text()')
        record.add_xpath('journal_lpage', '//lpage/text()')

        published_date = self._get_published_date(node)
        record.add_value('journal_year', int(published_date[:4]))
        record.add_value('date_published', published_date)

        record.add_xpath('copyright_holder', '//copyright-holder/text()')
        record.add_xpath('copyright_year', '//copyright-year/text()')
        record.add_xpath('copyright_statement', '//copyright-statement/text()')
        record.add_value('copyright_material', 'Article')

        license = get_license(
            license_url=node.xpath(
                '//license/license-p/ext-link/@href').extract_first(),
            license_text=node.xpath(
                '//license/license-p/ext-link/text()').extract_first(),
        )
        record.add_value('license', license)

        record.add_value(
            'collections',
            self._get_collections(node, article_type, journal_title))
        parsed_record = dict(record.load_item())
        validate_schema(data=parsed_record, schema_name='hep')

        return parsed_record
Example #5
0
    def parse_node(self, response, node):
        """Parse a WSP XML file into a HEP record."""
        node.remove_namespaces()
        article_type = node.xpath('@article-type').extract()
        self.log("Got article_type {0}".format(article_type))
        if article_type is None or article_type[0] not in self.allowed_article_types:
            # Filter out non-interesting article types
            return None

        record = HEPLoader(item=HEPRecord(), selector=node, response=response)
        if article_type in ['correction',
                            'addendum']:
            record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href")
            record.add_value('journal_doctype', article_type)
        record.add_xpath('dois', "//article-id[@pub-id-type='doi']/text()")
        record.add_xpath('page_nr', "//counts/page-count/@count")

        record.add_xpath('abstract', '//abstract[1]')
        record.add_xpath('title', '//article-title/text()')
        record.add_xpath('subtitle', '//subtitle/text()')

        record.add_value('authors', self._get_authors(node))
        record.add_xpath('collaborations', "//contrib/collab/text()")

        free_keywords, classification_numbers = self._get_keywords(node)
        record.add_value('free_keywords', free_keywords)
        record.add_value('classification_numbers', classification_numbers)

        record.add_value('date_published', self._get_published_date(node))

        # TODO: Special journal title handling
        # journal, volume = fix_journal_name(journal, self.journal_mappings)
        # volume += get_value_in_tag(self.document, 'volume')
        journal_title = '//abbrev-journal-title/text()|//journal-title/text()'
        record.add_xpath('journal_title', journal_title)
        record.add_xpath('journal_issue', '//issue/text()')
        record.add_xpath('journal_volume', '//volume/text()')
        record.add_xpath('journal_artid', '//elocation-id/text()')

        record.add_xpath('journal_fpage', '//fpage/text()')
        record.add_xpath('journal_lpage', '//lpage/text()')

        published_date = self._get_published_date(node)
        record.add_value('journal_year', int(published_date[:4]))
        record.add_value('date_published', published_date)

        record.add_xpath('copyright_holder', '//copyright-holder/text()')
        record.add_xpath('copyright_year', '//copyright-year/text()')
        record.add_xpath('copyright_statement', '//copyright-statement/text()')
        record.add_value('copyright_material', 'Article')

        license = get_license(
            license_url=node.xpath(
                '//license/license-p/ext-link/@href').extract_first(),
            license_text=node.xpath(
                '//license/license-p/ext-link/text()').extract_first(),
        )
        record.add_value('license', license)

        record.add_value('collections', self._get_collections(node, article_type, journal_title))
        parsed_record = dict(record.load_item())
        validate_schema(data=parsed_record, schema_name='hep')

        return parsed_record