Esempio n. 1
0
class JatsParser(object):
    """Parser for the JATS format.

    It can be used directly by invoking the :func:`JatsParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the JATS metadata.
    """
    def __init__(self, jats_record, source=None):
        self.root = self.get_root_node(jats_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a JATS record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(**keyword)
        self.builder.add_imprint_date(self.publication_date.dumps())
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a JATS record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath('./back/ref-list/ref')
        return list(
            itertools.chain.from_iterable(
                self.get_reference(node) for node in ref_nodes
            )
        )

    remove_tags_config_abstract = {
        'allowed_tags': ['sup', 'sub'],
        'allowed_trees': ['math'],
        'strip': 'self::pub-id|self::issn'
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract

    @property
    def article_type(self):
        article_type = self.root.xpath('./@article-type').extract_first()

        return article_type

    @property
    def artid(self):
        artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first()

        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]')
        authors = [self.get_author(author) for author in author_nodes]

        return authors

    @property
    def collaborations(self):
        collab_nodes = self.root.xpath(
            './front//collab |'
            './front//contrib[@contrib-type="collaboration"] |'
            './front//on-behalf-of'
        )
        collaborations = set(
            collab.xpath('string(.)').extract_first() for collab in collab_nodes
        )

        return collaborations

    @property
    def copyright(self):
        copyright = {
            'holder': self.copyright_holder,
            'material': self.material,
            'statement': self.copyright_statement,
            'year': self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first()

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract()
        dois = [
            {'doi': value, 'material': self.material} for value in doi_values
        ]

        if self.material != 'publication':
            doi_values = self.root.xpath(
                './front/article-meta//related-article[@ext-link-type="doi"]/@href'
            ).extract()
            related_dois = ({'doi': value} for value in doi_values)
            dois.extend(related_dois)

        return dois

    @property
    def document_type(self):
        if self.is_conference_paper:
            document_type = 'conference paper'
        else:
            document_type = 'article'

        return document_type

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        conference_node = self.root.xpath('./front//conference').extract_first()

        return bool(conference_node)

    @property
    def journal_title(self):
        journal_title = self.root.xpath(
            './front/journal-meta//abbrev-journal-title/text() |'
            './front/journal-meta//journal-title/text()'
        ).extract_first()

        return journal_title

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first()

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first()

        return journal_volume

    @property
    def keywords(self):
        keyword_groups = self.root.xpath('./front//kwd-group')
        keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups)

        return keywords

    @property
    def license(self):
        license = {
            'license': self.license_statement,
            'material': self.material,
            'url': self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip()

        return license_statement

    @property
    def license_url(self):
        url_nodes = (
            './front/article-meta//license_ref/text() |'
            './front/article-meta//license/@href |'
            './front/article-meta//license//ext-link/@href'
        )
        license_url = self.root.xpath(url_nodes).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type.startswith('correc'):
            material = 'erratum'
        elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'):
            material = self.article_type
        else:
            material = 'publication'

        return material

    @property
    def number_of_pages(self):
        number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first())

        return number_of_pages

    @property
    def page_start(self):
        page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first()

        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first()

        return page_end

    @property
    def publication_date(self):
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[@pub-type="epub"] |'
            './front//pub-date[starts-with(@date-type,"pub")] |'
            './front//date[starts-with(@date-type,"pub")]'
        )
        publication_date = min(
            self.get_date(date_node) for date_node in date_nodes
        )

        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            'artid': self.artid,
            'journal_title': self.journal_title,
            'journal_issue': self.journal_issue,
            'journal_volume': self.journal_volume,
            'material': self.material,
            'page_start': self.page_start,
            'page_end': self.page_end,
            'year': self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath('./front//publisher-name/text()').extract_first()

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath('string(./front//subtitle)').extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath('string(./front//article-title)').extract_first()

        return title

    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath("//aff[@id=$id_]", id_=id_)
        if affiliation_node:
            affiliation = remove_tags(
                affiliation_node[0], strip="self::label | self::email"
            ).strip()
            return affiliation

    def get_emails_from_refs(self, id_):
        """Get the emails from the node with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the node.

        Returns:
            List[str]: the emails from the node with that id or [] if none found.
        """
        email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_)
        return email_nodes.extract()

    @property
    def year(self):
        not_online = (
            'not(starts-with(@publication-format, "elec"))'
            ' and not(starts-with(@publication-format, "online")'
        )
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[starts-with(@date-type,"pub") and $not_online] |'
            './front//date[starts-with(@date-type,"pub") and $not_online]',
            not_online=not_online
        )

        year = min(
            self.get_date(date_node) for date_node in date_nodes
        ).year

        return year

    def get_author_affiliations(self, author_node):
        """Extract an author's affiliations."""
        raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        # Sometimes the rid might have more than one ID (e.g. rid="id0 id1")
        referred_ids = set()
        for raw_referred_id in raw_referred_ids:
            referred_ids.update(set(raw_referred_id.split(' ')))

        affiliations = [
            self.get_affiliation(rid) for rid in referred_ids
            if self.get_affiliation(rid)
        ]

        return affiliations

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath('.//email/text()').extract()
        referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        for referred_id in referred_ids:
            emails.extend(self.get_emails_from_refs(referred_id))

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath('.//surname/text()').extract_first()
        if not surname:
            # the author name is unstructured
            author_name = author_node.xpath('string(./string-name)').extract_first()
        given_names = author_node.xpath('.//given-names/text()').extract_first()
        suffix = author_node.xpath('.//suffix/text()').extract_first()
        author_name = ', '.join(el for el in (surname, given_names, suffix) if el)

        return author_name

    @staticmethod
    def get_date(date_node):
        """Extract a date from a date node.

        Returns:
            PartialDate: the parsed date.
        """
        iso_string = date_node.xpath('./@iso-8601-date').extract_first()
        iso_date = PartialDate.loads(iso_string) if iso_string else None

        year = date_node.xpath('string(./year)').extract_first()
        month = date_node.xpath('string(./month)').extract_first()
        day = date_node.xpath('string(./day)').extract_first()
        date_from_parts = PartialDate.from_parts(year, month, day) if year else None

        string_date = date_node.xpath('string(./string-date)').extract_first()
        try:
            parsed_date = PartialDate.parse(string_date)
        except ValueError:
            parsed_date = None

        date = get_first([iso_date, date_from_parts, parsed_date])
        return date

    @staticmethod
    def get_keywords(group_node):
        """Extract keywords from a keyword group."""
        schema = None
        if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower():
            schema = 'PACS'

        keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd'))
        keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords)

        return keyword_dicts

    @staticmethod
    def get_root_node(jats_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(jats_record, six.string_types):
            root = get_node(jats_record)
        else:
            root = jats_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node)

        return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails)

    @staticmethod
    def get_reference_authors(ref_node, role):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.
            role(str): author role

        Returns:
            List[str]: list of names
        """
        return ref_node.xpath(
            './person-group[@person-group-type=$role]/string-name/text()',
            role=role
        ).extract()

    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj

    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)
Esempio n. 2
0
def hepcrawl_to_hep(crawler_record):
    """
    Args:
        crawler_record(dict): dictionary representing the hepcrawl formatted
            record.


    Returns:
        dict: The hep formatted record.
    """

    def _filter_affiliation(affiliations):
        return [
            affilation.get('value')
            for affilation in affiliations
            if affilation.get('value')
        ]

    builder = LiteratureBuilder(
        source=crawler_record['acquisition_source']['source']
    )

    for author in crawler_record.get('authors', []):
        builder.add_author(builder.make_author(
            full_name=author['full_name'],
            raw_affiliations=_filter_affiliation(author['affiliations']),
        ))

    for title in crawler_record.get('titles', []):
        builder.add_title(
            title=title.get('title'),
            subtitle=title.get('subtitle'),
            source=title.get('source')
        )

    for abstract in crawler_record.get('abstracts', []):
        builder.add_abstract(
            abstract=abstract.get('value'),
            source=abstract.get('source')
        )

    for arxiv_eprint in crawler_record.get('arxiv_eprints', []):
        builder.add_arxiv_eprint(
            arxiv_id=arxiv_eprint.get('value'),
            arxiv_categories=arxiv_eprint.get('categories')
        )

    for doi in crawler_record.get('dois', []):
        builder.add_doi(
            doi=doi.get('value'),
            material=doi.get('material'),
        )

    for private_note in crawler_record.get('private_notes', []):
        builder.add_private_note(
            private_notes=private_note
        )

    for public_note in crawler_record.get('public_notes', []):
        builder.add_public_note(
            public_note=public_note.get('value'),
            source=public_note.get('source')
        )

    for license in crawler_record.get('license', []):
        builder.add_license(
            url=license.get('url'),
            license=license.get('license'),
            material=license.get('material'),
        )

    for collaboration in crawler_record.get('collaborations', []):
        builder.add_collaboration(
            collaboration=collaboration.get('value')
        )

    for imprint in crawler_record.get('imprints', []):
        builder.add_imprint_date(
            imprint_date=imprint.get('date')
        )

    for copyright in crawler_record.get('copyright', []):
        builder.add_copyright(
            holder=copyright.get('holder'),
            material=copyright.get('material'),
            statement=copyright.get('statement')
        )

    builder.add_preprint_date(
        preprint_date=crawler_record.get('preprint_date')
    )

    acquisition_source = crawler_record.get('acquisition_source', {})
    builder.add_acquisition_source(
        method=acquisition_source['method'],
        date=acquisition_source['datetime'],
        source=acquisition_source['source'],
        submission_number=acquisition_source['submission_number'],
    )

    try:
        builder.add_number_of_pages(
            number_of_pages=int(crawler_record.get('page_nr', [])[0])
        )
    except (TypeError, ValueError, IndexError):
        pass

    publication_types = [
        'introductory',
        'lectures',
        'review',
        'manual',
    ]

    document_types = [
        'book',
        'note',
        'report',
        'proceedings',
        'thesis',
    ]

    added_doc_type = False

    for collection in crawler_record.get('collections', []):
        collection = collection['primary'].strip().lower()

        if collection == 'arxiv':
            continue  # ignored
        elif collection == 'citeable':
            builder.set_citeable(True)
        elif collection == 'core':
            builder.set_core(True)
        elif collection == 'noncore':
            builder.set_core(False)
        elif collection == 'published':
            builder.set_refereed(True)
        elif collection == 'withdrawn':
            builder.set_withdrawn(True)
        elif collection in publication_types:
            builder.add_publication_type(collection)
        elif collection == 'bookchapter':
            added_doc_type = True
            builder.add_document_type('book chapter')
        elif collection == 'conferencepaper':
            added_doc_type = True
            builder.add_document_type('conference paper')
        elif collection in document_types:
            added_doc_type = True
            builder.add_document_type(collection)

    if not added_doc_type:
        builder.add_document_type('article')

    _pub_info = crawler_record.get('publication_info', [{}])[0]
    builder.add_publication_info(
        year=_pub_info.get('year'),
        artid=_pub_info.get('artid'),
        page_end=_pub_info.get('page_end'),
        page_start=_pub_info.get('page_start'),
        journal_issue=_pub_info.get('journal_issue'),
        journal_title=_pub_info.get('journal_title'),
        journal_volume=_pub_info.get('journal_volume'),
        pubinfo_freetext=_pub_info.get('pubinfo_freetext'),
        material=_pub_info.get('pubinfo_material'),
    )

    for report_number in crawler_record.get('report_numbers', []):
        builder.add_report_number(
            report_number=report_number.get('value'),
            source=report_number.get('source')
        )

    for url in crawler_record.get('urls', []):
        builder.add_url(url=url.get('value'))

    for document in crawler_record.get('documents', []):
        builder.add_document(
            description=document.get('description'),
            fulltext=document.get('fulltext'),
            hidden=document.get('hidden'),
            key=document['key'],
            material=document.get('material'),
            original_url=document.get('original_url'),
            url=document['url'],
        )

    return builder.record
Esempio n. 3
0
def crawler2hep(crawler_record):
    def _filter_affiliation(affiliations):
        return [
            affilation.get('value') for affilation in affiliations
            if affilation.get('value')
        ]

    builder = LiteratureBuilder('hepcrawl')

    for author in crawler_record.get('authors', []):
        builder.add_author(
            builder.make_author(
                author['full_name'],
                affiliations=_filter_affiliation(author['affiliations']),
            ))

    for title in crawler_record.get('titles', []):
        builder.add_title(title=title.get('title'), source=title.get('source'))

    for abstract in crawler_record.get('abstracts', []):
        builder.add_abstract(abstract=abstract.get('value'),
                             source=abstract.get('source'))

    for arxiv_eprint in crawler_record.get('arxiv_eprints', []):
        builder.add_arxiv_eprint(
            arxiv_id=arxiv_eprint.get('value'),
            arxiv_categories=arxiv_eprint.get('categories'))

    for doi in crawler_record.get('dois', []):
        builder.add_doi(doi=doi.get('value'))

    for public_note in crawler_record.get('public_notes', []):
        builder.add_public_note(public_note=public_note.get('value'),
                                source=public_note.get('source'))

    for license in crawler_record.get('license', []):
        builder.add_license(url=license.get('url'),
                            license=license.get('license'))

    for collaboration in crawler_record.get('collaborations', []):
        builder.add_collaboration(collaboration=collaboration.get('value'))

    for imprint in crawler_record.get('imprints', []):
        builder.add_imprint_date(imprint_date=imprint.get('date'))

    for copyright in crawler_record.get('copyright', []):
        builder.add_copyright(holder=copyright.get('holder'),
                              material=copyright.get('material'),
                              statement=copyright.get('statement'))

    builder.add_preprint_date(
        preprint_date=crawler_record.get('preprint_date'))

    acquisition_source = crawler_record.get('acquisition_source', {})
    builder.add_acquisition_source(
        method='hepcrawl',
        date=acquisition_source.get('date'),
        source=acquisition_source.get('source'),
        submission_number=acquisition_source.get('submission_number'))

    try:
        builder.add_number_of_pages(
            number_of_pages=int(crawler_record.get('page_nr', [])[0]))
    except (TypeError, ValueError, IndexError):
        pass

    publication_types = [
        'introductory',
        'lectures',
        'review',
    ]

    special_collections = [
        'cdf-internal-note',
        'cdf-note',
        'cds',
        'd0-internal-note',
        'd0-preliminary-note',
        'h1-internal-note',
        'h1-preliminary-note',
        'halhidden',
        'hephidden',
        'hermes-internal-note',
        'larsoft-internal-note',
        'larsoft-note',
        'zeus-internal-note',
        'zeus-preliminary-note',
    ]

    document_types = [
        'book',
        'note',
        'report',
        'proceedings',
        'thesis',
    ]

    added_doc_type = False

    for collection in crawler_record.get('collections', []):
        collection = collection['primary'].strip().lower()

        if collection == 'arxiv':
            continue  # ignored
        elif collection == 'citeable':
            builder.set_citeable(True)
        elif collection == 'core':
            builder.set_core(True)
        elif collection == 'noncore':
            builder.set_core(False)
        elif collection == 'published':
            builder.set_refereed(True)
        elif collection == 'withdrawn':
            builder.set_withdrawn(True)
        elif collection in publication_types:
            builder.add_publication_type(collection)
        elif collection in special_collections:
            builder.add_special_collection(collection.upper())
        elif collection == 'bookchapter':
            added_doc_type = True
            builder.add_document_type('book chapter')
        elif collection == 'conferencepaper':
            added_doc_type = True
            builder.add_document_type('conference paper')
        elif collection in document_types:
            added_doc_type = True
            builder.add_document_type(collection)

    if not added_doc_type:
        builder.add_document_type('article')

    _pub_info = crawler_record.get('publication_info', [{}])[0]
    builder.add_publication_info(
        year=_pub_info.get('year'),
        artid=_pub_info.get('artid'),
        page_end=_pub_info.get('page_end'),
        page_start=_pub_info.get('page_start'),
        journal_issue=_pub_info.get('journal_issue'),
        journal_title=_pub_info.get('journal_title'),
        journal_volume=_pub_info.get('journal_volume'),
        pubinfo_freetext=_pub_info.get('pubinfo_freetext'),
    )

    for report_number in crawler_record.get('report_numbers', []):
        builder.add_report_number(report_number=report_number.get('value'),
                                  source=report_number.get('source'))

    builder.validate_record()

    return builder.record
Esempio n. 4
0
class ArxivParser(object):
    """Parser for the arXiv format.

    It can be used directly by invoking the :func:`ArxivParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        arxiv_record (Union[str, scrapy.selector.Selector]): the record in arXiv format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the arXiv metadata.
    """
    _l2t = LatexNodes2Text(
        latex_context=get_arxiv_latex_context_db(),
        math_mode="verbatim",
        strict_latex_spaces="based-on-source",
        keep_comments=True,
        keep_braced_groups=True,
        keep_braced_groups_minlen=2,
    )

    def __init__(self, arxiv_record, source=None):
        self.root = self.get_root_node(arxiv_record)
        if not source:
            source = 'arXiv'
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract an arXiv record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(abstract=self.abstract, source=self.source)
        self.builder.add_title(title=self.title, source=self.source)
        for license in self.licenses:
            self.builder.add_license(**license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        self.builder.add_preprint_date(self.preprint_date)
        if self.public_note:
            self.builder.add_public_note(self.public_note, self.source)
        for rep_number in self.report_numbers:
            self.builder.add_report_number(rep_number, self.source)
        self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories)
        self.builder.add_private_note(self.private_note)
        self.builder.add_document_type(self.document_type)
        normalized_categories = [
            classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories
        ]
        self.builder.add_inspire_categories(dedupe_list(normalized_categories),
                                            'arxiv')

        return self.builder.record

    def _get_authors_and_collaborations(self, node):
        """Parse authors, affiliations and collaborations from the record node.

        Heuristics are used to detect collaborations. In case those are not
        reliable, a warning is returned for manual checking.

        Args:
            node (Selector): a selector on a record
        Returns:
            tuple: a tuple of (authors, collaborations, warning)
        """
        author_selectors = node.xpath('.//authors//author')

        # take 'for the' out of the general phrases and dont use it in
        # affiliations
        collab_phrases = [
            'consortium',
            ' collab ',
            'collaboration',
            ' team',
            'group',
            ' on behalf of ',
            ' representing ',
        ]
        inst_phrases = ['institute', 'university', 'department', 'center']

        authors = []
        collaborations = []
        warning_tags = []
        some_affiliation_contains_collaboration = False

        authors_and_affiliations = (
            self._get_author_names_and_affiliations(author)
            for author in author_selectors)
        next_author_and_affiliations = (
            self._get_author_names_and_affiliations(author)
            for author in author_selectors)
        next(next_author_and_affiliations)

        for (forenames, keyname,
             affiliations), (next_forenames, next_keyname,
                             _) in six.moves.zip_longest(
                                 authors_and_affiliations,
                                 next_author_and_affiliations,
                                 fillvalue=('end of author-list', '', None)):

            name_string = " %s %s " % (forenames, keyname)

            # collaborations in affiliation field? Cautious with 'for the' in
            # Inst names
            affiliations_with_collaborations = []
            affiliations_without_collaborations = []
            for aff in affiliations:
                affiliation_contains_collaboration = any(
                    phrase in aff.lower()
                    for phrase in collab_phrases) and not any(
                        phrase in aff.lower() for phrase in inst_phrases)
                if affiliation_contains_collaboration:
                    affiliations_with_collaborations.append(aff)
                    some_affiliation_contains_collaboration = True
                else:
                    affiliations_without_collaborations.append(aff)
            for aff in affiliations_with_collaborations:
                coll, author_name = coll_cleanforthe(aff)
                if coll and coll not in collaborations:
                    collaborations.append(coll)

            # Check if name is a collaboration, else append to authors
            collaboration_in_name = ' for the ' in name_string.lower() or any(
                phrase in name_string.lower() for phrase in collab_phrases)
            if collaboration_in_name:
                coll, author_name = coll_cleanforthe(name_string)
                if author_name:
                    surname, given_names = split_fullname(author_name)
                    authors.append({
                        'full_name': surname + ', ' + given_names,
                        'surname': surname,
                        'given_names': given_names,
                        'affiliations': [],
                    })
                if coll and coll not in collaborations:
                    collaborations.append(coll)
            elif name_string.strip() == ':':
                # DANGERZONE : this might not be correct - add a warning for the cataloger
                warning_tags.append(' %s %s ' % (next_forenames, next_keyname))
                if not some_affiliation_contains_collaboration:
                    # everything up to now seems to be collaboration info
                    for author_info in authors:
                        name_string = " %s %s " % \
                            (author_info['given_names'], author_info['surname'])
                        coll, author_name = coll_cleanforthe(name_string)
                        if coll and coll not in collaborations:
                            collaborations.append(coll)
                    authors = []
            else:
                authors.append({
                    'full_name':
                    keyname + ', ' + forenames,
                    'surname':
                    keyname,
                    'given_names':
                    forenames,
                    'affiliations':
                    affiliations_without_collaborations
                })
        if warning_tags:
            warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join(
                warning_tags)
        else:
            warning = ''
        return authors, collaborations, warning

    @staticmethod
    def _get_author_names_and_affiliations(author_node):
        forenames = u' '.join(
            author_node.xpath('.//forenames//text()').extract())
        keyname = u' '.join(author_node.xpath('.//keyname//text()').extract())
        affiliations = author_node.xpath('.//affiliation//text()').extract()

        return forenames, keyname, affiliations

    @property
    def preprint_date(self):
        preprint_date = self.root.xpath('.//created/text()').extract_first()

        return preprint_date

    @property
    def abstract(self):
        abstract = self.root.xpath('.//abstract/text()').extract_first()
        long_text_fixed = self.fix_long_text(abstract)
        return self.latex_to_unicode(long_text_fixed)

    @property
    def authors(self):
        authors, _, _ = self.authors_and_collaborations
        parsed_authors = [
            self.builder.make_author(full_name=auth["full_name"],
                                     raw_affiliations=auth["affiliations"])
            for auth in authors
        ]

        return parsed_authors

    @property
    def collaborations(self):
        _, collaborations, _ = self.authors_and_collaborations

        return collaborations

    @property
    def dois(self):
        doi_values = self.root.xpath('.//doi/text()').extract()
        doi_values_splitted = chain.from_iterable(
            [re.split(RE_DOIS, doi) for doi in doi_values])
        dois = [{
            'doi': value,
            'material': 'publication'
        } for value in doi_values_splitted]

        return dois

    @property
    def licenses(self):
        licenses = self.root.xpath('.//license/text()').extract()
        return [{
            'url': license,
            'material': self.material
        } for license in licenses]

    @property
    def material(self):
        return 'preprint'

    @property
    def number_of_pages(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        found_pages = RE_PAGES.search(comments)
        if found_pages:
            pages = found_pages.group(1)
            return maybe_int(pages)

        return None

    @property
    def publication_info(self):
        publication_info = {
            'material': 'publication',
            'pubinfo_freetext': self.pubinfo_freetext,
        }

        return publication_info

    @property
    def pubinfo_freetext(self):
        return self.root.xpath('.//journal-ref/text()').extract_first()

    @property
    def title(self):
        long_text_fixed = self.fix_long_text(
            self.root.xpath('.//title/text()').extract_first())
        return self.latex_to_unicode(long_text_fixed)

    @staticmethod
    def fix_long_text(text):
        return re.sub(r'\s+', ' ', text).strip()

    @staticmethod
    def get_root_node(arxiv_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(arxiv_record, six.string_types):
            root = get_node(arxiv_record)
        else:
            root = arxiv_record
        root.remove_namespaces()

        return root

    @property
    def public_note(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        return self.latex_to_unicode(comments)

    @property
    def private_note(self):
        _, _, warning = self.authors_and_collaborations

        return warning

    @property
    def report_numbers(self):
        report_numbers = self.root.xpath('.//report-no/text()').extract()
        rns = []
        for rn in report_numbers:
            rns.extend(rn.split(', '))

        return rns

    @property
    def arxiv_eprint(self):
        return self.root.xpath('.//id/text()').extract_first()

    @property
    def arxiv_categories(self):
        categories = self.root.xpath('.//categories/text()').extract_first(
            default='[]')
        categories = categories.split()
        categories_without_old = [
            normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories
        ]

        return dedupe_list(categories_without_old)

    @property
    def document_type(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        doctype = 'article'
        if RE_THESIS.search(comments):
            doctype = 'thesis'
        elif RE_CONFERENCE.search(comments):
            doctype = 'conference paper'

        return doctype

    @property
    def source(self):
        return 'arXiv'

    @property
    def authors_and_collaborations(self):
        if not hasattr(self, '_authors_and_collaborations'):
            self._authors_and_collaborations = self._get_authors_and_collaborations(
                self.root)
        return self._authors_and_collaborations

    @classmethod
    def latex_to_unicode(cls, latex_string):
        try:
            return cls._l2t.latex_to_text(latex_string).replace("  ", " ")
        except Exception as e:
            return latex_string
Esempio n. 5
0
class JatsParser(object):
    """Parser for the JATS format.

    It can be used directly by invoking the :func:`JatsParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the JATS metadata.
    """
    def __init__(self, jats_record, source=None):
        self.root = self.get_root_node(jats_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a JATS record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(**keyword)
        self.builder.add_imprint_date(self.publication_date.dumps())
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a JATS record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath('./back/ref-list/ref')
        return list(
            itertools.chain.from_iterable(
                self.get_reference(node) for node in ref_nodes
            )
        )

    remove_tags_config_abstract = {
        'allowed_tags': ['sup', 'sub'],
        'allowed_trees': ['math'],
        'strip': 'self::pub-id|self::issn'
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract

    @property
    def article_type(self):
        article_type = self.root.xpath('./@article-type').extract_first()

        return article_type

    @property
    def artid(self):
        artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first()

        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]')
        authors = [self.get_author(author) for author in author_nodes]

        return authors

    @property
    def collaborations(self):
        collab_nodes = self.root.xpath(
            './front//collab |'
            './front//contrib[@contrib-type="collaboration"] |'
            './front//on-behalf-of'
        )
        collaborations = set(
            collab.xpath('string(.)').extract_first() for collab in collab_nodes
        )

        return collaborations

    @property
    def copyright(self):
        copyright = {
            'holder': self.copyright_holder,
            'material': self.material,
            'statement': self.copyright_statement,
            'year': self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first()

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract()
        dois = [
            {'doi': value, 'material': self.material} for value in doi_values
        ]

        if self.material != 'publication':
            doi_values = self.root.xpath(
                './front/article-meta//related-article[@ext-link-type="doi"]/@href'
            ).extract()
            related_dois = ({'doi': value} for value in doi_values)
            dois.extend(related_dois)

        return dois

    @property
    def document_type(self):
        if self.is_conference_paper:
            document_type = 'conference paper'
        else:
            document_type = 'article'

        return document_type

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        conference_node = self.root.xpath('./front//conference').extract_first()

        return bool(conference_node)

    @property
    def journal_title(self):
        journal_title = self.root.xpath(
            './front/journal-meta//abbrev-journal-title/text() |'
            './front/journal-meta//journal-title/text()'
        ).extract_first()

        return journal_title

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first()

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first()

        return journal_volume

    @property
    def keywords(self):
        keyword_groups = self.root.xpath('./front//kwd-group')
        keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups)

        return keywords

    @property
    def license(self):
        license = {
            'license': self.license_statement,
            'material': self.material,
            'url': self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip()

        return license_statement

    @property
    def license_url(self):
        url_nodes = (
            './front/article-meta//license_ref/text() |'
            './front/article-meta//license/@href |'
            './front/article-meta//license//ext-link/@href'
        )
        license_url = self.root.xpath(url_nodes).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type.startswith('correc'):
            material = 'erratum'
        elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'):
            material = self.article_type
        else:
            material = 'publication'

        return material

    @property
    def number_of_pages(self):
        number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first())

        return number_of_pages

    @property
    def page_start(self):
        page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first()

        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first()

        return page_end

    @property
    def publication_date(self):
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[@pub-type="epub"] |'
            './front//pub-date[starts-with(@date-type,"pub")] |'
            './front//date[starts-with(@date-type,"pub")]'
        )
        publication_date = min(
            self.get_date(date_node) for date_node in date_nodes
        )

        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            'artid': self.artid,
            'journal_title': self.journal_title,
            'journal_issue': self.journal_issue,
            'journal_volume': self.journal_volume,
            'material': self.material,
            'page_start': self.page_start,
            'page_end': self.page_end,
            'year': self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath('./front//publisher-name/text()').extract_first()

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath('string(./front//subtitle)').extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath('string(./front//article-title)').extract_first()

        return title

    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath('//aff[@id=$id_]', id_=id_)[0]
        affiliation = remove_tags(
            affiliation_node,
            strip='self::label | self::email'
        ).strip()

        return affiliation

    def get_emails_from_refs(self, id_):
        """Get the emails from the node with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the node.

        Returns:
            List[str]: the emails from the node with that id or [] if none found.
        """
        email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_)
        return email_nodes.extract()

    @property
    def year(self):
        not_online = (
            'not(starts-with(@publication-format, "elec"))'
            ' and not(starts-with(@publication-format, "online")'
        )
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[starts-with(@date-type,"pub") and $not_online] |'
            './front//date[starts-with(@date-type,"pub") and $not_online]',
            not_online=not_online
        )

        year = min(
            self.get_date(date_node) for date_node in date_nodes
        ).year

        return year

    def get_author_affiliations(self, author_node):
        """Extract an author's affiliations."""
        raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        # Sometimes the rid might have more than one ID (e.g. rid="id0 id1")
        referred_ids = set()
        for raw_referred_id in raw_referred_ids:
            referred_ids.update(set(raw_referred_id.split(' ')))

        affiliations = [
            self.get_affiliation(rid) for rid in referred_ids
            if self.get_affiliation(rid)
        ]

        return affiliations

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath('.//email/text()').extract()
        referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        for referred_id in referred_ids:
            emails.extend(self.get_emails_from_refs(referred_id))

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath('.//surname/text()').extract_first()
        if not surname:
            # the author name is unstructured
            author_name = author_node.xpath('string(./string-name)').extract_first()
        given_names = author_node.xpath('.//given-names/text()').extract_first()
        suffix = author_node.xpath('.//suffix/text()').extract_first()
        author_name = ', '.join(el for el in (surname, given_names, suffix) if el)

        return author_name

    @staticmethod
    def get_date(date_node):
        """Extract a date from a date node.

        Returns:
            PartialDate: the parsed date.
        """
        iso_string = date_node.xpath('./@iso-8601-date').extract_first()
        iso_date = PartialDate.loads(iso_string) if iso_string else None

        year = date_node.xpath('string(./year)').extract_first()
        month = date_node.xpath('string(./month)').extract_first()
        day = date_node.xpath('string(./day)').extract_first()
        date_from_parts = PartialDate.from_parts(year, month, day) if year else None

        string_date = date_node.xpath('string(./string-date)').extract_first()
        try:
            parsed_date = PartialDate.parse(string_date)
        except ValueError:
            parsed_date = None

        date = get_first([iso_date, date_from_parts, parsed_date])
        return date

    @staticmethod
    def get_keywords(group_node):
        """Extract keywords from a keyword group."""
        schema = None
        if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower():
            schema = 'PACS'

        keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd'))
        keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords)

        return keyword_dicts

    @staticmethod
    def get_root_node(jats_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(jats_record, six.string_types):
            root = get_node(jats_record)
        else:
            root = jats_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node)

        return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails)

    @staticmethod
    def get_reference_authors(ref_node, role):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.
            role(str): author role

        Returns:
            List[str]: list of names
        """
        return ref_node.xpath(
            './person-group[@person-group-type=$role]/string-name/text()',
            role=role
        ).extract()


    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj


    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)