Esempio n. 1
0
def test_split_page_artid_unicode_dash():
    page_string = u'45−47'
    result = utils.split_page_artid(page_string)

    expected = '45', '47', None

    assert expected == result
Esempio n. 2
0
def test_split_page_artid_long_page_start():
    page_string = 'B1234'
    result = utils.split_page_artid(page_string)

    expected = 'B1234', None, 'B1234'

    assert expected == result
Esempio n. 3
0
def test_split_page_artid_page_start():
    page_string = '451'
    result = utils.split_page_artid(page_string)

    expected = '451', None, '451'

    assert expected == result
Esempio n. 4
0
def test_split_page_artid_artid():
    page_string = 'CONF546'
    result = utils.split_page_artid(page_string)

    expected = None, None, 'CONF546'

    assert expected == result
Esempio n. 5
0
def test_split_page_artid_page_range():
    page_string = '451-487'
    result = utils.split_page_artid(page_string)

    expected = '451', '487', None

    assert expected == result
Esempio n. 6
0
def publication_info(self, key, value):
    """Populate the ``publication_info`` key."""
    def _get_cnum(value):
        w_value = force_single_element(value.get('w', ''))
        normalized_w_value = w_value.replace('/', '-').upper()

        return normalized_w_value

    def _get_material(value):
        schema = load_schema('elements/material')
        valid_materials = schema['enum']

        m_value = force_single_element(value.get('m', ''))
        normalized_m_value = m_value.lower()

        if normalized_m_value in valid_materials:
            return normalized_m_value

    def _get_parent_isbn(value):
        z_value = force_single_element(value.get('z', ''))
        if z_value:
            return normalize_isbn(z_value)

    def _get_pubinfo_freetext(value):
        x_value = force_single_element(value.get('x', ''))
        if not x_value.startswith('#DONE'):
            return x_value

    page_start, page_end, artid = split_page_artid(value.get('c'))

    parent_recid = maybe_int(force_single_element(value.get('0')))
    parent_record = get_record_ref(parent_recid, 'literature')

    journal_recid = maybe_int(force_single_element(value.get('1')))
    journal_record = get_record_ref(journal_recid, 'journals')

    conference_recid = maybe_int(force_single_element(value.get('2')))
    conference_record = get_record_ref(conference_recid, 'conferences')

    return {
        'artid': artid,
        'cnum': _get_cnum(value),
        'conf_acronym': force_single_element(value.get('q')),
        'conference_record': conference_record,
        'hidden': key.startswith('7731') or None,
        'journal_issue': force_single_element(value.get('n')),
        'journal_record': journal_record,
        'journal_title': force_single_element(value.get('p')),
        'journal_volume': force_single_element(value.get('v')),
        'material': _get_material(value),
        'page_end': page_end,
        'page_start': page_start,
        'parent_isbn': _get_parent_isbn(value),
        'parent_record': parent_record,
        'parent_report_number': force_single_element(value.get('r')),
        'pubinfo_freetext': _get_pubinfo_freetext(value),
        'year': maybe_int(force_single_element(value.get('y'))),
    }
Esempio n. 7
0
def extract_journal_info(obj, eng):
    """Extract the journal information from ``pubinfo_freetext``.

    Runs ``extract_journal_reference`` on the ``pubinfo_freetext`` key of each
    ``publication_info``, if it exists, and uses the extracted information to
    populate the other keys.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if not obj.data.get('publication_info'):
        return

    for publication_info in obj.data['publication_info']:
        try:
            with local_refextract_kbs_path() as kbs_path:
                extracted_publication_info = extract_journal_reference(
                    publication_info['pubinfo_freetext'],
                    override_kbs_files=kbs_path,
                )

            if not extracted_publication_info:
                continue

            if extracted_publication_info.get('title'):
                publication_info['journal_title'] = extracted_publication_info[
                    'title']

            if extracted_publication_info.get('volume'):
                publication_info[
                    'journal_volume'] = extracted_publication_info['volume']

            if extracted_publication_info.get('page'):
                page_start, page_end, artid = split_page_artid(
                    extracted_publication_info['page'])
                if page_start:
                    publication_info['page_start'] = page_start
                if page_end:
                    publication_info['page_end'] = page_end
                if artid:
                    publication_info['artid'] = artid

            if extracted_publication_info.get('year'):
                year = maybe_int(extracted_publication_info['year'])
                if year:
                    publication_info['year'] = year
        except KeyError:
            pass

    obj.data['publication_info'] = convert_old_publication_info_to_new(
        obj.data['publication_info'])
Esempio n. 8
0
def split_page_range_article_id(obj, formdata):
    page_range_article_id = formdata.get('page_range_article_id')

    if page_range_article_id:
        page_start, page_end, artid = split_page_artid(page_range_article_id)
        formdata['start_page'] = page_start
        formdata['end_page'] = page_end
        formdata['artid'] = artid

    return formdata
Esempio n. 9
0
def split_page_range_article_id(obj, formdata):
    page_range_article_id = formdata.get('page_range_article_id')

    if page_range_article_id:
        page_start, page_end, artid = split_page_artid(page_range_article_id)
        formdata['start_page'] = page_start
        formdata['end_page'] = page_end
        formdata['artid'] = artid

    return formdata
Esempio n. 10
0
def extract_journal_info(obj, eng):
    """Extract the journal information from ``pubinfo_freetext``.

    Runs ``extract_journal_reference`` on the ``pubinfo_freetext`` key of each
    ``publication_info``, if it exists, and uses the extracted information to
    populate the other keys.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if not obj.data.get('publication_info'):
        return

    for publication_info in obj.data['publication_info']:
        try:
            with local_refextract_kbs_path() as kbs_path:
                extracted_publication_info = extract_journal_reference(
                    publication_info['pubinfo_freetext'],
                    override_kbs_files=kbs_path,
                )

            if not extracted_publication_info:
                continue

            if extracted_publication_info.get('title'):
                publication_info['journal_title'] = extracted_publication_info['title']

            if extracted_publication_info.get('volume'):
                publication_info['journal_volume'] = extracted_publication_info['volume']

            if extracted_publication_info.get('page'):
                page_start, page_end, artid = split_page_artid(extracted_publication_info['page'])
                if page_start:
                    publication_info['page_start'] = page_start
                if page_end:
                    publication_info['page_end'] = page_end
                if artid:
                    publication_info['artid'] = artid

            if extracted_publication_info.get('year'):
                year = maybe_int(extracted_publication_info['year'])
                if year:
                    publication_info['year'] = year
        except KeyError:
            pass

    obj.data['publication_info'] = convert_old_publication_info_to_new(obj.data['publication_info'])
Esempio n. 11
0
def extract_journal_info(obj, eng):
    """Extract journal, volume etc. from any freetext publication info."""
    publication_info = get_value(obj.data, "publication_info")
    if not publication_info:
        return

    new_publication_info = []
    for pubnote in publication_info:
        if not pubnote:
            continue
        freetext = pubnote.get("pubinfo_freetext")
        if freetext:
            if isinstance(freetext, (list, tuple)):
                freetext = ". ".join(freetext)
            extracted_publication_info = extract_journal_reference(
                freetext,
                # override_kbs_files={
                #    'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME'])
                # }
            )
            if extracted_publication_info:
                if "volume" in extracted_publication_info:
                    pubnote["journal_volume"] = extracted_publication_info.get(
                        "volume"
                    )
                if "title" in extracted_publication_info:
                    pubnote["journal_title"] = extracted_publication_info.get(
                        "title"
                    )
                if "year" in extracted_publication_info:
                    year = maybe_int(extracted_publication_info.get('year'))
                    if year is not None:
                        pubnote['year'] = year
                if "page" in extracted_publication_info:
                    page_start, page_end, artid = split_page_artid(
                        extracted_publication_info.get("page"))
                    if page_start:
                        pubnote["page_start"] = page_start
                    if page_end:
                        pubnote["page_end"] = page_end
                    if artid:
                        pubnote["artid"] = artid
        if any(value for value in pubnote.values()):
            new_publication_info.append(pubnote)

    obj.data["publication_info"] = new_publication_info
Esempio n. 12
0
    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj
Esempio n. 13
0
    def build_literature(self, data):
        literature = LiteratureBuilder()

        literature.add_document_type(data["document_type"])

        literature.add_arxiv_eprint(data.get("arxiv_id"),
                                    data.get("arxiv_categories"))
        literature.add_preprint_date(data.get("preprint_date"))
        literature.add_doi(data.get("doi"))

        pdf_link = data.get("pdf_link")
        additional_link = data.get("additional_link")
        if pdf_link and "arxiv.org" not in pdf_link:
            literature.add_url(data.get("pdf_link"))
        if additional_link and "arxiv.org" not in additional_link:
            literature.add_url(data.get("additional_link"))

        literature.add_title(data["title"], source="submitter")

        language = data.get("language")
        literature.add_language(language if language != "en" else None)

        literature.add_inspire_categories(data.get("subjects"))

        for author in data.get("authors", []):
            record_author = literature.make_author(
                author.get("full_name"),
                record=author.get("record"),
                affiliations=[author.get("affiliation")
                              ],  # TODO: use `affiliation_record`
            )
            literature.add_author(record_author)

        literature.add_collaboration(data.get("collaboration"))

        literature.add_accelerator_experiment(
            data.get("experiment"), record=data.get("experiment_record"))
        # TODO: source=submitter?
        literature.add_abstract(data.get("abstract"), source="submitter")

        for report_number in data.get("report_numbers", []):
            literature.add_report_number(report_number, source="submitter")

        page_start, page_end, artid = split_page_artid(data.get("page_range"))

        literature.add_publication_info(
            journal_title=data.get("journal_title"),
            journal_volume=data.get("volume"),
            journal_issue=data.get("issue"),
            journal_record=data.get("journal_record"),
            conference_record=data.get("conference_record"),
            artid=artid,
            page_start=data.get("start_page") or page_start,
            page_end=data.get("end_page") or page_end,
            year=data.get("year"),
            parent_record=data.get("parent_book_record"),
        )

        literature.add_book_series(data.get("series_title"))

        literature.add_book(
            date=data.get("publication_date"),
            publisher=data.get("publisher"),
            place=data.get("publication_place"),
        )

        literature.add_thesis(
            defense_date=data.get("defense_date"),
            degree_type=data.get("degree_type"),
            institution=data.get("institution"),
            date=data.get("submission_date"),
        )

        for supervisor in data.get("supervisors", []):
            record_supervisor = literature.make_author(
                supervisor.get("full_name"),
                affiliations=[supervisor.get("affiliation")],
                roles=["supervisor"],
            )
            literature.add_author(record_supervisor)

        literature.add_private_note(data.get("comments"), source="submitter")
        literature.add_private_note(data.get("proceedings_info"),
                                    source="submitter")

        if data.get("conference_record") is None:
            literature.add_private_note(data.get("conference_info"),
                                        source="submitter")

        return literature.record
Esempio n. 14
0
    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj