def test_populate_inspire_document_type_from_refereed():
    schema = load_schema('hep')
    document_type_schema = schema['properties']['document_type']
    refereed_schema = schema['properties']['refereed']

    record = {
        '$schema': 'http://localhost:5000/records/schemas/hep.json',
        'document_type': [
            'article',
        ],
        'refereed': True,
    }
    record = InspireRecord(record, model=RecordMetadata)
    assert validate(record['document_type'], document_type_schema) is None
    assert validate(record['refereed'], refereed_schema) is None

    populate_inspire_document_type(record)

    expected = [
        'article',
        'peer reviewed',
    ]
    result = record['facet_inspire_doc_type']

    assert expected == result
def test_populate_bookautocomplete_from_authors():
    schema = load_schema('hep')
    authors_schema = schema['properties']['authors']
    document_type_schema = schema['properties']['document_type']
    self_schema = schema['properties']['self']

    record = {
        '$schema': 'http://localhost:5000/records/schemas/hep.json',
        'authors': [
            {'full_name': 'Rafelski, Johann'},
        ],
        'document_type': [
            'book',
        ],
        'self': {
            '$ref': 'http://localhost:5000/api/literature/1519486',
        },
    }
    record = InspireRecord(record, model=RecordMetadata)
    assert validate(record['authors'], authors_schema) is None
    assert validate(record['document_type'], document_type_schema) is None
    assert validate(record['self'], self_schema) is None

    populate_bookautocomplete(record)

    expected = {
        'input': [
            'Rafelski, Johann',
        ],
    }
    result = record['bookautocomplete']

    assert expected == result
def test_arxiv_derive_inspire_categories():
    schema = load_schema('hep')
    arxiv_eprints_schema = schema['properties']['arxiv_eprints']
    inspire_categories_schema = schema['properties']['inspire_categories']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'nucl-th',
                ],
                'value': '1605.03898',
            },
        ],
    }  # literature/1458300
    extra_data = {}
    assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert arxiv_derive_inspire_categories(obj, eng) is None

    expected = [
        {
            'source': 'arxiv',
            'term': 'Theory-Nucl',
        },
    ]
    result = obj.data['inspire_categories']

    assert validate(result, inspire_categories_schema) is None
    assert expected == result
def test_extract_journal_info_handles_year_an_empty_string():
    schema = load_schema('hep')
    subschema = schema['properties']['publication_info']

    data = {
        'publication_info': [
            {'pubinfo_freetext': 'The Astrophysical Journal, 838:134 (16pp), 2017 April 1'},
        ],
    }
    extra_data = {}
    assert validate(data['publication_info'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert extract_journal_info(obj, eng) is None

    expected = [
        {
            'artid': '134',
            'journal_title': 'Astrophys. J.',
            'journal_volume': '838',
            'page_start': '134',
            'pubinfo_freetext': 'The Astrophysical Journal, 838:134 (16pp), 2017 April 1',
        },
    ]
    result = obj.data['publication_info']

    assert validate(result, subschema) is None
    assert expected == result
def test_extract_journal_info_handles_the_journal_split():
    schema = load_schema('hep')
    subschema = schema['properties']['publication_info']

    data = {
        'publication_info': [
            {'pubinfo_freetext': 'Phys. Rev. D 96, 076008. 2017'},
        ],
    }
    extra_data = {}
    assert validate(data['publication_info'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert extract_journal_info(obj, eng) is None

    expected = [
        {
            'artid': '076008',
            'journal_title': 'Phys. Rev. D',
            'journal_volume': '96',
            'pubinfo_freetext': 'Phys. Rev. D 96, 076008. 2017',
        },
    ]
    result = obj.data['publication_info']

    assert validate(result, subschema) is None
    assert expected == result
def test_populate_inspire_document_type_from_publication_type():
    schema = load_schema('hep')
    document_type_schema = schema['properties']['document_type']
    publication_type_schema = schema['properties']['publication_type']

    record = {
        '$schema': 'http://localhost:5000/records/schemas/hep.json',
        'document_type': [
            'article',
        ],
        'publication_type': [
            'introductory',
        ],
    }
    assert validate(record['document_type'], document_type_schema) is None
    assert validate(record['publication_type'], publication_type_schema) is None

    populate_inspire_document_type(None, record)

    expected = [
        'article',
        'introductory',
    ]
    result = record['facet_inspire_doc_type']

    assert expected == result
def test_populate_bookautocomplete_does_nothing_if_record_is_not_a_book():
    schema = load_schema('hep')
    authors_schema = schema['properties']['authors']
    document_type_schema = schema['properties']['document_type']
    self_schema = schema['properties']['self']

    record = {
        '$schema': 'http://localhost:5000/records/schemas/hep.json',
        'authors': [
            {'full_name': 'Mohayai, Tanaz Angelina'},
        ],
        'document_type': [
            'article',
        ],
        'self': {
            '$ref': 'http://localhost:5000/api/literature/1520027',
        }
    }
    assert validate(record['authors'], authors_schema) is None
    assert validate(record['document_type'], document_type_schema) is None
    assert validate(record['self'], self_schema) is None

    populate_bookautocomplete(None, record)

    assert 'bookautocomplete' not in record
def test_set_refereed_and_fix_document_type_sets_refereed_to_false_if_all_journals_are_not_refereed(mock_replace_refs):
    schema = load_schema('journals')
    subschema = schema['properties']['refereed']

    journals = [{'refereed': False}]
    assert validate(journals[0]['refereed'], subschema) is None

    mock_replace_refs.return_value = journals

    schema = load_schema('hep')
    subschema = schema['properties']['refereed']

    data = {'document_type': ['article']}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert set_refereed_and_fix_document_type(obj, eng) is None

    expected = False
    result = obj.data['refereed']

    assert validate(result, subschema) is None
    assert expected == result
def test_set_refereed_and_fix_document_type_replaces_article_with_conference_paper_if_needed(mock_replace_refs):
    schema = load_schema('journals')
    subschema = schema['properties']['proceedings']

    journals = [{'proceedings': True}]
    assert validate(journals[0]['proceedings'], subschema) is None

    mock_replace_refs.return_value = journals

    schema = load_schema('hep')
    subschema = schema['properties']['document_type']

    data = {'document_type': ['article']}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert set_refereed_and_fix_document_type(obj, eng) is None

    expected = ['conference paper']
    result = obj.data['document_type']

    assert validate(result, subschema) is None
    assert expected == result
def test_match_references_finds_match_when_repeated_record_with_different_scores(
    mocked_inspire_matcher_match,
    isolated_app
):
    references = [
        {
            'reference': {
                'publication_info': {
                    'artid': '045',
                    'journal_title': 'JHEP',
                    'journal_volume': '06',
                    'page_start': '045',
                    'year': 2007
                }
            }
        }
    ]

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate(references, subschema) is None
    references = match_references(references)

    assert len(references) == 1
    assert references[0]['record']['$ref'] == 'http://localhost:5000/api/literature/1'
    assert validate(references, subschema) is None
def test_match_reference_on_texkey(isolated_app):
    cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 1,
        'document_type': ['article'],
        'texkeys': [
            'Giudice:2007fh',
        ],
        'titles': [
            {
                'title': 'The Strongly-Interacting Light Higgs'
            }
        ],
    }

    TestRecordMetadata.create_from_kwargs(
        json=cited_record_json, index_name='records-hep')

    reference = {
        'reference': {
            'texkey': 'Giudice:2007fh',
        }
    }

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate([reference], subschema) is None
    reference = match_reference(reference)

    assert reference['record']['$ref'] == 'http://localhost:5000/api/literature/1'
    assert validate([reference], subschema) is None
def test_is_arxiv_paper_for_submission():
    schema = load_schema('hep')
    acquisition_source_schema = schema['properties']['acquisition_source']
    arxiv_eprints_schema = schema['properties']['arxiv_eprints']

    data = {
        'acquisition_source': {
            'method': 'submitter',
        },
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '0801.4782',
            },
        ],
    }
    extra_data = {}
    assert validate(data['acquisition_source'], acquisition_source_schema) is None
    assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert is_arxiv_paper(obj, eng)
def test_is_arxiv_paper_returns_false_if_method_is_not_hepcrawl_or_arxiv():
    schema = load_schema('hep')
    acquisition_source_schema = schema['properties']['acquisition_source']
    arxiv_eprints_schema = schema['properties']['arxiv_eprints']

    data = {
        'acquisition_source': {
            'method': 'batchuploader',
            'source': 'arxiv',
        },
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '0801.4782',
            },
        ],
    }
    extra_data = {}
    assert validate(data['acquisition_source'], acquisition_source_schema) is None
    assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert not is_arxiv_paper(obj, eng)
def test_get_conference_record(replace_refs):
    schema = load_schema('hep')
    control_number_schema = schema['properties']['control_number']
    publication_info_schema = schema['properties']['publication_info']

    conference_record = {'control_number': 972464}
    assert validate(conference_record['control_number'], control_number_schema) is None

    record = {
        'publication_info': [
            {
                'conference_record': {
                    '$ref': 'http://localhost:5000/api/conferences/972464',
                },
            },
        ],
    }
    assert validate(record['publication_info'], publication_info_schema) is None

    replace_refs.return_value = conference_record

    expected = 972464
    result = get_conference_record(record)

    assert expected == result['control_number']
def test_formdata_to_model_only_chapter(mock_validate_record):
    schema = load_schema('hep')
    book_series_subschema = schema['properties']['book_series']
    publication_info_subschema = schema['properties']['publication_info']

    data = {}
    extra_data = {}
    obj = MockObj(data, extra_data)
    formdata = {
        'end_page': '1200',
        'parent_book': 'http://localhost:5000/api/literature/1373790',
        'series_title': 'Astrophysics',
        'start_page': '150',
        'type_of_doc': 'chapter',
    }

    expected_book_series = [
        {'title': 'Astrophysics'},
    ]
    expected_publication_info = [
        {
            'page_end': '1200',
            'page_start': '150',
            'parent_record': {
                '$ref': 'http://localhost:5000/api/literature/1373790',
            },
        },
    ]
    result = formdata_to_model(obj, formdata)

    assert validate(result['book_series'], book_series_subschema) is None
    assert expected_book_series == result['book_series']

    assert validate(result['publication_info'], publication_info_subschema) is None
    assert expected_publication_info == result['publication_info']
def test_fix_submission_number():
    schema = load_schema('hep')
    subschema = schema['properties']['acquisition_source']

    data = {
        'acquisition_source': {
            'method': 'hepcrawl',
            'submission_number': '751e374a017311e896d6fa163ec92c6a',
        },
    }
    extra_data = {}
    assert validate(data['acquisition_source'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    fix_submission_number(obj, eng)

    expected = {
        'method': 'hepcrawl',
        'submission_number': '1',
    }
    result = obj.data['acquisition_source']

    assert validate(result, subschema) is None
    assert expected == result
def test_populate_title_suggest_with_all_inputs():
    schema = load_schema('journals')
    journal_title_schema = schema['properties']['journal_title']
    short_title_schema = schema['properties']['short_title']
    title_variants_schema = schema['properties']['title_variants']

    record = {
        '$schema': 'http://localhost:5000/schemas/records/journals.json',
        'journal_title': {'title': 'The Journal of High Energy Physics (JHEP)'},
        'short_title': 'JHEP',
        'title_variants': ['JOURNAL OF HIGH ENERGY PHYSICS'],
    }
    assert validate(record['journal_title'], journal_title_schema) is None
    assert validate(record['short_title'], short_title_schema) is None
    assert validate(record['title_variants'], title_variants_schema) is None

    populate_title_suggest(None, record)

    expected = {
        'input': [
            'The Journal of High Energy Physics (JHEP)',
            'JHEP',
            'JOURNAL OF HIGH ENERGY PHYSICS'
        ],
        'output': 'JHEP',
        'payload': {
            'full_title': 'The Journal of High Energy Physics (JHEP)'
        }
    }

    result = record['title_suggest']

    assert expected == result
def fix_submission_number_does_nothing_if_method_is_not_hepcrawl():
    schema = load_schema('hep')
    subschema = schema['properties']['acquisition_source']

    data = {
        'acquisition_source': {
            'method': 'submitter',
            'submission_number': '869215',
        },
    }
    extra_data = {}
    assert validate(data['acquisition_source'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    fix_submission_number(obj, eng)

    expected = {
        'method': 'submitter',
        'submission_number': '869215',
    }
    result = obj.data['acquisition_source']

    assert validate(result, subschema) is None
    assert expected == result
def test_assign_uuid_does_not_touch_existing_uuids(mock_uuid4):
    mock_uuid4.return_value = UUID('727238f3-8ed6-40b6-97d2-dc3cd1429131')

    schema = load_schema('hep')
    subschema = schema['properties']['authors']

    record = {
        '$schema': 'http://localhost:5000/records/schemas/hep.json',
        'authors': [
            {
                'full_name': 'Ellis, John Richard',
                'uuid': 'e14955b0-7e57-41a0-90a8-f4c64eb8f4e9',
            },
        ],
    }
    assert validate(record['authors'], subschema) is None

    assign_uuid(None, record)

    expected = [
        {
            'full_name': 'Ellis, John Richard',
            'uuid': 'e14955b0-7e57-41a0-90a8-f4c64eb8f4e9',
        },
    ]
    result = record['authors']

    assert validate(result, subschema) is None
    assert expected == result
def test_populate_journal_coverage_writes_partial_if_all_coverages_are_partial(mock_replace_refs):
    schema = load_schema('journals')
    subschema = schema['properties']['_harvesting_info']

    journals = [{'_harvesting_info': {'coverage': 'partial'}}]
    assert validate(journals[0]['_harvesting_info'], subschema) is None

    mock_replace_refs.return_value = journals

    schema = load_schema('hep')
    subschema = schema['properties']['publication_info']

    data = {
        'publication_info': [
            {'journal_record': {'$ref': 'http://localhost:/api/journals/1212337'}},
        ],
    }
    extra_data = {}
    assert validate(data['publication_info'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert populate_journal_coverage(obj, eng) is None

    expected = 'partial'
    result = obj.extra_data['journal_coverage']

    assert expected == result
Beispiel #21
0
def test_record_with_non_valid_content_is_cleaned_and_created_properly(
        isolated_app):
    record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        'control_number': 1,
        'document_type': [
            'article',
        ],
        'titles': [
            {'title': 'foo'},
        ],
        '_collections': [
            'Literature'
        ],
        # these two fields make the record not valid
        'documents': [],
        'urls': [
            {'url': ''},
        ],
        # record/1628455/export/xme -- with some modification
    }
    non_valid = False
    try:
        validate(record_json)
    except ValidationError:
        non_valid = True

    assert non_valid
    record = InspireRecord.create(record_json)
    validate(record)
def test_set_refereed_and_fix_document_type_handles_journals_that_publish_mixed_content(mock_replace_refs):
    schema = load_schema('journals')
    proceedings_schema = schema['properties']['proceedings']
    refereed_schema = schema['properties']['refereed']

    journals = [{'proceedings': True, 'refereed': True}]
    assert validate(journals[0]['proceedings'], proceedings_schema) is None
    assert validate(journals[0]['refereed'], refereed_schema) is None

    mock_replace_refs.return_value = journals

    schema = load_schema('hep')
    subschema = schema['properties']['refereed']

    data = {'document_type': ['article']}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert set_refereed_and_fix_document_type(obj, eng) is None

    expected = True
    result = obj.data['refereed']

    assert validate(result, subschema) is None
    assert expected == result
def test_extract_journal_info():
    schema = load_schema('hep')
    subschema = schema['properties']['publication_info']

    data = {
        'publication_info': [
            {'pubinfo_freetext': 'J. Math. Phys. 55, 082102 (2014)'},
        ],
    }
    extra_data = {}
    assert validate(data['publication_info'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert extract_journal_info(obj, eng) is None

    expected = [
        {
            'artid': '082102',
            'journal_title': 'J. Math. Phys.',
            'journal_volume': '55',
            'pubinfo_freetext': 'J. Math. Phys. 55, 082102 (2014)',
            'year': 2014,
        }
    ]
    result = obj.data['publication_info']

    assert validate(result, subschema) is None
    assert expected == result
def test_populate_bookautocomplete_from_isbns_values():
    schema = load_schema('hep')
    document_type_schema = schema['properties']['document_type']
    self_schema = schema['properties']['self']
    isbns_schema = schema['properties']['isbns']

    record = {
        '$schema': 'http://localhost:5000/records/schemas/hep.json',
        'document_type': [
            'book',
        ],
        'isbns': [
            {'value': '0201021153'},
        ],
        'self': {
            '$ref': 'http://localhost:5000/api/literature/1519486',
        },
    }
    record = InspireRecord(record, model=RecordMetadata)
    assert validate(record['document_type'], document_type_schema) is None
    assert validate(record['isbns'], isbns_schema) is None
    assert validate(record['self'], self_schema) is None

    populate_bookautocomplete(record)

    expected = {
        'input': [
            '0201021153',
        ],
    }
    result = record['bookautocomplete']

    assert expected == result
def test_prepare_keywords_does_nothing_if_no_keywords_were_predicted():
    schema = load_schema('hep')
    subschema = schema['properties']['keywords']

    data = {
        'keywords': [
            {
                'schema': 'INSPIRE',
                'value': 'field theory: conformal',
            },
        ],
    }
    extra_data = {}
    assert validate(data['keywords'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert prepare_keywords(obj, eng) is None

    expected = [
        {
            'schema': 'INSPIRE',
            'value': 'field theory: conformal',
        },
    ]
    result = obj.data

    assert validate(result['keywords'], subschema) is None
    assert expected == result['keywords']
def test_arxiv_author_list_handles_multiple_author_xml_files():
    schema = load_schema('hep')
    eprints_subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    authors_subschema = schema['properties']['authors']
    expected_authors = [
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432142', 'schema': 'CERN'},
            ],
            'full_name': 'Sirunyan, Albert M',
        },
        {
            'affiliations': [{'value': 'Yerevan Phys. Inst.'}],
            'ids': [
                {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'},
                {'value': 'CERN-432143', 'schema': 'CERN'},
            ],
            'full_name': 'Weary, Jake',
        }
    ]
    validate(expected_authors, authors_subschema)

    assert obj.data.get('authors') == expected_authors
def test_fuzzy_match_returns_true_if_something_matched_with_publication_info(mock_match, enable_fuzzy_matcher):
    schema = load_schema('hep')
    publication_info_schema = schema['properties']['publication_info']
    titles_schema = schema['properties']['titles']

    matched_record = {
        'control_number': 1472986,
        'titles': [
            {
                'title': 'title',
            },
        ],
        'publication_info': [
            {
                'artid': '054021',
                'journal_issue': '5',
                'journal_title': 'Phys.Rev.D',
                'journal_volume': '94',
                'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021',
                'year': 2016
            },
        ],
    }

    assert validate(matched_record['titles'], titles_schema) is None
    assert validate(matched_record['publication_info'], publication_info_schema) is None

    mock_match.return_value = iter([{'_source': matched_record}])

    data = {}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert fuzzy_match(obj, eng)
    assert 'matches' in obj.extra_data

    expected = [{
        'control_number': 1472986,
        'title': 'title',
        'publication_info': [
            {
                'artid': '054021',
                'journal_issue': '5',
                'journal_title': 'Phys.Rev.D',
                'journal_volume': '94',
                'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021',
                'year': 2016
            },
        ],
    }]
    result = get_value(obj.extra_data, 'matches.fuzzy')

    assert expected == result
def test_fuzzy_match_returns_true_if_something_matched_with_more_than_1_public_notes(mock_match, enable_fuzzy_matcher):
    schema = load_schema('hep')
    public_notes_schema = schema['properties']['public_notes']
    titles_schema = schema['properties']['titles']

    matched_record = {
        'control_number': 1472986,
        'titles': [
            {
                'title': 'title',
            },
        ],
        'public_notes': [
            {
                'source': 'arXiv',
                'value': '4 pages, 4 figures',
            },
            {
                'source': 'arXiv',
                'value': 'Some other public note',
            },
        ],
    }

    assert validate(matched_record['titles'], titles_schema) is None
    assert validate(matched_record['public_notes'], public_notes_schema) is None

    mock_match.return_value = iter([{'_source': matched_record}])

    data = {}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert fuzzy_match(obj, eng)
    assert 'matches' in obj.extra_data

    expected = [{
        'control_number': 1472986,
        'title': 'title',
        'public_notes': [
            {'value': '4 pages, 4 figures'},
            {'value': 'Some other public note'},
        ],
    }]
    result = get_value(obj.extra_data, 'matches.fuzzy')

    assert expected == result
def test_arxiv_plot_extract_logs_when_images_are_invalid(mock_process_tarball):
    mock_process_tarball.side_effect = DelegateError

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'physics.ins-det',
                ],
                'value': '1612.00624',
            },
        ],
    }  # synthetic data
    extra_data = {}
    files = MockFiles({
        '1612.00624.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1612.00624',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_plot_extract(obj, eng) is None

    expected = 'Error extracting plots for 1612.00624. Report and skip.'
    result = obj.log._error.getvalue()

    assert expected == result
def test_arxiv_plot_extract_logs_when_tarball_is_invalid(mock_process_tarball):
    mock_process_tarball.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'physics.ins-det',
                ],
                'value': '1612.00626',
            },
        ],
    }  # synthetic data
    extra_data = {}
    files = MockFiles({
        '1612.00626.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1612.00626',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_plot_extract(obj, eng) is None

    expected = 'Invalid tarball http://export.arxiv.org/e-print/1612.00626 for arxiv_id 1612.00626'
    result = obj.log._info.getvalue()

    assert expected == result
Beispiel #31
0
def test_references_from_999C50_h_m_o_r_y():
    schema = load_schema('hep')
    subschema = schema['properties']['references']

    snippet = (
        '<datafield tag="999" ind1="C" ind2="5">'
        '  <subfield code="0">701721</subfield>'
        '  <subfield code="h">A. Ferrari, P.R. Sala, A. Fasso, and J. Ranft</subfield>'
        '  <subfield code="m">FLUKA: a multi-particle transport code, CERN-10 , INFN/TC_05/11</subfield>'
        '  <subfield code="o">13</subfield>'
        '  <subfield code="r">SLAC-R-773</subfield>'
        '  <subfield code="y">2005</subfield>'
        '</datafield>'
    )  # record/1478478

    expected = [
        {
            'curated_relation': False,
            'record': {
                '$ref': 'http://localhost:5000/api/literature/701721',
            },
            'reference': {
                'authors': [
                    {'full_name': 'Ferrari, A.'},
                    {'full_name': 'Sala, P.R.'},
                    {'full_name': 'Fasso, A.'},
                    {'full_name': 'Ranft, J.'},
                ],
                'label': '13',
                'misc': [
                    'FLUKA: a multi-particle transport code, CERN-10 , INFN/TC_05/11',
                ],
                'publication_info': {'year': 2005},
                'report_numbers': [
                    'SLAC-R-773',
                ],
            },
        },
    ]
    result = hep.do(create_record(snippet))

    assert validate(result['references'], subschema) is None
    assert expected == result['references']

    expected = [
        {
            '0': 701721,
            'h': [
                'Ferrari, A.',
                'Sala, P.R.',
                'Fasso, A.',
                'Ranft, J.',
            ],
            'm': 'FLUKA: a multi-particle transport code, CERN-10 , INFN/TC_05/11',
            'r': [
                'SLAC-R-773',
            ],
            'o': '13',
            'y': 2005,
            'z': 0,
        }
    ]
    result = hep2marc.do(result)

    assert expected == result['999C5']
Beispiel #32
0
def test_schemas_validate(schema_name):
    example_data = load_example(schema_name)
    api.validate(data=example_data, schema_name=schema_name)
Beispiel #33
0
 def _get_processed_item(item, spider):
     record = pipeline.process_item(item, spider)
     validate(record, 'hep')
     assert record
     return record
Beispiel #34
0
def xtest_populate_facet_author_name(mocked_get_linked_records_in_field):
    authors_json = [{
        '$schema':
        'http://localhost:5000/records/schemas/authors.json',
        'name': {
            'value': 'Silk, James Brian'
        },
        '_collections': ['Authors'],
        'ids': [{
            'schema': 'INSPIRE BAI',
            'value': 'James.Brian.1'
        }],
        'control_number':
        111,
    }, {
        '$schema': 'http://localhost:5000/records/schemas/authors.json',
        'name': {
            'value': 'Doe, John',
            'preferred_name': 'J Doe'
        },
        '_collections': ['Authors'],
        'ids': [{
            'schema': 'INSPIRE BAI',
            'value': 'John.Doe.1'
        }],
        'control_number': 222,
    }]

    mocked_get_linked_records_in_field.return_value = iter(authors_json)

    schema = load_schema('hep')
    subschema = schema['properties']['authors']
    record = {
        '$schema':
        'http://localhost:5000/records/schemas/hep.json',
        'authors': [
            {
                'full_name': 'Silk, James Brian',
                'record': {
                    '$ref': 'https://labs.inspirehep.net/api/literature/111'
                }
            },
            {
                'full_name': 'Doe, John',
                'record': {
                    '$ref': 'https://labs.inspirehep.net/api/literature/222'
                }
            },
            {
                'full_name': 'Rohan, George',
            },
        ],
    }
    record = InspireRecord(record, model=RecordMetadata)
    expected_result = [
        u'James.Brian.1_James Brian Silk', u'John.Doe.1_J Doe',
        u'BAI_George Rohan'
    ]
    assert validate(record['authors'], subschema) is None
    populate_facet_author_name(record)
    assert record['facet_author_name'] == expected_result
Beispiel #35
0
def test_fuzzy_match_returns_true_if_something_matched_with_4_authors(mock_match, enable_fuzzy_matcher):
    schema = load_schema('hep')
    authors_schema = schema['properties']['authors']
    titles_schema = schema['properties']['titles']

    matched_record = {
        'control_number': 4328,
        'titles': [
            {
                'title': 'title',
            },
        ],
        'authors': [
            {
                'full_name': 'Author 1'
            },
            {
                'full_name': 'Author, 2'
            },
            {
                'full_name': 'Author, 3'
            },
            {
                'full_name': 'Author, 4'
            }
        ],
        'authors_count': 4
    }

    assert validate(matched_record['titles'], titles_schema) is None
    assert validate(matched_record['authors'], authors_schema) is None

    mock_match.return_value = iter([{'_source': matched_record}])

    data = {}
    extra_data = {}

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert fuzzy_match(obj, eng)
    assert 'matches' in obj.extra_data

    expected = [{
        'control_number': 4328,
        'title': 'title',
        'authors': [
            {
                'full_name': 'Author 1'
            },
            {
                'full_name': 'Author, 2'
            },
            {
                'full_name': 'Author, 3'
            },
        ],
        'authors_count': 4
    }]
    result = get_value(obj.extra_data, 'matches.fuzzy')

    assert expected == result
Beispiel #36
0
def test_load_author_advisors():
    data = {
        'advisors': [
            {
                'degree_type':
                'bachelor',
                'ids': [
                    {
                        'schema': 'DESY',
                        'value': 'DESY-55924820881'
                    },
                    {
                        'schema': 'SCOPUS',
                        'value': '7039712595'
                    },
                    {
                        'schema': 'SCOPUS',
                        'value': '8752067273'
                    },
                ],
                'name':
                'occaecat qui sint in id',
                'record': {
                    '$ref': 'http://1js40iZ'
                }
            },
        ]
    }
    schema = load_schema('authors')
    subschema = schema['properties']['advisors']

    result = Author().load(data).data
    expected = {
        '_collections': ['Authors'],
        'advisors': [
            {
                'curated_relation':
                False,
                'degree_type':
                'bachelor',
                'name':
                'Id, Occaecat Qui Sint In',
                'ids': [{
                    'schema': 'DESY',
                    'value': 'DESY-55924820881'
                }, {
                    'schema': 'SCOPUS',
                    'value': '7039712595'
                }, {
                    'schema': 'SCOPUS',
                    'value': '8752067273'
                }],
                'record': {
                    '$ref': 'http://1js40iZ'
                }
            },
        ]
    }

    assert validate(result['advisors'], subschema) is None
    assert expected == result
Beispiel #37
0
def test_report_numbers_and_document_type_from_multiple_088__a():
    schema = load_schema('hep')
    subschema_report_numbers = schema['properties']['report_numbers']
    subschema_document_type = schema['properties']['document_type']

    snippet = ('<record>'
               '  <datafield tag="088" ind1=" " ind2=" ">'
               '    <subfield code="a">ATL-PHYS-CONF-2008-015</subfield>'
               '  </datafield>'
               '  <datafield tag="088" ind1=" " ind2=" ">'
               '    <subfield code="a">ATL-COM-PHYS-2008-052</subfield>'
               '  </datafield>'
               '<record>')  # cds.cern.ch/record/2275456

    expected = {
        '037__': [
            {
                '9': 'CDS',
                'a': 'ATL-PHYS-CONF-2008-015',
            },
            {
                '9': 'CDS',
                'a': 'ATL-COM-PHYS-2008-052',
            },
        ],
        '980__': [
            {
                'a': 'NOTE',
            },
            {
                'a': 'HEP',
            },
            {
                'a': 'CORE',
            },
        ],
    }
    result = cds2hep_marc.do(create_record(snippet))

    assert expected['037__'] == result['037__']
    assert expected['980__'] == result['980__']

    expected = {
        'document_type': [
            'note',
        ],
        'public_notes': [
            {
                'source': 'CDS',
                'value': 'Preliminary results',
            },
        ],
        'report_numbers': [
            {
                'source': 'CDS',
                'value': 'ATL-PHYS-CONF-2008-015',
            },
            {
                'source': 'CDS',
                'value': 'ATL-COM-PHYS-2008-052',
            },
        ],
    }
    result = hep.do(create_record_from_dict(result))

    assert validate(result['report_numbers'], subschema_report_numbers) is None
    assert validate(result['document_type'], subschema_document_type) is None
    assert expected['report_numbers'] == result['report_numbers']
    assert expected['document_type'] == result['document_type']
Beispiel #38
0
def validate_subschema(obj):
    schema = load_schema('hep')
    key = list(obj.keys())[0]  # python 3 compatibility
    sub_schema = schema['properties'].get(key)
    assert validate(obj.get(key), sub_schema) is None
Beispiel #39
0
def test_arxiv_author_list_handles_multiple_author_xml_files():
    schema = load_schema('hep')
    eprints_subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__,
        os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    authors_subschema = schema['properties']['authors']
    expected_authors = [{
        'affiliations': [{
            'value': 'Yerevan Phys. Inst.'
        }],
        'ids': [
            {
                'value': 'INSPIRE-00312131',
                'schema': 'INSPIRE ID'
            },
            {
                'value': 'CERN-432142',
                'schema': 'CERN'
            },
        ],
        'full_name':
        'Sirunyan, Albert M',
    }, {
        'affiliations': [{
            'value': 'Yerevan Phys. Inst.'
        }],
        'ids': [
            {
                'value': 'INSPIRE-00312132',
                'schema': 'INSPIRE ID'
            },
            {
                'value': 'CERN-432143',
                'schema': 'CERN'
            },
        ],
        'full_name':
        'Weary, Jake',
    }]
    validate(expected_authors, authors_subschema)

    assert obj.data.get('authors') == expected_authors
def test_match_reference_on_texkey_has_lower_priority_than_pub_info(isolated_app):
    cited_record_with_texkey_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 1,
        'document_type': ['article'],
        'texkeys': [
            'MyTexKey:2008fh',
        ],
        'titles': [
            {
                'title': 'The Strongly-Interacting Light Higgs'
            }
        ],
    }

    TestRecordMetadata.create_from_kwargs(
        json=cited_record_with_texkey_json, index_name='records-hep')

    cited_record_with_pub_info_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 2,
        'document_type': ['article'],
        'publication_info': [
            {
                'artid': '100',
                'journal_title': 'JHEP',
                'journal_volume': '100',
                'page_start': '100',
                'year': 2020
            }
        ],
        'titles': [
            {
                'title': 'The Strongly-Interacting Light Higgs'
            }
        ],
    }

    TestRecordMetadata.create_from_kwargs(
        json=cited_record_with_pub_info_json, index_name='records-hep')

    reference = {
        'reference': {
            'texkey': 'MyTexKey:2008fh',
            'publication_info': {
                'artid': '100',
                'journal_title': 'JHEP',
                'journal_volume': '100',
                'page_start': '100',
                'year': 2020
            }
        }
    }

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate([reference], subschema) is None
    reference = match_reference(reference)

    assert reference['record']['$ref'] == 'http://localhost:5000/api/literature/2'
    assert validate([reference], subschema) is None
def test_match_references_no_match_when_multiple_match_different_from_previous(isolated_app):
    """Test reference matcher for when inspire-matcher returns multiple matches
    where the matched record id is not the same as the previous matched record id"""

    original_cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 1,
        'document_type': ['article'],
        'publication_info': [
            {
                'artid': '159',
                'journal_title': 'JHEP',
                'journal_volume': '03',
                'page_start': '159',
                'year': 2016
            },
            {
                'artid': '074',
                'journal_title': 'JHEP',
                'journal_volume': '05',
                'material': 'erratum',
                'page_start': '074',
                'year': 2017
            }
        ]
    }

    errata_cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 2,
        'document_type': ['article'],
        'publication_info': [
            {
                'artid': '074',
                'journal_title': 'JHEP',
                'journal_volume': '05',
                'material': 'erratum',
                'page_start': '074',
                'year': 2017
            }
        ]
    }

    TestRecordMetadata.create_from_kwargs(
        json=original_cited_record_json, index_name='records-hep')

    TestRecordMetadata.create_from_kwargs(
        json=errata_cited_record_json, index_name='records-hep')

    references = [
        {
            'reference': {
                'publication_info': {
                    'artid': '074',
                    'journal_title': 'JHEP',
                    'journal_volume': '05',
                    'page_start': '074',
                    'year': 2017
                }
            }
        }
    ]

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate(references, subschema) is None

    references = match_references(references)

    assert get_value(references[0], 'record') is None
    assert validate(references, subschema) is None
def test_add_institution_sorts_by_rank():
    schema = load_schema('authors')
    subschema = schema['properties']['positions']

    author = AuthorBuilder()
    author.add_institution(institution='Colgate University',
                           rank='MASTER')
    author.add_institution(institution='Colgate University',
                           rank='PHD')
    author.add_institution(institution='Colgate University',
                           rank='VISITOR')
    author.add_institution(institution='Colgate University',
                           rank='STAFF')
    author.add_institution(institution='Colgate University',
                           rank='SENIOR')
    author.add_institution(institution='Colgate University',
                           rank='OTHER')
    author.add_institution(institution='Colgate University',
                           rank='UNDERGRADUATE')
    author.add_institution(institution='Colgate University')
    author.add_institution(institution='Colgate University',
                           rank='POSTDOC')
    author.add_institution(institution='Colgate University',
                           rank='JUNIOR')

    expected = [
        {
            "institution": 'Colgate University',
            "rank": 'STAFF',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'SENIOR',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'JUNIOR',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'VISITOR',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'POSTDOC',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'PHD',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'MASTER',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'UNDERGRADUATE',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "rank": 'OTHER',
            "curated_relation": False,
            "current": False
        },
        {
            "institution": 'Colgate University',
            "curated_relation": False,
            "current": False
        },
    ]
    result = author.obj['positions']

    assert validate(result, subschema) is None
    assert expected == result
Beispiel #43
0
def test_authors_from_100__a_0_u_m_and_700__a_0_u_m():
    schema = load_schema('hep')
    subschema = schema['properties']['authors']

    snippet = (
        '<record>'
        '  <datafield tag="100" ind1=" " ind2=" ">'
        '    <subfield code="a">Joram, Christian</subfield>'
        '    <subfield code="0">AUTHOR|(INSPIRE)INSPIRE-00093928</subfield>'
        '    <subfield code="0">AUTHOR|(SzGeCERN)403463</subfield>'
        '    <subfield code="0">AUTHOR|(CDS)2068232</subfield>'
        '    <subfield code="u">CERN</subfield>'
        '    <subfield code="m">[email protected]</subfield>'
        '  </datafield>'
        '  <datafield tag="700" ind1=" " ind2=" ">'
        '    <subfield code="a">Pons, Xavier</subfield>'
        '    <subfield code="0">AUTHOR|(CDS)2067681</subfield>'
        '    <subfield code="0">AUTHOR|(SzGeCERN)531402</subfield>'
        '    <subfield code="u">CERN</subfield>'
        '    <subfield code="m">[email protected]</subfield>'
        '  </datafield>'
        '</record>')  # record/2295263

    expected = {
        '100__': [
            {
                'a': 'Joram, Christian',
                'i': ['INSPIRE-00093928'],
                'j': ['CCID-403463'],
                'u': 'CERN',
                'm': '*****@*****.**',
            },
        ],
        '700__': [
            {
                'a': 'Pons, Xavier',
                'j': ['CCID-531402'],
                'u': 'CERN',
                'm': '*****@*****.**',
            },
        ],
    }
    result = cds2hep_marc.do(create_record(snippet))

    assert expected['100__'] == result['100__']
    assert expected['700__'] == result['700__']

    expected = [
        {
            'full_name':
            'Joram, Christian',
            'ids': [
                {
                    'schema': 'INSPIRE ID',
                    'value': 'INSPIRE-00093928',
                },
                {
                    'schema': 'CERN',
                    'value': 'CERN-403463',
                },
            ],
            'affiliations': [{
                'value': 'CERN'
            }],
            'emails': ['*****@*****.**'],
        },
        {
            'full_name': 'Pons, Xavier',
            'ids': [
                {
                    'schema': 'CERN',
                    'value': 'CERN-531402',
                },
            ],
            'affiliations': [{
                'value': 'CERN'
            }],
            'emails': ['*****@*****.**'],
        },
    ]
    result = hep.do(create_record_from_dict(result))

    assert validate(result['authors'], subschema) is None
    assert expected == result['authors']
Beispiel #44
0
 def validate(self):
     """Validate the record, also ensuring format compliance."""
     validate(self)
Beispiel #45
0
def test_report_numbers_and_document_type_and_publicate_notes_from_037__a():
    schema = load_schema('hep')
    subschema_report_numbers = schema['properties']['report_numbers']
    subschema_document_type = schema['properties']['document_type']
    subschema_public_notes = schema['properties']['public_notes']

    snippet = ('<datafield tag="088" ind1=" " ind2=" ">'
               '   <subfield code="a">CMS-PAS-SMP-15-001</subfield>'
               '</datafield>')  # cds.cern.ch/record/2202807

    expected = {
        '037__': [
            {
                '9': 'CDS',
                'a': 'CMS-PAS-SMP-15-001',
            },
        ],
        '500__': [
            {
                '9': 'CDS',
                'a': 'Preliminary results',
            },
        ],
        '980__': [
            {
                'a': 'NOTE',
            },
            {
                'a': 'HEP',
            },
            {
                'a': 'CORE',
            },
        ],
    }
    result = cds2hep_marc.do(create_record(snippet))

    assert expected['037__'] == result['037__']
    assert expected['500__'] == result['500__']
    assert expected['980__'] == result['980__']

    expected = {
        'document_type': [
            'note',
        ],
        'public_notes': [
            {
                'source': 'CDS',
                'value': 'Preliminary results',
            },
        ],
        'report_numbers': [
            {
                'source': 'CDS',
                'value': 'CMS-PAS-SMP-15-001',
            },
        ],
    }
    result = hep.do(create_record_from_dict(result))

    assert validate(result['report_numbers'], subschema_report_numbers) is None
    assert validate(result['public_notes'], subschema_public_notes) is None
    assert validate(result['document_type'], subschema_document_type) is None
    assert expected['report_numbers'] == result['report_numbers']
    assert expected['public_notes'] == result['public_notes']
    assert expected['document_type'] == result['document_type']
Beispiel #46
0
def test_match_references_matches_when_multiple_match_if_same_as_previous(
        inspire_app):
    """Test reference matcher for when inspire-matcher returns multiple matches
    where the matched record id is one of the previous matched record id as well"""

    original_cited_record_json = {
        "$schema":
        "http://localhost:5000/schemas/records/hep.json",
        "_collections": ["Literature"],
        "control_number":
        1,
        "document_type": ["article"],
        "publication_info": [
            {
                "artid": "159",
                "journal_title": "JHEP",
                "journal_volume": "03",
                "page_start": "159",
                "year": 2016,
            },
            {
                "artid": "074",
                "journal_title": "JHEP",
                "journal_volume": "05",
                "material": "erratum",
                "page_start": "074",
                "year": 2017,
            },
        ],
    }

    errata_cited_record_json = {
        "$schema":
        "http://localhost:5000/schemas/records/hep.json",
        "_collections": ["Literature"],
        "control_number":
        2,
        "document_type": ["article"],
        "publication_info": [{
            "artid": "074",
            "journal_title": "JHEP",
            "journal_volume": "05",
            "material": "erratum",
            "page_start": "074",
            "year": 2017,
        }],
    }

    create_record("lit", data=original_cited_record_json)
    create_record("lit", data=errata_cited_record_json)

    references = [
        {
            "reference": {
                "publication_info": {
                    "artid": "159",
                    "journal_title": "JHEP",
                    "journal_volume": "03",
                    "page_start": "159",
                    "year": 2016,
                }
            }
        },
        {
            "reference": {
                "publication_info": {
                    "artid": "074",
                    "journal_title": "JHEP",
                    "journal_volume": "05",
                    "page_start": "074",
                    "year": 2017,
                }
            }
        },
    ]

    schema = load_schema("hep")
    subschema = schema["properties"]["references"]

    assert validate(references, subschema) is None

    match_result = match_references(references)
    matched_references = match_result["matched_references"]

    assert (matched_references[1]["record"]["$ref"] ==
            "http://localhost:5000/api/literature/1")
    assert validate(matched_references, subschema) is None

    assert match_result["any_link_modified"]
    assert match_result["added_recids"] == [1, 1]
    assert match_result["removed_recids"] == []
Beispiel #47
0
def test_populate_experiment_suggest():
    schema = load_schema('experiments')
    legacy_name_schema = schema['properties']['legacy_name']
    long_name_schema = schema['properties']['long_name']
    name_variants_schema = schema['properties']['name_variants']
    collaboration_schema = schema['properties']['collaboration']
    accelerator_schema = schema['properties']['accelerator']
    experiment_schema = schema['properties']['experiment']
    institutions_schema = schema['properties']['institutions']

    record = {
        '$schema': 'http://foo/experiments.json',
        'self': {
            '$ref': 'https://localhost:5000/api/experiments/bar'
        },
        'legacy_name': 'foo',
        'long_name': 'foobarbaz',
        'name_variants': [
            'bar',
            'baz',
        ],
        'collaboration': {
            'value': 'D0',
        },
        'accelerator': {
            'value': 'LHC',
        },
        'experiment': {
            'short_name': 'SHINE',
            'value': 'NA61',
        },
        'institutions': [
            {
                'value': 'ICN',
            },
        ],
    }
    record = InspireRecord(record, model=RecordMetadata)
    assert validate(record['legacy_name'], legacy_name_schema) is None
    assert validate(record['long_name'], long_name_schema) is None
    assert validate(record['name_variants'], name_variants_schema) is None
    assert validate(record['collaboration'], collaboration_schema) is None
    assert validate(record['accelerator'], accelerator_schema) is None
    assert validate(record['institutions'], institutions_schema) is None
    assert validate(record['experiment'], experiment_schema) is None

    populate_experiment_suggest(record)

    expected = {
        'input': [
            'LHC',
            'D0',
            'SHINE',
            'NA61',
            'ICN',
            'foo',
            'foobarbaz',
            'bar',
            'baz',
        ]
    }

    result = record['experiment_suggest']

    assert expected == result
Beispiel #48
0
def test_match_reference_on_texkey_has_lower_priority_than_pub_info(
        inspire_app):
    cited_record_with_texkey_json = {
        "$schema": "http://localhost:5000/schemas/records/hep.json",
        "_collections": ["Literature"],
        "control_number": 1,
        "document_type": ["article"],
        "texkeys": ["MyTexKey:2008fh"],
        "titles": [{
            "title": "The Strongly-Interacting Light Higgs"
        }],
    }

    create_record("lit", cited_record_with_texkey_json)

    cited_record_with_pub_info_json = {
        "$schema":
        "http://localhost:5000/schemas/records/hep.json",
        "_collections": ["Literature"],
        "control_number":
        2,
        "document_type": ["article"],
        "publication_info": [{
            "artid": "100",
            "journal_title": "JHEP",
            "journal_volume": "100",
            "page_start": "100",
            "year": 2020,
        }],
        "titles": [{
            "title": "The Strongly-Interacting Light Higgs"
        }],
    }

    create_record("lit", cited_record_with_pub_info_json)

    reference = {
        "reference": {
            "texkey": "MyTexKey:2008fh",
            "publication_info": {
                "artid": "100",
                "journal_title": "JHEP",
                "journal_volume": "100",
                "page_start": "100",
                "year": 2020,
            },
        }
    }

    schema = load_schema("hep")
    subschema = schema["properties"]["references"]

    assert validate([reference], subschema) is None
    reference = match_reference(reference)

    assert reference["record"][
        "$ref"] == "http://localhost:5000/api/literature/2"
    assert validate([reference], subschema) is None

    expected_control_number = [2, 1]
    result_coontrol_number = match_reference_control_numbers(reference)

    assert set(expected_control_number) == set(result_coontrol_number)
    assert len(expected_control_number) == len(result_coontrol_number)
Beispiel #49
0
def test_ids_from_035__a_9_with_cern_malformed():
    schema = load_schema('authors')
    subschema = schema['properties']['ids']

    snippet = (
        '<record>'
        '  <datafield tag="035" ind1=" " ind2=" ">'
        '    <subfield code="9">CERN</subfield>'
        '    <subfield code="a">CERN-CERN-645257</subfield>'
        '  </datafield>'  # record/1030771
        '  <datafield tag="035" ind1=" " ind2=" ">'
        '    <subfield code="9">CERN</subfield>'
        '    <subfield code="a">cern-783683</subfield>'
        '  </datafield>'  # record/1408145
        '  <datafield tag="035" ind1=" " ind2=" ">'
        '    <subfield code="9">CERN</subfield>'
        '    <subfield code="a">CERM-724319</subfield>'
        '  </datafield>'  # record/1244430
        '  <datafield tag="035" ind1=" " ind2=" ">'
        '    <subfield code="9">CERN</subfield>'
        '    <subfield code="a">CNER-727986</subfield>'
        '  </datafield>'  # record/1068077
        '  <datafield tag="035" ind1=" " ind2=" ">'
        '    <subfield code="9">CERN</subfield>'
        '    <subfield code="a">CVERN-765559</subfield>'
        '  </datafield>'  # record/1340631
        '</record>')

    expected = [
        {
            'schema': 'CERN',
            'value': 'CERN-645257',
        },
        {
            'schema': 'CERN',
            'value': 'CERN-783683',
        },
        {
            'schema': 'CERN',
            'value': 'CERN-724319',
        },
        {
            'schema': 'CERN',
            'value': 'CERN-727986',
        },
        {
            'schema': 'CERN',
            'value': 'CERN-765559',
        },
    ]
    result = hepnames.do(create_record(snippet))

    assert validate(result['ids'], subschema) is None
    assert expected == result['ids']

    expected = [
        {
            '9': 'CERN',
            'a': 'CERN-645257',
        },
        {
            '9': 'CERN',
            'a': 'CERN-783683',
        },
        {
            '9': 'CERN',
            'a': 'CERN-724319',
        },
        {
            '9': 'CERN',
            'a': 'CERN-727986',
        },
        {
            '9': 'CERN',
            'a': 'CERN-765559',
        },
    ]
    result = hepnames2marc.do(result)

    assert expected == result['035']
Beispiel #50
0
def test_match_references_no_match_when_multiple_match_different_from_previous(
    inspire_app, ):
    """Test reference matcher for when inspire-matcher returns multiple matches
    where the matched record id is not the same as the previous matched record id"""

    original_cited_record_json = {
        "$schema":
        "http://localhost:5000/schemas/records/hep.json",
        "_collections": ["Literature"],
        "control_number":
        1,
        "document_type": ["article"],
        "publication_info": [
            {
                "artid": "159",
                "journal_title": "JHEP",
                "journal_volume": "03",
                "page_start": "159",
                "year": 2016,
            },
            {
                "artid": "074",
                "journal_title": "JHEP",
                "journal_volume": "05",
                "material": "erratum",
                "page_start": "074",
                "year": 2017,
            },
        ],
    }

    errata_cited_record_json = {
        "$schema":
        "http://localhost:5000/schemas/records/hep.json",
        "_collections": ["Literature"],
        "control_number":
        2,
        "document_type": ["article"],
        "publication_info": [{
            "artid": "074",
            "journal_title": "JHEP",
            "journal_volume": "05",
            "material": "erratum",
            "page_start": "074",
            "year": 2017,
        }],
    }

    create_record("lit", data=original_cited_record_json)
    create_record("lit", data=errata_cited_record_json)

    references = [{
        "reference": {
            "publication_info": {
                "artid": "074",
                "journal_title": "JHEP",
                "journal_volume": "05",
                "page_start": "074",
                "year": 2017,
            }
        }
    }]

    schema = load_schema("hep")
    subschema = schema["properties"]["references"]

    assert validate(references, subschema) is None

    references = match_references(references)

    assert get_value(references[0], "record") is None
    assert validate(references, subschema) is None
Beispiel #51
0
def test_schemas_validate_negative(schema_name):
    example_data = load_example(schema_name)
    example_data = change_something(example_data)
    with pytest.raises(jsonschema.ValidationError):
        api.validate(data=example_data, schema_name=schema_name)
Beispiel #52
0
def test_collaborations_from_multiple_710__g_0_and_710__g():
    schema = load_schema('hep')
    subschema = schema['properties']['collaborations']

    snippet = ('<record>'
               '  <datafield tag="710" ind1=" " ind2=" ">'
               '    <subfield code="g">ANTARES</subfield>'
               '    <subfield code="0">1110619</subfield>'
               '  </datafield>'
               '  <datafield tag="710" ind1=" " ind2=" ">'
               '    <subfield code="g">IceCube</subfield>'
               '    <subfield code="0">1108514</subfield>'
               '  </datafield>'
               '  <datafield tag="710" ind1=" " ind2=" ">'
               '    <subfield code="g">LIGO Scientific</subfield>'
               '  </datafield>'
               '  <datafield tag="710" ind1=" " ind2=" ">'
               '    <subfield code="g">Virgo</subfield>'
               '    <subfield code="0">1110601</subfield>'
               '  </datafield>'
               '</record>')  # record/1422032

    expected = [
        {
            'record': {
                '$ref': 'http://localhost:5000/api/experiments/1110619',
            },
            'value': 'ANTARES',
        },
        {
            'record': {
                '$ref': 'http://localhost:5000/api/experiments/1108514',
            },
            'value': 'IceCube',
        },
        {
            'value': 'LIGO Scientific',
        },
        {
            'record': {
                '$ref': 'http://localhost:5000/api/experiments/1110601',
            },
            'value': 'Virgo',
        },
    ]
    result = hep.do(create_record(snippet))

    assert validate(result['collaborations'], subschema) is None
    assert expected == result['collaborations']

    expected = [
        {
            'g': 'ANTARES'
        },
        {
            'g': 'IceCube'
        },
        {
            'g': 'LIGO Scientific'
        },
        {
            'g': 'Virgo'
        },
    ]
    result = hep2marc.do(result)

    assert expected == result['710']
Beispiel #53
0
def test_addresses_from_371__triple_a_b_d_e_g_and_371__triple_a_b_d_e_g_x():
    schema = load_schema('institutions')
    subschema = schema['properties']['addresses']

    snippet = (
        '<record>'
        '  <datafield tag="371" ind1=" " ind2=" ">'
        '    <subfield code="a">Université Libre de Bruxelles (ULB)</subfield>'
        '    <subfield code="a">Boulevard du Triomphe, 2</subfield>'
        '    <subfield code="a">B-1050 Bruxelles</subfield>'
        '    <subfield code="b">Brussels</subfield>'
        '    <subfield code="d">Belgium</subfield>'
        '    <subfield code="e">1050</subfield>'
        '    <subfield code="g">BE</subfield>'
        '  </datafield>'
        '  <datafield tag="371" ind1=" " ind2=" ">'
        '    <subfield code="a">Vrije Universiteit VUB</subfield>'
        '    <subfield code="a">Pleinlaan 2</subfield>'
        '    <subfield code="a">B-1050 Brussel</subfield>'
        '    <subfield code="b">Brussels</subfield>'
        '    <subfield code="d">Belgium</subfield>'
        '    <subfield code="e">1050</subfield>'
        '    <subfield code="g">BE</subfield>'
        '    <subfield code="x">secondary</subfield>'
        '  </datafield>'
        '</record>')  # record/902696

    expected = [
        {
            'cities': [
                'Brussels',
            ],
            'country_code':
            'BE',
            'postal_address': [
                u'Université Libre de Bruxelles (ULB)',
                'Boulevard du Triomphe, 2',
                'B-1050 Bruxelles',
            ],
            'postal_code':
            '1050',
        },
        {
            'cities': [
                'Brussels',
            ],
            'country_code':
            'BE',
            'postal_address': [
                'Vrije Universiteit VUB',
                'Pleinlaan 2',
                'B-1050 Brussel',
            ],
            'postal_code':
            '1050',
        },
    ]
    result = institutions.do(create_record(snippet))

    assert validate(result['addresses'], subschema) is None
    assert expected == result['addresses']
Beispiel #54
0
def test_figures_order_from_FFT():
    schema = load_schema('hep')
    subschema = schema['properties']['figures']

    snippet = (
        '<record>'
        '  <datafield tag="FFT" ind1=" " ind2=" ">'
        '    <subfield code="a">/opt/cds-invenio/var/data/files/g151/3037400/content.png;1</subfield>'
        '    <subfield code="d">00010 Co-simulation results, at $50~\mathrm{ms}$...</subfield>'
        '    <subfield code="f">.png</subfield>'
        '    <subfield code="n">FIG11</subfield>'
        '    <subfield code="r"/>'
        '    <subfield code="s">2017-10-04 07:54:54</subfield>'
        '    <subfield code="t">Main</subfield>'
        '    <subfield code="v">1</subfield>'
        '    <subfield code="z"/>'
        '  </datafield>'
        '  <datafield tag="FFT" ind1=" " ind2=" ">'
        '    <subfield code="a">/opt/cds-invenio/var/data/files/g151/3037399/content.png;1</subfield>'
        '    <subfield code="d">00009 Co-simulation results, at $50~\mathrm{ms}$...</subfield>'
        '    <subfield code="f">.png</subfield>'
        '    <subfield code="n">FIG10</subfield>'
        '    <subfield code="r"/>'
        '    <subfield code="s">2017-10-04 07:54:54</subfield>'
        '    <subfield code="t">Main</subfield>'
        '    <subfield code="v">1</subfield>'
        '    <subfield code="z"/>'
        '  </datafield>'
        '  <datafield tag="FFT" ind1=" " ind2=" ">'
        '    <subfield code="a">/opt/cds-invenio/var/data/files/g151/3037401/content.png;1</subfield>'
        '    <subfield code="d">00011 Co-simulation results, at $50~\mathrm{ms}$...</subfield>'
        '    <subfield code="f">.png</subfield>'
        '    <subfield code="n">FIG12</subfield>'
        '    <subfield code="r"/>'
        '    <subfield code="s">2017-10-04 07:54:54</subfield>'
        '    <subfield code="t">Main</subfield>'
        '    <subfield code="v">1</subfield>'
        '    <subfield code="z"/>'
        '  </datafield>'
        '</record>'
    )  # record/1628455

    expected = [
        {
            'key': 'FIG10.png',
            'caption': 'Co-simulation results, at $50~\mathrm{ms}$...',
            'url': 'file:///afs/cern.ch/project/inspire/PROD/var/data/files/g151/3037399/content.png%3B1',
        },
        {
            'key': 'FIG11.png',
            'caption': 'Co-simulation results, at $50~\mathrm{ms}$...',
            'url': 'file:///afs/cern.ch/project/inspire/PROD/var/data/files/g151/3037400/content.png%3B1',
        },
        {
            'key': 'FIG12.png',
            'caption': 'Co-simulation results, at $50~\mathrm{ms}$...',
            'url': 'file:///afs/cern.ch/project/inspire/PROD/var/data/files/g151/3037401/content.png%3B1',
        }
    ]
    result = hep.do(create_record(snippet))
    assert validate(result['figures'], subschema) is None
    assert expected == result['figures']
    assert 'documents' not in result
Beispiel #55
0
def test_references_from_999C50_9_r_u_h_m_o():
    schema = load_schema('hep')
    subschema = schema['properties']['references']

    snippet = (
        '<datafield tag="999" ind1="C" ind2="5">'
        '  <subfield code="0">1511470</subfield>'
        '  <subfield code="9">CURATOR</subfield>'
        '  <subfield code="r">urn:nbn:de:hebis:77-diss-1000009520</subfield>'
        '  <subfield code="u">http://www.diss.fu-berlin.de/diss/receive/FUDISS_thesis_000000094316</subfield>'
        '  <subfield code="h">K. Wiebe</subfield>'
        '  <subfield code="m">Ph.D. thesis, University of Mainz, in preparation</subfield>'
        '  <subfield code="o">51</subfield>'
        '</datafield>'
    )  # record/1504897

    expected = [
        {
            'curated_relation': False,
            'legacy_curated': True,
            'record': {
                '$ref': 'http://localhost:5000/api/literature/1511470',
            },
            'reference': {
                'authors': [
                    {'full_name': 'Wiebe, K.'},
                ],
                'label': '51',
                'misc': [
                    'Ph.D. thesis, University of Mainz, in preparation',
                ],
                'report_numbers': [
                    'urn:nbn:de:hebis:77-diss-1000009520',
                ],
                'urls': [
                    {'value': 'http://www.diss.fu-berlin.de/diss/receive/FUDISS_thesis_000000094316'},
                ],
            },
        },
    ]
    result = hep.do(create_record(snippet))

    assert validate(result['references'], subschema) is None
    assert expected == result['references']

    expected = [
        {
            '0': 1511470,
            '9': 'CURATOR',
            'h': [
                'Wiebe, K.',
            ],
            'r': [
                'urn:nbn:de:hebis:77-diss-1000009520',
            ],
            'm': 'Ph.D. thesis, University of Mainz, in preparation',
            'o': '51',
            'u': [
                'http://www.diss.fu-berlin.de/diss/receive/FUDISS_thesis_000000094316',
            ],
            'z': 0,
        },
    ]
    result = hep2marc.do(result)

    assert expected == result['999C5']
Beispiel #56
0
def test_authors_from_100__a_u_and_multiple_700__a_u_e():
    schema = load_schema('hep')
    subschema = schema['properties']['authors']

    snippet = ('<record>'
               '  <datafield tag="100" ind1=" " ind2=" ">'
               '    <subfield code="a">Aichinger, Ida</subfield>'
               '    <subfield code="u">Linz U.</subfield>'
               '  </datafield>'
               '  <datafield tag="700" ind1=" " ind2=" ">'
               '    <subfield code="a">Larcher, Gerhard</subfield>'
               '    <subfield code="u">Linz U.</subfield>'
               '    <subfield code="e">dir.</subfield>'
               '  </datafield>'
               '  <datafield tag="700" ind1=" " ind2=" ">'
               '    <subfield code="a">Kersevan, Roberto</subfield>'
               '    <subfield code="u">Linz U.</subfield>'
               '    <subfield code="e">dir.</subfield>'
               '  </datafield>'
               '</record>')  # record/2295265

    expected = {
        '100__': [
            {
                'a': 'Aichinger, Ida',
                'u': 'Linz U.',
            },
        ],
        '701__': [{
            'a': 'Larcher, Gerhard',
            'e': 'dir.',
            'u': 'Linz U.',
        }, {
            'a': 'Kersevan, Roberto',
            'e': 'dir.',
            'u': 'Linz U.',
        }]
    }
    result = cds2hep_marc.do(create_record(snippet))

    assert expected['100__'] == result['100__']
    assert expected['701__'] == result['701__']

    expected = [
        {
            'full_name': 'Aichinger, Ida',
            'affiliations': [{
                'value': 'Linz U.'
            }],
        },
        {
            'full_name': 'Larcher, Gerhard',
            'inspire_roles': ['supervisor'],
            'affiliations': [{
                'value': 'Linz U.'
            }],
        },
        {
            'full_name': 'Kersevan, Roberto',
            'inspire_roles': ['supervisor'],
            'affiliations': [{
                'value': 'Linz U.'
            }],
        },
    ]
    result = hep.do(create_record_from_dict(result))

    assert validate(result['authors'], subschema) is None
    assert expected == result['authors']
Beispiel #57
0
def test_validate_raises_if_no_schema_key():
    with pytest.raises(errors.SchemaKeyNotFound):
        api.validate(data={})
Beispiel #58
0
def reporterrors(output):
    """Reports in a friendly way all failed records and corresponding motivation."""
    def get_collection(marc_record):
        collections = set()
        for field in force_list(marc_record.get('980__')):
            for v in field.values():
                for e in force_list(v):
                    collections.add(e.upper().strip())
        if 'DELETED' in collections:
            return 'DELETED'
        for collection in collections:
            if collection in REAL_COLLECTIONS:
                return collection
        return 'HEP'

    click.echo("Reporting broken records into {0}".format(output))
    errors = {}
    results = InspireProdRecords.query.filter(InspireProdRecords.valid == False) # noqa: ignore=F712
    results_length = results.count()
    with click.progressbar(results.yield_per(100), length=results_length) as bar:
        for obj in bar:
            marc_record = create_record(obj.marcxml, keep_singletons=False)
            collection = get_collection(marc_record)
            if 'DELETED' in collection:
                continue
            recid = int(marc_record['001'])
            try:
                json_record = marcxml2record(obj.marcxml)
            except Exception as err:
                tb = u''.join(traceback.format_tb(sys.exc_info()[2]))
                errors.setdefault((collection, 'dojson', tb), []).append(recid)
                continue

            try:
                validate(json_record)
            except jsonschema.exceptions.ValidationError as err:
                exc = [
                    row
                    for row in str(err).splitlines()
                    if row.startswith('Failed validating')
                ][0]
                details = u'\n'.join(
                    dropwhile(
                        lambda x: not x.startswith('On instance'),
                        str(err).splitlines()
                    )
                )
                errors.setdefault(
                    (collection, 'validation', exc), []
                ).append((recid, details))
                continue

    with open(output, "w") as out:
        csv_writer = csv.writer(out)
        for (collection, stage, error), elements in errors.iteritems():
            if stage == 'dojson':
                csv_writer.writerow((
                    collection,
                    stage,
                    error,
                    '\n'.join(
                        'http://inspirehep.net/record/{}'.format(recid)
                        for recid in elements
                    )
                ))
            else:
                for recid, details in elements:
                    csv_writer.writerow((
                        collection,
                        stage,
                        error,
                        'http://inspirehep.net/record/{}'.format(recid),
                        details
                    ))
    click.echo("Dumped errors into {}".format(output))
Beispiel #59
0
 def _get_record_from_processed_item(item, spider):
     crawl_result = pipeline.process_item(item, spider)
     validate(crawl_result['record'], 'hep')
     assert crawl_result
     return crawl_result['record']
Beispiel #60
0
def test_references_from_999C5a_h_o_s_x_y_0():
    schema = load_schema('hep')
    subschema = schema['properties']['references']

    snippet = (
        '<datafield tag="999" ind1="C" ind2="5">'
        '  <subfield code="a">doi:10.1142/S0217751X0804055X</subfield>'
        '  <subfield code="h">G.K. Leontaris</subfield>'
        '  <subfield code="o">15</subfield>'
        '  <subfield code="s">Int.J.Mod.Phys.,A23,2055</subfield>'
        '  <subfield code="x">Int. J. Mod. Phys. A 23 (doi:10.1142/S0217751X0804055X)</subfield>'
        '  <subfield code="y">2008</subfield>'
        '  <subfield code="0">780399</subfield>'
        '</datafield>'
    )  # record/1478478

    expected = [
        {
            'curated_relation': False,
            'record': {
                '$ref': 'http://localhost:5000/api/literature/780399',
            },
            'raw_refs': [
                {
                    'value': 'Int. J. Mod. Phys. A 23 (doi:10.1142/S0217751X0804055X)',
                    'schema': 'text',
                },
            ],
            'reference': {
                'dois': ['10.1142/S0217751X0804055X'],
                'authors': [
                    {'full_name': u'Leontaris, G.K.'},
                ],
                'label': '15',
                'publication_info': {
                    "artid": '2055',
                    'journal_title': 'Int.J.Mod.Phys.A',
                    'journal_volume': '23',
                    'page_start': '2055',
                    'year': 2008,
                },
            },
        },
    ]
    result = hep.do(create_record(snippet))

    assert validate(result['references'], subschema) is None
    assert expected == result['references']

    expected = [
        {
            'a': [
                'doi:10.1142/S0217751X0804055X',
            ],
            'h': [
                'Leontaris, G.K.',
            ],
            'o': '15',
            's': 'Int.J.Mod.Phys.,A23,2055',
            'x': [
                'Int. J. Mod. Phys. A 23 (doi:10.1142/S0217751X0804055X)',
            ],
            'y': 2008,
            'z': 0,
            '0': 780399,
        }
    ]
    result = hep2marc.do(result)

    assert expected == result['999C5']