def test_prepare_files_ignores_keys_not_ending_with_pdf(): data = {} extra_data = {} files = MockFiles({ 'foo.bar': AttrDict({ 'obj': AttrDict({ 'file': AttrDict({ 'uri': '/data/foo.pdf', }), }), }), }) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected = {} result = obj.data assert expected == result expected = '' result = obj.log._info.getvalue() assert expected == result
def test_guess_coreness_when_rejected(j_a_r, g_b_u): j_a_r.return_value = { 'decision': 'Rejected', 'scores': [ -3.064134460779941, -1.2487082195061714, 1.375354035683761, ], } g_b_u.return_value = 'https://beard.inspirehep.net/predictor/coreness' obj = MockObj({}, {}) eng = MockEng() assert guess_coreness(obj, eng) is None assert obj.extra_data['relevance_prediction'] == { 'max_score': 1.375354035683761, 'decision': 'Rejected', 'scores': { 'CORE': -3.064134460779941, 'Non-CORE': -1.2487082195061714, 'Rejected': 1.375354035683761, }, 'relevance_score': -11.375354035683761, }
def test_is_marked(): obj = MockObj({}, {'foo': 'bar'}) eng = MockEng() is_foo_marked = is_marked('foo') assert is_foo_marked(obj, eng)
def test_send_robotupload_does_nothing_when_not_in_production_mode(): with requests_mock.Mocker(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] config = { 'LEGACY_ROBOTUPLOAD_URL': 'http://inspirehep.net', 'PRODUCTION_MODE': False, } with patch.dict(current_app.config, config): data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': 'hep-th/9711200', }, ], } extra_data = {} assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() _send_robotupload = send_robotupload(mode='insert', ) assert _send_robotupload(obj, eng) is None
def test_prepare_keywords_does_nothing_if_no_keywords_were_predicted(): schema = load_schema('hep') subschema = schema['properties']['keywords'] data = { 'keywords': [ { 'schema': 'INSPIRE', 'value': 'field theory: conformal', }, ], } extra_data = {} assert validate(data['keywords'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert prepare_keywords(obj, eng) is None expected = [ { 'schema': 'INSPIRE', 'value': 'field theory: conformal', }, ] result = obj.data assert validate(result['keywords'], subschema) is None assert expected == result['keywords']
def test_extract_journal_info_handles_year_an_empty_string(): schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ {'pubinfo_freetext': 'The Astrophysical Journal, 838:134 (16pp), 2017 April 1'}, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert extract_journal_info(obj, eng) is None expected = [ { 'artid': '134', 'journal_title': 'Astrophys. J.', 'journal_volume': '838', 'page_start': '134', 'pubinfo_freetext': 'The Astrophysical Journal, 838:134 (16pp), 2017 April 1', }, ] result = obj.data['publication_info'] assert validate(result, subschema) is None assert expected == result
def test_fuzzy_match_returns_true_if_something_matched_without_abstracts( mock_match, enable_fuzzy_matcher): schema = load_schema('hep') titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 4328, 'titles': [ { 'title': 'title', }, ], } assert validate(matched_record['titles'], titles_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 4328, 'title': 'title', }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_refextract_valid_refs_from_raw_refs(mock_match): schema = load_schema('hep') subschema = schema['properties']['references'] data = { 'references': [ { 'raw_refs': [ { 'schema': 'text', 'source': 'arXiv', 'value': '[37] M. Vallisneri, \u201cUse and abuse of the Fisher information matrix in the assessment of gravitational-wave parameter-estimation prospects,\u201d Phys. Rev. D 77, 042001 (2008) doi:10.1103/PhysRevD.77.042001 [gr-qc/0703086 [GR-QC]].' }, { 'schema': 'text', 'source': 'arXiv', 'value': '[37] M. Vallisneri, \u201cUse and abuse of the Fisher information matrix in the assessment of gravitational-wave parameter-estimation prospects,\u201d Phys. Rev. D 77, 042001 (2008) doi:10.1103/PhysRevD.77.042001 [gr-qc/0703086 [GR-QC]].' }, ], }, ], } obj = MockObj(data, {}) eng = MockEng() assert refextract(obj, eng) is None assert len(obj.data['references']) == 1 assert validate(obj.data['references'], subschema) is None
def test_refextract_valid_refs_from_text(mock_match, mock_get_document_in_workflow): """TODO: Make this an integration test and also test reference matching.""" mock_get_document_in_workflow.return_value.__enter__.return_value = None mock_get_document_in_workflow.return_value.__exit__.return_value = None schema = load_schema('hep') refs_subschema = schema['properties']['references'] acquisition_source_subschema = schema['properties']['acquisition_source'] data = {'acquisition_source': {'source': 'submitter'}} extra_data = { 'formdata': { 'references': 'M.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167\nM.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167', }, } assert validate(data['acquisition_source'], acquisition_source_subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert refextract(obj, eng) is None assert len(obj.data['references']) == 1 assert validate(obj.data['references'], refs_subschema) is None
def test_set_refereed_and_fix_document_type_replaces_article_with_conference_paper_if_needed( mock_replace_refs): schema = load_schema('journals') subschema = schema['properties']['proceedings'] journals = [{'proceedings': True}] assert validate(journals[0]['proceedings'], subschema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['document_type'] data = {'document_type': ['article']} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert set_refereed_and_fix_document_type(obj, eng) is None expected = ['conference paper'] result = obj.data['document_type'] assert validate(result, subschema) is None assert expected == result
def test_validate_record(): schema = load_schema('hep') data = { '_collections': [ 'Literature', ], 'document_type': [ 'article', ], 'titles': [ { 'title': 'Partial Symmetries of Weak Interactions' }, ], } extra_data = {} assert validate(data, schema) is None obj = MockObj(data, extra_data) eng = MockEng() _validate_record = validate_record('hep') assert _validate_record(obj, eng) is None
def test_set_refereed_and_fix_document_type_sets_refereed_to_false_if_all_journals_are_not_refereed( mock_replace_refs): schema = load_schema('journals') subschema = schema['properties']['refereed'] journals = [{'refereed': False}] assert validate(journals[0]['refereed'], subschema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['refereed'] data = {'document_type': ['article']} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert set_refereed_and_fix_document_type(obj, eng) is None expected = False result = obj.data['refereed'] assert validate(result, subschema) is None assert expected == result
def test_set_refereed_and_fix_document_type_handles_journals_that_publish_mixed_content( mock_replace_refs): schema = load_schema('journals') proceedings_schema = schema['properties']['proceedings'] refereed_schema = schema['properties']['refereed'] journals = [{'proceedings': True, 'refereed': True}] assert validate(journals[0]['proceedings'], proceedings_schema) is None assert validate(journals[0]['refereed'], refereed_schema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['refereed'] data = {'document_type': ['article']} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert set_refereed_and_fix_document_type(obj, eng) is None expected = True result = obj.data['refereed'] assert validate(result, subschema) is None assert expected == result
def test_populate_submission_document_without_pdf(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1707.02785', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1707.02785.html')), ) schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'datetime': '2017-11-30T16:38:43.352370', 'email': '*****@*****.**', 'internal_uid': 54252, 'method': 'submitter', 'orcid': '0000-0002-2174-4493', 'source': 'submitter', 'submission_number': '1' } } assert validate(data['acquisition_source'], subschema) is None extra_data = { 'submission_pdf': 'http://export.arxiv.org/pdf/1707.02785', } files = MockFiles({}) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert populate_submission_document(obj, eng) is None documents = obj.data.get('documents', []) assert 0 == len(documents)
def test_extract_journal_info_handles_the_journal_split(): schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ {'pubinfo_freetext': 'Phys. Rev. D 96, 076008. 2017'}, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert extract_journal_info(obj, eng) is None expected = [ { 'artid': '076008', 'journal_title': 'Phys. Rev. D', 'journal_volume': '96', 'pubinfo_freetext': 'Phys. Rev. D 96, 076008. 2017', }, ] result = obj.data['publication_info'] assert validate(result, subschema) is None assert expected == result
def test_is_arxiv_paper_returns_false_if_method_is_not_hepcrawl_or_arxiv(): schema = load_schema('hep') acquisition_source_schema = schema['properties']['acquisition_source'] arxiv_eprints_schema = schema['properties']['arxiv_eprints'] data = { 'acquisition_source': { 'method': 'batchuploader', 'source': 'arxiv', }, 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '0801.4782', }, ], } extra_data = {} assert validate(data['acquisition_source'], acquisition_source_schema) is None assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None obj = MockObj(data, extra_data) eng = MockEng() assert not is_arxiv_paper(obj, eng)
def test_extract_journal_info(): schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ {'pubinfo_freetext': 'J. Math. Phys. 55, 082102 (2014)'}, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert extract_journal_info(obj, eng) is None expected = [ { 'artid': '082102', 'journal_title': 'J. Math. Phys.', 'journal_volume': '55', 'pubinfo_freetext': 'J. Math. Phys. 55, 082102 (2014)', 'year': 2014, } ] result = obj.data['publication_info'] assert validate(result, subschema) is None assert expected == result
def test_is_arxiv_paper_for_submission(): schema = load_schema('hep') acquisition_source_schema = schema['properties']['acquisition_source'] arxiv_eprints_schema = schema['properties']['arxiv_eprints'] data = { 'acquisition_source': { 'method': 'submitter', }, 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '0801.4782', }, ], } extra_data = {} assert validate(data['acquisition_source'], acquisition_source_schema) is None assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None obj = MockObj(data, extra_data) eng = MockEng() assert is_arxiv_paper(obj, eng)
def test_core_is_not_written_in_extradata_if_article_is_non_core(app): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] categories_config = { 'ARXIV_CATEGORIES': { 'core': ['hep-ph'], 'non-core': ['astro-ph.CO', 'gr-qc'] } } with patch.dict(app.config, categories_config): data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() obj.data = { 'arxiv_eprints': [ { 'categories': [ 'astro-ph.CO', ], 'value': '1705.01122', }, ], } assert validate(obj.data['arxiv_eprints'], subschema) is None set_core_in_extra_data(obj, eng) assert 'core' not in obj.extra_data
def test_download_documents(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['documents'] data = { 'documents': [ { 'key': '1605.03844.pdf', 'url': 'http://export.arxiv.org/pdf/1605.03844' }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['documents'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert download_documents(obj, eng) is None documents = obj.data['documents'] expected_document_url = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf' assert 1 == len(documents) assert expected_document_url == documents[0]['url']
def test_fuzzy_match_returns_true_if_something_matched_with_publication_info( mock_match, enable_fuzzy_matcher): schema = load_schema('hep') publication_info_schema = schema['properties']['publication_info'] titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 1472986, 'titles': [ { 'title': 'title', }, ], 'publication_info': [ { 'artid': '054021', 'journal_issue': '5', 'journal_title': 'Phys.Rev.D', 'journal_volume': '94', 'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021', 'year': 2016 }, ], } assert validate(matched_record['titles'], titles_schema) is None assert validate(matched_record['publication_info'], publication_info_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 1472986, 'title': 'title', 'publication_info': [ { 'artid': '054021', 'journal_issue': '5', 'journal_title': 'Phys.Rev.D', 'journal_volume': '94', 'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021', 'year': 2016 }, ], }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_fix_submission_number(): schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'method': 'hepcrawl', 'submission_number': '751e374a017311e896d6fa163ec92c6a', }, } extra_data = {} assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() fix_submission_number(obj, eng) expected = { 'method': 'hepcrawl', 'submission_number': '1', } result = obj.data['acquisition_source'] assert validate(result, subschema) is None assert expected == result
def test_send_robotupload_works_doesnt_fail_when_removing_references_and_no_references( ): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', 'http://inspirehep.net/batchuploader/robotupload/insert', text='[INFO] foo bar baz') config = { 'LEGACY_ROBOTUPLOAD_URL': 'http://inspirehep.net', 'PRODUCTION_MODE': True, } with patch.dict(current_app.config, config), \ patch('inspirehep.modules.workflows.tasks.submission.record2marcxml') as mock_record2marcxml: data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', } extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() _send_robotupload = send_robotupload(mode='insert', ) assert _send_robotupload(obj, eng) is None assert mock_record2marcxml.called_with(data)
def fix_submission_number_does_nothing_if_method_is_not_hepcrawl(): schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'method': 'submitter', 'submission_number': '869215', }, } extra_data = {} assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() fix_submission_number(obj, eng) expected = { 'method': 'submitter', 'submission_number': '869215', } result = obj.data['acquisition_source'] assert validate(result, subschema) is None assert expected == result
def test_send_robotupload_update_article_when_feature_flag_is_enabled(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', 'http://inspirehep.net/batchuploader/robotupload/replace', text='[INFO] foo bar baz') config = { 'LEGACY_ROBOTUPLOAD_URL': 'http://inspirehep.net', 'PRODUCTION_MODE': True, 'FEATURE_FLAG_ENABLE_UPDATE_TO_LEGACY': True } with patch.dict(current_app.config, config), \ patch('inspirehep.modules.workflows.tasks.submission.record2marcxml'): data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', } extra_data = {'is-update': True} obj = MockObj(data, extra_data) eng = MockEng() assert send_to_legacy(obj, eng) is None expected = ('Robotupload sent!' '[INFO] foo bar baz' 'end of upload') result = obj.log._info.getvalue() assert expected == result
def test_populate_journal_coverage_writes_partial_if_all_coverages_are_partial( mock_replace_refs): schema = load_schema('journals') subschema = schema['properties']['_harvesting_info'] journals = [{'_harvesting_info': {'coverage': 'partial'}}] assert validate(journals[0]['_harvesting_info'], subschema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ { 'journal_record': { '$ref': 'http://localhost:/api/journals/1212337' } }, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert populate_journal_coverage(obj, eng) is None expected = 'partial' result = obj.extra_data['journal_coverage'] assert expected == result
def test_is_marked_returns_false_when_value_is_falsy(): obj = MockObj({}, {'foo': False}) eng = MockEng() is_foo_marked = is_marked('foo') assert not is_foo_marked(obj, eng)
def test_classify_paper_with_no_fulltext(get_document_in_workflow, higgs_ontology): data = { 'titles': [ { 'title': 'Some title', }, ], 'abstracts': [ { 'value': 'Very interesting paper about the Higgs boson.' }, ], } obj = MockObj(data, {}) eng = MockEng() get_document_in_workflow.return_value.__enter__.return_value = None get_document_in_workflow.return_value.__exit__.return_value = None expected = [{'number': 1, 'keyword': 'Higgs particle'}] classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) assert obj.extra_data['classifier_results']['complete_output'][ 'core_keywords'] == expected assert obj.extra_data['classifier_results']['fulltext_used'] is False
def test_is_marked_returns_false_when_key_does_not_exist(): obj = MockObj({}, {}) eng = MockEng() is_foo_marked = is_marked('foo') assert not is_foo_marked(obj, eng)
def test_prepare_files_annotates_files_from_arxiv(): schema = load_schema('hep') _fft_schema = schema['properties']['_fft'] arxiv_eprints_schema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': ['hep-th'], 'value': 'hep-th/9711200', }, ], } extra_data = {} files = MockFiles({ 'foo.pdf': AttrDict({ 'obj': AttrDict({ 'file': AttrDict({ 'uri': '/data/foo.pdf', }), }), }), }) assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected_fft = [ { 'path': '/data/foo.pdf', 'type': 'arXiv', 'filename': 'arxiv:foo', 'format': '.pdf', }, ] expected_arxiv_eprints = [ { 'categories': [ 'hep-th', ], 'value': 'hep-th/9711200', }, ] result = obj.data assert validate(result['_fft'], _fft_schema) is None assert expected_fft == result['_fft'] assert validate(result['arxiv_eprints'], arxiv_eprints_schema) is None assert expected_arxiv_eprints == result['arxiv_eprints'] expected = 'Non-user PDF files added to FFT.' result = obj.log._info.getvalue() assert expected == result