def test_arxiv_author_list_handles_auto_ignore_comment(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None
def test_arxiv_author_list_with_missing_tarball(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ 'jessica.jones.tar.gz': AttrDict({'file': AttrDict({ 'uri': 'alias.investigations', })}) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() expected_message = \ 'Skipping author list extraction, no tarball with name "1703.09986.tar.gz" found' assert default_arxiv_author_list(obj, eng) is None assert expected_message in obj.log._info.getvalue()
def test_arxiv_author_list_logs_on_error(mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1605.07707')) data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None assert '1605.07707' in obj.log._info.getvalue()
def test_arxiv_author_list_handles_multiple_author_xml_files(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432142', 'schema': 'CERN'}, ], 'full_name': 'Sirunyan, Albert M', }, { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432143', 'schema': 'CERN'}, ], 'full_name': 'Weary, Jake', } ] validate(expected_authors, authors_subschema) assert obj.data.get('authors') == expected_authors
def test_arxiv_author_list_handles_multiple_author_xml_files(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432142', 'schema': 'CERN'}, ], 'full_name': 'Sirunyan, Albert M', }, { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432143', 'schema': 'CERN'}, ], 'full_name': 'Weary, Jake', } ] validate(expected_authors, authors_subschema) assert obj.data.get('authors') == expected_authors
def test_arxiv_author_list_does_not_produce_latex(): schema = load_schema('hep') filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1802.03388.tar.gz')) eprints_subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1802.03388', }, ], } validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1802.03388.tar.gz': AttrDict({'file': AttrDict({'uri': filename})}) }) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{'value': 'Lund U.'}], 'ids': [ { 'value': 'INSPIRE-00061248', 'schema': 'INSPIRE ID' } ], 'full_name': u'Åkesson, Torsten Paul Ake' }, ] validate(expected_authors, authors_subschema) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None assert obj.data.get('authors') == expected_authors
def test_arxiv_author_list_logs_on_error(mock_os, mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1605.07707', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert default_arxiv_author_list(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707' result = obj.log._error.getvalue() assert expected == result finally: rmtree(temporary_dir)
def test_arxiv_author_list_only_overrides_authors(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) assert 'arxiv_eprints' in obj.data assert obj.data['arxiv_eprints'] == data['arxiv_eprints'] assert '$schema' in obj.data assert obj.data['$schema'] == data['$schema']
def test_arxiv_author_list_only_overrides_authors(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) assert 'arxiv_eprints' in obj.data assert obj.data['arxiv_eprints'] == data['arxiv_eprints'] assert '$schema' in obj.data assert obj.data['$schema'] == data['$schema']
def test_arxiv_author_list_handles_auto_ignore_comment(mock_os): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert default_arxiv_author_list(obj, eng) is None finally: rmtree(temporary_dir)
def test_arxiv_author_list_logs_on_error(mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1605.07707', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707' result = obj.log._info.getvalue() assert expected == result
set_core_in_extra_data, ], mark('auto-approved', False), ), ), ] ENHANCE_RECORD = [ IF( is_arxiv_paper, [ populate_arxiv_document, arxiv_package_download, arxiv_plot_extract, arxiv_derive_inspire_categories, arxiv_author_list("authorlist2marcxml.xsl"), ] ), IF( is_submission, populate_submission_document, ), download_documents, normalize_journal_titles, refextract, count_reference_coreness, extract_journal_info, populate_journal_coverage, classify_paper( only_core_tags=False, spires=True,
class Article(object): """Article ingestion workflow for Literature collection.""" name = "HEP" data_type = "hep" workflow = [ # Make sure schema is set for proper indexing in Holding Pen set_schema, # Emit record signals to receive metadata enrichment emit_record_signals, # Query locally or via legacy search API to see if article # is already ingested and this is an update IF(article_exists, [ mark('match-found', True), ]), IF_ELSE( is_submission, [ # Article matching for submissions # ================================ IF(pending_in_holding_pen, [ mark('already-in-holding-pen', True), ]), # Special RT integration for submissions # ====================================== create_ticket( template="literaturesuggest/tickets/curator_submitted.html", queue="HEP_add_user", context_factory=new_ticket_context, ticket_id_key="ticket_id"), reply_ticket( template="literaturesuggest/tickets/user_submitted.html", context_factory=reply_ticket_context, keep_new=True), ], [ # Article matching for non-submissions # ==================================== # Query holding pen to see if we already have this article ingested # # NOTE on updates: # If the same article has been harvested before and the # ingestion has been completed, process is continued # to allow for updates. IF(pending_in_holding_pen, [ mark('already-in-holding-pen', True), mark('delete', True), ]), IF( is_arxiv_paper, [ # FIXME: This filtering step should be removed when this # workflow includes arXiv CORE harvesting IF(already_harvested, [ mark('already-ingested', True), mark('stop', True), ]), # FIXME: This filtering step should be removed when: # old previously rejected records are treated # differently e.g. good auto-reject heuristics or better # time based filtering (5 days is quite random now). IF(previously_rejected(), [ mark('already-ingested', True), mark('stop', True), ]), ]), IF(is_marked('delete'), [update_old_object, delete_self_and_stop_processing]), IF(is_marked('stop'), [stop_processing]), ]), # # Article Processing # ================== IF(is_arxiv_paper, [ arxiv_fulltext_download, arxiv_plot_extract, arxiv_refextract, arxiv_author_list("authorlist2marcxml.xsl"), ]), extract_journal_info, classify_paper( taxonomy="HEPont.rdf", only_core_tags=False, spires=True, with_author_keywords=True, ), filter_core_keywords, guess_categories, IF(is_experimental_paper, [ guess_experiments, ]), guess_keywords, # Predict action for a generic HEP paper based only on title # and abstract. guess_coreness, # ("arxiv_skip_astro_title_abstract.pickle) # Check if we shall halt or auto-reject # ===================================== # NOTE: User submissions are always relevant IF_ELSE(is_record_relevant, [ halt_record(action="hep_approval"), ], [reject_record("Article automatically rejected"), stop_processing]), IF_ELSE(is_record_accepted, [ IF(article_exists, [ IF_ELSE(is_submission, [ reject_record('Article was already found on INSPIRE'), stop_processing, reply_ticket( template= "literaturesuggest/tickets/user_rejected_exists.html", context_factory=reply_ticket_context), close_ticket(ticket_id_key="ticket_id"), ], [ halt_record(action="merge_approval"), ]), ]), add_core, add_note_entry, filter_keywords, user_pdf_get, IF_ELSE(shall_push_remotely, [ IF_ELSE(article_exists, [ prepare_update_payload(extra_data_key="update_payload"), send_robotupload(marcxml_processor=hep2marc, mode="correct", extra_data_key="update_payload"), ], [ send_robotupload(marcxml_processor=hep2marc, mode="insert"), ]) ], [store_record]), IF(is_submission, [ IF(curation_ticket_needed, [ create_ticket( template="literaturesuggest/tickets/curation_core.html", queue="HEP_curation", context_factory=curation_ticket_context, ticket_id_key="curation_ticket_id") ]), reply_ticket( template="literaturesuggest/tickets/user_accepted.html", context_factory=reply_ticket_context), ]), ], [ IF(is_submission, [reply_ticket(context_factory=reply_ticket_context)]) ]), close_ticket(ticket_id_key="ticket_id") ]
[stop_processing] ), ] ENHANCE_RECORD = [ # Article Processing # ================== IF( is_arxiv_paper, [ arxiv_fulltext_download, arxiv_plot_extract, arxiv_refextract, arxiv_derive_inspire_categories, arxiv_author_list("authorlist2marcxml.xsl"), ] ), extract_journal_info, classify_paper( taxonomy="HEPont.rdf", only_core_tags=False, spires=True, with_author_keywords=True, ), filter_core_keywords, guess_categories, # TODO: adapt the output of guess_experiment so that it # can be stored in ElasticSearch (see issue #2054). # IF( # is_experimental_paper,