def test_classify_paper_with_fulltext(get_document_in_workflow, tmpdir, higgs_ontology): obj = MockObj({}, {}) eng = MockEng() fulltext = tmpdir.join('fulltext.txt') fulltext.write('Higgs boson') get_document_in_workflow.return_value.__enter__.return_value = binary_type(fulltext) get_document_in_workflow.return_value.__exit__.return_value = None expected = [ { 'number': 1, 'keyword': 'Higgs particle' } ] classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) assert obj.extra_data['classifier_results']['complete_output']['core_keywords'] == expected assert obj.extra_data['classifier_results']['fulltext_used'] is True
def test_classify_paper_with_no_fulltext(get_document_in_workflow, higgs_ontology): data = { 'titles': [ { 'title': 'Some title', }, ], 'abstracts': [ { 'value': 'Very interesting paper about the Higgs boson.' }, ], } obj = MockObj(data, {}) eng = MockEng() get_document_in_workflow.return_value.__enter__.return_value = None get_document_in_workflow.return_value.__exit__.return_value = None expected = [ { 'number': 1, 'keyword': 'Higgs particle' } ] classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) assert obj.extra_data['classifier_results']['complete_output']['core_keywords'] == expected assert obj.extra_data['classifier_results']['fulltext_used'] is False
def test_classify_paper_uses_keywords(get_document_in_workflow): data = { 'titles': [ { 'title': 'Some title', }, ], 'keywords': [ { 'value': 'Higgs boson', }, ], } obj = MockObj(data, {}) eng = MockEng() get_document_in_workflow.return_value.__enter__.return_value = None get_document_in_workflow.return_value.__exit__.return_value = None expected = [ { 'number': 1, 'keyword': 'Higgs particle' } ] classify_paper( taxonomy="HEPont.rdf", only_core_tags=False, spires=True, with_author_keywords=True, )(obj, eng) assert obj.extra_data['classifier_results']['complete_output']['core_keywords'] == expected assert obj.extra_data['classifier_results']['fulltext_used'] is False
def test_classify_paper_with_no_fulltext(get_document_in_workflow, higgs_ontology): data = { 'titles': [ { 'title': 'Some title', }, ], 'abstracts': [ { 'value': 'Very interesting paper about the Higgs boson.' }, ], } obj = MockObj(data, {}) eng = MockEng() get_document_in_workflow.return_value.__enter__.return_value = None get_document_in_workflow.return_value.__exit__.return_value = None expected = [{'number': 1, 'keyword': 'Higgs particle'}] classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) assert obj.extra_data['classifier_results']['complete_output'][ 'core_keywords'] == expected assert obj.extra_data['classifier_results']['fulltext_used'] is False
def test_classify_paper_does_not_raise_on_unprintable_keywords(get_document_in_workflow, higgs_ontology): paper_with_unprintable_keywords = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1802.08709.pdf')) get_document_in_workflow.return_value.__enter__.return_value = paper_with_unprintable_keywords get_document_in_workflow.return_value.__exit__.return_value = None obj = MockObj({}, {}) eng = MockEng() classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) # Does not raise.
def test_classify_paper_does_not_raise_on_unprintable_keywords( get_document_in_workflow, higgs_ontology): paper_with_unprintable_keywords = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1802.08709.pdf')) get_document_in_workflow.return_value.__enter__.return_value = paper_with_unprintable_keywords get_document_in_workflow.return_value.__exit__.return_value = None obj = MockObj({}, {}) eng = MockEng() classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) # Does not raise.
def test_classify_paper_with_fulltext(get_document_in_workflow, tmpdir, higgs_ontology): obj = MockObj({}, {}) eng = MockEng() fulltext = tmpdir.join('fulltext.txt') fulltext.write('Higgs boson') get_document_in_workflow.return_value.__enter__.return_value = binary_type( fulltext) get_document_in_workflow.return_value.__exit__.return_value = None expected = [{'number': 1, 'keyword': 'Higgs particle'}] classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) assert obj.extra_data['classifier_results']['complete_output'][ 'core_keywords'] == expected assert obj.extra_data['classifier_results']['fulltext_used'] is True
arxiv_author_list("authorlist2marcxml.xsl"), ] ), IF( is_submission, populate_submission_document, ), download_documents, normalize_journal_titles, refextract, count_reference_coreness, extract_journal_info, populate_journal_coverage, classify_paper( only_core_tags=False, spires=True, with_author_keywords=True, ), filter_core_keywords, guess_categories, IF( is_experimental_paper, guess_experiments, ), guess_keywords, guess_coreness, ] NOTIFY_NOT_ACCEPTED = [ IF(
class Article(object): """Article ingestion workflow for Literature collection.""" name = "HEP" data_type = "hep" workflow = [ # Make sure schema is set for proper indexing in Holding Pen set_schema, # Emit record signals to receive metadata enrichment emit_record_signals, # Query locally or via legacy search API to see if article # is already ingested and this is an update IF(article_exists, [ mark('match-found', True), ]), IF_ELSE( is_submission, [ # Article matching for submissions # ================================ IF(pending_in_holding_pen, [ mark('already-in-holding-pen', True), ]), # Special RT integration for submissions # ====================================== create_ticket( template="literaturesuggest/tickets/curator_submitted.html", queue="HEP_add_user", context_factory=new_ticket_context, ticket_id_key="ticket_id"), reply_ticket( template="literaturesuggest/tickets/user_submitted.html", context_factory=reply_ticket_context, keep_new=True), ], [ # Article matching for non-submissions # ==================================== # Query holding pen to see if we already have this article ingested # # NOTE on updates: # If the same article has been harvested before and the # ingestion has been completed, process is continued # to allow for updates. IF(pending_in_holding_pen, [ mark('already-in-holding-pen', True), mark('delete', True), ]), IF( is_arxiv_paper, [ # FIXME: This filtering step should be removed when this # workflow includes arXiv CORE harvesting IF(already_harvested, [ mark('already-ingested', True), mark('stop', True), ]), # FIXME: This filtering step should be removed when: # old previously rejected records are treated # differently e.g. good auto-reject heuristics or better # time based filtering (5 days is quite random now). IF(previously_rejected(), [ mark('already-ingested', True), mark('stop', True), ]), ]), IF(is_marked('delete'), [update_old_object, delete_self_and_stop_processing]), IF(is_marked('stop'), [stop_processing]), ]), # # Article Processing # ================== IF(is_arxiv_paper, [ arxiv_fulltext_download, arxiv_plot_extract, arxiv_refextract, arxiv_author_list("authorlist2marcxml.xsl"), ]), extract_journal_info, classify_paper( taxonomy="HEPont.rdf", only_core_tags=False, spires=True, with_author_keywords=True, ), filter_core_keywords, guess_categories, IF(is_experimental_paper, [ guess_experiments, ]), guess_keywords, # Predict action for a generic HEP paper based only on title # and abstract. guess_coreness, # ("arxiv_skip_astro_title_abstract.pickle) # Check if we shall halt or auto-reject # ===================================== # NOTE: User submissions are always relevant IF_ELSE(is_record_relevant, [ halt_record(action="hep_approval"), ], [reject_record("Article automatically rejected"), stop_processing]), IF_ELSE(is_record_accepted, [ IF(article_exists, [ IF_ELSE(is_submission, [ reject_record('Article was already found on INSPIRE'), stop_processing, reply_ticket( template= "literaturesuggest/tickets/user_rejected_exists.html", context_factory=reply_ticket_context), close_ticket(ticket_id_key="ticket_id"), ], [ halt_record(action="merge_approval"), ]), ]), add_core, add_note_entry, filter_keywords, user_pdf_get, IF_ELSE(shall_push_remotely, [ IF_ELSE(article_exists, [ prepare_update_payload(extra_data_key="update_payload"), send_robotupload(marcxml_processor=hep2marc, mode="correct", extra_data_key="update_payload"), ], [ send_robotupload(marcxml_processor=hep2marc, mode="insert"), ]) ], [store_record]), IF(is_submission, [ IF(curation_ticket_needed, [ create_ticket( template="literaturesuggest/tickets/curation_core.html", queue="HEP_curation", context_factory=curation_ticket_context, ticket_id_key="curation_ticket_id") ]), reply_ticket( template="literaturesuggest/tickets/user_accepted.html", context_factory=reply_ticket_context), ]), ], [ IF(is_submission, [reply_ticket(context_factory=reply_ticket_context)]) ]), close_ticket(ticket_id_key="ticket_id") ]
IF(is_arxiv_paper, [ arxiv_fulltext_download, arxiv_package_download, arxiv_plot_extract, refextract, arxiv_derive_inspire_categories, arxiv_author_list("authorlist2marcxml.xsl"), ]), IF(is_submission, [ submission_fulltext_download, refextract, ]), extract_journal_info, classify_paper( taxonomy="HEPont.rdf", only_core_tags=False, spires=True, with_author_keywords=True, ), filter_core_keywords, guess_categories, IF(is_experimental_paper, [guess_experiments]), guess_keywords, # Predict action for a generic HEP paper based only on title # and abstract. guess_coreness, # ("arxiv_skip_astro_title_abstract.pickle) # Check if we shall halt or auto-reject # ===================================== ] CHECK_IF_SUBMISSION_AND_ASK_FOR_APPROVAL = [ IF_ELSE(is_record_relevant, [
# Article Processing # ================== IF( is_arxiv_paper, [ arxiv_fulltext_download, arxiv_plot_extract, arxiv_refextract, arxiv_derive_inspire_categories, arxiv_author_list("authorlist2marcxml.xsl"), ] ), extract_journal_info, classify_paper( taxonomy="HEPont.rdf", only_core_tags=False, spires=True, with_author_keywords=True, ), filter_core_keywords, guess_categories, # TODO: adapt the output of guess_experiment so that it # can be stored in ElasticSearch (see issue #2054). # IF( # is_experimental_paper, # [guess_experiments] # ), guess_keywords, # Predict action for a generic HEP paper based only on title # and abstract. guess_coreness, # ("arxiv_skip_astro_title_abstract.pickle) # Check if we shall halt or auto-reject