def test_harvesting_arxiv_workflow_accepted( mocked, db_only_app, record_oai_arxiv_plots): """Test a full harvesting workflow.""" from invenio_workflows import ( start, WorkflowEngine, ObjectStatus, workflow_object_class ) from dojson.contrib.marc21.utils import create_record from invenio_db import db from inspirehep.dojson.hep import hep from inspirehep.modules.converter.xslt import convert # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert( record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl" ) record_marc = create_record(record_oai_arxiv_plots_marcxml) record_json = hep.do(record_marc) workflow_uuid = None with db_only_app.app_context(): workflow_uuid = start('article', [record_json]) eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] assert obj.status == ObjectStatus.HALTED assert obj.data_type == "hep" # Files should have been attached (tarball + pdf) assert obj.files["1407.7587.pdf"] assert obj.files["1407.7587.tar.gz"] # A publication note should have been extracted pub_info = obj.data.get('publication_info') assert pub_info assert pub_info[0] assert pub_info[0].get('year') == "2014" assert pub_info[0].get('journal_title') == "J. Math. Phys." # This record should not have been touched yet assert "approved" not in obj.extra_data # Now let's resolve it as accepted and continue # FIXME Should be accept, but record validation prevents us. obj.remove_action() obj.extra_data["approved"] = True obj.extra_data["core"] = True obj.save() db.session.commit() with db_only_app.app_context(): eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] obj_id = obj.id obj.continue_workflow() obj = workflow_object_class.get(obj_id) # It was accepted assert obj.status == ObjectStatus.COMPLETED
def spawn_arXiv_workflow_from_oai_harvest(request, records, name, **kwargs): """Receive a list of harvested arXiv records and schedule workflow.""" from flask import current_app from invenio_workflows import start, workflows if request.endpoint not in ARXIV_URLS: # This is not arXiv return spider = kwargs.get('spider') workflow = kwargs.get('workflow') if spider or workflow: # Taken care of by inspire-crawler return workflow = "article" if workflow not in workflows: current_app.logger.warning( "{0} not in available workflows. Skipping workflow {1}.".format( workflow, name ) ) return for record in records: recxml = six.text_type(record) marcxml = convert(recxml, "oaiarXiv2marcxml.xsl") record = create_record(marcxml) hep_record = hep.do(record) start.delay(workflow, data=[hep_record])
def _author_list(obj, eng): from inspirehep.modules.converter.xslt import convert model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir = os.path.abspath("{0}_files".format(tarball)) try: file_list = untar(tarball, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [filename for filename in file_list if filename.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results( "authors", [{ "name": "authors", "results": authorlist_record["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": authorlist_record["number_of_authors"] }] ) break model.update()
def test_harvesting_arxiv_workflow_accepted(mocked, small_app, record_oai_arxiv_plots): """Test a full harvesting workflow.""" from invenio_workflows import (start, WorkflowEngine, ObjectStatus, workflow_object_class) from dojson.contrib.marc21.utils import create_record from invenio_db import db from inspirehep.dojson.hep import hep from inspirehep.modules.converter.xslt import convert # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl") record_marc = create_record(record_oai_arxiv_plots_marcxml) record_json = hep.do(record_marc) workflow_uuid = None with small_app.app_context(): workflow_uuid = start('article', [record_json]) eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] assert obj.status == ObjectStatus.HALTED assert obj.data_type == "hep" # Files should have been attached (tarball + pdf) assert obj.files["1407.7587.pdf"] assert obj.files["1407.7587.tar.gz"] # A publication note should have been extracted pub_info = obj.data.get('publication_info') assert pub_info assert pub_info[0] assert pub_info[0].get('year') == "2014" assert pub_info[0].get('journal_title') == "J. Math. Phys." # This record should not have been touched yet assert "approved" not in obj.extra_data # Now let's resolve it as accepted and continue # FIXME Should be accept, but record validation prevents us. obj.remove_action() obj.extra_data["approved"] = True obj.extra_data["core"] = True obj.save() db.session.commit() with small_app.app_context(): eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] obj_id = obj.id obj.continue_workflow() obj = workflow_object_class.get(obj_id) # It was accepted assert obj.status == ObjectStatus.COMPLETED
def already_harvested_on_legacy_record(): """Provide record fixture.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join( 'fixtures', 'oai_arxiv_record_already_on_legacy.xml' ) ) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert( record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl" ) record_marc = create_record(record_oai_arxiv_plots_marcxml) json_data = hep.do(record_marc) return json_data
def record(): """Provide record fixture.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join( 'fixtures', 'oai_arxiv_record_with_plots.xml' ) ) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert( record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl" ) record_marc = create_record(record_oai_arxiv_plots_marcxml) json_data = hep.do(record_marc) if 'preprint_date' in json_data: json_data['preprint_date'] = datetime.date.today().isoformat() return json_data
def spawn_arXiv_workflow_from_oai_harvest(request, records, name, **kwargs): """Receive a list of harvested arXiv records and schedule workflow.""" from flask import current_app from invenio_workflows import start, workflows if not request.endpoint == "http://export.arxiv.org/oai2": return workflow = "arxiv_ingestion" if workflow not in workflows: current_app.logger.warning( "{0} not in available workflows. Skipping workflow {1}.".format( workflow, name ) ) return for record in records: recxml = six.text_type(record) marcxml = convert(recxml, "oaiarXiv2marcxml.xsl") record = create_record(marcxml) hep_record = hep.do(record) start.delay(workflow, data=[hep_record])
def test_harvesting_arxiv_workflow_rejected( mocked_api_request_beard_block, mocked_api_request_magpie, mocked_api_request_beard, mocked_download, app, record_oai_arxiv_plots): """Test a full harvesting workflow.""" from invenio_workflows import ( start, WorkflowEngine, ObjectStatus, workflow_object_class ) from dojson.contrib.marc21.utils import create_record from invenio_db import db from inspirehep.dojson.hep import hep from inspirehep.modules.converter.xslt import convert # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert( record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl" ) record_marc = create_record(record_oai_arxiv_plots_marcxml) record_json = hep.do(record_marc) extra_config = { "BEARD_API_URL": "http://example.com/beard", "MAGPIE_API_URL": "http://example.com/magpie", } workflow_uuid = None with app.app_context(): with mock.patch.dict(app.config, extra_config): workflow_uuid = start('article', [record_json]) eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] assert obj.status == ObjectStatus.HALTED assert obj.data_type == "hep" # Files should have been attached (tarball + pdf, and plots) assert obj.files["1407.7587.pdf"] assert obj.files["1407.7587.tar.gz"] assert len(obj.files) > 2 # A publication note should have been extracted pub_info = obj.data.get('publication_info') assert pub_info assert pub_info[0] assert pub_info[0].get('year') == "2014" assert pub_info[0].get('journal_title') == "J. Math. Phys." # A prediction should have been made prediction = obj.extra_data.get("relevance_prediction") assert prediction assert prediction['decision'] == "Rejected" assert prediction['scores']['Rejected'] == 0.8358207729691823 experiments_prediction = obj.extra_data.get("experiments_prediction") assert experiments_prediction assert experiments_prediction['experiments'] == [ ['CMS', 0.7549515247344971] ] keywords_prediction = obj.extra_data.get("keywords_prediction") assert keywords_prediction assert {"label": "galaxy", "score": 0.29424679279327393, "accept": True} in keywords_prediction['keywords'] # This record should not have been touched yet assert "approved" not in obj.extra_data # Now let's resolve it as accepted and continue # FIXME Should be accept, but record validation prevents us. obj.remove_action() obj.extra_data["approved"] = False # obj.extra_data["core"] = True obj.save() db.session.commit() with app.app_context(): eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] obj_id = obj.id obj.continue_workflow() obj = workflow_object_class.get(obj_id) # It was rejected assert obj.status == ObjectStatus.COMPLETED
def test_harvesting_arxiv_workflow_rejected(mocked_api_request_beard_block, mocked_api_request_magpie, mocked_api_request_beard, mocked_download, small_app, record_oai_arxiv_plots): """Test a full harvesting workflow.""" # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl") record_marc = create_record(record_oai_arxiv_plots_marcxml) record_json = hep.do(record_marc) extra_config = { "BEARD_API_URL": "http://example.com/beard", "MAGPIE_API_URL": "http://example.com/magpie", } workflow_uuid = None with small_app.app_context(): with mock.patch.dict(small_app.config, extra_config): workflow_uuid = start('article', [record_json]) eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] assert obj.status == ObjectStatus.HALTED assert obj.data_type == "hep" # Files should have been attached (tarball + pdf, and plots) assert obj.files["1407.7587.pdf"] assert obj.files["1407.7587.tar.gz"] assert len(obj.files) > 2 # A publication note should have been extracted pub_info = obj.data.get('publication_info') assert pub_info assert pub_info[0] assert pub_info[0].get('year') == 2014 assert pub_info[0].get('journal_title') == "J. Math. Phys." # A prediction should have been made prediction = obj.extra_data.get("relevance_prediction") assert prediction assert prediction['decision'] == "Rejected" assert prediction['scores']['Rejected'] == 0.8358207729691823 experiments_prediction = obj.extra_data.get("experiments_prediction") assert experiments_prediction assert experiments_prediction['experiments'] == [[ 'CMS', 0.7549515247344971 ]] keywords_prediction = obj.extra_data.get("keywords_prediction") assert keywords_prediction assert { "label": "galaxy", "score": 0.29424679279327393, "accept": True } in keywords_prediction['keywords'] # This record should not have been touched yet assert "approved" not in obj.extra_data # Now let's resolve it as accepted and continue # FIXME Should be accept, but record validation prevents us. obj.remove_action() obj.extra_data["approved"] = False # obj.extra_data["core"] = True obj.save() db.session.commit() with small_app.app_context(): eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] obj_id = obj.id obj.continue_workflow() obj = workflow_object_class.get(obj_id) # It was rejected assert obj.status == ObjectStatus.COMPLETED