def _exists_in_holding_pen(obj, eng): from inspire.utils.knowledge import get_value record = get_record_from_obj(obj, eng) identifiers = record.get(identifier_key, []) result = get_value(kb_name, identifiers) if result: obj.log.info("Record already found in Holding Pen ({0})".format( result )) return result
def test_harvesting_workflow_accepted(self, search): """Test a full harvesting workflow.""" from invenio_base.globals import cfg from invenio_workflows.api import start from inspire.utils.helpers import get_record_from_obj # Mock Elasticsearch search for Holding Pen check search.return_value = [] responses.add(responses.GET, cfg["WORKFLOWS_MATCH_REMOTE_SERVER_URL"], body="[]", status=200) responses.add( responses.GET, "http://arxiv.org/e-print/1511.01097", content_type="application/x-eprint-tar", body=self.arxiv_tarball_accept.read(), status=200, adding_headers={"Content-Encoding": "x-gzip"}, ) responses.add( responses.GET, "http://arxiv.org/pdf/1511.01097", content_type="application/pdf", body=self.arxiv_pdf_accept.read(), status=200, stream=True, ) robotupload_url = os.path.join( cfg.get("CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"), "batchuploader/robotupload/insert" ) responses.add( responses.POST, robotupload_url, body="[INFO] bibupload batchupload --insert /dummy/file/path\n", status=200 ) workflow = start("harvesting_fixture", data=[self.record_oai_arxiv_accept], module_name="unit_tests") # Let's get the halted record obj = workflow.halted_objects[0] # Now let's resolve it as accepted and continue obj.remove_action() obj.extra_data["approved"] = True obj.extra_data["core"] = True obj.set_extra_data(obj.extra_data) obj.save() workflow = obj.continue_workflow() record = get_record_from_obj(obj, workflow) # Now it is CORE self.assertTrue("CORE" in record.get("collections.primary"))
def test_harvesting_workflow_rejected(self, search): """Test a full harvesting workflow.""" from invenio_base.globals import cfg from invenio_workflows.api import start from inspire.utils.helpers import get_record_from_obj # Mock Elasticsearch search for Holding Pen check search.return_value = [] responses.add(responses.GET, cfg["WORKFLOWS_MATCH_REMOTE_SERVER_URL"], body="[]", status=200) responses.add( responses.GET, "http://arxiv.org/e-print/1407.7587", content_type="application/x-eprint-tar", body=self.arxiv_tarball.read(), status=200, adding_headers={"Content-Encoding": "x-gzip"}, ) responses.add( responses.GET, "http://arxiv.org/pdf/1407.7587", content_type="application/pdf", body=self.arxiv_pdf.read(), status=200, stream=True, ) workflow = start("harvesting_fixture", data=[self.record_oai_arxiv_plots], module_name="unit_tests") # Let's get the record metadata and check contents obj = workflow.completed_objects[0] record = get_record_from_obj(obj, workflow) # This record should be rejected self.assertFalse(obj.extra_data["approved"]) # Files should have been attached (tarball + pdf) self.assertTrue(len(obj.data["files"]) == 2) # Some plots/files should have been added to FFTs self.assertTrue(record.get("fft")) # A publication note should have been extracted self.assertTrue(record.get("publication_info")) # A prediction should have been made self.assertTrue(obj.get_tasks_results().get("arxiv_guessing")) # It is not CORE self.assertFalse("CORE" in record.get("collections.primary"))
def _update_old_object(obj, eng): from inspire.utils.knowledge import get_value from invenio_workflows.models import BibWorkflowObject record = get_record_from_obj(obj, eng) identifiers = [] identifiers = record.get('arxiv_eprints.value', []) object_id = get_value(kb_name, identifiers) if object_id: old_object = BibWorkflowObject.query.get(object_id) if old_object: # record waiting approval old_object.set_data(obj.data) old_object.save()
def exists_in_holding_pen(obj, eng): """Check if a record exists in HP by looking in given KB.""" from invenio_workflows.search import search as hp_search record = get_record_from_obj(obj, eng) identifiers = [] for field, lookup in six.iteritems( current_app.config.get("HOLDING_PEN_MATCH_MAPPING")): # Add quotes around to make the search exact identifiers += ['{0}:"{1}"'.format(field, i) for i in record.get(lookup, [])] # Search for any existing record in Holding Pen, exclude self result = set(hp_search( query=" OR ".join(identifiers) )) - set([obj.id]) if result: obj.log.info("Record already found in Holding Pen ({0})".format( result )) obj.extra_data["holdingpen_ids"] = list(result) return result
def test_harvesting_workflow_without_match(self): """Test a full harvesting workflow.""" from invenio.base.globals import cfg from invenio_workflows.api import start from inspire.utils.helpers import ( get_record_from_obj, ) httpretty.HTTPretty.allow_net_connect = False httpretty.register_uri( httpretty.GET, cfg['WORKFLOWS_MATCH_REMOTE_SERVER_URL'], body='[]', status=200 ) httpretty.register_uri( httpretty.GET, 'http://arxiv.org/e-print/1407.7587', content_type="application/x-eprint-tar", body=self.arxiv_tarball.read(), status=200, adding_headers={ "Content-Encoding": 'x-gzip', } ) httpretty.register_uri( httpretty.GET, 'http://arxiv.org/pdf/1407.7587.pdf', content_type="application/pdf", body=self.arxiv_pdf.read(), status=200, ) robotupload_url = os.path.join( cfg.get("CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"), "batchuploader/robotupload/insert" ) httpretty.register_uri( httpretty.POST, robotupload_url, body="[INFO] bibupload batchupload --insert /dummy/file/path\n", status=200, ) workflow = start('harvesting_fixture', data=[self.record_oai_arxiv_plots], module_name='unit_tests') # Let's get the record metadata and check contents obj = workflow.halted_objects[0] record = get_record_from_obj(obj, workflow) # Files should have been attached (tarball + pdf) self.assertTrue(len(obj.data["files"]) == 2) # Some plots/files should have been added to FFTs self.assertTrue(record.get('fft')) # A publication note should have been extracted self.assertTrue(record.get('publication_info')) # A prediction should have been made self.assertTrue(obj.get_tasks_results().get("arxiv_guessing")) record = get_record_from_obj(obj, workflow) # This one is not yet CORE self.assertFalse("CORE" in record.get("collections.primary")) # Now let's resolve it as accepted and continue obj.remove_action() obj.extra_data["approved"] = True obj.extra_data["core"] = True obj.set_extra_data(obj.extra_data) obj.save() workflow = obj.continue_workflow() record = get_record_from_obj(obj, workflow) # Now it is CORE self.assertTrue("CORE" in record.get("collections.primary"))
def arxiv_set_category_field(obj, eng): """Temporary measure to enable sorting by primary category.""" record = get_record_from_obj(obj, eng) obj.uri = record.get("report_number.arxiv_category", [""])[0]
def _save_identifiers_to_kb(obj, eng): from inspire.utils.knowledge import save_keys_to_kb record = get_record_from_obj(obj, eng) identifiers = record.get(identifier_key, []) save_keys_to_kb(kb_name, identifiers, obj.id)
def arxiv_set_category_field(obj, eng): """Temporary measure to enable sorting by primary category.""" record = get_record_from_obj(obj, eng) obj.uri = record.get("arxiv_eprints.categories", [""])[0]