Esempio n. 1
0
    def _exists_in_holding_pen(obj, eng):
        from inspire.utils.knowledge import get_value

        record = get_record_from_obj(obj, eng)
        identifiers = record.get(identifier_key, [])
        result = get_value(kb_name, identifiers)
        if result:
            obj.log.info("Record already found in Holding Pen ({0})".format(
                result
            ))
        return result
Esempio n. 2
0
    def test_harvesting_workflow_accepted(self, search):
        """Test a full harvesting workflow."""
        from invenio_base.globals import cfg
        from invenio_workflows.api import start
        from inspire.utils.helpers import get_record_from_obj

        # Mock Elasticsearch search for Holding Pen check
        search.return_value = []

        responses.add(responses.GET, cfg["WORKFLOWS_MATCH_REMOTE_SERVER_URL"], body="[]", status=200)

        responses.add(
            responses.GET,
            "http://arxiv.org/e-print/1511.01097",
            content_type="application/x-eprint-tar",
            body=self.arxiv_tarball_accept.read(),
            status=200,
            adding_headers={"Content-Encoding": "x-gzip"},
        )

        responses.add(
            responses.GET,
            "http://arxiv.org/pdf/1511.01097",
            content_type="application/pdf",
            body=self.arxiv_pdf_accept.read(),
            status=200,
            stream=True,
        )

        robotupload_url = os.path.join(
            cfg.get("CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"), "batchuploader/robotupload/insert"
        )

        responses.add(
            responses.POST, robotupload_url, body="[INFO] bibupload batchupload --insert /dummy/file/path\n", status=200
        )
        workflow = start("harvesting_fixture", data=[self.record_oai_arxiv_accept], module_name="unit_tests")

        # Let's get the halted record
        obj = workflow.halted_objects[0]

        # Now let's resolve it as accepted and continue
        obj.remove_action()
        obj.extra_data["approved"] = True
        obj.extra_data["core"] = True
        obj.set_extra_data(obj.extra_data)
        obj.save()
        workflow = obj.continue_workflow()

        record = get_record_from_obj(obj, workflow)
        # Now it is CORE
        self.assertTrue("CORE" in record.get("collections.primary"))
Esempio n. 3
0
    def test_harvesting_workflow_rejected(self, search):
        """Test a full harvesting workflow."""
        from invenio_base.globals import cfg
        from invenio_workflows.api import start
        from inspire.utils.helpers import get_record_from_obj

        # Mock Elasticsearch search for Holding Pen check
        search.return_value = []

        responses.add(responses.GET, cfg["WORKFLOWS_MATCH_REMOTE_SERVER_URL"], body="[]", status=200)

        responses.add(
            responses.GET,
            "http://arxiv.org/e-print/1407.7587",
            content_type="application/x-eprint-tar",
            body=self.arxiv_tarball.read(),
            status=200,
            adding_headers={"Content-Encoding": "x-gzip"},
        )

        responses.add(
            responses.GET,
            "http://arxiv.org/pdf/1407.7587",
            content_type="application/pdf",
            body=self.arxiv_pdf.read(),
            status=200,
            stream=True,
        )

        workflow = start("harvesting_fixture", data=[self.record_oai_arxiv_plots], module_name="unit_tests")

        # Let's get the record metadata and check contents
        obj = workflow.completed_objects[0]
        record = get_record_from_obj(obj, workflow)

        # This record should be rejected
        self.assertFalse(obj.extra_data["approved"])

        # Files should have been attached (tarball + pdf)
        self.assertTrue(len(obj.data["files"]) == 2)

        # Some plots/files should have been added to FFTs
        self.assertTrue(record.get("fft"))

        # A publication note should have been extracted
        self.assertTrue(record.get("publication_info"))

        # A prediction should have been made
        self.assertTrue(obj.get_tasks_results().get("arxiv_guessing"))

        # It is not CORE
        self.assertFalse("CORE" in record.get("collections.primary"))
Esempio n. 4
0
    def _update_old_object(obj, eng):
        from inspire.utils.knowledge import get_value
        from invenio_workflows.models import BibWorkflowObject

        record = get_record_from_obj(obj, eng)
        identifiers = []
        identifiers = record.get('arxiv_eprints.value', [])

        object_id = get_value(kb_name, identifiers)
        if object_id:
            old_object = BibWorkflowObject.query.get(object_id)
            if old_object:
                # record waiting approval
                old_object.set_data(obj.data)
                old_object.save()
Esempio n. 5
0
def exists_in_holding_pen(obj, eng):
    """Check if a record exists in HP by looking in given KB."""
    from invenio_workflows.search import search as hp_search
    record = get_record_from_obj(obj, eng)

    identifiers = []
    for field, lookup in six.iteritems(
            current_app.config.get("HOLDING_PEN_MATCH_MAPPING")):
        # Add quotes around to make the search exact
        identifiers += ['{0}:"{1}"'.format(field, i)
                        for i in record.get(lookup, [])]
    # Search for any existing record in Holding Pen, exclude self
    result = set(hp_search(
        query=" OR ".join(identifiers)
    )) - set([obj.id])
    if result:
        obj.log.info("Record already found in Holding Pen ({0})".format(
            result
        ))
    obj.extra_data["holdingpen_ids"] = list(result)
    return result
Esempio n. 6
0
    def test_harvesting_workflow_without_match(self):
        """Test a full harvesting workflow."""
        from invenio.base.globals import cfg
        from invenio_workflows.api import start
        from inspire.utils.helpers import (
            get_record_from_obj,
        )

        httpretty.HTTPretty.allow_net_connect = False

        httpretty.register_uri(
            httpretty.GET,
            cfg['WORKFLOWS_MATCH_REMOTE_SERVER_URL'],
            body='[]',
            status=200
        )

        httpretty.register_uri(
            httpretty.GET,
            'http://arxiv.org/e-print/1407.7587',
            content_type="application/x-eprint-tar",
            body=self.arxiv_tarball.read(),
            status=200,
            adding_headers={
                "Content-Encoding": 'x-gzip',
            }
        )

        httpretty.register_uri(
            httpretty.GET,
            'http://arxiv.org/pdf/1407.7587.pdf',
            content_type="application/pdf",
            body=self.arxiv_pdf.read(),
            status=200,
        )

        robotupload_url = os.path.join(
            cfg.get("CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"),
            "batchuploader/robotupload/insert"
        )

        httpretty.register_uri(
            httpretty.POST,
            robotupload_url,
            body="[INFO] bibupload batchupload --insert /dummy/file/path\n",
            status=200,
        )
        workflow = start('harvesting_fixture',
                         data=[self.record_oai_arxiv_plots],
                         module_name='unit_tests')

        # Let's get the record metadata and check contents
        obj = workflow.halted_objects[0]
        record = get_record_from_obj(obj, workflow)

        # Files should have been attached (tarball + pdf)
        self.assertTrue(len(obj.data["files"]) == 2)

        # Some plots/files should have been added to FFTs
        self.assertTrue(record.get('fft'))

        # A publication note should have been extracted
        self.assertTrue(record.get('publication_info'))

        # A prediction should have been made
        self.assertTrue(obj.get_tasks_results().get("arxiv_guessing"))

        record = get_record_from_obj(obj, workflow)

        # This one is not yet CORE
        self.assertFalse("CORE" in record.get("collections.primary"))

        # Now let's resolve it as accepted and continue
        obj.remove_action()
        obj.extra_data["approved"] = True
        obj.extra_data["core"] = True
        obj.set_extra_data(obj.extra_data)
        obj.save()
        workflow = obj.continue_workflow()

        record = get_record_from_obj(obj, workflow)

        # Now it is CORE
        self.assertTrue("CORE" in record.get("collections.primary"))
Esempio n. 7
0
def arxiv_set_category_field(obj, eng):
    """Temporary measure to enable sorting by primary category."""
    record = get_record_from_obj(obj, eng)
    obj.uri = record.get("report_number.arxiv_category", [""])[0]
Esempio n. 8
0
    def _save_identifiers_to_kb(obj, eng):
        from inspire.utils.knowledge import save_keys_to_kb
        record = get_record_from_obj(obj, eng)

        identifiers = record.get(identifier_key, [])
        save_keys_to_kb(kb_name, identifiers, obj.id)
Esempio n. 9
0
def arxiv_set_category_field(obj, eng):
    """Temporary measure to enable sorting by primary category."""
    record = get_record_from_obj(obj, eng)
    obj.uri = record.get("arxiv_eprints.categories", [""])[0]