def test_match_in_holdingpen_previously_rejected_wf_stop(
    mocked_download_arxiv,
    mocked_api_request_beard,
    mocked_api_request_magpie,
    mocked_package_download,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    eng_uuid = start('article', [record])
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj_id = eng.objects[0].id
    obj = workflow_object_class.get(obj_id)
    obj.extra_data["approved"] = False  # reject record
    obj.continue_workflow()
    obj = workflow_object_class.get(obj_id)
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data.get('approved') is False

    es.indices.refresh('holdingpen-hep')

    record['titles'][0]['title'] = 'This is an update that will match the wf in the holdingpen'
    # this workflow matches in the holdingpen and stops because the
    # matched one was rejected
    eng_uuid = start('article', [record])
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj2 = eng.objects[0]

    assert obj2.extra_data['already-in-holding-pen'] is False
    assert obj2.extra_data['previously_rejected'] is True
    assert obj2.extra_data['previously_rejected_matches'] == [obj_id]
Esempio n. 2
0
def test_merge_with_conflicts_callback_url(
    mocked_api_request_magpie,
    mocked_beard_api,
    workflow_app,
    mocked_external_services,
    disable_file_upload,
    enable_merge_on_update,
):
    with patch(
            'inspire_json_merger.config.ArxivOnArxivOperations.conflict_filters',
        ['acquisition_source.source']):
        factory = TestRecordMetadata.create_from_file(
            __name__, 'merge_record_arxiv.json', index_name='records-hep')

        update_workflow_id = build_workflow(RECORD_WITH_CONFLICTS).id

        eng_uuid = start('article', object_id=update_workflow_id)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        conflicts = obj.extra_data.get('conflicts')

        expected_url = 'http://localhost:5000/callback/workflows/resolve_merge_conflicts'

        assert obj.status == ObjectStatus.HALTED
        assert expected_url == obj.extra_data.get('callback_url')
        assert len(conflicts) == 1

        assert obj.extra_data.get('is-update') is True
        assert obj.extra_data['merger_root'] == RECORD_WITH_CONFLICTS

        payload = {
            'id': obj.id,
            'metadata': obj.data,
            '_extra_data': obj.extra_data
        }

        with workflow_app.test_client() as client:
            response = client.put(
                obj.extra_data.get('callback_url'),
                data=json.dumps(payload),
                content_type='application/json',
            )

        data = json.loads(response.get_data())
        expected_message = 'Workflow {} has been saved with conflicts.'.format(
            obj.id)

        assert response.status_code == 200
        assert expected_message == data['message']

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        assert obj.status == ObjectStatus.HALTED

        updated_root = read_wf_record_source(factory.record_metadata.id,
                                             'arxiv')
        assert updated_root is None
def test_harvesting_arxiv_workflow_manual_accepted(
    mocked_refextract_extract_refs,
    mocked_matching_match,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_download_utils,
    mocked_download_arxiv,
    mocked_package_download,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()
    """Test a full harvesting workflow."""

    workflow_uuid, eng, obj = get_halted_workflow(app=workflow_app, record=record)

    do_accept_core(app=workflow_app, workflow_id=obj.id)

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    assert obj.status == ObjectStatus.WAITING

    do_robotupload_callback(app=workflow_app, workflow_id=obj.id, recids=[12345])

    obj = workflow_object_class.get(obj.id)
    assert obj.status == ObjectStatus.WAITING

    do_webcoll_callback(app=workflow_app, recids=[12345])

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    # It was accepted
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data["approved"] is True
Esempio n. 4
0
def test_harvesting_arxiv_workflow_manual_accepted(
    mocked_refextract_extract_refs,
    mocked_matching_match,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_download_utils,
    mocked_download_arxiv,
    mocked_package_download,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()
    """Test a full harvesting workflow."""

    workflow_uuid, eng, obj = get_halted_workflow(app=workflow_app, record=record)

    do_accept_core(app=workflow_app, workflow_id=obj.id)

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    assert obj.status == ObjectStatus.WAITING

    do_robotupload_callback(app=workflow_app, workflow_id=obj.id, recids=[12345])

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    # It was accepted
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data["approved"] is True
Esempio n. 5
0
def test_match_in_holdingpen_previously_rejected_wf_stop(
    mocked_download_arxiv,
    mocked_api_request_beard,
    mocked_api_request_magpie,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    eng_uuid = start('article', [record])
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj_id = eng.objects[0].id
    obj = workflow_object_class.get(obj_id)
    obj.extra_data["approved"] = False  # reject record
    obj.continue_workflow()
    obj = workflow_object_class.get(obj_id)
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data.get('approved') is False

    es.indices.refresh('holdingpen-hep')

    record['titles'][0][
        'title'] = 'This is an update that will match the wf in the holdingpen'
    # this workflow matches in the holdingpen and stops because the
    # matched one was rejected
    eng_uuid = start('article', [record])
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj2 = eng.objects[0]

    assert obj2.extra_data['already-in-holding-pen'] is False
    assert obj2.extra_data['previously_rejected'] is True
    assert obj2.extra_data['previously_rejected_matches'] == [obj_id]
Esempio n. 6
0
def test_match_in_holdingpen_different_sources_continues(
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_package_download,
    mocked_is_pdf_link,
    mocked_download_arxiv,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    workflow_id = build_workflow(record).id
    eng_uuid = start('article', object_id=workflow_id)
    es.indices.refresh('holdingpen-hep')
    eng = WorkflowEngine.from_uuid(eng_uuid)
    wf_to_match = eng.objects[0].id
    obj = workflow_object_class.get(wf_to_match)
    assert obj.status == ObjectStatus.HALTED
    # generated wf pending in holdingpen

    record['titles'][0][
        'title'] = 'This is an update that will match the wf in the holdingpen'
    record['acquisition_source']['source'] = 'but not the source'
    # this workflow matches in the holdingpen but continues because has a
    # different source
    workflow_id = build_workflow(record).id
    eng_uuid = start('article', object_id=workflow_id)
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    assert obj.extra_data['already-in-holding-pen'] is True
    assert obj.extra_data['holdingpen_matches'] == [wf_to_match]
    assert obj.extra_data['previously_rejected'] is False
    assert not obj.extra_data.get('stopped-matched-holdingpen-wf')
def test_halt(app, halt_workflow, halt_workflow_conditional):
    """Test halt task."""
    assert 'halttest' in app.extensions['invenio-workflows'].workflows
    assert 'halttestcond' in app.extensions['invenio-workflows'].workflows

    with app.app_context():
        data = [{'foo': 'bar'}]

        eng_uuid = start('halttest', data)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.WAITING == obj.status
        assert WorkflowStatus.HALTED == eng.status

        obj_id = obj.id
        obj.continue_workflow()

        obj = WorkflowObject.get(obj_id)
        assert obj.known_statuses.COMPLETED == obj.status

        # Check conditional workflows and pass data not as a list (to check).
        eng_uuid = start('halttestcond', data[0])
        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.WAITING == obj.status
        assert WorkflowStatus.HALTED == eng.status

        obj_id = obj.id
        obj.continue_workflow()

        obj = WorkflowObject.get(obj_id)
        assert obj.known_statuses.COMPLETED == obj.status
Esempio n. 8
0
def test_match_in_holdingpen_previously_rejected_wf_stop(
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_package_download,
    mocked_is_pdf_link,
    mocked_download_arxiv,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    record_workflow = build_workflow(record).id
    eng_uuid = start("article", object_id=record_workflow)
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj_id = eng.objects[0].id
    obj = workflow_object_class.get(obj_id)
    obj.extra_data["approved"] = False  # reject record
    obj.continue_workflow()
    obj = workflow_object_class.get(obj_id)
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data.get("approved") is False

    es.indices.refresh("holdingpen-hep")

    record["titles"][0][
        "title"] = "This is an update that will match the wf in the holdingpen"
    # this workflow matches in the holdingpen and stops because the
    # matched one was rejected
    workflow_id = build_workflow(record).id
    eng_uuid = start("article", object_id=workflow_id)
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj2 = eng.objects[0]

    assert obj2.extra_data["previously_rejected"] is True
    assert obj2.extra_data["previously_rejected_matches"] == [obj_id]
def test_match_in_holdingpen_different_sources_continues(
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_package_download,
    mocked_is_pdf_link,
    mocked_download_arxiv,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    workflow_id = build_workflow(record).id
    eng_uuid = start('article', object_id=workflow_id)
    es.indices.refresh('holdingpen-hep')
    eng = WorkflowEngine.from_uuid(eng_uuid)
    wf_to_match = eng.objects[0].id
    obj = workflow_object_class.get(wf_to_match)
    assert obj.status == ObjectStatus.HALTED
    # generated wf pending in holdingpen

    record['titles'][0]['title'] = 'This is an update that will match the wf in the holdingpen'
    record['acquisition_source']['source'] = 'but not the source'
    # this workflow matches in the holdingpen but continues because has a
    # different source
    workflow_id = build_workflow(record).id
    eng_uuid = start('article', object_id=workflow_id)
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    assert obj.extra_data['already-in-holding-pen'] is True
    assert obj.extra_data['holdingpen_matches'] == [wf_to_match]
    assert obj.extra_data['previously_rejected'] is False
    assert not obj.extra_data.get('stopped-matched-holdingpen-wf')
Esempio n. 10
0
def test_harvesting_arxiv_workflow_accepted(
    mocked, db_only_app, record_oai_arxiv_plots):
    """Test a full harvesting workflow."""
    from invenio_workflows import (
        start, WorkflowEngine, ObjectStatus, workflow_object_class
    )
    from dojson.contrib.marc21.utils import create_record
    from invenio_db import db
    from inspirehep.dojson.hep import hep
    from inspirehep.modules.converter.xslt import convert

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(
        record_oai_arxiv_plots,
        "oaiarXiv2marcxml.xsl"
    )
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    record_json = hep.do(record_marc)
    workflow_uuid = None
    with db_only_app.app_context():
        workflow_uuid = start('article', [record_json])

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.status == ObjectStatus.HALTED
        assert obj.data_type == "hep"

        # Files should have been attached (tarball + pdf)
        assert obj.files["1407.7587.pdf"]
        assert obj.files["1407.7587.tar.gz"]

        # A publication note should have been extracted
        pub_info = obj.data.get('publication_info')
        assert pub_info
        assert pub_info[0]
        assert pub_info[0].get('year') == "2014"
        assert pub_info[0].get('journal_title') == "J. Math. Phys."

        # This record should not have been touched yet
        assert "approved" not in obj.extra_data

        # Now let's resolve it as accepted and continue
        # FIXME Should be accept, but record validation prevents us.
        obj.remove_action()
        obj.extra_data["approved"] = True
        obj.extra_data["core"] = True
        obj.save()

        db.session.commit()

    with db_only_app.app_context():
        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        obj_id = obj.id
        obj.continue_workflow()

        obj = workflow_object_class.get(obj_id)
        # It was accepted
        assert obj.status == ObjectStatus.COMPLETED
Esempio n. 11
0
def test_halt(app, halt_workflow, halt_workflow_conditional):
    """Test halt task."""
    assert 'halttest' in app.extensions['invenio-workflows'].workflows
    assert 'halttestcond' in app.extensions['invenio-workflows'].workflows

    with app.app_context():
        data = [{'foo': 'bar'}]

        eng_uuid = start('halttest', data)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.WAITING == obj.status
        assert WorkflowStatus.HALTED == eng.status

        obj_id = obj.id
        obj.continue_workflow()

        obj = WorkflowObject.get(obj_id)
        assert obj.known_statuses.COMPLETED == obj.status

        # Check conditional workflows and pass data not as a list (to check).
        eng_uuid = start('halttestcond', data[0])
        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.WAITING == obj.status
        assert WorkflowStatus.HALTED == eng.status

        obj_id = obj.id
        obj.continue_workflow()

        obj = WorkflowObject.get(obj_id)
        assert obj.known_statuses.COMPLETED == obj.status
Esempio n. 12
0
def test_merge_callback_url_with_malformed_workflow(
        mocked_api_request_magpie,
        mocked_beard_api,
        workflow_app,
        mocked_external_services,
        disable_file_upload,
        enable_merge_on_update,
):
    with patch('inspire_json_merger.config.ArxivOnArxivOperations.conflict_filters', ['acquisition_source.source']):
        factory = TestRecordMetadata.create_from_file(
            __name__, 'merge_record_arxiv.json', index_name='records-hep')

        update_workflow_id = build_workflow(RECORD_WITH_CONFLICTS).id

        eng_uuid = start('article', object_id=update_workflow_id)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        conflicts = obj.extra_data.get('conflicts')

        expected_url = 'http://localhost:5000/callback/workflows/resolve_merge_conflicts'

        assert obj.status == ObjectStatus.HALTED
        assert expected_url == obj.extra_data.get('callback_url')
        assert len(conflicts) == 1

        assert obj.extra_data.get('is-update') is True
        assert obj.extra_data['merger_root'] == RECORD_WITH_CONFLICTS

        payload = {
            'id': obj.id,
            'metadata': 'Jessica Jones',
            '_extra_data': 'Frank Castle'
        }

        with workflow_app.test_client() as client:
            response = client.put(
                obj.extra_data.get('callback_url'),
                data=json.dumps(payload),
                content_type='application/json',
            )

        data = json.loads(response.get_data())
        expected_message = 'The workflow request is malformed.'

        assert response.status_code == 400
        assert expected_message == data['message']

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        assert obj.status == ObjectStatus.HALTED
        assert obj.extra_data.get('callback_url') is not None
        assert obj.extra_data.get('conflicts') is not None
        assert obj.extra_data['merger_root'] is not None

        updated_root = read_wf_record_source(factory.record_metadata.id, 'arxiv')
        assert updated_root is None
Esempio n. 13
0
def test_harvesting_arxiv_workflow_manual_accepted(
    mocked_refextract_extract_refs,
    mocked_matching_search,
    mocked_api_request_beard_block,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_download,
    workflow_app,
    record,
):
    """Test a full harvesting workflow."""
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            requests_mock.ANY,
            re.compile('.*(indexer|localhost).*'),
            real_http=True,
        )
        requests_mocker.register_uri(
            'POST',
            re.compile(
                'https?://localhost:1234.*',
            ),
            text=u'[INFO]',
            status_code=200,
        )

        workflow_uuid, eng, obj = get_halted_workflow(
            app=workflow_app,
            extra_config={'PRODUCTION_MODE': False},
            record=record,
        )

        _do_accept_core(
            app=workflow_app,
            workflow_id=obj.id,
        )

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        assert obj.status == ObjectStatus.WAITING

        response = _do_robotupload_callback(
            app=workflow_app,
            workflow_id=obj.id,
            recids=[12345],
        )
        assert response.status_code == 200

        obj = workflow_object_class.get(obj.id)
        assert obj.status == ObjectStatus.WAITING

        response = _do_webcoll_callback(app=workflow_app, recids=[12345])
        assert response.status_code == 200

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        # It was accepted
        assert obj.status == ObjectStatus.COMPLETED
def test_harvesting_arxiv_workflow_accepted(mocked, small_app,
                                            record_oai_arxiv_plots):
    """Test a full harvesting workflow."""
    from invenio_workflows import (start, WorkflowEngine, ObjectStatus,
                                   workflow_object_class)
    from dojson.contrib.marc21.utils import create_record
    from invenio_db import db
    from inspirehep.dojson.hep import hep
    from inspirehep.modules.converter.xslt import convert

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots,
                                             "oaiarXiv2marcxml.xsl")
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    record_json = hep.do(record_marc)
    workflow_uuid = None
    with small_app.app_context():
        workflow_uuid = start('article', [record_json])

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.status == ObjectStatus.HALTED
        assert obj.data_type == "hep"

        # Files should have been attached (tarball + pdf)
        assert obj.files["1407.7587.pdf"]
        assert obj.files["1407.7587.tar.gz"]

        # A publication note should have been extracted
        pub_info = obj.data.get('publication_info')
        assert pub_info
        assert pub_info[0]
        assert pub_info[0].get('year') == "2014"
        assert pub_info[0].get('journal_title') == "J. Math. Phys."

        # This record should not have been touched yet
        assert "approved" not in obj.extra_data

        # Now let's resolve it as accepted and continue
        # FIXME Should be accept, but record validation prevents us.
        obj.remove_action()
        obj.extra_data["approved"] = True
        obj.extra_data["core"] = True
        obj.save()

        db.session.commit()

    with small_app.app_context():
        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        obj_id = obj.id
        obj.continue_workflow()

        obj = workflow_object_class.get(obj_id)
        # It was accepted
        assert obj.status == ObjectStatus.COMPLETED
Esempio n. 15
0
def test_harvesting_arxiv_workflow_manual_accepted(
    mocked_refextract_extract_refs,
    mocked_matching_search,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_download_utils,
    mocked_download_arxiv,
    workflow_app,
):
    record = generate_record()
    """Test a full harvesting workflow."""
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            requests_mock.ANY,
            re.compile('.*(indexer|localhost).*'),
            real_http=True,
        )
        requests_mocker.register_uri(
            'POST',
            re.compile('https?://localhost:1234.*', ),
            text=u'[INFO]',
            status_code=200,
        )

        workflow_uuid, eng, obj = get_halted_workflow(
            app=workflow_app,
            extra_config={'PRODUCTION_MODE': False},
            record=record,
        )

        do_accept_core(
            app=workflow_app,
            workflow_id=obj.id,
        )

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        assert obj.status == ObjectStatus.WAITING

        response = do_robotupload_callback(
            app=workflow_app,
            workflow_id=obj.id,
            recids=[12345],
        )
        assert response.status_code == 200

        obj = workflow_object_class.get(obj.id)
        assert obj.status == ObjectStatus.WAITING

        response = do_webcoll_callback(app=workflow_app, recids=[12345])
        assert response.status_code == 200

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]
        # It was accepted
        assert obj.status == ObjectStatus.COMPLETED
Esempio n. 16
0
def test_merge_with_conflicts_callback_url_and_resolve(workflow_app,
                                                       enable_merge_on_update,
                                                       disable_file_upload):
    factory = TestRecordMetadata.create_from_file(__name__,
                                                  'record_for_merging.json')

    record_update = RECORD_WITH_CONFLICTS
    record_update.update({
        'dois': factory.record_metadata.json.get('dois'),
    })

    eng_uuid = start('article', [record_update])

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    conflicts = obj.extra_data.get('conflicts')

    expected_url = 'http://localhost:5000/callback/workflows/resolve_merge_conflicts'

    assert obj.status == ObjectStatus.HALTED
    assert expected_url == obj.extra_data.get('callback_url')
    assert len(conflicts) == 1
    assert obj.extra_data.get('is-update') is True

    # resolve conflicts
    obj.data['$schema'] = factory.record_metadata.json.get('$schema')
    del obj.extra_data['conflicts']

    payload = {
        'id': obj.id,
        'metadata': obj.data,
        '_extra_data': obj.extra_data
    }

    with workflow_app.test_client() as client:
        response = client.put(
            obj.extra_data.get('callback_url'),
            data=json.dumps(payload),
            content_type='application/json',
        )
    assert response.status_code == 200

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    conflicts = obj.extra_data.get('conflicts')

    assert obj.status == ObjectStatus.COMPLETED
    assert conflicts is None
    assert obj.extra_data.get('approved') is True
    assert obj.extra_data.get('is-update') is True
    assert obj.extra_data.get('merged') is True
Esempio n. 17
0
def test_merge_callback_url_with_malformed_workflow(workflow_app,
                                                    enable_merge_on_update,
                                                    disable_file_upload):
    factory = TestRecordMetadata.create_from_file(__name__,
                                                  'record_for_merging.json')

    record_update = RECORD_WITH_CONFLICTS
    record_update.update({
        'dois': factory.record_metadata.json.get('dois'),
    })

    eng_uuid = start('article', [record_update])

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    conflicts = obj.extra_data.get('conflicts')

    expected_url = 'http://localhost:5000/callback/workflows/resolve_merge_conflicts'

    assert obj.status == ObjectStatus.HALTED
    assert expected_url == obj.extra_data.get('callback_url')
    assert len(conflicts) == 1
    assert obj.extra_data.get('is-update') is True

    payload = {
        'id': obj.id,
        'metadata': 'Jessica Jones',
        '_extra_data': 'Frank Castle'
    }

    with workflow_app.test_client() as client:
        response = client.put(
            obj.extra_data.get('callback_url'),
            data=json.dumps(payload),
            content_type='application/json',
        )

    data = json.loads(response.get_data())
    expected_message = 'The workflow request is malformed.'

    assert response.status_code == 400
    assert expected_message == data['message']

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    assert obj.status == ObjectStatus.HALTED
    assert obj.extra_data.get('callback_url') is not None
    assert obj.extra_data.get('conflicts') is not None
Esempio n. 18
0
def transform_example_file(obj, eng: WorkflowEngine):
    input_data = ''
    try:
        with open(obj.data, 'r') as input:
            input_data = input.read()
    except OSError:
        eng.abort()  # Cannot read input data, abort workflow execution

    output = obj.scratch.create_file(task_name='example_output')
    with open(output, 'w') as tf:
        tf.write(input_data.title())

    obj.data = output
    return obj
Esempio n. 19
0
def test_restart(app, restart_workflow):
    """Test halt task."""
    assert 'restarttest' in app.extensions['invenio-workflows'].workflows

    with app.app_context():
        data = {}

        eng_uuid = start('restarttest', data)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.HALTED == obj.status
        assert WorkflowStatus.HALTED == eng.status
        assert obj.data == {"title": "foo"}
        assert obj.get_action() == "foo"
        assert obj.get_action_message() == "Test"

        # Restart shall have no effect (still halted)
        new_eng_uuid = restart(eng_uuid)

        assert new_eng_uuid == eng_uuid

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.HALTED == obj.status
        assert WorkflowStatus.HALTED == eng.status
        assert obj.data == {"title": {"value": "bar"}}
        assert obj.get_action() == "foo"

        obj.remove_action()
        assert obj.get_action() is None

        obj_id = obj.id

        # Now it should resume the next task
        resume(obj_id)

        obj = WorkflowObject.get(obj_id)
        assert obj.known_statuses.COMPLETED == obj.status
        assert obj.extra_data.get('test') == 'test'
        assert obj.data.get('title').get('source') == 'TEST'

        # We restart the object again
        restart(obj.workflow.uuid, data=obj)
        assert obj.known_statuses.HALTED == obj.status
        assert WorkflowStatus.HALTED == eng.status
        assert obj.data == {"title": {"value": "bar"}}
def test_restart(app, restart_workflow):
    """Test halt task."""
    assert 'restarttest' in app.extensions['invenio-workflows'].workflows

    with app.app_context():
        data = {}

        eng_uuid = start('restarttest', data)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.HALTED == obj.status
        assert WorkflowStatus.HALTED == eng.status
        assert obj.data == {"title": "foo"}
        assert obj.get_action() == "foo"
        assert obj.get_action_message() == "Test"

        # Restart shall have no effect (still halted)
        new_eng_uuid = restart(eng_uuid)

        assert new_eng_uuid == eng_uuid

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.processed_objects[0]

        assert obj.known_statuses.HALTED == obj.status
        assert WorkflowStatus.HALTED == eng.status
        assert obj.data == {"title": {"value": "bar"}}
        assert obj.get_action() == "foo"

        obj.remove_action()
        assert obj.get_action() is None

        obj_id = obj.id

        # Now it should resume the next task
        resume(obj_id)

        obj = WorkflowObject.get(obj_id)
        assert obj.known_statuses.COMPLETED == obj.status
        assert obj.extra_data.get('test') == 'test'
        assert obj.data.get('title').get('source') == 'TEST'

        # We restart the object again
        restart(obj.workflow.uuid, data=obj)
        assert obj.known_statuses.HALTED == obj.status
        assert WorkflowStatus.HALTED == eng.status
        assert obj.data == {"title": {"value": "bar"}}
def test_harvesting_arxiv_workflow_core_record_auto_accepted(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_is_pdf_link,
    mocked_package_download,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    """Test a full harvesting workflow."""
    record, categories = core_record()

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        "ARXIV_CATEGORIES": categories,
    }
    with workflow_app.app_context():
        workflow_id = build_workflow(record).id
        with mock.patch.dict(workflow_app.config, extra_config):
            workflow_uuid = start("article", object_id=workflow_id)

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.extra_data["approved"] is True
        assert obj.extra_data["auto-approved"] is True
        assert obj.data["core"] is True
Esempio n. 22
0
def start_edit_article_workflow(recid):
    try:
        record = get_db_record('lit', recid)
    except RecordGetterError:
        raise CallbackRecordNotFoundError(recid)

    record_permission = RecordPermission.create(action='update', record=record)
    if not record_permission.can():
        abort(403, record_permission)
    # has to be done before start() since, it is deattaching this session
    user_id = current_user.get_id()
    eng_uuid = start('edit_article', data=record)
    workflow_id = WorkflowEngine.from_uuid(eng_uuid).objects[0].id
    workflow = workflow_object_class.get(workflow_id)
    workflow.id_user = user_id
    if request.referrer:
        base_rt_url = get_rt_link_for_ticket('').replace('?', '\?')
        ticket_match = re.match(base_rt_url + '(?P<ticket_id>\d+)', request.referrer)
        if ticket_match:
            ticket_id = int(ticket_match.group('ticket_id'))
            workflow.extra_data['curation_ticket_id'] = ticket_id

    workflow.save()
    db.session.commit()
    url = "{}{}".format(current_app.config['WORKFLOWS_EDITOR_API_URL'], workflow_id)
    return redirect(location=url, code=302)
Esempio n. 23
0
def test_harvesting_arxiv_workflow_core_record_auto_accepted(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_is_pdf_link,
    mocked_package_download,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    """Test a full harvesting workflow."""
    record, categories = core_record()

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        "ARXIV_CATEGORIES": categories,
    }
    with workflow_app.app_context():
        workflow_id = build_workflow(record).id
        with mock.patch.dict(workflow_app.config, extra_config):
            workflow_uuid = start("article", object_id=workflow_id)

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.extra_data["approved"] is True
        assert obj.extra_data["auto-approved"] is True
        assert obj.data["core"] is True
Esempio n. 24
0
def test_merge_without_conflicts(workflow_app, enable_merge_on_update, record_to_merge):
    record_update = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'document_type': ['article'],
        'titles': [
            {'title': 'Jessica Jones'},
            {'title': 'Luke Cage'},
            {'title': 'Frank Castle'},
        ],
        'dois': [
            {
                'value': '10.1007/978-3-319-15001-7'
            }
        ],
    }

    eng_uuid = start('article', [record_update])

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data.get('callback_url') is None
    assert obj.extra_data.get('conflicts') is None

    assert obj.extra_data.get('approved') is True
    assert obj.extra_data.get('is-update') is True
    assert obj.extra_data.get('merged') is True
def test_validation_error_callback_with_malformed_with_invalid_types(workflow_app):
    invalid_record = {
        '_collections': [
            'Literature',
        ],
        'document_type': [
            'article',
        ],
        'titles': [
            {'title': 'A title'},
        ],
    }

    eng_uuid = start('article', [invalid_record])

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    response = do_validation_callback(
        workflow_app,
        # id
        'Alias Investigations',
        obj.data,
        # extra_data
        'Jessica Jones'
    )
    data = json.loads(response.get_data())
    expected_message = 'The workflow request is malformed.'
    expected_error_code = 'MALFORMED'

    assert response.status_code == 400
    assert expected_error_code == data['error_code']
    assert expected_message == data['message']
    assert 'errors' in data
def test_validation_error_callback_with_a_valid(workflow_app):
    valid_record = {
        '_collections': [
            'Literature',
        ],
        'document_type': [
            'article',
        ],
        'titles': [
            {'title': 'A title'},
        ],
    }

    eng_uuid = start('article', [valid_record])

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    assert obj.status != ObjectStatus.ERROR

    response = do_validation_callback(
        workflow_app,
        obj.id,
        obj.data,
        obj.extra_data
    )

    expected_error_code = 'WORKFLOW_NOT_IN_ERROR_STATE'
    data = json.loads(response.get_data())

    assert response.status_code == 400
    assert expected_error_code == data['error_code']
def test_validation_error_callback_with_missing_worfklow(workflow_app):
    invalid_record = {
        '_collections': [
            'Literature',
        ],
        'document_type': [
            'article',
        ],
        'titles': [
            {'title': 'A title'},
        ],
    }

    eng_uuid = start('article', [invalid_record])

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    response = do_validation_callback(
        workflow_app,
        1111,
        obj.data,
        obj.extra_data
    )

    data = json.loads(response.get_data())
    expected_message = 'The workflow with id "1111" was not found.'
    expected_error_code = 'WORKFLOW_NOT_FOUND'

    assert response.status_code == 404
    assert expected_error_code == data['error_code']
    assert expected_message == data['message']
Esempio n. 28
0
def edit_workflow(workflow_app):
    app_client = workflow_app.test_client()
    user = User.query.filter_by(email='*****@*****.**').one()
    login_user_via_session(app_client, user=user)

    record = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'nucl-th'
                ],
                'value': '1802.03287'
            }
        ],
        'control_number': 123,
        'document_type': ['article'],
        'titles': [{'title': 'Resource Pooling in Large-Scale Content Delivery Systems'}],
        'self': {'$ref': 'http://localhost:5000/schemas/records/hep.json'},
        '_collections': ['Literature']
    }
    factory = TestRecordMetadata.create_from_kwargs(json=record)
    eng_uuid = start('edit_article', data=factory.record_metadata.json)
    obj = WorkflowEngine.from_uuid(eng_uuid).objects[0]

    assert obj.status == ObjectStatus.WAITING
    assert obj.extra_data['callback_url']
    return obj
def test_previously_rejected_from_not_fully_harvested_category_is_not_auto_approved(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_is_pdf_link,
    mocked_package_download,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    record, categories = core_record()
    record["arxiv_eprints"][0]["categories"] = ["q-bio.GN"]

    obj = workflow_object_class.create(
        data=record, status=ObjectStatus.COMPLETED, data_type="hep"
    )
    obj.extra_data["approved"] = False  # reject it
    obj.save()
    es.indices.refresh("holdingpen-hep")

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        "ARXIV_CATEGORIES": categories,
    }
    with workflow_app.app_context():
        with mock.patch.dict(workflow_app.config, extra_config):
            workflow_id = build_workflow(record).id
            eng_uuid = start("article", object_id=workflow_id)
            eng = WorkflowEngine.from_uuid(eng_uuid)
            obj2 = eng.processed_objects[0]
            assert not obj2.extra_data["auto-approved"]
            assert len(obj2.extra_data["previously_rejected_matches"]) > 0
            assert obj2.status == ObjectStatus.COMPLETED
def test_validation_error_callback_with_malformed_with_invalid_types(workflow_app):
    invalid_record = {
        "_collections": ["Literature"],
        "document_type": ["article"],
        "titles": [{"title": "A title"}],
    }

    workflow_id = build_workflow(invalid_record).id
    eng_uuid = start("article", object_id=workflow_id)

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    response = do_validation_callback(
        workflow_app,
        # id
        "Alias Investigations",
        obj.data,
        # extra_data
        "Jessica Jones",
    )
    data = json.loads(response.get_data())
    expected_message = "The workflow request is malformed."
    expected_error_code = "MALFORMED"

    assert response.status_code == 400
    assert expected_error_code == data["error_code"]
    assert expected_message == data["message"]
    assert "errors" in data
def test_keep_previously_rejected_from_fully_harvested_category_is_auto_approved(
        mocked_refextract_extract_refs,
        mocked_api_request_magpie,
        mocked_api_request_beard,
        mocked_is_pdf_link,
        mocked_package_download,
        mocked_arxiv_download,
        workflow_app,
        mocked_external_services,
):
    record, categories = core_record()
    obj = workflow_object_class.create(
        data=record,
        status=ObjectStatus.COMPLETED,
        data_type='hep',
    )
    obj.extra_data['approved'] = False  # reject it
    obj.save()
    es.indices.refresh('holdingpen-hep')

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        'ARXIV_CATEGORIES': categories,
    }
    with workflow_app.app_context():
        with mock.patch.dict(workflow_app.config, extra_config):
            workflow_uuid = start('article', [record])
            eng = WorkflowEngine.from_uuid(workflow_uuid)
            obj2 = eng.processed_objects[0]
            assert obj2.extra_data['auto-approved']
            assert len(obj2.extra_data['previously_rejected_matches']) > 0
            assert obj.status == ObjectStatus.COMPLETED
Esempio n. 32
0
def test_harvesting_arxiv_workflow_manual_rejected(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_beard_api,
    mocked_actions_download,
    mocked_is_pdf_link,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    """Test a full harvesting workflow."""
    record = generate_record()
    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
    }

    workflow_uuid, eng, obj = get_halted_workflow(app=workflow_app,
                                                  extra_config=extra_config,
                                                  record=record)

    obj.extra_data["approved"] = False
    obj.save()
    db.session.commit()

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    obj_id = obj.id
    obj.continue_workflow()

    obj = workflow_object_class.get(obj_id)
    # It was rejected
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data["approved"] is False
Esempio n. 33
0
def test_harvesting_arxiv_workflow_already_on_legacy(
    mocked_refextract_extract_refs,
    mocked_api_request_beard_block,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_download,
    small_app,
    already_harvested_on_legacy_record,
):
    """Test a full harvesting workflow."""

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
    }

    workflow_uuid = None
    with small_app.app_context():
        with mock.patch.dict(small_app.config, extra_config):
            workflow_uuid = start('article', [
                already_harvested_on_legacy_record])

        eng = WorkflowEngine.from_uuid(workflow_uuid)
        obj = eng.processed_objects[0]

        assert obj.status == ObjectStatus.COMPLETED
        assert 'already-ingested' in obj.extra_data
        assert obj.extra_data['already-ingested']
Esempio n. 34
0
def test_merge_with_disabled_merge_on_update_feature_flag(
        mocked_api_request_magpie,
        mocked_beard_api,
        workflow_app,
        mocked_external_services,
        disable_file_upload,
):

    with patch.dict(workflow_app.config, {'FEATURE_FLAG_ENABLE_MERGER': False}):
        factory = TestRecordMetadata.create_from_file(
            __name__, 'merge_record_arxiv.json', index_name='records-hep')

        update_workflow_id = build_workflow(RECORD_WITHOUT_CONFLICTS).id
        eng_uuid = start('article', object_id=update_workflow_id)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        assert obj.status == ObjectStatus.COMPLETED

        assert obj.extra_data.get('callback_url') is None
        assert obj.extra_data.get('conflicts') is None
        assert obj.extra_data.get('merged') is True
        assert obj.extra_data.get('merger_root') is None
        assert obj.extra_data.get('is-update') is True

        updated_root = read_wf_record_source(factory.record_metadata.id, 'arxiv')
        assert updated_root is None
Esempio n. 35
0
def test_validation_error_callback_with_malformed_with_invalid_types(
        workflow_app):
    invalid_record = {
        "_collections": ["Literature"],
        "document_type": ["article"],
        "titles": [{
            "title": "A title"
        }],
    }

    workflow_id = build_workflow(invalid_record).id
    eng_uuid = start("article", object_id=workflow_id)

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    response = do_validation_callback(
        workflow_app,
        # id
        "Alias Investigations",
        obj.data,
        # extra_data
        "Jessica Jones",
    )
    data = json.loads(response.get_data())
    expected_message = "The workflow request is malformed."
    expected_error_code = "MALFORMED"

    assert response.status_code == 400
    assert expected_error_code == data["error_code"]
    assert expected_message == data["message"]
    assert "errors" in data
Esempio n. 36
0
def test_previously_rejected_from_not_fully_harvested_category_is_not_auto_approved(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_is_pdf_link,
    mocked_package_download,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    record, categories = core_record()
    record["arxiv_eprints"][0]["categories"] = ["q-bio.GN"]

    obj = workflow_object_class.create(data=record,
                                       status=ObjectStatus.COMPLETED,
                                       data_type="hep")
    obj.extra_data["approved"] = False  # reject it
    obj.save()
    es.indices.refresh("holdingpen-hep")

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        "ARXIV_CATEGORIES": categories,
    }
    with workflow_app.app_context():
        with mock.patch.dict(workflow_app.config, extra_config):
            workflow_id = build_workflow(record).id
            eng_uuid = start("article", object_id=workflow_id)
            eng = WorkflowEngine.from_uuid(eng_uuid)
            obj2 = eng.processed_objects[0]
            assert not obj2.extra_data["auto-approved"]
            assert len(obj2.extra_data["previously_rejected_matches"]) > 0
            assert obj2.status == ObjectStatus.COMPLETED
Esempio n. 37
0
def start_edit_article_workflow(recid):
    try:
        record = get_db_record('lit', recid)
    except RecordGetterError:
        raise CallbackRecordNotFoundError(recid)

    record_permission = RecordPermission.create(action='update', record=record)
    if not record_permission.can():
        abort(403, record_permission)
    # has to be done before start() since, it is deattaching this session
    user_id = current_user.get_id()
    eng_uuid = start('edit_article', data=record)
    workflow_id = WorkflowEngine.from_uuid(eng_uuid).objects[0].id
    workflow = workflow_object_class.get(workflow_id)
    workflow.id_user = user_id
    if request.referrer:
        base_rt_url = get_rt_link_for_ticket('').replace('?', '\?')
        ticket_match = re.match(base_rt_url + '(?P<ticket_id>\d+)',
                                request.referrer)
        if ticket_match:
            ticket_id = int(ticket_match.group('ticket_id'))
            workflow.extra_data['curation_ticket_id'] = ticket_id

    workflow.save()
    db.session.commit()
    url = "{}{}".format(current_app.config['WORKFLOWS_EDITOR_API_URL'],
                        workflow_id)
    return redirect(location=url, code=302)
Esempio n. 38
0
def test_validation_error_callback_with_missing_worfklow(workflow_app):
    invalid_record = {
        "_collections": ["Literature"],
        "document_type": ["article"],
        "titles": [{
            "title": "A title"
        }],
    }

    workflow_id = build_workflow(invalid_record).id
    eng_uuid = start("article", object_id=workflow_id)

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    response = do_validation_callback(workflow_app, 1111, obj.data,
                                      obj.extra_data)

    data = json.loads(response.get_data())
    expected_message = 'The workflow with id "1111" was not found.'
    expected_error_code = "WORKFLOW_NOT_FOUND"

    assert response.status_code == 404
    assert expected_error_code == data["error_code"]
    assert expected_message == data["message"]
Esempio n. 39
0
def test_merge_with_conflicts_rootful(
        mocked_api_request_magpie,
        mocked_beard_api,
        workflow_app,
        mocked_external_services,
        disable_file_upload,
        enable_merge_on_update,
):
    with patch('inspire_json_merger.config.ArxivOnArxivOperations.conflict_filters', ['acquisition_source.source']):
        TestRecordMetadata.create_from_file(
            __name__, 'merge_record_arxiv.json', index_name='records-hep')

        update_workflow_id = build_workflow(RECORD_WITH_CONFLICTS).id

        # By default the root is {}.

        eng_uuid = start('article', object_id=update_workflow_id)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        conflicts = obj.extra_data.get('conflicts')
        assert obj.status == ObjectStatus.HALTED
        assert len(conflicts) == 1

        assert obj.extra_data.get('callback_url') is not None
        assert obj.extra_data.get('is-update') is True
        assert obj.extra_data['merger_root'] == RECORD_WITH_CONFLICTS
        assert obj.extra_data['merger_head_revision'] == 0
        assert obj.extra_data['merger_original_root'] == {}
Esempio n. 40
0
def test_update_exact_matched_goes_trough_the_workflow(
    mocked_is_pdf_link,
    mocked_download_arxiv,
    mocked_api_request_beard,
    mocked_api_request_magpie,
    workflow_app,
    mocked_external_services,
    record_from_db,
):
    record = record_from_db
    workflow_id = build_workflow(record).id
    eng_uuid = start("article", object_id=workflow_id)
    obj_id = WorkflowEngine.from_uuid(eng_uuid).objects[0].id
    obj = workflow_object_class.get(obj_id)

    assert obj.extra_data["holdingpen_matches"] == []
    assert obj.extra_data["previously_rejected"] is False
    assert not obj.extra_data.get("stopped-matched-holdingpen-wf")
    assert obj.extra_data["is-update"]
    assert obj.extra_data["exact-matched"]
    assert obj.extra_data["matches"]["exact"] == [record.get("control_number")]
    assert obj.extra_data["matches"]["approved"] == record.get(
        "control_number")
    assert obj.extra_data["approved"]
    assert obj.status == ObjectStatus.COMPLETED
Esempio n. 41
0
def test_stop_matched_holdingpen_wfs(app, simple_record):
    # need to run a wf in order to assign to it the wf definition and a uuid
    # for it

    obj = workflow_object_class.create(
        data_type='hep',
        **simple_record
    )
    workflow_uuid = start('article', object_id=obj.id)
    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    obj.status = ObjectStatus.HALTED
    obj.save()
    obj_id = obj.id
    es.indices.refresh('holdingpen-hep')

    obj2 = WorkflowObject.create(data_type='hep', **simple_record)
    obj2_id = obj2.id

    match_non_completed_wf_in_holdingpen(obj2, None)
    assert obj2.extra_data['holdingpen_matches'] == [obj_id]

    stop_matched_holdingpen_wfs(obj2, None)

    stopped_wf = workflow_object_class.get(obj_id)
    assert stopped_wf.status == ObjectStatus.COMPLETED
    assert stopped_wf.extra_data['stopped-by-wf'] == obj2_id
Esempio n. 42
0
def test_merge_without_conflicts_rootful(
        mocked_api_request_magpie,
        mocked_beard_api,
        workflow_app,
        mocked_external_services,
        disable_file_upload,
        enable_merge_on_update,
):
    with patch('inspire_json_merger.config.ArxivOnArxivOperations.conflict_filters', ['acquisition_source.source']):
        factory = TestRecordMetadata.create_from_file(
            __name__, 'merge_record_arxiv.json', index_name='records-hep')

        update_workflow_id = build_workflow(RECORD_WITH_CONFLICTS).id

        insert_wf_record_source(json=ARXIV_ROOT, record_uuid=factory.record_metadata.id, source='arxiv')

        eng_uuid = start('article', object_id=update_workflow_id)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        conflicts = obj.extra_data.get('conflicts')

        assert obj.status == ObjectStatus.COMPLETED
        assert not conflicts

        assert obj.extra_data.get('callback_url') is None
        assert obj.extra_data.get('is-update') is True
        assert obj.extra_data['merger_head_revision'] == 0
        assert obj.extra_data['merger_original_root'] == ARXIV_ROOT

        updated_root = read_wf_record_source(factory.record_metadata.id, 'arxiv')
        assert updated_root.json == RECORD_WITH_CONFLICTS
def test_harvesting_arxiv_workflow_manual_rejected(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_beard_api,
    mocked_actions_download,
    mocked_is_pdf_link,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    """Test a full harvesting workflow."""
    record = generate_record()
    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
    }

    workflow_uuid, eng, obj = get_halted_workflow(
        app=workflow_app, extra_config=extra_config, record=record
    )

    obj.extra_data["approved"] = False
    obj.save()
    db.session.commit()

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    obj_id = obj.id
    obj.continue_workflow()

    obj = workflow_object_class.get(obj_id)
    # It was rejected
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data["approved"] is False
Esempio n. 44
0
def stop_matched_holdingpen_wfs(obj, eng):
    """Stop the matched workflow objects in the holdingpen.

    Stops the matched workflows in the holdingpen by replacing their steps with
    a new one defined on the fly, containing a ``stop`` step, and executing it.
    For traceability reason, these workflows are also marked as
    ``'stopped-by-wf'``, whose value is the current workflow's id.

    In the use case of harvesting twice an article, this function is involved
    to stop the first workflow and let the current one being processed,
    since it the latest metadata.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None
    """
    stopping_steps = [mark('stopped-by-wf', int(obj.id)), stop_processing]

    save_workflow(obj, eng)

    for holdingpen_wf_id in obj.extra_data['holdingpen_matches']:
        holdingpen_wf = workflow_object_class.get(holdingpen_wf_id)
        holdingpen_wf_eng = WorkflowEngine.from_uuid(holdingpen_wf.id_workflow)

        # stop this holdingpen workflow by replacing its steps with a stop step
        holdingpen_wf_eng.callbacks.replace(stopping_steps)
        holdingpen_wf_eng.process([holdingpen_wf])
Esempio n. 45
0
def stop_matched_holdingpen_wfs(obj, eng):
    """Stop the matched workflow objects in the holdingpen.

    Stops the matched workflows in the holdingpen by replacing their steps with
    a new one defined on the fly, containing a ``stop`` step, and executing it.
    For traceability reason, these workflows are also marked as
    ``'stopped-by-wf'``, whose value is the current workflow's id.

    In the use case of harvesting twice an article, this function is involved
    to stop the first workflow and let the current one being processed,
    since it the latest metadata.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None
    """
    stopping_steps = [mark('stopped-by-wf', int(obj.id)), stop_processing]

    obj.save()

    for holdingpen_wf_id in obj.extra_data['holdingpen_matches']:
        holdingpen_wf = workflow_object_class.get(holdingpen_wf_id)
        holdingpen_wf_eng = WorkflowEngine.from_uuid(holdingpen_wf.id_workflow)

        # stop this holdingpen workflow by replacing its steps with a stop step
        holdingpen_wf_eng.callbacks.replace(stopping_steps)
        holdingpen_wf_eng.process([holdingpen_wf])
Esempio n. 46
0
def test_merge_without_conflicts_handles_update_without_acquisition_source_and_acts_as_rootless(
        mocked_api_request_magpie,
        mocked_beard_api,
        workflow_app,
        mocked_external_services,
        disable_file_upload,
        enable_merge_on_update,
):
    with patch('inspire_json_merger.config.PublisherOnArxivOperations.conflict_filters', ['acquisition_source.source']):
        factory = TestRecordMetadata.create_from_file(
            __name__, 'merge_record_arxiv.json', index_name='records-hep')

        update_workflow_id = build_workflow(RECORD_WITHOUT_ACQUISITION_SOURCE_AND_NO_CONFLICTS).id

        eng_uuid = start('article', object_id=update_workflow_id)

        eng = WorkflowEngine.from_uuid(eng_uuid)
        obj = eng.objects[0]

        conflicts = obj.extra_data.get('conflicts')

        assert obj.status == ObjectStatus.COMPLETED
        assert not conflicts

        assert obj.extra_data.get('callback_url') is None
        assert obj.extra_data.get('is-update') is True
        assert obj.extra_data['merger_head_revision'] == 0
        assert obj.extra_data['merger_original_root'] == {}

        # source us unknown, so no new root is saved.
        roots = read_all_wf_record_sources(factory.record_metadata.id)
        assert not roots
Esempio n. 47
0
def test_validation_error_callback_with_a_valid(workflow_app):
    valid_record = {
        "_collections": ["Literature"],
        "document_type": ["article"],
        "titles": [{
            "title": "A title"
        }],
    }

    workflow_id = build_workflow(valid_record).id
    eng_uuid = start("article", object_id=workflow_id)

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    assert obj.status != ObjectStatus.ERROR

    response = do_validation_callback(workflow_app, obj.id, obj.data,
                                      obj.extra_data)

    expected_error_code = "WORKFLOW_NOT_IN_ERROR_STATE"
    data = json.loads(response.get_data())

    assert response.status_code == 400
    assert expected_error_code == data["error_code"]
def test_refextract_from_pdf(
    mocked_indexing_task,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_is_pdf_link,
    mocked_package_download,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services
):
    """Test refextract from PDF and reference matching for default Configuration
     by going through the entire workflow."""

    cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'arxiv_eprints': [
            {
                'categories': ['quant-ph', 'cond-mat.mes-hall', 'cond-mat.str-el', 'math-ph', 'math.MP'],
                'value': '1308.0815'
            }
        ],
        'control_number': 1000,
        'document_type': ['article'],
        'titles': [
            {
                'source': 'arXiv',
                'title': 'Solving a two-electron quantum dot model in terms of polynomial solutions of a Biconfluent Heun equation'
            }
        ],
    }

    TestRecordMetadata.create_from_kwargs(
        json=cited_record_json, index='records-hep', pid_type='lit')
    citing_record, categories = insert_citing_record()

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        'ARXIV_CATEGORIES': categories,
    }

    schema = load_schema('hep')
    subschema = schema['properties']['acquisition_source']

    assert validate(citing_record['acquisition_source'], subschema) is None

    with mock.patch.dict(workflow_app.config, extra_config):
        workflow_id = build_workflow(citing_record).id
        citing_doc_workflow_uuid = start('article', object_id=workflow_id)

    citing_doc_eng = WorkflowEngine.from_uuid(citing_doc_workflow_uuid)
    citing_doc_obj = citing_doc_eng.processed_objects[0]

    assert citing_doc_obj.data['references'][7]['record']['$ref'] == 'http://localhost:5000/api/literature/1000'
    assert citing_doc_obj.data['references'][0]['raw_refs'][0]['source'] == 'arXiv'
def test_update_record_goes_through_api_version_of_store_record_without_issue(
    mocked_is_pdf_link,
    mocked_download_arxiv,
    mocked_api_request_beard,
    mocked_api_request_magpie,
    workflow_app,
    mocked_external_services,
    record_from_db,
):
    record = record_from_db
    workflow_id = build_workflow(record).id
    expected_control_number = record['control_number']
    expected_head_uuid = str(record.id)
    with mock.patch.dict(
        workflow_app.config, {
            "FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT": True,
            "INSPIREHEP_URL": "http://web:8000"
        }
    ):
        with requests_mock.Mocker(real_http=True) as requests_mocker:
            requests_mocker.register_uri(
                'PUT', '{url}/literature/{cn}'.format(
                    url=workflow_app.config.get("INSPIREHEP_URL"),
                    cn=expected_control_number,
                ),
                headers={'content-type': 'application/json'},
                status_code=200,
                json={
                    'metadata': {
                        'control_number': expected_control_number,
                    },
                    'id_': expected_head_uuid
                }
            )
            eng_uuid = start("article", object_id=workflow_id)
            url_paths = [r.path for r in requests_mocker.request_history]
            url_hostnames = [r.hostname for r in requests_mocker.request_history]

            assert 'web' in url_hostnames
            assert "/literature/{cn}".format(cn=expected_control_number) in url_paths

    obj_id = WorkflowEngine.from_uuid(eng_uuid).objects[0].id
    obj = workflow_object_class.get(obj_id)

    assert obj.data['control_number'] == expected_control_number

    assert obj.extra_data["holdingpen_matches"] == []
    assert obj.extra_data["previously_rejected"] is False
    assert not obj.extra_data.get("stopped-matched-holdingpen-wf")
    assert obj.extra_data["is-update"]
    assert obj.extra_data["exact-matched"]
    assert obj.extra_data["matches"]["exact"] == [record.get("control_number")]
    assert obj.extra_data["matches"]["approved"] == record.get("control_number")
    assert obj.extra_data["approved"]
    assert obj.status == ObjectStatus.COMPLETED
def test_match_in_holdingpen_stops_pending_wf(
    mocked_download_arxiv,
    mocked_api_request_beard,
    mocked_api_request_magpie,
    mocked_package_download,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    eng_uuid = start('article', [record])
    es.indices.refresh('holdingpen-hep')
    eng = WorkflowEngine.from_uuid(eng_uuid)
    old_wf = eng.objects[0]
    obj_id = old_wf.id

    assert old_wf.status == ObjectStatus.HALTED
    assert old_wf.extra_data['previously_rejected'] is False

    record2 = record
    record['titles'][0]['title'] = 'This is an update that will match the wf in the holdingpen'
    eng_uuid2 = start('article', [record2])
    es.indices.refresh('holdingpen-hep')
    eng2 = WorkflowEngine.from_uuid(eng_uuid2)
    update_wf = eng2.objects[0]

    assert update_wf.status == ObjectStatus.HALTED
    assert update_wf.extra_data['already-in-holding-pen'] is True
    assert update_wf.extra_data['previously_rejected'] is False
    assert update_wf.extra_data['stopped-matched-holdingpen-wf'] is True
    assert update_wf.extra_data['is-update'] is False

    old_wf = workflow_object_class.get(obj_id)
    assert old_wf.extra_data['already-in-holding-pen'] is False
    assert old_wf.extra_data['previously_rejected'] is False
    assert old_wf.extra_data['stopped-by-wf'] == update_wf.id
    assert old_wf.extra_data.get('approved') is None
    assert old_wf.extra_data['is-update'] is False
    assert old_wf.status == ObjectStatus.COMPLETED
def get_halted_workflow(mocked_is_pdf_link, app, record, extra_config=None):
    mocked_is_pdf_link.return_value = True

    extra_config = extra_config or {}
    with mock.patch.dict(app.config, extra_config):
        workflow_id = build_workflow(record).id
        workflow_uuid = start("article", object_id=workflow_id)

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]

    assert obj.status == ObjectStatus.HALTED
    assert obj.data_type == "hep"

    # Files should have been attached (tarball + pdf, and plots)
    assert obj.files["1407.7587.pdf"]
    assert obj.files["1407.7587.tar.gz"]

    assert len(obj.files) > 2

    # A publication note should have been extracted
    pub_info = obj.data.get("publication_info")
    assert pub_info
    assert pub_info[0]
    assert pub_info[0].get("year") == 2014
    assert pub_info[0].get("journal_title") == "J. Math. Phys."

    # A prediction should have been made
    prediction = obj.extra_data.get("relevance_prediction")
    assert prediction
    assert prediction["decision"] == "Non-CORE"
    assert prediction["scores"]["Non-CORE"] == 0.8358207729691823

    expected_experiment_prediction = {
        "experiments": [{"label": "CMS", "score": 0.75495152473449707}]
    }
    experiments_prediction = obj.extra_data.get("experiments_prediction")
    assert experiments_prediction == expected_experiment_prediction

    keywords_prediction = obj.extra_data.get("keywords_prediction")
    assert keywords_prediction
    assert {
        "label": "galaxy",
        "score": 0.29424679279327393,
        "accept": True,
    } in keywords_prediction["keywords"]

    # This record should not have been touched yet
    assert obj.extra_data["approved"] is None

    return workflow_uuid, eng, obj
def test_match_in_holdingpen_previously_rejected_wf_stop(
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_package_download,
    mocked_is_pdf_link,
    mocked_download_arxiv,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    record_workflow = build_workflow(record).id
    eng_uuid = start("article", object_id=record_workflow)
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj_id = eng.objects[0].id
    obj = workflow_object_class.get(obj_id)
    obj.extra_data["approved"] = False  # reject record
    obj.continue_workflow()
    obj = workflow_object_class.get(obj_id)
    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data.get("approved") is False

    es.indices.refresh("holdingpen-hep")

    record["titles"][0][
        "title"
    ] = "This is an update that will match the wf in the holdingpen"
    # this workflow matches in the holdingpen and stops because the
    # matched one was rejected
    workflow_id = build_workflow(record).id
    eng_uuid = start("article", object_id=workflow_id)
    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj2 = eng.objects[0]

    assert obj2.extra_data["previously_rejected"] is True
    assert obj2.extra_data["previously_rejected_matches"] == [obj_id]
def test_article_workflow_continues_when_record_is_valid(workflow_app):
    valid_record = {
        "_collections": ["Literature"],
        "document_type": ["article"],
        "titles": [{"title": "A title"}],
    }

    workflow_id = build_workflow(valid_record).id
    eng_uuid = start("article", object_id=workflow_id)

    eng = WorkflowEngine.from_uuid(eng_uuid)
    obj = eng.objects[0]

    assert obj.status != ObjectStatus.ERROR
    assert "_error_msg" not in obj.extra_data
Esempio n. 54
0
def get_halted_workflow(app, record, extra_config=None):
    extra_config = extra_config or {}
    with mock.patch.dict(app.config, extra_config):
        workflow_uuid = start('article', [record])

    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]

    assert obj.status == ObjectStatus.HALTED
    assert obj.data_type == "hep"

    # Files should have been attached (tarball + pdf, and plots)
    assert obj.files["1407.7587.pdf"]
    assert obj.files["1407.7587.tar.gz"]

    assert len(obj.files) > 2

    # A publication note should have been extracted
    pub_info = obj.data.get('publication_info')
    assert pub_info
    assert pub_info[0]
    assert pub_info[0].get('year') == 2014
    assert pub_info[0].get('journal_title') == "J. Math. Phys."

    # A prediction should have been made
    prediction = obj.extra_data.get("relevance_prediction")
    assert prediction
    assert prediction['decision'] == 'Non-CORE'
    assert prediction['scores']['Non-CORE'] == 0.8358207729691823

    # TODO: add the experiments predictions to the workflow
    # object (see issue #2054).
    experiments_prediction = obj.extra_data.get("experiments_prediction")
    assert experiments_prediction is None

    keywords_prediction = obj.extra_data.get("keywords_prediction")
    assert keywords_prediction
    assert {
        "label": "galaxy",
        "score": 0.29424679279327393,
        "accept": True
    } in keywords_prediction['keywords']

    # This record should not have been touched yet
    assert "approved" not in obj.extra_data

    return workflow_uuid, eng, obj
def test_match_in_holdingpen_stops_pending_wf(
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_package_download,
    mocked_is_pdf_link,
    mocked_download_arxiv,
    workflow_app,
    mocked_external_services,
):
    record = generate_record()

    workflow_id = build_workflow(record).id
    eng_uuid = start("article", object_id=workflow_id)
    es.indices.refresh("holdingpen-hep")
    eng = WorkflowEngine.from_uuid(eng_uuid)
    old_wf = eng.objects[0]
    obj_id = old_wf.id

    assert old_wf.status == ObjectStatus.HALTED
    assert old_wf.extra_data["previously_rejected"] is False

    record2 = record
    record["titles"][0][
        "title"
    ] = "This is an update that will match the wf in the holdingpen"
    record2_workflow = build_workflow(record2).id
    start("article", object_id=record2_workflow)
    es.indices.refresh("holdingpen-hep")

    update_wf = workflow_object_class.get(record2_workflow)

    assert update_wf.status == ObjectStatus.HALTED
    #  As workflow stops (in error) before setting this
    assert update_wf.extra_data["previously_rejected"] is False
    assert update_wf.extra_data['already-in-holding-pen'] is True
    assert update_wf.extra_data["stopped-matched-holdingpen-wf"] is True
    assert update_wf.extra_data["is-update"] is False

    old_wf = workflow_object_class.get(obj_id)
    assert old_wf.extra_data['already-in-holding-pen'] is False
    assert old_wf.extra_data['previously_rejected'] is False
    assert old_wf.extra_data['stopped-by-wf'] == update_wf.id
    assert old_wf.extra_data.get('approved') is None
    assert old_wf.extra_data['is-update'] is False
    assert old_wf.status == ObjectStatus.COMPLETED
def test_delayed_execution_api(app, halt_workflow):
    """Test continue object task."""
    with app.app_context():
        data = [{'foo': 'bar'}]

        async_result = start.delay('halttest', data)

        eng = WorkflowEngine.from_uuid(async_result.get())
        obj = eng.processed_objects[0]

        assert obj.known_statuses.WAITING == obj.status
        assert WorkflowStatus.HALTED == eng.status

        obj_id = obj.id
        resume.delay(obj_id)

        obj = WorkflowObject.get(obj_id)
        assert obj.known_statuses.COMPLETED == obj.status