def test_store_record_inspirehep_api_author_new_wrong_response_code(workflow_app):
    record_data = {
        '$schema': 'http://localhost:5000/schemas/records/authors.json',
        'name': {'value': 'Robert Johnson'}, '_collections': ['Authors']
    }
    workflow = workflow_object_class.create({})
    workflow.extra_data['is-update'] = False
    workflow.data = record_data

    eng = MagicMock(workflow_definition=MagicMock(data_type='authors'))
    with patch.dict(workflow_app.config, {
        'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True,
        'INSPIREHEP_URL': "http://web:8000"
    }):
        with requests_mock.Mocker() as requests_mocker:
            requests_mocker.register_uri(
                'POST', '{url}/authors'.format(
                    url=workflow_app.config.get("INSPIREHEP_URL")),
                headers={'content-type': 'application/json'},
                status_code=401,
                json={
                    "message": "Something"
                }
            )
            with pytest.raises(WorkflowsError):
                store_record(workflow, eng)
Exemple #2
0
def start_workflow_for_literature_submission():
    json = request.get_json()
    submission_data = json['data']

    workflow_object = workflow_object_class.create(
        data={},
        id_user=submission_data['acquisition_source']['internal_uid'],
        data_type="hep"
    )
    submission_data['acquisition_source']['submission_number'] = str(workflow_object.id)
    workflow_object.data = submission_data
    workflow_object.extra_data['formdata'] = json['form_data']
    workflow_object.extra_data['source_data'] = {
        'extra_data': copy.deepcopy(workflow_object.extra_data),
        'data': copy.deepcopy(workflow_object.data),
    }

    workflow_object.save()
    db.session.commit()

    workflow_object_id = workflow_object.id

    start.delay("article", object_id=workflow_object.id)

    return jsonify({'workflow_object_id': workflow_object_id})
def test_regression_store_record_does_not_commit_when_error(workflow_app):
    data = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'document_type': ['article'],
        'titles': [{'title': 'title'}],
    }
    eng = MagicMock(workflow_definition=MagicMock(data_type='hep'))

    obj = workflow_object_class.create(data)

    record_count = RecordMetadata.query.count()
    assert record_count == 0

    with patch.object(
        InspireRecord,
        'download_documents_and_figures',
        side_effect=Exception
    ):
        # pytest.raises catches the exception and makes the test passing immediately
        try:
            store_record(obj, eng)
        except Exception:
            record_count = RecordMetadata.query.count()
            assert record_count == 0
def test_store_root_new_record(workflow_app):
    config = {
        'FEATURE_FLAG_ENABLE_MERGER': True
    }
    eng = MagicMock(workflow_definition=MagicMock(data_type='hep'))

    with patch.dict(current_app.config, config):
        head = TestRecordMetadata.create_from_kwargs(index=False, has_pid=False)
        head_uuid = head.record_metadata.id
        record = head.record_metadata.json

        obj = workflow_object_class.create(record)

        root = {
            'version': 'original',
            'acquisition_source': {'source': 'arXiv'}
        }

        extra_data = {
            'head_uuid': str(head_uuid),
            'merger_root': root,
        }

        obj.extra_data = extra_data

        store_root(obj, eng)

        root_entry = read_wf_record_source(head_uuid, 'arxiv')

        assert root_entry.json == root
def test_store_record_inspirehep_api_author_new(workflow_app):
    record_data = {
        '$schema': 'http://localhost:5000/schemas/records/authors.json',
        'name': {'value': 'Robert Johnson'}, '_collections': ['Authors']
    }
    workflow = workflow_object_class.create({})
    workflow.extra_data['is-update'] = False
    workflow.data = record_data

    expected_head_uuid = 'uuid_number_123456'
    expected_control_number = 222

    eng = MagicMock(workflow_definition=MagicMock(data_type='authors'))
    with patch.dict(workflow_app.config, {
        'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True,
        'INSPIREHEP_URL': "http://web:8000"
    }):
        with requests_mock.Mocker() as requests_mocker:
            requests_mocker.register_uri(
                'POST', '{url}/authors'.format(
                    url=workflow_app.config.get("INSPIREHEP_URL")),
                headers={'content-type': 'application/json'},
                status_code=201,
                json={
                    "metadata": {
                        "control_number": expected_control_number
                    },
                    'id_': expected_head_uuid
                }
            )
            store_record(workflow, eng)  # not throwing exception
    assert workflow.data['control_number'] == expected_control_number
    assert workflow.extra_data['head_uuid'] == expected_head_uuid
Exemple #6
0
def submit():
    """Get form data and start workflow."""
    form = LiteratureForm(formdata=request.form)
    visitor = DataExporter()
    visitor.visit(form)

    workflow_object = workflow_object_class.create(
        data={},
        id_user=current_user.get_id(),
        data_type="hep"
    )
    workflow_object.extra_data['formdata'] = copy.deepcopy(visitor.data)
    visitor.data = normalize_formdata(workflow_object, visitor.data)
    workflow_object.data = formdata_to_model(workflow_object, visitor.data)
    workflow_object.extra_data['source_data'] = {
        'extra_data': copy.deepcopy(workflow_object.extra_data),
        'data': copy.deepcopy(workflow_object.data),
    }
    workflow_object.save()
    db.session.commit()

    # Start workflow. delayed=True will execute the workflow in the
    # background using, for example, Celery.
    start.delay("article", object_id=workflow_object.id)
    if 'chapter' in visitor.data.get('type_of_doc') and not visitor.data.get('parent_book'):
        return redirect(url_for('.success_book_parent'))
    else:
        return redirect(url_for('.success'))
Exemple #7
0
def submitnew():
    """Form action handler for INSPIRE author new form."""
    form = AuthorUpdateForm(formdata=request.form)
    visitor = DataExporter()
    visitor.visit(form)

    workflow_object = workflow_object_class.create(
        data={},
        id_user=current_user.get_id(),
        data_type="authors"
    )
    workflow_object.extra_data['formdata'] = copy.deepcopy(visitor.data)
    workflow_object.data = formdata_to_model(workflow_object, visitor.data)
    workflow_object.save()
    db.session.commit()

    # Start workflow. delayed=True will execute the workflow in the
    # background using, for example, Celery.
    start.delay("author", object_id=workflow_object.id)

    ctx = {
        "inspire_url": get_inspire_url(visitor.data)
    }

    return render_template('authors/forms/new_success.html', **ctx)
def test_has_same_source(app, simple_record):
    obj = workflow_object_class.create(
        data=simple_record,
        status=ObjectStatus.HALTED,
        data_type='hep',
    )
    obj_id = obj.id
    obj.save()
    es.indices.refresh('holdingpen-hep')

    obj2 = WorkflowObject.create(data=simple_record, data_type='hep')
    match_non_completed_wf_in_holdingpen(obj2, None)

    same_source_func = has_same_source('holdingpen_matches')

    assert same_source_func(obj2, None)
    assert obj2.extra_data['holdingpen_matches'] == [obj_id]

    # change source and match the wf in the holdingpen
    different_source_rec = dict(simple_record)
    different_source_rec['acquisition_source'] = {'source': 'different'}
    obj3 = WorkflowObject.create(data=different_source_rec, data_type='hep')

    assert match_non_completed_wf_in_holdingpen(obj3, None)
    assert not same_source_func(obj3, None)
def test_previously_rejected_from_not_fully_harvested_category_is_not_auto_approved(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_is_pdf_link,
    mocked_package_download,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    record, categories = core_record()
    record["arxiv_eprints"][0]["categories"] = ["q-bio.GN"]

    obj = workflow_object_class.create(
        data=record, status=ObjectStatus.COMPLETED, data_type="hep"
    )
    obj.extra_data["approved"] = False  # reject it
    obj.save()
    es.indices.refresh("holdingpen-hep")

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        "ARXIV_CATEGORIES": categories,
    }
    with workflow_app.app_context():
        with mock.patch.dict(workflow_app.config, extra_config):
            workflow_id = build_workflow(record).id
            eng_uuid = start("article", object_id=workflow_id)
            eng = WorkflowEngine.from_uuid(eng_uuid)
            obj2 = eng.processed_objects[0]
            assert not obj2.extra_data["auto-approved"]
            assert len(obj2.extra_data["previously_rejected_matches"]) > 0
            assert obj2.status == ObjectStatus.COMPLETED
def test_stop_matched_holdingpen_wfs(app, simple_record):
    # need to run a wf in order to assign to it the wf definition and a uuid
    # for it

    obj = workflow_object_class.create(
        data_type='hep',
        **simple_record
    )
    workflow_uuid = start('article', object_id=obj.id)
    eng = WorkflowEngine.from_uuid(workflow_uuid)
    obj = eng.processed_objects[0]
    obj.status = ObjectStatus.HALTED
    obj.save()
    obj_id = obj.id
    es.indices.refresh('holdingpen-hep')

    obj2 = WorkflowObject.create(data_type='hep', **simple_record)
    obj2_id = obj2.id

    match_non_completed_wf_in_holdingpen(obj2, None)
    assert obj2.extra_data['holdingpen_matches'] == [obj_id]

    stop_matched_holdingpen_wfs(obj2, None)

    stopped_wf = workflow_object_class.get(obj_id)
    assert stopped_wf.status == ObjectStatus.COMPLETED
    assert stopped_wf.extra_data['stopped-by-wf'] == obj2_id
def test_article_workflow_stops_when_record_is_not_valid(workflow_app):
    invalid_record = {
        'document_type': [
            'article',
        ],
        'titles': [
            {'title': 'A title'},
        ],
    }

    obj = workflow_object_class.create(
        data=invalid_record,
        data_type='hep',
        id_user=1,
    )
    obj_id = obj.id

    with pytest.raises(ValidationError):
        start('article', invalid_record, obj_id)

    obj = workflow_object_class.get(obj_id)

    assert obj.status == ObjectStatus.ERROR
    assert '_error_msg' in obj.extra_data
    assert 'required' in obj.extra_data['_error_msg']
Exemple #12
0
def start_author_workflow():
    submission_data = request.get_json()['data']
    workflow_object = workflow_object_class.create(
        data={},
        # can be changed to get the user id from the current user once we implement authentication
        id_user=submission_data['acquisition_source']['internal_uid'],
        data_type='authors'
    )
    submission_data['acquisition_source']['submission_number'] = str(workflow_object.id)
    workflow_object.data = submission_data
    workflow_object.extra_data['is-update'] = bool(submission_data.get('control_number'))
    workflow_object.extra_data['source_data'] = {
        'data': copy.deepcopy(workflow_object.data),
        'extra_data': copy.deepcopy(workflow_object.extra_data)
    }

    workflow_object.save()
    db.session.commit()

    workflow_object_id = workflow_object.id

    start.delay(
        'author', object_id=workflow_object.id)

    return jsonify({'workflow_object_id': workflow_object_id})
def test_save_roots(workflow_app):

    head = InspireRecord.create_or_update(fake_record('title1', 123), skip_files=False)
    head.commit()
    update = InspireRecord.create_or_update(fake_record('title2', 456), skip_files=False)
    update.commit()

    obj = workflow_object_class.create(
        data={},
        data_type='hep'
    )
    obj.extra_data['head_uuid'] = str(head.id)
    obj.extra_data['update_uuid'] = str(update.id)
    obj.save()

    # Union: keep the most recently created/updated root from each source.
    insert_wf_record_source(json={'version': 'original'}, record_uuid=head.id, source='arxiv')

    insert_wf_record_source(json={'version': 'updated'}, record_uuid=update.id, source='arxiv')

    insert_wf_record_source(json={'version': 'updated'}, record_uuid=update.id, source='publisher')

    save_roots(obj, None)

    arxiv_rec = read_wf_record_source(head.id, 'arxiv')
    assert arxiv_rec.json == {'version': 'updated'}

    pub_rec = read_wf_record_source(head.id, 'publisher')
    assert pub_rec.json == {'version': 'updated'}

    assert not read_wf_record_source(update.id, 'arxiv')
    assert not read_wf_record_source(update.id, 'publisher')
def test_store_record_does_not_raise_in_the_orcid_receiver(mock_attempt_push, app):
    config = {
        'FEATURE_FLAG_ENABLE_ORCID_PUSH': True,
        'RECORDS_SKIP_FILES': False,
    }
    eng = MagicMock(workflow_definition=MagicMock(data_type='hep'))

    with patch.dict(current_app.config, config):
        obj = workflow_object_class.create({
            '$schema': 'http://localhost:5000/schemas/records/hep.json',
            '_collections': [
                'Literature',
            ],
            'authors': [
                {
                    'full_name': 'Patra, Asim',
                    'ids': [
                        {
                            'schema': 'ORCID',
                            'value': '0000-0003-1166-2790',
                        },
                    ],
                },
            ],
            'document_type': [
                'article',
            ],
            'titles': [
                {'title': 'title'},
            ],
        })

        store_record(obj, eng)  # Does not raise.
def test_save_roots(workflow_app):
    # XXX: for some reason, this must be internal.
    from inspirehep.modules.migrator.tasks import record_insert_or_replace

    head = record_insert_or_replace(fake_record('title1', 123))
    update = record_insert_or_replace(fake_record('title2', 456))

    obj = workflow_object_class.create(
        data={},
        data_type='hep'
    )
    obj.extra_data['head_uuid'] = str(head.id)
    obj.extra_data['update_uuid'] = str(update.id)
    obj.save()

    insert_wf_record_source(json={}, record_uuid=head.id, source='a')
    insert_wf_record_source(json={}, record_uuid=head.id, source='b')

    # this will not be saved because there's already an entry with source `a`
    insert_wf_record_source(json={}, record_uuid=update.id, source='a')
    insert_wf_record_source(json={}, record_uuid=update.id, source='c')

    save_roots(obj, None)

    assert read_wf_record_source(str(head.id), 'a')
    assert read_wf_record_source(str(head.id), 'b')
    assert read_wf_record_source(str(head.id), 'c')
Exemple #16
0
def test_audit(small_app):
    user_id = None
    workflow_id = None
    with small_app.app_context():
        user = User(email="*****@*****.**", active=True)
        user.password = "******"
        db.session.add(user)

        workflows_object = workflow_object_class.create({}, data_type="hep")
        db.session.commit()

        user_id = user.id
        workflow_id = workflows_object.id

    with small_app.app_context():
        logging_info = {
            'object_id': workflow_id,
            'user_id': user_id,
            'score': 0.222113,
            'user_action': "Non-CORE",
            'decision': "Rejected",
            'source': "test",
            'action': "accept"
        }
        audit = WorkflowsAudit(**logging_info)
        audit.save()
        db.session.commit()

        assert WorkflowsAudit.query.count() == 1

        audit_entry = WorkflowsAudit.query.filter(
            WorkflowsAudit.object_id == workflow_id
        ).one()
        assert audit_entry
        assert audit_entry.action == "accept"
        assert audit_entry.score == 0.222113

    relevance_prediction = dict(
        max_score=0.222113, decision="Rejected"
    )
    with small_app.app_context():
        log_workflows_action(
            action="accept_core",
            relevance_prediction=relevance_prediction,
            object_id=workflow_id,
            user_id=None,
            source="test",
            user_action="accept"
        )
        db.session.commit()

        assert WorkflowsAudit.query.count() == 2

        audit_entry = WorkflowsAudit.query.filter(
            WorkflowsAudit.action == "accept_core"
        ).one()
        assert audit_entry
        assert audit_entry.action == "accept_core"
        assert audit_entry.score == 0.222113
def test_is_stale_data_is_true(workflow_app):
    head = TestRecordMetadata.create_from_kwargs(index=False, has_pid=False)
    obj = workflow_object_class.create({})
    obj.extra_data['is-update'] = True
    obj.extra_data['head_uuid'] = head.record_metadata.id
    obj.extra_data['head_version_id'] = head.record_metadata.version_id - 1

    assert is_stale_data(obj, None)
Exemple #18
0
 def create_wf(arxiv_id, control_number):
     wf = workflow_object_class.create(
         data_type='hep',
         data={
             'arxiv_eprints': [{'value': arxiv_id}],
             'control_number': control_number
         }
     )
     wf.status = ObjectStatus.COMPLETED
     wf.save()
def test_match_wf_in_error_goes_in_initial_state(workflow_app):
    record = generate_record()

    obj = workflow_object_class.create(data=record, data_type="hep")
    obj.status = ObjectStatus.INITIAL
    obj.save()
    es.indices.refresh("holdingpen-hep")

    with pytest.raises(WorkflowsError):
        workflow_id = build_workflow(record).id
        start("article", object_id=workflow_id)
def build_workflow(workflow_data, data_type='hep', **kwargs):
    workflow_object = workflow_object_class.create(
        data_type=data_type,
        data=workflow_data,
        extra_data={
            'source_data': {
                'data': deepcopy(workflow_data),
                'extra_data': {},
            }
        },
        **kwargs
    )
    return workflow_object
def start_merger(head_id, update_id, current_user_id=None):
    """Start a new ManualMerge workflow to merge two records manually.

    Args:
        head_id: the id of the first record to merge. This record is the one
            that will be updated with the new information.
        update_id: the id of the second record to merge. This record is the
            one that is going to be deleted and replaced by `head`.
        current_user_id: Id of the current user provided by the Flask app.

    Returns:
        (int): the current workflow object's id.
    """
    data = {
        'pid_type': 'lit',  # TODO: support
        'recid_head': head_id,
        'recid_update': update_id,
    }

    head = get_db_record('lit', head_id)
    update = get_db_record('lit', update_id)

    workflow_object = workflow_object_class.create(
        data=None,
        id_user=current_user_id,
        data_type='hep'
    )

    wf_id = workflow_object.id    # to retrieve it later
    workflow_object.extra_data.update(data)

    update_source = LiteratureReader(update).source
    update_source = update_source if update_source else 'arxiv'

    workflow_object.extra_data['update_source'] = update_source.lower()

    workflow_object.extra_data['head_control_number'] = head_id
    workflow_object.extra_data['update_control_number'] = update_id

    workflow_object.extra_data['head_uuid'] = str(head.id)
    workflow_object.extra_data['update_uuid'] = str(update.id)

    workflow_object.extra_data['head'] = head
    workflow_object.extra_data['update'] = update

    workflow_object.save()

    start('manual_merge', object_id=wf_id)

    return wf_id
def workflow():
    workflow_object = workflow_object_class.create(
        data={},
        id_user=1,
        data_type="hep"
    )
    workflow_object.save()
    db.session.commit()
    workflow_object.continue_workflow = lambda **args: True

    yield workflow_object

    WorkflowObjectModel.query.filter_by(id=workflow_object.id).delete()
    db.session.commit()
Exemple #23
0
def test_inspect_merge_view_returns_400(workflow_app):

    factory = TestRecordMetadata.create_from_kwargs(
        json={'titles': [{'title': 'Curated version'}]}
    )

    obj = workflow_object_class.create(
        data=factory.record_metadata.json,
        data_type='hep',
    )
    obj.save()
    db.session.commit()

    with workflow_app.test_client() as client:
        response = client.get('/workflows/inspect_merge/{}'.format(obj.id))
        assert response.status_code == 400
def test_workflow_loads_from_source_data_fails_on_no_source_data(
    load_from_source_data_workflow,
    workflow_app,
    record_from_db,
):
    extra_data_without_source_data = {}
    workflow_id = workflow_object_class.create(
        data_type='hep',
        data=record_from_db,
        extra_data=extra_data_without_source_data,
    ).id

    with pytest.raises(ValueError) as exc:
        start('load_source_data', object_id=workflow_id)

    assert exc.match(r'source_data.*missing')
def test_normalize_journal_titles_known_journals_with_ref(workflow_app, insert_journals_in_db):
    record = {
        "_collections": [
            "Literature"
        ],
        "titles": [
            "A title"
        ],
        "document_type": [
            "book",
            "note",
            "report"
        ],
        "publication_info": [
            {
                "journal_title": "A Test Journal1",
                "journal_record": {
                    "$ref": "http://localhost:5000/api/journals/1936475"
                }
            },
            {
                "cnum": "C01-01-01"
            },
            {
                "journal_title": "Test.Jou.2",
                "journal_record": {
                    "$ref": "http://localhost:5000/api/journals/1936476"
                }
            }
        ]
    }

    obj = workflow_object_class.create(
        data=record,
        id_user=1,
        data_type='hep'
    )

    normalize_journal_titles(obj, None)

    assert obj.data['publication_info'][0]['journal_title'] == 'Test.Jou.1'
    assert obj.data['publication_info'][2]['journal_title'] == 'Test.Jou.2'
    assert obj.data['publication_info'][0]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936475'}
    assert obj.data['publication_info'][2]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936476'}
Exemple #26
0
    def start_workflow_for_submission(self, endpoint, submission_data,
                                      control_number=None):
        workflow_object = workflow_object_class.create(
            data={},
            id_user=current_user.get_id(),
            data_type=self.endpoint_to_data_type[endpoint]
        )

        submission_data['acquisition_source'] = dict(
            email=current_user.email,
            datetime=datetime.datetime.utcnow().isoformat(),
            method='submitter',
            submission_number=str(workflow_object.id),
            internal_uid=int(workflow_object.id_user),
        )

        orcid = self._get_user_orcid()
        if orcid:
            submission_data['acquisition_source']['orcid'] = orcid

        serializer = self._get_serializer_from_endpoint(endpoint)
        serialized_data = serializer().load(submission_data).data

        if control_number:
            serialized_data['control_number'] = int(control_number)

        workflow_object.data = serialized_data
        workflow_object.extra_data['is-update'] = bool(control_number)

        workflow_object.extra_data['source_data'] = {
            'data': copy.deepcopy(workflow_object.data),
            'extra_data': copy.deepcopy(workflow_object.extra_data)
        }

        workflow_object.save()
        db.session.commit()

        workflow_object_id = workflow_object.id

        start.delay(
            self.endpoint_to_workflow_name[endpoint], object_id=workflow_object.id)

        return workflow_object_id
Exemple #27
0
def test_inspect_merge_view(workflow_app):

    factory = TestRecordMetadata.create_from_kwargs(
        json={'titles': [{'title': 'Curated version'}]}
    )

    obj = workflow_object_class.create(
        data=factory.record_metadata.json,
        data_type='hep',
    )
    obj.save()
    db.session.commit()

    head = deepcopy(factory.record_metadata.json)
    factory.record_metadata.json['titles'][0]['title'] = 'second curated version'
    db.session.add(factory.record_metadata)
    db.session.commit()

    obj.extra_data['merger_root'] = {
        'titles': [{'title': 'Second version'}],
        'document_type': ['article'],
        '_collections': ['Literature'],
    }
    obj.extra_data['merger_original_root'] = {
        'titles': [{'title': 'First version'}],
        'document_type': ['article'],
        '_collections': ['Literature'],
    }
    obj.extra_data['merger_head_revision'] = factory.inspire_record.revision_id

    expected = {
        'root': obj.extra_data['merger_original_root'],
        'head': head,
        'update': obj.extra_data['merger_root'],
        'merged': factory.record_metadata.json
    }

    with workflow_app.test_client() as client:
        response = client.get('/workflows/inspect_merge/{}'.format(obj.id))
        assert response.status_code == 200
        assert json.loads(response.data) == expected
def test_authors_workflow_continues_when_record_is_valid(workflow_app, mocked_external_services):
    valid_record = {
        '_collections': ['Authors'],
        'name': {
            'preferred_name': 'John Smith',
            'value': 'Smith, John'
        }
    }

    obj = workflow_object_class.create(
        data=valid_record,
        data_type='authors',
        id_user=1,
    )

    start('author', valid_record, obj.id)

    obj = workflow_object_class.get(obj.id)

    assert obj.status == ObjectStatus.HALTED
    assert '_error_msg' not in obj.extra_data
Exemple #29
0
def submit():
    """Get form data and start workflow."""
    form = LiteratureForm(formdata=request.form)
    visitor = DataExporter()
    visitor.visit(form)

    workflow_object = workflow_object_class.create(
        data={},
        id_user=current_user.get_id(),
        data_type="hep"
    )
    workflow_object.extra_data['formdata'] = copy.deepcopy(visitor.data)
    workflow_object.data = formdata_to_model(workflow_object, visitor.data)
    workflow_object.save()
    db.session.commit()

    # Start workflow. delayed=True will execute the workflow in the
    # background using, for example, Celery.
    start.delay("article", object_id=workflow_object.id)

    return redirect(url_for('.success'))
def test_pending_holdingpen_matches_wf_if_not_completed(app, simple_record):
    obj = workflow_object_class.create(
        data=simple_record,
        status=ObjectStatus.HALTED,
        data_type='hep',
    )
    obj_id = obj.id
    obj.save()
    es.indices.refresh('holdingpen-hep')

    obj2 = WorkflowObject.create(data=simple_record, data_type='hep')
    assert match_non_completed_wf_in_holdingpen(obj2, None)
    assert obj2.extra_data['holdingpen_matches'] == [obj_id]

    obj = workflow_object_class.get(obj_id)
    obj.status = ObjectStatus.COMPLETED
    obj.save()
    es.indices.refresh('holdingpen-hep')

    # doesn't match anymore because obj is COMPLETED
    assert not match_non_completed_wf_in_holdingpen(obj2, None)
def test_store_record_does_not_raise_in_the_orcid_receiver(
        mock_attempt_push, app):
    config = {
        'FEATURE_FLAG_ENABLE_ORCID_PUSH': True,
        'RECORDS_SKIP_FILES': False,
    }
    eng = MagicMock(workflow_definition=MagicMock(data_type='hep'))

    with patch.dict(current_app.config, config):
        obj = workflow_object_class.create({
            '$schema':
            'http://localhost:5000/schemas/records/hep.json',
            '_collections': [
                'Literature',
            ],
            'authors': [
                {
                    'full_name':
                    'Patra, Asim',
                    'ids': [
                        {
                            'schema': 'ORCID',
                            'value': '0000-0003-1166-2790',
                        },
                    ],
                },
            ],
            'document_type': [
                'article',
            ],
            'titles': [
                {
                    'title': 'title'
                },
            ],
        })

        store_record(obj, eng)  # Does not raise.
def test_normalize_journal_titles_known_journals_no_ref(workflow_app, insert_journals_in_db):
    record = {
        "_collections": [
            "Literature"
        ],
        "titles": [
            "A title"
        ],
        "document_type": [
            "book",
            "note",
            "report"
        ],
        "publication_info": [
            {
                "journal_title": "A Test Journal1"
            },
            {
                "cnum": "C01-01-01"
            },
            {
                "journal_title": "Test.Jou.2"
            }
        ]
    }

    obj = workflow_object_class.create(
        data=record,
        id_user=1,
        data_type='hep'
    )

    normalize_journal_titles(obj, None)

    assert obj.data['publication_info'][0]['journal_title'] == 'Test.Jou.1'
    assert obj.data['publication_info'][2]['journal_title'] == 'Test.Jou.2'
    assert obj.data['publication_info'][0]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936475'}
    assert obj.data['publication_info'][2]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936476'}
def test_normalize_journal_titles_unknown_journals_no_ref(workflow_app, insert_journals_in_db):
    record = {
        "_collections": [
            "Literature"
        ],
        "titles": [
            "A title"
        ],
        "document_type": [
            "book",
            "note",
            "report"
        ],
        "publication_info": [
            {
                "journal_title": "Unknown1"
            },
            {
                "cnum": "C01-01-01"
            },
            {
                "journal_title": "Unknown2"
            }
        ]
    }

    obj = workflow_object_class.create(
        data=record,
        id_user=1,
        data_type='hep'
    )

    normalize_journal_titles(obj, None)

    assert obj.data['publication_info'][0]['journal_title'] == 'Unknown1'
    assert obj.data['publication_info'][2]['journal_title'] == 'Unknown2'
    assert 'journal_record' not in obj.data['publication_info'][0]
    assert 'journal_record' not in obj.data['publication_info'][2]
Exemple #34
0
def test_store_record_inspirehep_api_literature_new(workflow_app):
    record_data = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        'titles': [{
            'title': 'Follow hour including staff wrong.'
        }],
        'document_type': ['article'],
        '_collections': ['Literature']
    }
    workflow = workflow_object_class.create({})
    workflow.extra_data['is-update'] = False
    workflow.data = record_data

    expected_head_uuid = 'uuid_number_123456'
    expected_control_number = 111

    eng = MagicMock(workflow_definition=MagicMock(data_type='hep'))
    with patch.dict(
            workflow_app.config, {
                'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True,
                'INSPIREHEP_URL': "http://web:8000"
            }):
        with requests_mock.Mocker() as requests_mocker:
            requests_mocker.register_uri(
                'POST',
                '{url}/literature'.format(
                    url=workflow_app.config.get("INSPIREHEP_URL")),
                headers={'content-type': 'application/json'},
                status_code=201,
                json={
                    "metadata": {
                        "control_number": expected_control_number
                    },
                    'uuid': expected_head_uuid
                })
            store_record(workflow, eng)  # not throwing exception
    assert workflow.data['control_number'] == expected_control_number
    assert workflow.extra_data['head_uuid'] == expected_head_uuid
Exemple #35
0
def test_authors_workflow_stops_when_record_is_not_valid(workflow_app):
    invalid_record = {
        'name': {
            'preferred_name': 'John Smith',
            'value': 'Smith, John'
        }
    }

    obj = workflow_object_class.create(
        data=invalid_record,
        data_type='authors',
        id_user=1,
    )
    obj_id = obj.id

    with pytest.raises(ValidationError):
        start('author', invalid_record, obj_id)

    obj = workflow_object_class.get(obj_id)

    assert obj.status == ObjectStatus.ERROR
    assert '_error_msg' in obj.extra_data
    assert 'required' in obj.extra_data['_error_msg']
def test_store_record_inspirehep_api_author_new(workflow_app):
    record_data = {
        '$schema': 'http://localhost:5000/schemas/records/authors.json',
        'name': {
            'value': 'Robert Johnson'
        },
        '_collections': ['Authors']
    }
    workflow = workflow_object_class.create({})
    workflow.extra_data['is-update'] = False
    workflow.data = record_data

    expected_head_uuid = 'uuid_number_123456'
    expected_control_number = 222

    eng = MagicMock(workflow_definition=MagicMock(data_type='authors'))
    with patch.dict(
            workflow_app.config, {
                'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True,
                'INSPIREHEP_URL': "http://web:8000"
            }):
        with requests_mock.Mocker() as requests_mocker:
            requests_mocker.register_uri(
                'POST',
                '{url}/authors'.format(
                    url=workflow_app.config.get("INSPIREHEP_URL")),
                headers={'content-type': 'application/json'},
                status_code=201,
                json={
                    "metadata": {
                        "control_number": expected_control_number
                    },
                    'id_': expected_head_uuid
                })
            store_record(workflow, eng)  # not throwing exception
    assert workflow.data['control_number'] == expected_control_number
    assert workflow.extra_data['head_uuid'] == expected_head_uuid
Exemple #37
0
def test_save_roots(workflow_app):

    head = InspireRecord.create_or_update(fake_record('title1', 123),
                                          skip_files=False)
    head.commit()
    update = InspireRecord.create_or_update(fake_record('title2', 456),
                                            skip_files=False)
    update.commit()

    obj = workflow_object_class.create(data={}, data_type='hep')
    obj.extra_data['head_uuid'] = str(head.id)
    obj.extra_data['update_uuid'] = str(update.id)
    obj.save()

    # Union: keep the most recently created/updated root from each source.
    insert_wf_record_source(json={'version': 'original'},
                            record_uuid=head.id,
                            source='arxiv')

    insert_wf_record_source(json={'version': 'updated'},
                            record_uuid=update.id,
                            source='arxiv')

    insert_wf_record_source(json={'version': 'updated'},
                            record_uuid=update.id,
                            source='publisher')

    save_roots(obj, None)

    arxiv_rec = read_wf_record_source(head.id, 'arxiv')
    assert arxiv_rec.json == {'version': 'updated'}

    pub_rec = read_wf_record_source(head.id, 'publisher')
    assert pub_rec.json == {'version': 'updated'}

    assert not read_wf_record_source(update.id, 'arxiv')
    assert not read_wf_record_source(update.id, 'publisher')
Exemple #38
0
def start_workflow_for_author_submission():
    submission_data = request.get_json()['data']
    workflow_object = workflow_object_class.create(
        data={},
        id_user=submission_data['acquisition_source']['internal_uid'],
        data_type='authors')
    submission_data['acquisition_source']['submission_number'] = str(
        workflow_object.id)
    workflow_object.data = submission_data
    workflow_object.extra_data['is-update'] = bool(
        submission_data.get('control_number'))
    workflow_object.extra_data['source_data'] = {
        'data': copy.deepcopy(workflow_object.data),
        'extra_data': copy.deepcopy(workflow_object.extra_data)
    }

    workflow_object.save()
    db.session.commit()

    workflow_object_id = workflow_object.id

    start.delay('author', object_id=workflow_object.id)

    return jsonify({'workflow_object_id': workflow_object_id})
def test_save_roots(workflow_app):
    # XXX: for some reason, this must be internal.
    from inspirehep.modules.migrator.tasks import record_insert_or_replace

    head = record_insert_or_replace(fake_record('title1', 123))
    update = record_insert_or_replace(fake_record('title2', 456))

    obj = workflow_object_class.create(data={}, data_type='hep')
    obj.extra_data['head_uuid'] = str(head.id)
    obj.extra_data['update_uuid'] = str(update.id)
    obj.save()

    insert_wf_record_source(json={}, record_uuid=head.id, source='a')
    insert_wf_record_source(json={}, record_uuid=head.id, source='b')

    # this will not be saved because there's already an entry with source `a`
    insert_wf_record_source(json={}, record_uuid=update.id, source='a')
    insert_wf_record_source(json={}, record_uuid=update.id, source='c')

    save_roots(obj, None)

    assert read_wf_record_source(str(head.id), 'a')
    assert read_wf_record_source(str(head.id), 'b')
    assert read_wf_record_source(str(head.id), 'c')
Exemple #40
0
def test_previously_rejected_from_not_fully_harvested_category_is_not_auto_approved(
    mocked_refextract_extract_refs,
    mocked_api_request_magpie,
    mocked_api_request_beard,
    mocked_is_pdf_link,
    mocked_package_download,
    mocked_arxiv_download,
    workflow_app,
    mocked_external_services,
):
    record, categories = core_record()
    record['arxiv_eprints'][0]['categories'] = ['q-bio.GN']

    obj = workflow_object_class.create(
        data=record,
        status=ObjectStatus.COMPLETED,
        data_type='hep',
    )
    obj.extra_data['approved'] = False  # reject it
    obj.save()
    es.indices.refresh('holdingpen-hep')

    extra_config = {
        "BEARD_API_URL": "http://example.com/beard",
        "MAGPIE_API_URL": "http://example.com/magpie",
        'ARXIV_CATEGORIES': categories,
    }
    with workflow_app.app_context():
        with mock.patch.dict(workflow_app.config, extra_config):
            workflow_id = build_workflow(record).id
            eng_uuid = start('article', object_id=workflow_id)
            eng = WorkflowEngine.from_uuid(eng_uuid)
            obj2 = eng.processed_objects[0]
            assert not obj2.extra_data['auto-approved']
            assert len(obj2.extra_data['previously_rejected_matches']) > 0
            assert obj2.status == ObjectStatus.COMPLETED
def test_is_stale_data_returns_false_if_is_update_is_falsy(workflow_app):
    TestRecordMetadata.create_from_kwargs(index=False, has_pid=False)
    obj = workflow_object_class.create({})
    assert is_stale_data(obj, None) is False
Exemple #42
0
def submit_results(job_id, errors, log_file, results_uri, results_data=None):
    """Receive the submission of the results of a crawl job.

    Then it spawns the appropiate workflow according to whichever workflow
    the crawl job specifies.

    :param job_id: Id of the crawler job.
    :param errors: Errors that happened, if any (seems ambiguous)
    :param log_file: Path to the log file of the crawler job.
    :param results_uri: URI to the file containing the results of the crawl
       job, namely the records extracted.
    :param results_data: Optional data payload with the results list, to skip
        retrieving them from the `results_uri`, useful for slow or unreliable
        storages.
    """
    results_path = urlparse(results_uri).path
    job = CrawlerJob.get_by_job(job_id)
    job.logs = log_file
    job.results = results_uri

    if errors:
        job.status = JobStatus.ERROR
        job.save()
        db.session.commit()
        raise CrawlerJobError(str(errors))

    if results_data is None:
        results_data = _extract_results_data(results_path)

    for crawl_result in results_data:
        crawl_result = copy.deepcopy(crawl_result)
        try:
            _check_crawl_result_format(crawl_result)
        except KeyError as e:
            crawl_result = _crawl_result_from_exception(e, crawl_result)

        record = crawl_result.pop('record')
        crawl_errors = crawl_result['errors']

        current_app.logger.debug('Parsing record: {}'.format(record))
        engine = WorkflowEngine.with_name(job.workflow)
        engine.save()
        obj = workflow_object_class.create(data=record)
        obj.id_workflow = str(engine.uuid)
        if crawl_errors:
            obj.status = ObjectStatus.ERROR
            obj.extra_data['crawl_errors'] = crawl_result

        else:
            extra_data = {
                'crawler_job_id': job_id,
                'crawler_results_path': results_path,
            }
            record_extra = record.pop('extra_data', {})
            if record_extra:
                extra_data['record_extra'] = record_extra

            obj.extra_data['source_data'] = {
                'data': copy.deepcopy(record),
                'extra_data': copy.deepcopy(extra_data),
            }
            obj.extra_data.update(extra_data)

        obj.data_type = current_app.config['CRAWLER_DATA_TYPE']
        obj.save()
        db.session.commit()

        crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id)
        db.session.add(crawler_object)
        queue = current_app.config['CRAWLER_CELERY_QUEUE']

        if not crawl_errors:
            start.apply_async(
                kwargs={
                    'workflow_name': job.workflow,
                    'object_id': obj.id,
                },
                queue=queue,
            )

    current_app.logger.info('Parsed {} records.'.format(len(results_data)))

    job.status = JobStatus.FINISHED
    job.save()
    db.session.commit()