def test_manual_merge_existing_records(workflow_app):
    # XXX: for some reason, this must be internal.
    from inspirehep.modules.migrator.tasks import record_insert_or_replace

    json_head = fake_record('This is the HEAD', 1)
    json_update = fake_record('While this is the update', 2)

    # this two fields will create a merging conflict
    json_head['core'] = True
    json_update['core'] = False

    head = record_insert_or_replace(json_head)
    update = record_insert_or_replace(json_update)
    head_id = head.id
    update_id = update.id

    obj_id = start_merger(
        head_id=1,
        update_id=2,
        current_user_id=1,
    )

    do_resolve_manual_merge_wf(workflow_app, obj_id)

    # retrieve it again, otherwise Detached Instance Error
    obj = workflow_object_class.get(obj_id)

    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data['approved'] is True
    assert obj.extra_data['auto-approved'] is False

    # no root present before
    last_root = read_wf_record_source(head_id, 'arxiv')
    assert last_root is None
    root_update = read_wf_record_source(update_id, get_source(update))
    assert root_update is None

    # check that head's content has been replaced by merged
    deleted_record = RecordMetadata.query.filter_by(id=update_id).one()

    latest_record = get_db_record('lit', 1)

    assert deleted_record.json['deleted'] is True

    # check deleted record is linked in the latest one
    deleted_rec_ref = {'$ref': 'http://localhost:5000/api/literature/2'}
    assert [deleted_rec_ref] == latest_record['deleted_records']

    # check the merged record is linked in the deleted one
    new_record_metadata = {'$ref': 'http://localhost:5000/api/literature/1'}
    assert new_record_metadata == deleted_record.json['new_record']

    del latest_record['deleted_records']
    assert latest_record == obj.data  # -> resulted merged record
def test_manual_merge_existing_records(workflow_app):
    # XXX: for some reason, this must be internal.
    from inspirehep.modules.migrator.tasks import record_insert_or_replace

    json_head = fake_record('This is the HEAD', 1)
    json_update = fake_record('While this is the update', 2)

    # this two fields will create a merging conflict
    json_head['core'] = True
    json_update['core'] = False

    head = record_insert_or_replace(json_head)
    update = record_insert_or_replace(json_update)
    head_id = head.id
    update_id = update.id

    obj_id = start_merger(
        head_id=1,
        update_id=2,
        current_user_id=1,
    )

    do_resolve_manual_merge_wf(workflow_app, obj_id)

    # retrieve it again, otherwise Detached Instance Error
    obj = workflow_object_class.get(obj_id)

    assert obj.status == ObjectStatus.COMPLETED
    assert obj.extra_data['approved'] is True
    assert obj.extra_data['auto-approved'] is False

    # no root present before
    last_root = read_wf_record_source(head_id, 'arxiv')
    assert last_root is None
    root_update = read_wf_record_source(update_id, get_source(update))
    assert root_update is None

    # check that head's content has been replaced by merged
    deleted_record = RecordMetadata.query.filter_by(id=update_id).one()

    latest_record = get_db_record('lit', 1)

    assert deleted_record.json['deleted'] is True

    # check deleted record is linked in the latest one
    deleted_rec_ref = {'$ref': 'http://localhost:5000/api/literature/2'}
    assert [deleted_rec_ref] == latest_record['deleted_records']

    # check the merged record is linked in the deleted one
    new_record_metadata = {'$ref': 'http://localhost:5000/api/literature/1'}
    assert new_record_metadata == deleted_record.json['new_record']

    del latest_record['deleted_records']
    assert latest_record == obj.data  # -> resulted merged record
Ejemplo n.º 3
0
def start_merger(head_id, update_id, current_user_id=None):
    """Start a new ManualMerge workflow to merge two records manually.

    Args:
        head_id: the id of the first record to merge. This record is the one
            that will be updated with the new information.
        update_id: the id of the second record to merge. This record is the
            one that is going to be deleted and replaced by `head`.
        current_user_id: Id of the current user provided by the Flask app.

    Returns:
        (int): the current workflow object's id.
    """
    data = {
        'pid_type': 'lit',  # TODO: support
        'recid_head': head_id,
        'recid_update': update_id,
    }

    head = get_db_record('lit', head_id)
    update = get_db_record('lit', update_id)

    workflow_object = workflow_object_class.create(data=None,
                                                   id_user=current_user_id,
                                                   data_type='hep')

    wf_id = workflow_object.id  # to retrieve it later
    workflow_object.extra_data.update(data)

    # preparing identifiers in order to do less requests possible later
    head_source = get_head_source(head.id) or merger_get_source(head)

    update_source = get_source(update)
    update_source = update_source if update_source else 'arxiv'

    workflow_object.extra_data['head_source'] = head_source.lower()
    workflow_object.extra_data['update_source'] = update_source.lower()

    workflow_object.extra_data['head_control_number'] = head_id
    workflow_object.extra_data['update_control_number'] = update_id

    workflow_object.extra_data['head_uuid'] = str(head.id)
    workflow_object.extra_data['update_uuid'] = str(update.id)

    workflow_object.extra_data['head'] = head
    workflow_object.extra_data['update'] = update

    workflow_object.save()

    start('manual_merge', object_id=wf_id)

    return wf_id
Ejemplo n.º 4
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(
            extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.',
                     len(extracted_raw_references))
        obj.data['references'] = match_references(extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = get_source(obj.data)

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(
                extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references(pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(
            extract_references_from_text(text, source))
        matched_text_references = match_references(text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.',
                     len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.',
                     len(matched_text_references))
        obj.data['references'] = matched_text_references
Ejemplo n.º 5
0
def test_get_source():
    schema = load_schema('hep')
    subschema = schema['properties']['acquisition_source']

    record = {
        'acquisition_source': {
            'method': 'oai',
            'source': 'arxiv',
        },
    }
    assert validate(record['acquisition_source'], subschema) is None

    expected = 'arxiv'
    result = get_source(record)

    assert expected == result
Ejemplo n.º 6
0
def merge_articles(obj, eng):
    """Merge two articles.

    The workflow payload is overwritten by the merged record, the conflicts are
    stored in ``extra_data.conflicts``. Also, it adds a ``callback_url`` which
    contains the endpoint which resolves the merge conflicts.

    Note:
        When the feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is ``False`` it
        will skip the merge.

    """
    if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER'):
        return None

    matched_control_number = obj.extra_data['matches']['approved']

    head_uuid = PersistentIdentifier.get('lit',
                                         matched_control_number).object_uuid

    obj.extra_data['head_uuid'] = str(head_uuid)

    head = InspireRecord.get_record(head_uuid)
    update = obj.data
    update_source = get_source(update).lower()
    head_root = read_wf_record_source(record_uuid=head.id,
                                      source=update_source)
    head_root = head_root.json if head_root else {}

    obj.extra_data['merger_head_revision'] = head.revision_id
    obj.extra_data['merger_original_root'] = deepcopy(head_root)

    merged, conflicts = merge(
        head=head.dumps(),
        root=head_root,
        update=update,
    )

    obj.data = merged

    if conflicts:
        obj.extra_data['conflicts'] = conflicts
        obj.extra_data['callback_url'] = \
            get_resolve_merge_conflicts_callback_url()
    obj.save()
Ejemplo n.º 7
0
def is_arxiv_paper(obj, eng):
    """Check if a workflow contains a paper from arXiv.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: whether the workflow contains a paper from arXiv.

    """
    method = get_method(obj.data)
    source = get_source(obj.data)

    is_submission_with_arxiv = method == 'submitter' and 'arxiv_eprints' in obj.data
    is_harvested_from_arxiv = method == 'hepcrawl' and source.lower() == 'arxiv'

    return is_submission_with_arxiv or is_harvested_from_arxiv
Ejemplo n.º 8
0
def store_root(obj, eng):
    """Insert or update the current record head's root into the ``WorkflowsRecordSources`` table."""
    if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER', False):
        obj.log.info(
            'skipping storing source root, feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is disabled.'
        )
        return

    root = obj.extra_data['merger_root']
    head_uuid = obj.extra_data['head_uuid']

    source = get_source(root).lower()

    if not source:
        return

    root_record = WorkflowsRecordSources(
        source=get_source_for_root(source),
        record_uuid=head_uuid,
        json=root,
    )
    db.session.merge(root_record)
    db.session.commit()