def test_manual_merge_existing_records(workflow_app): # XXX: for some reason, this must be internal. from inspirehep.modules.migrator.tasks import record_insert_or_replace json_head = fake_record('This is the HEAD', 1) json_update = fake_record('While this is the update', 2) # this two fields will create a merging conflict json_head['core'] = True json_update['core'] = False head = record_insert_or_replace(json_head) update = record_insert_or_replace(json_update) head_id = head.id update_id = update.id obj_id = start_merger( head_id=1, update_id=2, current_user_id=1, ) do_resolve_manual_merge_wf(workflow_app, obj_id) # retrieve it again, otherwise Detached Instance Error obj = workflow_object_class.get(obj_id) assert obj.status == ObjectStatus.COMPLETED assert obj.extra_data['approved'] is True assert obj.extra_data['auto-approved'] is False # no root present before last_root = read_wf_record_source(head_id, 'arxiv') assert last_root is None root_update = read_wf_record_source(update_id, get_source(update)) assert root_update is None # check that head's content has been replaced by merged deleted_record = RecordMetadata.query.filter_by(id=update_id).one() latest_record = get_db_record('lit', 1) assert deleted_record.json['deleted'] is True # check deleted record is linked in the latest one deleted_rec_ref = {'$ref': 'http://localhost:5000/api/literature/2'} assert [deleted_rec_ref] == latest_record['deleted_records'] # check the merged record is linked in the deleted one new_record_metadata = {'$ref': 'http://localhost:5000/api/literature/1'} assert new_record_metadata == deleted_record.json['new_record'] del latest_record['deleted_records'] assert latest_record == obj.data # -> resulted merged record
def test_manual_merge_existing_records(workflow_app): # XXX: for some reason, this must be internal. from inspirehep.modules.migrator.tasks import record_insert_or_replace json_head = fake_record('This is the HEAD', 1) json_update = fake_record('While this is the update', 2) # this two fields will create a merging conflict json_head['core'] = True json_update['core'] = False head = record_insert_or_replace(json_head) update = record_insert_or_replace(json_update) head_id = head.id update_id = update.id obj_id = start_merger( head_id=1, update_id=2, current_user_id=1, ) do_resolve_manual_merge_wf(workflow_app, obj_id) # retrieve it again, otherwise Detached Instance Error obj = workflow_object_class.get(obj_id) assert obj.status == ObjectStatus.COMPLETED assert obj.extra_data['approved'] is True assert obj.extra_data['auto-approved'] is False # no root present before last_root = read_wf_record_source(head_id, 'arxiv') assert last_root is None root_update = read_wf_record_source(update_id, get_source(update)) assert root_update is None # check that head's content has been replaced by merged deleted_record = RecordMetadata.query.filter_by(id=update_id).one() latest_record = get_db_record('lit', 1) assert deleted_record.json['deleted'] is True # check deleted record is linked in the latest one deleted_rec_ref = {'$ref': 'http://localhost:5000/api/literature/2'} assert [deleted_rec_ref] == latest_record['deleted_records'] # check the merged record is linked in the deleted one new_record_metadata = {'$ref': 'http://localhost:5000/api/literature/1'} assert new_record_metadata == deleted_record.json['new_record'] del latest_record['deleted_records'] assert latest_record == obj.data # -> resulted merged record
def start_merger(head_id, update_id, current_user_id=None): """Start a new ManualMerge workflow to merge two records manually. Args: head_id: the id of the first record to merge. This record is the one that will be updated with the new information. update_id: the id of the second record to merge. This record is the one that is going to be deleted and replaced by `head`. current_user_id: Id of the current user provided by the Flask app. Returns: (int): the current workflow object's id. """ data = { 'pid_type': 'lit', # TODO: support 'recid_head': head_id, 'recid_update': update_id, } head = get_db_record('lit', head_id) update = get_db_record('lit', update_id) workflow_object = workflow_object_class.create(data=None, id_user=current_user_id, data_type='hep') wf_id = workflow_object.id # to retrieve it later workflow_object.extra_data.update(data) # preparing identifiers in order to do less requests possible later head_source = get_head_source(head.id) or merger_get_source(head) update_source = get_source(update) update_source = update_source if update_source else 'arxiv' workflow_object.extra_data['head_source'] = head_source.lower() workflow_object.extra_data['update_source'] = update_source.lower() workflow_object.extra_data['head_control_number'] = head_id workflow_object.extra_data['update_control_number'] = update_id workflow_object.extra_data['head_uuid'] = str(head.id) workflow_object.extra_data['update_uuid'] = str(update.id) workflow_object.extra_data['head'] = head workflow_object.extra_data['update'] = update workflow_object.save() start('manual_merge', object_id=wf_id) return wf_id
def refextract(obj, eng): """Extract references from various sources and add them to the workflow. Runs ``refextract`` on both the PDF attached to the workflow and the references provided by the submitter, if any, then chooses the one that generated the most and attaches them to the workflow object. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if 'references' in obj.data: extracted_raw_references = dedupe_list( extract_references_from_raw_refs(obj.data['references'])) obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references)) obj.data['references'] = match_references(extracted_raw_references) return matched_pdf_references, matched_text_references = [], [] source = get_source(obj.data) with get_document_in_workflow(obj) as tmp_document: if tmp_document: pdf_references = dedupe_list( extract_references_from_pdf(tmp_document, source)) matched_pdf_references = match_references(pdf_references) text = get_value(obj.extra_data, 'formdata.references') if text: text_references = dedupe_list( extract_references_from_text(text, source)) matched_text_references = match_references(text_references) if len(matched_pdf_references) == len(matched_text_references) == 0: obj.log.info('No references extracted.') elif len(matched_pdf_references) > len(matched_text_references): obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references)) obj.data['references'] = matched_pdf_references elif len(matched_text_references) >= len(matched_pdf_references): obj.log.info('Extracted %d references from text.', len(matched_text_references)) obj.data['references'] = matched_text_references
def test_get_source(): schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] record = { 'acquisition_source': { 'method': 'oai', 'source': 'arxiv', }, } assert validate(record['acquisition_source'], subschema) is None expected = 'arxiv' result = get_source(record) assert expected == result
def merge_articles(obj, eng): """Merge two articles. The workflow payload is overwritten by the merged record, the conflicts are stored in ``extra_data.conflicts``. Also, it adds a ``callback_url`` which contains the endpoint which resolves the merge conflicts. Note: When the feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is ``False`` it will skip the merge. """ if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER'): return None matched_control_number = obj.extra_data['matches']['approved'] head_uuid = PersistentIdentifier.get('lit', matched_control_number).object_uuid obj.extra_data['head_uuid'] = str(head_uuid) head = InspireRecord.get_record(head_uuid) update = obj.data update_source = get_source(update).lower() head_root = read_wf_record_source(record_uuid=head.id, source=update_source) head_root = head_root.json if head_root else {} obj.extra_data['merger_head_revision'] = head.revision_id obj.extra_data['merger_original_root'] = deepcopy(head_root) merged, conflicts = merge( head=head.dumps(), root=head_root, update=update, ) obj.data = merged if conflicts: obj.extra_data['conflicts'] = conflicts obj.extra_data['callback_url'] = \ get_resolve_merge_conflicts_callback_url() obj.save()
def is_arxiv_paper(obj, eng): """Check if a workflow contains a paper from arXiv. Args: obj: a workflow object. eng: a workflow engine. Returns: bool: whether the workflow contains a paper from arXiv. """ method = get_method(obj.data) source = get_source(obj.data) is_submission_with_arxiv = method == 'submitter' and 'arxiv_eprints' in obj.data is_harvested_from_arxiv = method == 'hepcrawl' and source.lower() == 'arxiv' return is_submission_with_arxiv or is_harvested_from_arxiv
def store_root(obj, eng): """Insert or update the current record head's root into the ``WorkflowsRecordSources`` table.""" if not current_app.config.get('FEATURE_FLAG_ENABLE_MERGER', False): obj.log.info( 'skipping storing source root, feature flag ``FEATURE_FLAG_ENABLE_MERGER`` is disabled.' ) return root = obj.extra_data['merger_root'] head_uuid = obj.extra_data['head_uuid'] source = get_source(root).lower() if not source: return root_record = WorkflowsRecordSources( source=get_source_for_root(source), record_uuid=head_uuid, json=root, ) db.session.merge(root_record) db.session.commit()