def refextract(obj, eng): """Extract references from various sources and add them to the workflow. Runs ``refextract`` on both the PDF attached to the workflow and the references provided by the submitter, if any, then chooses the one that generated the most and attaches them to the workflow object. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if 'references' in obj.data: extracted_raw_references = dedupe_list( extract_references_from_raw_refs(obj.data['references'])) obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references)) obj.data['references'] = match_references(extracted_raw_references) return matched_pdf_references, matched_text_references = [], [] source = get_source(obj.data) with get_document_in_workflow(obj) as tmp_document: if tmp_document: pdf_references = dedupe_list( extract_references_from_pdf(tmp_document, source)) matched_pdf_references = match_references(pdf_references) text = get_value(obj.extra_data, 'formdata.references') if text: text_references = dedupe_list( extract_references_from_text(text, source)) matched_text_references = match_references(text_references) if len(matched_pdf_references) == len(matched_text_references) == 0: obj.log.info('No references extracted.') elif len(matched_pdf_references) > len(matched_text_references): obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references)) obj.data['references'] = matched_pdf_references elif len(matched_text_references) >= len(matched_pdf_references): obj.log.info('Extracted %d references from text.', len(matched_text_references)) obj.data['references'] = matched_text_references
def test_match_references_finds_match_when_repeated_record_with_different_scores( mocked_inspire_matcher_match): references = [{ 'reference': { 'publication_info': { 'artid': '045', 'journal_title': 'JHEP', 'journal_volume': '06', 'page_start': '045', 'year': 2007 } } }] schema = load_schema('hep') subschema = schema['properties']['references'] assert validate(references, subschema) is None references = match_references(references) assert len(references) == 1 assert references[0]['record'][ '$ref'] == 'http://localhost:5000/api/literature/1' assert validate(references, subschema) is None
def test_match_references_no_match_when_multiple_match_different_from_previous( ): """Test reference matcher for when inspire-matcher returns multiple matches where the matched record id is not the same as the previous matched record id""" original_cited_record_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'control_number': 1, 'document_type': ['article'], 'publication_info': [{ 'artid': '159', 'journal_title': 'JHEP', 'journal_volume': '03', 'page_start': '159', 'year': 2016 }, { 'artid': '074', 'journal_title': 'JHEP', 'journal_volume': '05', 'material': 'erratum', 'page_start': '074', 'year': 2017 }] } errata_cited_record_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'control_number': 2, 'document_type': ['article'], 'publication_info': [{ 'artid': '074', 'journal_title': 'JHEP', 'journal_volume': '05', 'material': 'erratum', 'page_start': '074', 'year': 2017 }] } TestRecordMetadata.create_from_kwargs(json=original_cited_record_json, index_name='records-hep') TestRecordMetadata.create_from_kwargs(json=errata_cited_record_json, index_name='records-hep') references = [{ 'reference': { 'publication_info': { 'artid': '074', 'journal_title': 'JHEP', 'journal_volume': '05', 'page_start': '074', 'year': 2017 } } }] schema = load_schema('hep') subschema = schema['properties']['references'] assert validate(references, subschema) is None references = match_references(references) assert get_value(references[0], 'record') is None assert validate(references, subschema) is None