Python match_references Beispiele, inspirehep.modules.refextract.matcher.match_references Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: actions.py Projekt: theleestarr/inspire-next

def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(
            extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.',
                     len(extracted_raw_references))
        obj.data['references'] = match_references(extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(
                extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references(pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(
            extract_references_from_text(text, source))
        matched_text_references = match_references(text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.',
                     len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.',
                     len(matched_text_references))
        obj.data['references'] = matched_text_references

Beispiel #2

0

Datei anzeigen

Datei: test_refextract_matcher.py Projekt: harunurhan/inspire-next

def test_match_references_finds_match_when_repeated_record_with_different_scores(
    mocked_inspire_matcher_match,
    isolated_app
):
    references = [
        {
            'reference': {
                'publication_info': {
                    'artid': '045',
                    'journal_title': 'JHEP',
                    'journal_volume': '06',
                    'page_start': '045',
                    'year': 2007
                }
            }
        }
    ]

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate(references, subschema) is None
    references = match_references(references)

    assert len(references) == 1
    assert references[0]['record']['$ref'] == 'http://localhost:5000/api/literature/1'
    assert validate(references, subschema) is None

Beispiel #3

0

Datei anzeigen

Datei: test_refextract_matcher.py Projekt: theleestarr/inspire-next

def test_match_references_finds_match_when_repeated_record_with_different_scores(
    mocked_inspire_matcher_match,
    isolated_app
):
    references = [
        {
            'reference': {
                'publication_info': {
                    'artid': '045',
                    'journal_title': 'JHEP',
                    'journal_volume': '06',
                    'page_start': '045',
                    'year': 2007
                }
            }
        }
    ]

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate(references, subschema) is None
    references = match_references(references)

    assert len(references) == 1
    assert references[0]['record']['$ref'] == 'http://localhost:5000/api/literature/1'
    assert validate(references, subschema) is None

Beispiel #4

0

Datei anzeigen

Datei: api.py Projekt: theleestarr/inspire-next

def refextract_url():
    """Run refextract on a URL."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_url(
            request.json['url'],
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}')
    references = map_refextract_to_schema(extracted_references)
    references = match_references(references)
    return jsonify(references)

Beispiel #5

0

Datei anzeigen

Datei: actions.py Projekt: harunurhan/inspire-next

def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references))
        obj.data['references'] = match_references(extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references(pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(extract_references_from_text(text, source))
        matched_text_references = match_references(text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.', len(matched_text_references))
        obj.data['references'] = matched_text_references

Beispiel #6

0

Datei anzeigen

Datei: api.py Projekt: harunurhan/inspire-next

def refextract_text():
    """Run refextract on a piece of text."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_string(
            request.json['text'],
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}'
        )
    references = map_refextract_to_schema(extracted_references)
    references = match_references(references)
    return jsonify(references)

Beispiel #7

0

Datei anzeigen

Datei: test_refextract_matcher.py Projekt: harunurhan/inspire-next

def test_match_references_no_match_when_multiple_match_different_from_previous(isolated_app):
    """Test reference matcher for when inspire-matcher returns multiple matches
    where the matched record id is not the same as the previous matched record id"""

    original_cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 1,
        'document_type': ['article'],
        'publication_info': [
            {
                'artid': '159',
                'journal_title': 'JHEP',
                'journal_volume': '03',
                'page_start': '159',
                'year': 2016
            },
            {
                'artid': '074',
                'journal_title': 'JHEP',
                'journal_volume': '05',
                'material': 'erratum',
                'page_start': '074',
                'year': 2017
            }
        ]
    }

    errata_cited_record_json = {
        '$schema': 'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number': 2,
        'document_type': ['article'],
        'publication_info': [
            {
                'artid': '074',
                'journal_title': 'JHEP',
                'journal_volume': '05',
                'material': 'erratum',
                'page_start': '074',
                'year': 2017
            }
        ]
    }

    TestRecordMetadata.create_from_kwargs(
        json=original_cited_record_json, index_name='records-hep')

    TestRecordMetadata.create_from_kwargs(
        json=errata_cited_record_json, index_name='records-hep')

    references = [
        {
            'reference': {
                'publication_info': {
                    'artid': '074',
                    'journal_title': 'JHEP',
                    'journal_volume': '05',
                    'page_start': '074',
                    'year': 2017
                }
            }
        }
    ]

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate(references, subschema) is None

    references = match_references(references)

    assert get_value(references[0], 'record') is None
    assert validate(references, subschema) is None

Beispiel #8

0

Datei anzeigen

Datei: test_refextract_matcher.py Projekt: turtle321/inspire-next

def test_match_references_no_match_when_multiple_match_different_from_previous(
        isolated_app):
    """Test reference matcher for when inspire-matcher returns multiple matches
    where the matched record id is not the same as the previous matched record id"""

    original_cited_record_json = {
        '$schema':
        'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number':
        1,
        'document_type': ['article'],
        'publication_info': [{
            'artid': '159',
            'journal_title': 'JHEP',
            'journal_volume': '03',
            'page_start': '159',
            'year': 2016
        }, {
            'artid': '074',
            'journal_title': 'JHEP',
            'journal_volume': '05',
            'material': 'erratum',
            'page_start': '074',
            'year': 2017
        }]
    }

    errata_cited_record_json = {
        '$schema':
        'http://localhost:5000/schemas/records/hep.json',
        '_collections': ['Literature'],
        'control_number':
        2,
        'document_type': ['article'],
        'publication_info': [{
            'artid': '074',
            'journal_title': 'JHEP',
            'journal_volume': '05',
            'material': 'erratum',
            'page_start': '074',
            'year': 2017
        }]
    }

    TestRecordMetadata.create_from_kwargs(json=original_cited_record_json,
                                          index_name='records-hep')

    TestRecordMetadata.create_from_kwargs(json=errata_cited_record_json,
                                          index_name='records-hep')

    references = [{
        'reference': {
            'publication_info': {
                'artid': '074',
                'journal_title': 'JHEP',
                'journal_volume': '05',
                'page_start': '074',
                'year': 2017
            }
        }
    }]

    schema = load_schema('hep')
    subschema = schema['properties']['references']

    assert validate(references, subschema) is None

    references = match_references(references)

    assert get_value(references[0], 'record') is None
    assert validate(references, subschema) is None

Beispiel #9

0

Datei anzeigen

def get_linked_refs():
    data = request.json
    matched_refs = match_references(data['references'])
    return jsonify({'references': matched_refs})

Beispiel #10

0

Datei anzeigen

Datei: api.py Projekt: harunurhan/inspire-next

def get_linked_refs():
    data = request.json
    matched_refs = match_references(data['references'])
    return jsonify({'references': matched_refs})

Beispiel #11

0

Datei anzeigen

Datei: actions.py Projekt: pazembrz/inspire-next

def match_references_based_on_flag(references):
    if current_app.config.get("FEATURE_FLAG_ENABLE_MATCH_REFERENCES_HEP"):
        return match_references_hep(references)
    return match_references(references)