def test_refextract_references_from_text_removes_duplicate_urls():
    schema = load_schema('hep')
    subschema = schema['properties']['references']

    text = u'[4] CALICE Collaboration webpage. http://twiki.cern.ch/CALICE hello http://twiki.cern.ch/CALICE'
    result = extract_references_from_text(text)

    assert validate(result, subschema) is None
    assert len(result[0]['reference']['urls']) == 1
Beispiel #2
0
def test_extract_references_from_text_handles_unicode():
    schema = load_schema('hep')
    subschema = schema['properties']['references']

    text = u'Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581'

    result = extract_references_from_text(text)

    assert validate(result, subschema) is None
    assert len(result) > 0
Beispiel #3
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(
            extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.',
                     len(extracted_raw_references))
        obj.data['references'] = match_references_based_on_flag(
            extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(
                extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references_based_on_flag(
                pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(
            extract_references_from_text(text, source))
        matched_text_references = match_references_based_on_flag(
            text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.',
                     len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.',
                     len(matched_text_references))
        obj.data['references'] = matched_text_references
Beispiel #4
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Note:
        We might want to compare the number of *matched* references instead.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None
    """
    if 'references' in obj.data:
        obj.log.info('Found references in metadata, extracting unextracted raw_refs')
        obj.data['references'] = extract_references_from_raw_refs(obj.data['references'])
        return

    pdf_references, text_references = [], []
    source = get_value(obj.data, 'acquisition_source.source')

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            try:
                pdf_references = extract_references_from_pdf(tmp_document, source)
            except TimeoutError:
                obj.log.error('Timeout when extracting references from PDF.')

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        try:
            text_references = extract_references_from_text(text, source)
        except TimeoutError:
            obj.log.error('Timeout when extracting references from text.')

    if len(pdf_references) == len(text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(pdf_references) >= len(text_references):
        obj.log.info('Extracted %d references from PDF.', len(pdf_references))
        obj.data['references'] = pdf_references
    elif len(text_references) > len(pdf_references):
        obj.log.info('Extracted %d references from text.', len(text_references))
        obj.data['references'] = text_references
Beispiel #5
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Note:
        We might want to compare the number of *matched* references instead.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None
    """
    pdf_references, text_references = [], []
    source = get_value(obj.data, 'acquisition_source.source')

    tmp_pdf = get_pdf_in_workflow(obj)
    if tmp_pdf:
        try:
            pdf_references = extract_references_from_pdf(tmp_pdf, source)
        except TimeoutError:
            obj.log.error('Timeout when extracting references from PDF.')
        finally:
            if os.path.exists(tmp_pdf):
                os.unlink(tmp_pdf)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        try:
            text_references = extract_references_from_text(text, source)
        except TimeoutError:
            obj.log.error('Timeout when extracting references from text.')

    if len(pdf_references) == len(text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(pdf_references) >= len(text_references):
        obj.log.info('Extracted %d references from PDF.', len(pdf_references))
        obj.data['references'] = pdf_references
    elif len(text_references) > len(pdf_references):
        obj.log.info('Extracted %d references from text.',
                     len(text_references))
        obj.data['references'] = text_references
Beispiel #6
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references))
        obj.data['references'] = match_references(extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references(pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(extract_references_from_text(text, source))
        matched_text_references = match_references(text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.', len(matched_text_references))
        obj.data['references'] = matched_text_references
Beispiel #7
0
def test_extract_references_from_text_populates_raw_refs_source():
    text = u'Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581'

    result = extract_references_from_text(text, source='submitter')

    assert result[0]['raw_refs'][0]['source'] == 'submitter'