コード例 #1
0
ファイル: actions.py プロジェクト: pazembrz/inspire-next
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(
            extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.',
                     len(extracted_raw_references))
        obj.data['references'] = match_references_based_on_flag(
            extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(
                extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references_based_on_flag(
                pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(
            extract_references_from_text(text, source))
        matched_text_references = match_references_based_on_flag(
            text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.',
                     len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.',
                     len(matched_text_references))
        obj.data['references'] = matched_text_references
コード例 #2
0
def test_get_document_in_workflow():
    data = {
        'documents': [
            {
                'key': 'fulltext.xml',
                'fulltext': True,
            },
        ],
    }
    files = MockFiles({})
    files['fulltext.xml'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['fulltext.xml'].file.uri
コード例 #3
0
def test_get_document_in_workflow():
    data = {
        'documents': [
            {
                'key': 'fulltext.xml',
                'fulltext': True,
            },
        ],
    }
    files = MockFiles({})
    files['fulltext.xml'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['fulltext.xml'].file.uri
コード例 #4
0
ファイル: actions.py プロジェクト: david-caro/inspire-next
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Note:
        We might want to compare the number of *matched* references instead.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None
    """
    if 'references' in obj.data:
        obj.log.info('Found references in metadata, extracting unextracted raw_refs')
        obj.data['references'] = extract_references_from_raw_refs(obj.data['references'])
        return

    pdf_references, text_references = [], []
    source = get_value(obj.data, 'acquisition_source.source')

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            try:
                pdf_references = extract_references_from_pdf(tmp_document, source)
            except TimeoutError:
                obj.log.error('Timeout when extracting references from PDF.')

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        try:
            text_references = extract_references_from_text(text, source)
        except TimeoutError:
            obj.log.error('Timeout when extracting references from text.')

    if len(pdf_references) == len(text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(pdf_references) >= len(text_references):
        obj.log.info('Extracted %d references from PDF.', len(pdf_references))
        obj.data['references'] = pdf_references
    elif len(text_references) > len(pdf_references):
        obj.log.info('Extracted %d references from text.', len(text_references))
        obj.data['references'] = text_references
コード例 #5
0
def test_get_document_in_workflow_prefers_fulltext():
    data = {
        'documents': [
            {
                'key': 'table_of_contents.pdf',
            },
            {
                'key': 'fulltext.xml',
                'fulltext': True,
            },
        ],
    }
    files = MockFiles({})
    files['fulltext.xml'] = None
    files['table_of_contents.pdf'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['fulltext.xml'].file.uri
コード例 #6
0
def test_get_document_in_workflow_prefers_fulltext():
    data = {
        'documents': [
            {
                'key': 'table_of_contents.pdf',
            },
            {
                'key': 'fulltext.xml',
                'fulltext': True,
            },
        ],
    }
    files = MockFiles({})
    files['fulltext.xml'] = None
    files['table_of_contents.pdf'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['fulltext.xml'].file.uri
コード例 #7
0
ファイル: actions.py プロジェクト: harunurhan/inspire-next
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references))
        obj.data['references'] = match_references(extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references(pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(extract_references_from_text(text, source))
        matched_text_references = match_references(text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.', len(matched_text_references))
        obj.data['references'] = matched_text_references
コード例 #8
0
def test_get_document_in_workflow_takes_first_among_equals():
    data = {
        'documents': [
            {
                'key': 'table_of_contents.pdf',
            },
            {
                'key': 'document.pdf',
            },
        ],
    }
    files = MockFiles({})
    files['document.pdf'] = None
    files['table_of_contents.pdf'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['table_of_contents.pdf'].file.uri

    assert 'More than one document in workflow, first one used' in obj.log._error.getvalue()
コード例 #9
0
def test_get_document_in_workflow_takes_first_among_equals():
    data = {
        'documents': [
            {
                'key': 'table_of_contents.pdf',
            },
            {
                'key': 'document.pdf',
            },
        ],
    }
    files = MockFiles({})
    files['document.pdf'] = None
    files['table_of_contents.pdf'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['table_of_contents.pdf'].file.uri

    assert 'More than one document in workflow, first one used' in obj.log._error.getvalue(
    )
コード例 #10
0
def test_get_document_in_workflow_returns_None_when_no_documents():
    files = MockFiles({})
    obj = MockObj({}, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file is None
コード例 #11
0
def test_get_document_in_workflow_returns_None_when_no_documents():
    files = MockFiles({})
    obj = MockObj({}, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file is None