Ejemplo n.º 1
0
def get_document_in_workflow(obj):
    """Context manager giving the path to the document attached to a workflow object.

    Arg:
        obj: workflow object

    Returns:
        Optional[str]: The path to a local copy of the document.  If no
        documents are present, it retuns None.  If several documents are
        present, it prioritizes the fulltext. If several documents with the
        same priority are present, it takes the first one and logs an error.
    """
    documents = obj.data.get('documents', [])
    fulltexts = [
        document for document in documents if document.get('fulltext')
    ]
    documents = fulltexts or documents

    if not documents:
        obj.log.info('No document available')
        yield None
        return
    elif len(documents) > 1:
        obj.log.error('More than one document in workflow, first one used')

    key = documents[0]['key']
    obj.log.info('Using document with key "%s"', key)
    with retrieve_uri(obj.files[key].file.uri) as local_file:
        yield local_file
Ejemplo n.º 2
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = obj.files[filename]

    if tarball:
        with TemporaryDirectory(prefix='plot_extract') as scratch_space:
            tarball_file = retrieve_uri(tarball.file.uri, outdir=scratch_space)
            try:
                plots = process_tarball(
                    tarball_file,
                    output_directory=scratch_space,
                )
            except (InvalidTarball, NoTexFilesFound):
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return
            except DelegateError as err:
                obj.log.error(
                    'Error extracting plots for %s. Report and skip.',
                    arxiv_id,
                )
                current_app.logger.exception(err)
                return

            if 'figures' in obj.data:
                for figure in obj.data['figures']:
                    if figure['key'] in obj.files:
                        del obj.files[figure['key']]
                del obj.data['figures']

            lb = LiteratureBuilder(source='arxiv', record=obj.data)
            for index, plot in enumerate(plots):
                plot_name = os.path.basename(plot.get('url'))
                key = plot_name
                if plot_name in obj.files.keys:
                    key = '{number}_{name}'.format(number=index,
                                                   name=plot_name)
                with open(plot.get('url')) as plot_file:
                    obj.files[key] = plot_file

                lb.add_figure(key=key,
                              caption=''.join(plot.get('captions', [])),
                              label=plot.get('label'),
                              material='preprint',
                              url='/api/files/{bucket}/{key}'.format(
                                  bucket=obj.files[key].bucket_id,
                                  key=key,
                              ))

            obj.data = lb.record
            obj.log.info('Added {0} plots.'.format(len(plots)))
Ejemplo n.º 3
0
def get_document_in_workflow(obj):
    """Context manager giving the path to the document attached to a workflow object.

    Arg:
        obj: workflow object

    Returns:
        Optional[str]: The path to a local copy of the document.  If no
        documents are present, it retuns None.  If several documents are
        present, it prioritizes the fulltext. If several documents with the
        same priority are present, it takes the first one and logs an error.
    """
    documents = obj.data.get('documents', [])
    fulltexts = [document for document in documents if document.get('fulltext')]
    documents = fulltexts or documents

    if not documents:
        obj.log.info('No document available')
        yield None
        return
    elif len(documents) > 1:
        obj.log.error('More than one document in workflow, first one used')

    key = documents[0]['key']
    obj.log.info('Using document with key "%s"', key)
    with retrieve_uri(obj.files[key].file.uri) as local_file:
        yield local_file
Ejemplo n.º 4
0
def get_pdf_in_workflow(obj):
    """Return the fullpath to the PDF attached to a workflow object"""
    for filename in obj.files.keys:
        if filename.endswith('.pdf'):
            return retrieve_uri(obj.files[filename].file.uri)

    obj.log.info('No PDF available')
Ejemplo n.º 5
0
def local_refextract_kbs_path():
    """Get the path to the temporary refextract kbs from the application config.
    """
    journal_kb_path = current_app.config.get('REFEXTRACT_JOURNAL_KB_PATH')
    temp_journal_kb_path = retrieve_uri(journal_kb_path)
    try:
        yield {'journals': temp_journal_kb_path}
    finally:
        if os.path.exists(temp_journal_kb_path):
            os.unlink(temp_journal_kb_path)
Ejemplo n.º 6
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found'
                % filename)
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [
                path for path in file_list if path.endswith('.xml')
            ]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(
                        marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
Ejemplo n.º 7
0
def test_retrieve_uri(tmpdir):
    test_file = tmpdir.join('file.txt')
    test_file.write('some content')

    uri = 'file://' + binary_type(test_file)

    with retrieve_uri(uri) as local_path, open(local_path) as local_file:
        path_copy = local_path
        assert local_file.read() == 'some content'

    assert not os.path.exists(path_copy)
Ejemplo n.º 8
0
    def _author_list(obj, eng):
        arxiv_id = LiteratureReader(obj.data).arxiv_id
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found' % filename
            )
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [path for path in file_list if path.endswith('.xml')]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
Ejemplo n.º 9
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        tarball = obj.files[filename]

        if tarball:
            with TemporaryDirectory(prefix='author_list') as scratch_space:
                tarball_file = retrieve_uri(
                    tarball.file.uri,
                    outdir=scratch_space,
                )
                try:
                    file_list = untar(tarball_file, scratch_space)
                except InvalidTarball:
                    obj.log.info(
                        'Invalid tarball %s for arxiv_id %s',
                        tarball.file.uri,
                        arxiv_id,
                    )
                    return
                obj.log.info('Extracted tarball to: {0}'.format(scratch_space))

                xml_files_list = [
                    path for path in file_list if path.endswith('.xml')
                ]
                obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

                for xml_file in xml_files_list:
                    with open(xml_file, 'r') as xml_file_fd:
                        xml_content = xml_file_fd.read()

                    match = REGEXP_AUTHLIST.findall(xml_content)
                    if match:
                        obj.log.info('Found a match for author extraction')
                        try:
                            authors_xml = convert(xml_content, stylesheet)
                        except XMLSyntaxError:
                            # Probably the %auto-ignore comment exists, so we skip the
                            # first line. See: inspirehep/inspire-next/issues/2195
                            authors_xml = convert(
                                xml_content.split('\n', 1)[1],
                                stylesheet,
                            )
                        authorlist_record = marcxml2record(authors_xml)
                        obj.data.update(authorlist_record)
                        break
Ejemplo n.º 10
0
def test_refextract_from_pdf(mock_get_pdf_in_workflow):
    mock_get_pdf_in_workflow.return_value = retrieve_uri(
        pkg_resources.resource_filename(
            __name__,
            os.path.join('fixtures', '1704.00452.pdf'),
        ))

    schema = load_schema('hep')
    subschema = schema['properties']['acquisition_source']

    data = {'acquisition_source': {'source': 'arXiv'}}
    extra_data = {}
    assert validate(data['acquisition_source'], subschema) is None

    obj = MockObj(data, extra_data)
    eng = MockEng()

    assert refextract(obj, eng) is None
    assert obj.data['references'][0]['raw_refs'][0]['source'] == 'arXiv'
Ejemplo n.º 11
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_arxiv_id(obj.data)
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
    tarball = obj.files[filename]

    if tarball:
        with TemporaryDirectory(prefix='plot_extract') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                plots = process_tarball(
                    tarball_file,
                    output_directory=scratch_space,
                )
            except (InvalidTarball, NoTexFilesFound):
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return
            except DelegateError as err:
                obj.log.error(
                    'Error extracting plots for %s. Report and skip.',
                    arxiv_id,
                )
                current_app.logger.exception(err)
                return

            if 'figures' in obj.data:
                for figure in obj.data['figures']:
                    if figure['key'] in obj.files:
                        del obj.files[figure['key']]
                del obj.data['figures']

            lb = LiteratureBuilder(source='arxiv', record=obj.data)
            for index, plot in enumerate(plots):
                plot_name = os.path.basename(plot.get('url'))
                key = plot_name
                if plot_name in obj.files.keys:
                    key = 'w{number}_{name}'.format(
                        number=index,
                        name=plot_name,
                    )
                with open(plot.get('url')) as plot_file:
                    obj.files[key] = plot_file

                lb.add_figure(
                    key=key,
                    caption=''.join(plot.get('captions', [])),
                    label=plot.get('label'),
                    material='preprint',
                    url='/api/files/{bucket}/{key}'.format(
                        bucket=obj.files[key].bucket_id,
                        key=key,
                    )
                )

            obj.data = lb.record
            obj.log.info('Added {0} plots.'.format(len(plots)))
Ejemplo n.º 12
0
def local_refextract_kbs_path():
    """Get the path to the temporary refextract kbs from the application config.
    """
    journal_kb_path = current_app.config.get('REFEXTRACT_JOURNAL_KB_PATH')
    with retrieve_uri(journal_kb_path) as temp_journal_kb_path:
        yield {'journals': temp_journal_kb_path}
Ejemplo n.º 13
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    # Crude way to set memory limits for wand globally.
    mem_limit = current_app.config.get("WAND_MEMORY_LIMIT")
    if mem_limit and limits['memory'] != mem_limit:
        limits['memory'] = mem_limit
        # This sets disk limit, if not set it will swap data on disk
        # instead of throwing exception
        limits['disk'] = current_app.config.get("WAND_DISK_LIMIT", 0)
        # It will throw an exception when memory and disk limit exceeds.
        # At least workflow status will be saved.

    arxiv_id = LiteratureReader(obj.data).arxiv_id
    filename = secure_filename('{0}.tar.gz'.format(arxiv_id))

    try:
        tarball = obj.files[filename]
    except KeyError:
        obj.log.info('No file named=%s for arxiv_id %s', filename, arxiv_id)
        return

    with TemporaryDirectory(prefix='plot_extract') as scratch_space, \
            retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
        try:
            plots = process_tarball(
                tarball_file,
                output_directory=scratch_space,
            )
        except (InvalidTarball, NoTexFilesFound):
            obj.log.info(
                'Invalid tarball %s for arxiv_id %s',
                tarball.file.uri,
                arxiv_id,
            )
            return
        except DelegateError as err:
            obj.log.error(
                'Error extracting plots for %s. Report and skip.',
                arxiv_id,
            )
            current_app.logger.exception(err)
            return

        if 'figures' in obj.data:
            for figure in obj.data['figures']:
                if figure['key'] in obj.files:
                    del obj.files[figure['key']]
            del obj.data['figures']

        lb = LiteratureBuilder(source='arxiv', record=obj.data)
        for index, plot in enumerate(plots):
            plot_name = os.path.basename(plot.get('url'))
            key = plot_name
            if plot_name in obj.files.keys:
                key = 'w{number}_{name}'.format(
                    number=index,
                    name=plot_name,
                )
            with open(plot.get('url')) as plot_file:
                obj.files[key] = plot_file

            lb.add_figure(
                key=key,
                caption=''.join(plot.get('captions', [])),
                label=plot.get('label'),
                material='preprint',
                url='/api/files/{bucket}/{key}'.format(
                    bucket=obj.files[key].bucket_id,
                    key=key,
                )
            )

        obj.data = lb.record
        obj.log.info('Added {0} plots.'.format(len(plots)))
Ejemplo n.º 14
0
def local_refextract_kbs_path():
    """Get the path to the temporary refextract kbs from the application config.
    """
    journal_kb_path = current_app.config.get('REFEXTRACT_JOURNAL_KB_PATH')
    with retrieve_uri(journal_kb_path) as temp_journal_kb_path:
        yield {'journals': temp_journal_kb_path}