Example #1
0
def arxiv_refextract(obj, eng):
    """Extract references from arXiv PDF.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_workflow(
            workflow=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        pdf = obj.files[filename]
    if pdf:
        mapped_references = extract_references(pdf.file.uri)
        if mapped_references:
            obj.data["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
Example #2
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    from wand.exceptions import DelegateError

    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.tar.gz".format(arxiv_id))
    if filename not in obj.files:
        tarball = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_TARBALL_URL'].format(
                arxiv_id=arxiv_id))
    else:
        tarball = obj.files[filename]

    try:
        plots = process_tarball(tarball.file.uri)
    except (InvalidTarball, NoTexFilesFound):
        obj.log.error('Invalid tarball {0}'.format(tarball.file.uri))
        return
    except DelegateError as err:
        obj.log.error("Error extracting plots. Report and skip.")
        current_app.logger.exception(err)
        return

    for idx, plot in enumerate(plots):
        obj.files[plot.get('name')] = BytesIO(open(plot.get('url')))
        obj.files[plot.get('name')]["doctype"] = "Plot"
        obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format(
            idx, "".join(plot.get('captions', [])))
    obj.log.info("Added {0} plots.".format(len(plots)))
Example #3
0
def arxiv_refextract(obj, eng):
    """Extract references from arXiv PDF.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        pdf = obj.files[filename]
    if pdf:
        mapped_references = extract_references(pdf.file.uri)
        if mapped_references:
            # FIXME For now we do not add these references to the final record.
            obj.extra_data["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
Example #4
0
def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    arxiv_id = get_clean_arXiv_id(obj.data)
    categories = get_value(obj.data, 'arxiv_eprints.categories')

    if arxiv_id or categories:
        return True
    return False
Example #5
0
def match_by_arxiv_id(record):
    """Match by arXiv identifier."""
    arxiv_id = get_clean_arXiv_id(record)

    if arxiv_id:
        query = '035:"{0}"'.format(arxiv_id)
        return search(query)

    return list()
Example #6
0
def match_by_arxiv_id(record):
    """Match by arXiv identifier."""
    arxiv_id = get_clean_arXiv_id(record)

    if arxiv_id:
        query = '035:"{0}"'.format(arxiv_id)
        return search(query)

    return list()
Example #7
0
def test_get_clean_arXiv_id_from_arxiv_eprints_with_oai_prefix():
    record = {
        'arxiv_eprints': [
            {
                'value': 'oai:arXiv.org:physics/0112006'
            },
        ],
    }

    expected = 'physics/0112006'
    result = get_clean_arXiv_id(record)

    assert expected == result
Example #8
0
def test_get_clean_arXiv_id_from_arxiv_eprints_using_new_style():
    record = {
        'arxiv_eprints': [
            {
                'value': 'arxiv:1002.2647'
            },
        ],
    }

    expected = '1002.2647'
    result = get_clean_arXiv_id(record)

    assert expected == result
Example #9
0
def test_get_clean_arXiv_id_from_arxiv_eprints_using_old_style():
    record = {
        'arxiv_eprints': [
            {
                'value': 'physics/0112006'
            },
        ],
    }

    expected = 'physics/0112006'
    result = get_clean_arXiv_id(record)

    assert expected == result
Example #10
0
def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id))
        pdf['doctype'] = "arXiv"
Example #11
0
def test_get_clean_arXiv_id_from_arxiv_eprints_selects_first():
    record = {
        'arxiv_eprints': [
            {
                'value': 'oai:arXiv.org:0801.4782'
            },
            {
                'value': 'oai:arXiv.org:0805.1410'
            },
        ],
    }

    expected = '0801.4782'
    result = get_clean_arXiv_id(record)

    assert expected == result
Example #12
0
def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
        pdf['doctype'] = "arXiv"
Example #13
0
    def _author_list(obj, eng):
        from inspirehep.modules.converter import convert

        arxiv_id = get_clean_arXiv_id(obj.data)
        filename = secure_filename("{0}.tar.gz".format(arxiv_id))
        if filename not in obj.files:
            tarball = download_file_to_workflow(
                workflow=obj,
                name=filename,
                url=current_app.config['ARXIV_TARBALL_URL'].format(
                    arxiv_id=arxiv_id
                )
            )
        else:
            tarball = obj.files[filename]

        sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri))
        try:
            file_list = untar(tarball.file.uri, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball.file.uri))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [path for path in file_list
                          if path.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authors_rec = create_record(authors_xml)
                authorlist_record = hep.do(authors_rec)
                obj.data.update(authorlist_record)
                break
Example #14
0
    def _author_list(obj, eng):
        from inspirehep.modules.converter import convert

        arxiv_id = get_clean_arXiv_id(obj.data)
        filename = secure_filename("{0}.tar.gz".format(arxiv_id))
        if filename not in obj.files:
            tarball = download_file_to_record(
                record=obj,
                name=filename,
                url=current_app.config['ARXIV_TARBALL_URL'].format(
                    arxiv_id=arxiv_id
                )
            )
        else:
            tarball = obj.files[filename]

        sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri))
        try:
            file_list = untar(tarball.file.uri, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball.file.uri))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [path for path in file_list
                          if path.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authors_rec = create_record(authors_xml)
                authorlist_record = hep.do(authors_rec)
                obj.data.update(authorlist_record)
                break
Example #15
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    from wand.exceptions import DelegateError

    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.tar.gz".format(arxiv_id))
    if filename not in obj.files:
        tarball = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_TARBALL_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        tarball = obj.files[filename]

    try:
        plots = process_tarball(tarball.file.uri)
    except (InvalidTarball, NoTexFilesFound):
        obj.log.error(
            'Invalid tarball {0}'.format(tarball.file.uri)
        )
        return
    except DelegateError as err:
        obj.log.error("Error extracting plots. Report and skip.")
        current_app.logger.exception(err)
        return

    for idx, plot in enumerate(plots):
        obj.files[plot.get('name')] = BytesIO(open(plot.get('url')))
        obj.files[plot.get('name')]["doctype"] = "Plot"
        obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format(
            idx, "".join(plot.get('captions', []))
        )
    obj.log.info("Added {0} plots.".format(len(plots)))
Example #16
0
def test_get_clean_arXiv_id_returns_none_when_no_arxiv_eprints():
    assert get_clean_arXiv_id({}) is None
Example #17
0
def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    return bool(get_value(obj.data, "arxiv_eprints.categories", [[]])[0]) or \
        get_clean_arXiv_id(obj.data)
Example #18
0
def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    return bool(get_value(obj.data, "arxiv_eprints.categories", [[]])[0]) or \
        get_clean_arXiv_id(obj.data)
def test_arxiv_id_getter(arxiv_record, arxiv_record_old, arxiv_record_oai):
    """Test retrieval of arXiv ID."""
    assert "1002.2647" == get_clean_arXiv_id(arxiv_record)
    assert "physics/0112006" == get_clean_arXiv_id(arxiv_record_old)
    assert "physics/0112006" == get_clean_arXiv_id(arxiv_record_oai)
    assert get_clean_arXiv_id({}) is None