Python get_clean_arXiv_id Examples, inspirehep.utils.arxiv.get_clean_arXiv_id Python Examples

Example #1

0

Show file

def arxiv_refextract(obj, eng):
    """Extract references from arXiv PDF.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_workflow(
            workflow=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        pdf = obj.files[filename]
    if pdf:
        mapped_references = extract_references(pdf.file.uri)
        if mapped_references:
            obj.data["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")

Example #2

0

Show file

File: arxiv.py Project: spirosdelviniotis/inspire-next

def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    from wand.exceptions import DelegateError

    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.tar.gz".format(arxiv_id))
    if filename not in obj.files:
        tarball = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_TARBALL_URL'].format(
                arxiv_id=arxiv_id))
    else:
        tarball = obj.files[filename]

    try:
        plots = process_tarball(tarball.file.uri)
    except (InvalidTarball, NoTexFilesFound):
        obj.log.error('Invalid tarball {0}'.format(tarball.file.uri))
        return
    except DelegateError as err:
        obj.log.error("Error extracting plots. Report and skip.")
        current_app.logger.exception(err)
        return

    for idx, plot in enumerate(plots):
        obj.files[plot.get('name')] = BytesIO(open(plot.get('url')))
        obj.files[plot.get('name')]["doctype"] = "Plot"
        obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format(
            idx, "".join(plot.get('captions', [])))
    obj.log.info("Added {0} plots.".format(len(plots)))

Example #3

0

Show file

File: arxiv.py Project: kaplun/inspire-next

def arxiv_refextract(obj, eng):
    """Extract references from arXiv PDF.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        pdf = obj.files[filename]
    if pdf:
        mapped_references = extract_references(pdf.file.uri)
        if mapped_references:
            # FIXME For now we do not add these references to the final record.
            obj.extra_data["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")

Example #4

0

Show file

File: actions.py Project: michamos/inspire-next

def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    arxiv_id = get_clean_arXiv_id(obj.data)
    categories = get_value(obj.data, 'arxiv_eprints.categories')

    if arxiv_id or categories:
        return True
    return False

Example #5

0

Show file

File: matching.py Project: michamos/inspire-next

def match_by_arxiv_id(record):
    """Match by arXiv identifier."""
    arxiv_id = get_clean_arXiv_id(record)

    if arxiv_id:
        query = '035:"{0}"'.format(arxiv_id)
        return search(query)

    return list()

Example #6

0

Show file

File: matching.py Project: kaplun/inspire-next

def match_by_arxiv_id(record):
    """Match by arXiv identifier."""
    arxiv_id = get_clean_arXiv_id(record)

    if arxiv_id:
        query = '035:"{0}"'.format(arxiv_id)
        return search(query)

    return list()

Example #7

0

Show file

def test_get_clean_arXiv_id_from_arxiv_eprints_with_oai_prefix():
    record = {
        'arxiv_eprints': [
            {
                'value': 'oai:arXiv.org:physics/0112006'
            },
        ],
    }

    expected = 'physics/0112006'
    result = get_clean_arXiv_id(record)

    assert expected == result

Example #8

0

Show file

def test_get_clean_arXiv_id_from_arxiv_eprints_using_new_style():
    record = {
        'arxiv_eprints': [
            {
                'value': 'arxiv:1002.2647'
            },
        ],
    }

    expected = '1002.2647'
    result = get_clean_arXiv_id(record)

    assert expected == result

Example #9

0

Show file

def test_get_clean_arXiv_id_from_arxiv_eprints_using_old_style():
    record = {
        'arxiv_eprints': [
            {
                'value': 'physics/0112006'
            },
        ],
    }

    expected = 'physics/0112006'
    result = get_clean_arXiv_id(record)

    assert expected == result

Example #10

0

Show file

File: arxiv.py Project: spirosdelviniotis/inspire-next

def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id))
        pdf['doctype'] = "arXiv"

Example #11

0

Show file

def test_get_clean_arXiv_id_from_arxiv_eprints_selects_first():
    record = {
        'arxiv_eprints': [
            {
                'value': 'oai:arXiv.org:0801.4782'
            },
            {
                'value': 'oai:arXiv.org:0805.1410'
            },
        ],
    }

    expected = '0801.4782'
    result = get_clean_arXiv_id(record)

    assert expected == result

Example #12

0

Show file

File: arxiv.py Project: kaplun/inspire-next

def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.pdf".format(arxiv_id))
    if filename not in obj.files:
        pdf = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_PDF_URL'].format(
                arxiv_id=arxiv_id
            )
        )
        pdf['doctype'] = "arXiv"

Example #13

0

Show file

    def _author_list(obj, eng):
        from inspirehep.modules.converter import convert

        arxiv_id = get_clean_arXiv_id(obj.data)
        filename = secure_filename("{0}.tar.gz".format(arxiv_id))
        if filename not in obj.files:
            tarball = download_file_to_workflow(
                workflow=obj,
                name=filename,
                url=current_app.config['ARXIV_TARBALL_URL'].format(
                    arxiv_id=arxiv_id
                )
            )
        else:
            tarball = obj.files[filename]

        sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri))
        try:
            file_list = untar(tarball.file.uri, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball.file.uri))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [path for path in file_list
                          if path.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authors_rec = create_record(authors_xml)
                authorlist_record = hep.do(authors_rec)
                obj.data.update(authorlist_record)
                break

Example #14

0

Show file

File: arxiv.py Project: kaplun/inspire-next

    def _author_list(obj, eng):
        from inspirehep.modules.converter import convert

        arxiv_id = get_clean_arXiv_id(obj.data)
        filename = secure_filename("{0}.tar.gz".format(arxiv_id))
        if filename not in obj.files:
            tarball = download_file_to_record(
                record=obj,
                name=filename,
                url=current_app.config['ARXIV_TARBALL_URL'].format(
                    arxiv_id=arxiv_id
                )
            )
        else:
            tarball = obj.files[filename]

        sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri))
        try:
            file_list = untar(tarball.file.uri, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball.file.uri))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [path for path in file_list
                          if path.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authors_rec = create_record(authors_xml)
                authorlist_record = hep.do(authors_rec)
                obj.data.update(authorlist_record)
                break

Example #15

0

Show file

File: arxiv.py Project: kaplun/inspire-next

def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive.

    :param obj: Workflow Object to process
    :param eng: Workflow Engine processing the object
    """
    from wand.exceptions import DelegateError

    arxiv_id = get_clean_arXiv_id(obj.data)
    filename = secure_filename("{0}.tar.gz".format(arxiv_id))
    if filename not in obj.files:
        tarball = download_file_to_record(
            record=obj,
            name=filename,
            url=current_app.config['ARXIV_TARBALL_URL'].format(
                arxiv_id=arxiv_id
            )
        )
    else:
        tarball = obj.files[filename]

    try:
        plots = process_tarball(tarball.file.uri)
    except (InvalidTarball, NoTexFilesFound):
        obj.log.error(
            'Invalid tarball {0}'.format(tarball.file.uri)
        )
        return
    except DelegateError as err:
        obj.log.error("Error extracting plots. Report and skip.")
        current_app.logger.exception(err)
        return

    for idx, plot in enumerate(plots):
        obj.files[plot.get('name')] = BytesIO(open(plot.get('url')))
        obj.files[plot.get('name')]["doctype"] = "Plot"
        obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format(
            idx, "".join(plot.get('captions', []))
        )
    obj.log.info("Added {0} plots.".format(len(plots)))

Example #16

0

Show file

def test_get_clean_arXiv_id_returns_none_when_no_arxiv_eprints():
    assert get_clean_arXiv_id({}) is None

Example #17

0

Show file

File: actions.py Project: spirosdelviniotis/inspire-next

def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    return bool(get_value(obj.data, "arxiv_eprints.categories", [[]])[0]) or \
        get_clean_arXiv_id(obj.data)

Example #18

0

Show file

File: actions.py Project: bittirousku/inspire-next

def is_arxiv_paper(obj, *args, **kwargs):
    """Check if the record is from arXiv."""
    return bool(get_value(obj.data, "arxiv_eprints.categories", [[]])[0]) or \
        get_clean_arXiv_id(obj.data)

Example #19

0

Show file

File: test_utils_arxiv.py Project: spirosdelviniotis/inspire-next

def test_arxiv_id_getter(arxiv_record, arxiv_record_old, arxiv_record_oai):
    """Test retrieval of arXiv ID."""
    assert "1002.2647" == get_clean_arXiv_id(arxiv_record)
    assert "physics/0112006" == get_clean_arXiv_id(arxiv_record_old)
    assert "physics/0112006" == get_clean_arXiv_id(arxiv_record_oai)
    assert get_clean_arXiv_id({}) is None