Example #1
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, arxiv_id)

    if not existing_file:
        # We download it
        tarball = get_tarball_for_model(eng, arxiv_id)

        if tarball is None:
            obj.log.error("No tarball found")
            return
        add_file_by_name(model, tarball)
    else:
        tarball = existing_file.get_syspath()

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error("Timeout during tarball extraction on {0}".format(tarball))

    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = get_json_from_marcxml(marcxml)[0]
        record.update(new_dict)
        obj.update_task_results(
            "Plots", [{"name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html"}]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
        model.update()
Example #2
0
    def _author_list(obj, eng):
        from inspire.modules.converter.xslt import convert
        from invenio_oaiharvester.utils import find_matching_files

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id))

        if not existing_file:
            # We download it
            tarball = get_tarball_for_model(eng, arxiv_id)

            if tarball is None:
                obj.log.error("No tarball found")
                return
            add_file_by_name(model, tarball)
        else:
            tarball = existing_file.get_syspath()

        sub_dir = os.path.abspath("{0}_files".format(tarball))
        try:
            file_list = untar(tarball, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [filename for filename in file_list
                          if filename.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authorlist_record = get_json_from_marcxml(authors_xml)[0]
                record.update(authorlist_record)
                obj.update_task_results(
                    "authors",
                    [{
                        "name": "authors",
                        "results": authorlist_record["authors"]
                    }]
                )
                obj.update_task_results(
                    "number_of_authors",
                    [{
                        "name": "number_of_authors",
                        "results": authorlist_record["number_of_authors"]
                    }]
                )
                break
        model.update()
Example #3
0
    def _author_list(obj, eng):
        from invenio_oaiharvester.utils import find_matching_files

        from invenio_utils.plotextractor.cli import get_defaults
        from invenio_utils.plotextractor.converter import untar
        from invenio_utils.shell import Timeout

        from inspire.modules.converter.xslt import convert

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, arxiv_id)

        if not existing_file:
            # We download it
            tarball = get_tarball_for_model(eng, arxiv_id)

            if tarball is None:
                obj.log.error("No tarball found")
                return
            add_file_by_name(model, tarball)
        else:
            tarball = existing_file.get_syspath()

        sub_dir, dummy = get_defaults(str(tarball), cfg["CFG_TMPDIR"], "")

        try:
            untar(str(tarball), sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error("Timeout during tarball extraction on {0}".format(tarball))

        xml_files_list = find_matching_files(sub_dir, ["xml"])
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authorlist_record = get_json_from_marcxml(authors_xml)[0]
                record.update(authorlist_record)
                obj.update_task_results("authors", [{"name": "authors", "results": authorlist_record["authors"]}])
                obj.update_task_results(
                    "number_of_authors",
                    [{"name": "number_of_authors", "results": authorlist_record["number_of_authors"]}],
                )
                break
        model.update()
Example #4
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml

    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id))

    if not existing_file:
        # We download it
        pdf = get_pdf_for_model(eng, arxiv_id)

        if pdf is None:
            obj.log.error("No pdf found")
            return
        add_file_by_name(model, pdf)
    else:
        pdf = existing_file.get_syspath()

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(pdf)
        if references_xml:
            updated_xml = (
                '<?xml version="1.0" encoding="UTF-8"?>\n' "<collection>\n" + references_xml + "\n</collection>"
            )
            new_dict = get_json_from_marcxml(updated_xml)[0]
            if "references" in new_dict:
                record["references"] = new_dict["references"]
                obj.log.info("Extracted {0} references".format(len(obj.data["references"])))
                obj.update_task_results(
                    "References",
                    [
                        {
                            "name": "References",
                            "result": new_dict["references"],
                            "template": "workflows/results/refextract.html",
                        }
                    ],
                )
                model.update()
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")