def _plot_extract(obj, eng):
        from invenio.utils.plotextractor.api import (
            get_tarball_from_arxiv,
            get_marcxml_plots_from_tarball
        )
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        chosen_type = plotextractor_types

        if not chosen_type:
            chosen_type = arguments.get('p_extraction-source', [])

        if not isinstance(chosen_type, list):
            chosen_type = [chosen_type]

        if 'latex' in chosen_type:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                    str(eng.uuid)
                )
                tarball = get_tarball_from_arxiv(
                    obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                    extract_path
                )
                if tarball is None:
                    obj.log.error("No tarball found")
                    return
                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            try:
                marcxml = get_marcxml_plots_from_tarball(tarball)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on {0}'.format(tarball)
                )
            if marcxml:
                # We store the path to the directory the tarball contents lives
                new_dict = convert_marcxml_to_bibfield(marcxml)
                _attach_files_to_obj(obj, new_dict)
                obj.update_task_results(
                    "Plots",
                    [{
                        "name": "Plots",
                        "result": new_dict["fft"],
                        "template": "workflows/results/plots.html"
                    }]
                )
Exemple #2
0
def get_tarball_for_model(eng, arxiv_id):
    """We download it."""
    extract_path = os.path.join(
        cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
        str(eng.uuid)
    )
    return get_tarball_from_arxiv(
        arxiv_id,
        extract_path
    )
Exemple #3
0
    def _plot_extract(obj, eng):
        from invenio.utils.plotextractor.api import (
            get_tarball_from_arxiv, get_marcxml_plots_from_tarball)
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        chosen_type = plotextractor_types

        if not chosen_type:
            chosen_type = arguments.get('p_extraction-source', [])

        if not isinstance(chosen_type, list):
            chosen_type = [chosen_type]

        if 'latex' in chosen_type:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg.get('OAIHARVESTER_STORAGEDIR',
                            cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
                tarball = get_tarball_from_arxiv(
                    obj.data.get(
                        cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                    extract_path)
                if tarball is None:
                    obj.log.error("No tarball found")
                    return
                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            try:
                marcxml = get_marcxml_plots_from_tarball(tarball)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on {0}'.format(tarball))
            if marcxml:
                # We store the path to the directory the tarball contents lives
                new_dict = convert_marcxml_to_bibfield(marcxml)
                _attach_files_to_obj(obj, new_dict)
                obj.update_task_results(
                    "Plots", [{
                        "name": "Plots",
                        "result": new_dict["fft"],
                        "template": "workflows/results/plots.html"
                    }])
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    from invenio.utils.plotextractor.api import (
        get_tarball_from_arxiv,
        get_marcxml_plots_from_tarball
    )
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.shell import Timeout

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid)
        )
        tarball = get_tarball_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path
        )
        if tarball is None:
            obj.log.error("No tarball found")
            return
        obj.extra_data["_result"]["tarball"] = tarball
    else:
        tarball = obj.extra_data["_result"]["tarball"]

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error(
            'Timeout during tarball extraction on {0}'.format(tarball)
        )
    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = convert_marcxml_to_bibfield(marcxml)
        _attach_files_to_obj(obj, new_dict)
        obj.update_task_results(
            "Plots",
            [{
                "name": "Plots",
                "result": new_dict["fft"],
                "template": "workflows/results/plots.html"
            }]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    from invenio.utils.plotextractor.api import (get_tarball_from_arxiv,
                                                 get_marcxml_plots_from_tarball
                                                 )
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.shell import Timeout

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        tarball = get_tarball_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path)
        if tarball is None:
            obj.log.error("No tarball found")
            return
        obj.extra_data["_result"]["tarball"] = tarball
    else:
        tarball = obj.extra_data["_result"]["tarball"]

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error(
            'Timeout during tarball extraction on {0}'.format(tarball))
    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = convert_marcxml_to_bibfield(marcxml)
        _attach_files_to_obj(obj, new_dict)
        obj.update_task_results("Plots",
                                [{
                                    "name": "Plots",
                                    "result": new_dict["fft"],
                                    "template": "workflows/results/plots.html"
                                }])
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(
            cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR',
                        cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path)
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' %
                          (obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error(
                            "Error parsing authorlist record for id: %s" %
                            (identifiers, ))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(
                        updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }])
                    obj.update_task_results("number_of_authors", [{
                        "name":
                        "number_of_authors",
                        "results":
                        new_dict_representation["number_of_authors"]
                    }])
                    break
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                str(eng.uuid)
            )
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path
            )
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball,
                                      cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' % (
                obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error("Error parsing authorlist record for id: %s" % (
                            identifiers,))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }]
                    )
                    obj.update_task_results(
                        "number_of_authors",
                        [{
                            "name": "number_of_authors",
                            "results": new_dict_representation["number_of_authors"]
                        }]
                    )
                    break