Example #1
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, arxiv_id)

    if not existing_file:
        # We download it
        tarball = get_tarball_for_model(eng, arxiv_id)

        if tarball is None:
            obj.log.error("No tarball found")
            return
        add_file_by_name(model, tarball)
    else:
        tarball = existing_file.get_syspath()

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error("Timeout during tarball extraction on {0}".format(tarball))

    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = get_json_from_marcxml(marcxml)[0]
        record.update(new_dict)
        obj.update_task_results(
            "Plots", [{"name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html"}]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
        model.update()
    def _plot_extract(obj, eng):
        from invenio.utils.plotextractor.api import (
            get_tarball_from_arxiv,
            get_marcxml_plots_from_tarball
        )
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        chosen_type = plotextractor_types

        if not chosen_type:
            chosen_type = arguments.get('p_extraction-source', [])

        if not isinstance(chosen_type, list):
            chosen_type = [chosen_type]

        if 'latex' in chosen_type:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                    str(eng.uuid)
                )
                tarball = get_tarball_from_arxiv(
                    obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                    extract_path
                )
                if tarball is None:
                    obj.log.error("No tarball found")
                    return
                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            try:
                marcxml = get_marcxml_plots_from_tarball(tarball)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on {0}'.format(tarball)
                )
            if marcxml:
                # We store the path to the directory the tarball contents lives
                new_dict = convert_marcxml_to_bibfield(marcxml)
                _attach_files_to_obj(obj, new_dict)
                obj.update_task_results(
                    "Plots",
                    [{
                        "name": "Plots",
                        "result": new_dict["fft"],
                        "template": "workflows/results/plots.html"
                    }]
                )
Example #3
0
    def _plot_extract(obj, eng):
        from invenio.utils.plotextractor.api import (
            get_tarball_from_arxiv, get_marcxml_plots_from_tarball)
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        chosen_type = plotextractor_types

        if not chosen_type:
            chosen_type = arguments.get('p_extraction-source', [])

        if not isinstance(chosen_type, list):
            chosen_type = [chosen_type]

        if 'latex' in chosen_type:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg.get('OAIHARVESTER_STORAGEDIR',
                            cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
                tarball = get_tarball_from_arxiv(
                    obj.data.get(
                        cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                    extract_path)
                if tarball is None:
                    obj.log.error("No tarball found")
                    return
                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            try:
                marcxml = get_marcxml_plots_from_tarball(tarball)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on {0}'.format(tarball))
            if marcxml:
                # We store the path to the directory the tarball contents lives
                new_dict = convert_marcxml_to_bibfield(marcxml)
                _attach_files_to_obj(obj, new_dict)
                obj.update_task_results(
                    "Plots", [{
                        "name": "Plots",
                        "result": new_dict["fft"],
                        "template": "workflows/results/plots.html"
                    }])
Example #4
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    from invenio.utils.plotextractor.api import (
        get_tarball_from_arxiv,
        get_marcxml_plots_from_tarball
    )
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.shell import Timeout

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid)
        )
        tarball = get_tarball_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path
        )
        if tarball is None:
            obj.log.error("No tarball found")
            return
        obj.extra_data["_result"]["tarball"] = tarball
    else:
        tarball = obj.extra_data["_result"]["tarball"]

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error(
            'Timeout during tarball extraction on {0}'.format(tarball)
        )
    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = convert_marcxml_to_bibfield(marcxml)
        _attach_files_to_obj(obj, new_dict)
        obj.update_task_results(
            "Plots",
            [{
                "name": "Plots",
                "result": new_dict["fft"],
                "template": "workflows/results/plots.html"
            }]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
Example #5
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    from invenio.utils.plotextractor.api import (get_tarball_from_arxiv,
                                                 get_marcxml_plots_from_tarball
                                                 )
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.shell import Timeout

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        tarball = get_tarball_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path)
        if tarball is None:
            obj.log.error("No tarball found")
            return
        obj.extra_data["_result"]["tarball"] = tarball
    else:
        tarball = obj.extra_data["_result"]["tarball"]

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error(
            'Timeout during tarball extraction on {0}'.format(tarball))
    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = convert_marcxml_to_bibfield(marcxml)
        _attach_files_to_obj(obj, new_dict)
        obj.update_task_results("Plots",
                                [{
                                    "name": "Plots",
                                    "result": new_dict["fft"],
                                    "template": "workflows/results/plots.html"
                                }])
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))