def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, arxiv_id) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error("Timeout during tarball extraction on {0}".format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = get_json_from_marcxml(marcxml)[0] record.update(new_dict) obj.update_task_results( "Plots", [{"name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html"}] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"]))) model.update()
def _plot_extract(obj, eng): from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) chosen_type = plotextractor_types if not chosen_type: chosen_type = arguments.get('p_extraction-source', []) if not isinstance(chosen_type, list): chosen_type = [chosen_type] if 'latex' in chosen_type: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball) ) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] )
def _plot_extract(obj, eng): from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) chosen_type = plotextractor_types if not chosen_type: chosen_type = arguments.get('p_extraction-source', []) if not isinstance(chosen_type, list): chosen_type = [chosen_type] if 'latex' in chosen_type: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }])
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball) ) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" from invenio.utils.plotextractor.api import (get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results("Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }]) obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))