def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: try: plots = process_tarball(tarball.file.uri) except (InvalidTarball, NoTexFilesFound): obj.log.error( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id) return except DelegateError as err: obj.log.error('Error extracting plots for %s. Report and skip.', arxiv_id) current_app.logger.exception(err) return for idx, plot in enumerate(plots): with open(plot.get('url')) as plot_file: obj.files[plot.get('name')] = plot_file obj.files[plot.get('name')]['description'] = u'{0:05d} {1}'.format( idx, ''.join(plot.get('captions', [])) ) obj.log.info('Added {0} plots.'.format(len(plots)))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space: tarball_file = retrieve_uri(tarball.file.uri, outdir=scratch_space) try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = '{number}_{name}'.format(number=index, name=plot_name) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure(key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, )) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ from wand.exceptions import DelegateError arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id)) else: tarball = obj.files[filename] try: plots = process_tarball(tarball.file.uri) except (InvalidTarball, NoTexFilesFound): obj.log.error('Invalid tarball {0}'.format(tarball.file.uri)) return except DelegateError as err: obj.log.error("Error extracting plots. Report and skip.") current_app.logger.exception(err) return for idx, plot in enumerate(plots): obj.files[plot.get('name')] = BytesIO(open(plot.get('url'))) obj.files[plot.get('name')]["doctype"] = "Plot" obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format( idx, "".join(plot.get('captions', []))) obj.log.info("Added {0} plots.".format(len(plots)))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() try: plots = process_tarball(tarball) except InvalidTarball: eng.log.error( 'Invalid tarball {0}'.format(tarball) ) return if plots: # We store the path to the directory the tarball contents lives new_dict = get_json_for_plots(plots) record.update(new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"]))) model.update()
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ from wand.exceptions import DelegateError arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] try: plots = process_tarball(tarball.file.uri) except (InvalidTarball, NoTexFilesFound): obj.log.error( 'Invalid tarball {0}'.format(tarball.file.uri) ) return except DelegateError as err: obj.log.error("Error extracting plots. Report and skip.") current_app.logger.exception(err) return for idx, plot in enumerate(plots): obj.files[plot.get('name')] = BytesIO(open(plot.get('url'))) obj.files[plot.get('name')]["doctype"] = "Plot" obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format( idx, "".join(plot.get('captions', [])) ) obj.log.info("Added {0} plots.".format(len(plots)))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = 'w{number}_{name}'.format( number=index, name=plot_name, ) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure( key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, ) ) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ # Crude way to set memory limits for wand globally. mem_limit = current_app.config.get("WAND_MEMORY_LIMIT") if mem_limit and limits['memory'] != mem_limit: limits['memory'] = mem_limit # This sets disk limit, if not set it will swap data on disk # instead of throwing exception limits['disk'] = current_app.config.get("WAND_DISK_LIMIT", 0) # It will throw an exception when memory and disk limit exceeds. # At least workflow status will be saved. arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info('No file named=%s for arxiv_id %s', filename, arxiv_id) return with TemporaryDirectory(prefix='plot_extract') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = 'w{number}_{name}'.format( number=index, name=plot_name, ) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure( key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, ) ) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))