def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space: tarball_file = retrieve_uri(tarball.file.uri, outdir=scratch_space) try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = '{number}_{name}'.format(number=index, name=plot_name) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure(key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, )) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))
def add_document_or_figure( self, metadata, stream=None, is_document=True, file_name=None, key=None, ): """Add a document or figure to the record. Args: metadata(dict): metadata of the document or figure, see the schemas for more details, will be validated. stream(file like object): if passed, will extract the file contents from it. is_document(bool): if the given information is for a document, set to ```False``` for a figure. file_name(str): Name of the file, used as a basis of the key for the files store. key(str): if passed, will use this as the key for the files store and ignore ``file_name``, use it to overwrite existing keys. Returns: dict: metadata of the added document or figure. Raises: TypeError: if not ``file_name`` nor ``key`` are passed (one of them is required). """ if not key and not file_name: raise TypeError( 'No file_name and no key passed, at least one of them is ' 'needed.' ) if not key: key = self._get_unique_files_key(base_file_name=file_name) if stream is not None: self.files[key] = stream builder = LiteratureBuilder(record=self.to_dict()) metadata['key'] = key metadata['url'] = '/api/files/{bucket}/{key}'.format( bucket=self.files[key].bucket_id, key=key, ) if is_document: builder.add_document(**metadata) else: builder.add_figure(**metadata) super(InspireRecord, self).update(builder.record) return metadata
def add_document_or_figure( self, metadata, stream=None, is_document=True, file_name=None, key=None, ): """Add a document or figure to the record. Args: metadata(dict): metadata of the document or figure, see the schemas for more details, will be validated. stream(file like object): if passed, will extract the file contents from it. is_document(bool): if the given information is for a document, set to ```False``` for a figure. file_name(str): Name of the file, used as a basis of the key for the files store. key(str): if passed, will use this as the key for the files store and ignore ``file_name``, use it to overwrite existing keys. Returns: dict: metadata of the added document or figure. Raises: TypeError: if not ``file_name`` nor ``key`` are passed (one of them is required). """ if not key and not file_name: raise TypeError( 'No file_name and no key passed, at least one of them is ' 'needed.' ) if not key: key = self._get_unique_files_key(base_file_name=file_name) if stream is not None: self.files[key] = stream builder = LiteratureBuilder(record=self.dumps()) metadata['key'] = key metadata['url'] = '/api/files/{bucket}/{key}'.format( bucket=self.files[key].bucket_id, key=key, ) if is_document: builder.add_document(**metadata) else: builder.add_figure(**metadata) super(InspireRecord, self).update(builder.record) return metadata
def add_files(self, documents=None, figures=None): """Public method for adding documents and figures Args: documents (list[dict]): List of documents which should be added to this record figures (list[dict]): List of figures which should be added to this record Documents and figures are lists of dicts. Most obscure dict which whould be provided for each file is: { 'url': 'http:// or /api/file/bucket_id/file_key' 'is_document': True or False(default) } Returns: list: list of added keys """ if not documents and not figures: raise TypeError("No files passed, at least one is needed") if not current_app.config.get("FEATURE_FLAG_ENABLE_FILES", False): if figures: self.setdefault("figures", []).extend(figures) if documents: self.setdefault("documents", []).extend(documents) return [] files = [] builder = LiteratureBuilder(record=self) if documents: doc_keys = [ doc_metadata["key"] for doc_metadata in self.get("documents", []) ] for doc in documents: metadata = self._add_file(document=True, **doc) if metadata["key"] not in doc_keys: builder.add_document(**metadata) files.append(metadata) if figures: fig_keys = [ fig_metadata["key"] for fig_metadata in self.get("figures", []) ] for fig in figures: metadata = self._add_file(**fig) if metadata["key"] not in fig_keys: builder.add_figure(**metadata) files.append(metadata) # FIXME: this is wrong every time it goes to ``update``` function # which means update refs, pidstore etc.. super().update(builder.record.dumps()) return files
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = 'w{number}_{name}'.format( number=index, name=plot_name, ) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure( key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, ) ) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ # Crude way to set memory limits for wand globally. mem_limit = current_app.config.get("WAND_MEMORY_LIMIT") if mem_limit and limits['memory'] != mem_limit: limits['memory'] = mem_limit # This sets disk limit, if not set it will swap data on disk # instead of throwing exception limits['disk'] = current_app.config.get("WAND_DISK_LIMIT", 0) # It will throw an exception when memory and disk limit exceeds. # At least workflow status will be saved. arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info('No file named=%s for arxiv_id %s', filename, arxiv_id) return with TemporaryDirectory(prefix='plot_extract') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = 'w{number}_{name}'.format( number=index, name=plot_name, ) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure( key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, ) ) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))