def _resized_image(self, file): (max_width, max_height) = self._max_dimensions file_ext = filename_extension(file) name_tail = '.handprint' + file_ext new_file = file if name_tail in file else filename_basename( file) + name_tail if path.exists(new_file) and readable(new_file): from handprint.images import image_dimensions (image_width, image_height) = image_dimensions(new_file) if image_width < max_width and image_height < max_height: inform(f'Using reduced image found in {relative(new_file)}') return new_file else: # We found a "-reduced" file, perhaps from a previous run, but # for the current set of services, dimension are too large. if __debug__: log('existing resized file larger than' + f' {max_width}x{max_height}: {new_file}') inform(f'Dimensions too large; reducing dimensions: {relative(file)}') from handprint.images import reduced_image_dimensions (resized, error) = reduced_image_dimensions(file, new_file, max_width, max_height) if error: alert(f'Failed to re-dimension {relative(file)}: {error}') return None return resized
def _write_report(self, report_file, report_format, title, article_list): for fmt in report_format.split(','): dest_file = filename_basename(report_file) + '.' + fmt if fmt == "csv": with open(dest_file, 'w', newline='') as file: file.write('Status,DOI,Date,URL\n') csvwriter = csv.writer(file, delimiter=',') for article in article_list: row = [ article.status, article.doi, article.date, article.pdf ] csvwriter.writerow(row) elif fmt == "html": with open(dest_file, 'w', newline='') as file: file.write( _HTML_REPORT_TOP.format( title or 'Report for ' + timestamp())) for article in article_list: file.write('<tr>') file.write('<td>' + article.status + '</td>') file.write('<td>' + article.doi + '</td>') file.write('<td>' + article.date + '</td>') file.write('<td><a href="{0}">{0}</a></td>'.format( article.pdf)) file.write('</tr>') file.write(_HTML_REPORT_BOTTOM) else: raise ValueError('Unsupported report format "' + fmt + '"')
def _smaller_file(self, file): if not file: return None file_ext = filename_extension(file) name_tail = '.handprint' + file_ext new_file = file if name_tail in file else filename_basename( file) + name_tail if path.exists(new_file): from handprint.images import image_size if image_size(new_file) < self._max_size: inform(f'Reusing resized image found in {relative(new_file)}') return new_file else: # We found a ".handprint.ext" file, perhaps from a previous run, # but for the current set of services, it's larger than allowed. if __debug__: log('existing resized file larger than' + f' {self._max_size}b: {new_file}') inform(f'Size too large; reducing size: {relative(file)}') from handprint.images import reduced_image_size (resized, error) = reduced_image_size(file, new_file, self._max_size) if error: alert(f'Failed to resize {relative(file)}: {error}') return None return resized
def _converted_file(self, file, to_format, dest_dir): basename = path.basename(filename_basename(file)) new_file = path.join(dest_dir, basename + '.handprint.' + to_format) if path.exists(new_file): inform(f'Using existing converted image in {relative(new_file)}') return new_file else: inform(f'Converting to {to_format} format: {relative(file)}') from handprint.images import converted_image (converted, error) = converted_image(file, to_format, new_file) if error: alert(f'Failed to convert {relative(file)}: {error}') return None return converted
def converted_image(orig_file, to_format, dest_file=None): '''Returns a tuple of (success, output file, error message). Returns a tuple of (new_file, error). The value of 'error' will be None if no error occurred; otherwise, the value will be a string summarizing the error that occurred and 'new_file' will be set to None. ''' dest_format = canonical_format_name(to_format) if dest_file is None: dest_file = filename_basename(file) + '.' + dest_format # PIL is unable to read PDF files, so in that particular case, we have to # convert it using another tool. if filename_extension(orig_file) == '.pdf': import fitz doc = fitz.open(orig_file) if len(doc) >= 1: if len(doc) >= 2: if __debug__: log(f'{orig_file} has > 1 images; using only 1st') # FIXME: if there's more than 1 image, we could extra the rest. # Doing so will require some architectural changes first. if __debug__: log(f'extracting 1st image from {relative(dest_file)}') page = doc[0] pix = page.getPixmap(alpha=False) if __debug__: log(f'writing {relative(dest_file)}') pix.writeImage(dest_file, dest_format) return (dest_file, None) else: if __debug__: log(f'fitz says there is no image image in {relative(orig_file)}' ) return (None, f'Cannot find an image inside {relative(orig_file)}') else: # When converting images, PIL may issue a DecompressionBombWarning but # it's not a concern in our application. Ignore it. with warnings.catch_warnings(): warnings.simplefilter('ignore') try: im = Image.open(orig_file) if __debug__: log(f'converting {relative(orig_file)} to RGB') im.convert('RGB') if __debug__: log(f'saving converted image to {relative(dest_file)}') if orig_file == dest_file: im.seek(0) im.save(dest_file, dest_format) return (dest_file, None) except Exception as ex: return (None, str(ex))
def targets_from_arguments(self): # Validator_collection takes a long time to load. Delay loading it # until needed, so that overall application startup time is faster. from validator_collection.checkers import is_url targets = [] if self.from_file: if __debug__: log(f'reading {self.from_file}') targets = filter(None, open(self.from_file).read().splitlines()) else: for item in self.files: if is_url(item): targets.append(item) elif isfile(item) and filename_extension( item) in ACCEPTED_FORMATS: targets.append(item) elif isdir(item): # It's a directory, so look for files within. targets += files_in_directory(item, extensions=ACCEPTED_FORMATS) else: warn(f'"{item}" not a file or directory') # Filter files created in past runs. targets = filter(lambda name: '.handprint' not in name, targets) # If there is both a file in the format we generate and another # format of that file, ignore the other formats and just use ours. # Note: the value of targets is an iterator, but b/c it's tested inside # the loop, a separate list is needed (else get unexpected results). targets = list(targets) keep = [] for item in targets: ext = filename_extension(item) base = filename_basename(item) if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets): # png version of file is also present => skip this other version continue keep.append(item) return keep
def run_services(self, item, index, base_name): '''Run all requested services on the image indicated by "item", using "index" and "base_name" to construct a download copy of the item if it has to be downloaded from a URL first. ''' # Shortcuts to make the code more readable. services = self._services inform(f'Starting on [white]{item}[/]') (item_file, item_fmt) = self._get(item, base_name, index) if not item_file: return dest_dir = self._output_dir if self._output_dir else path.dirname( item_file) if not writable(dest_dir): alert(f'Cannot write output in {dest_dir}.') return # Normalize input image to the lowest common denominator. image = self._normalized(item, item_fmt, item_file, dest_dir) if not image.file: warn(f'Skipping {relative(item_file)}') return # Send the file to the services and get Result tuples back. self._senders = [] if self._num_threads == 1: # For 1 thread, avoid thread pool to make debugging easier. results = [self._send(image, s) for s in services] else: executor = ThreadPoolExecutor(max_workers=self._num_threads, thread_name_prefix='ServiceThread') for service in services: future = executor.submit(self._send, image, service) self._senders.append(future) results = [future.result() for future in self._senders] # If a service failed for some reason (e.g., a network glitch), we # get no result back. Remove empty results & go on with the rest. results = [x for x in results if x is not None] if not results: warn(f'Nothing to do for {item}') return # Create grid file if requested. if self._make_grid: base = path.basename(filename_basename(item_file)) grid_file = path.realpath( path.join(dest_dir, base + '.handprint-all.png')) inform(f'Creating results grid image: {relative(grid_file)}') all_results = [r.annotated for r in results] width = math.ceil(math.sqrt(len(all_results))) from handprint.images import create_image_grid create_image_grid(all_results, grid_file, max_horizontal=width) # Clean up after ourselves. if not self._extended_results: for file in set(image.temp_files | {r.annotated for r in results}): if file and path.exists(file): delete_existing(file) elif image.file != image.item_file: # Delete the resized file. While it would help efficiency to # reuse it on subsequent runs, the risk is that those runs might # target different services and would end up using a different- # sized image than if we sized it appropriately for _this_ run. delete_existing(image.file) inform(f'Done with {relative(item)}')
def _save_article_pmc(self, dest_dir, article, xml, zip_articles): inform('Writing ' + article.doi) to_archive = [] pdf_file = pmc_pdf_filename(article, dest_dir) if __debug__: log(f'downloading PDF to {pdf_file}') if not download_file(article.pdf, pdf_file): warn(f'Could not download PDF file for {article.doi}') article.status = 'failed-pdf-download' to_archive.append(pdf_file) jats_file = jats_filename(article, dest_dir) if __debug__: log(f'downloading JATS XML to {jats_file}') if not download_file(article.jats, jats_file): warn(f'Could not download JATS file for {article.doi}') article.status = 'failed-jats-download' if self.do_validate: if not valid_xml(jats_file, self._dtd): warn(f'Failed to validate JATS for article {article.doi}') article.status = 'failed-jats-validation' else: if __debug__: log(f'skipping DTD validation of {jats_file}') to_archive.append(jats_file) # We need to store the image with the name that appears in the # JATS file. That requires a little extra work to extract. image_extension = filename_extension(article.image) image_file = image_filename(article, dest_dir, ext=image_extension) if article.image: if __debug__: log(f'downloading image file to {image_file}') if download_file(article.image, image_file): with Image.open(image_file) as img: converted_img = image_without_alpha(img) converted_img = converted_img.convert('RGB') if __debug__: log(f'converting image to TIFF format') tiff_file = filename_basename(image_file) + '.tif' # Using save() means that only the 1st frame of a # multiframe image will be saved. converted_img.save(tiff_file, dpi=_TIFF_DPI, compression=None, description=tiff_comments(article)) to_archive.append(tiff_file) # We keep only the uncompressed TIFF version. if __debug__: log(f'deleting original image file {image_file}') delete_existing(image_file) else: warn(f'Failed to download image for {article.doi}') article.status = 'failed-image-download' else: if __debug__: log(f'skipping empty image file URL for {article.doi}') # Finally, put the files into their own zip archive. if zip_articles: if not article.status.startswith('failed'): zip_file = pmc_zip_filename(article, dest_dir) inform(f'Creating ZIP archive file "{zip_file}"') archive_files(zip_file, to_archive) if __debug__: log(f'verifying ZIP file {zip_file}') verify_archive(zip_file, 'zip') for file in to_archive: if __debug__: log(f'deleting file {file}') delete_existing(file) else: warn( f'ZIP archive for {article.doi} not created due to errors')
def _save_article_portico(self, dest_dir, article, xmldict): article_dir = path.join(dest_dir, article.basename) jats_dir = path.join(article_dir, 'jats') try: os.makedirs(article_dir) if self.journal.uses_jats: os.makedirs(jats_dir) except FileExistsError: pass inform('Writing ' + article.doi) xml_file = xml_filename(article, article_dir) with open(xml_file, 'w', encoding='utf8') as f: if __debug__: log(f'writing XML to {xml_file}') f.write(xmltodict.unparse(xmldict, pretty=True)) pdf_file = pdf_filename(article, article_dir) if __debug__: log(f'downloading PDF to {pdf_file}') if not download_file(article.pdf, pdf_file): warn(f'Could not download PDF file for {article.doi}') article.status = 'failed-pdf-download' if not self.journal.uses_jats: # Nothing more to do. return jats_file = jats_filename(article, jats_dir) if __debug__: log(f'downloading JATS XML to {jats_file}') if not download_file(article.jats, jats_file): warn(f'Could not download JATS file for {article.doi}') article.status = 'failed-jats-download' if self.do_validate: if not valid_xml(jats_file, self._dtd): warn(f'Failed to validate JATS for article {article.doi}') article.status = 'failed-jats-validation' else: if __debug__: log(f'skipping DTD validation of {jats_file}') # We need to store the image with the name that appears in the # JATS file. That requires a little extra work to extract. image_extension = filename_extension(article.image) image_file = image_filename(article, jats_dir, ext=image_extension) if article.image: if __debug__: log(f'downloading image file to {image_file}') if download_file(article.image, image_file): with Image.open(image_file) as img: converted = image_without_alpha(img) converted = converted.convert('RGB') if __debug__: log(f'converting image to TIFF format') tiff_name = filename_basename(image_file) + '.tif' comments = tiff_comments(article, self.journal.name) # Using save() means only the 1st frame of a multiframe # image will be saved. converted.save(tiff_name, compression=None, dpi=_TIFF_DPI, description=comments) # We keep only the uncompressed TIFF version. if __debug__: log(f'deleting original image file {image_file}') delete_existing(image_file) else: warn(f'Failed to download image for {article.doi}') article.status = 'failed-image-download' else: if __debug__: log(f'skipping empty image URL for {article.doi}')