def test_converted_image(): _, tmpfile = tempfile.mkstemp(dir='/tmp', suffix='.tiff') thisdir = path.dirname(os.path.abspath(__file__)) f1_file = path.join(thisdir, 'data', 'fragments', 'f1.png') (a, b) = converted_image(f1_file, 'tif', tmpfile) assert isinstance(a, str) assert b is None delete_existing(tmpfile)
def test_reduced_image_dimensions(): _, tmpfile = tempfile.mkstemp(dir='/tmp', suffix='.png') thisdir = path.dirname(os.path.abspath(__file__)) f1_file = path.join(thisdir, 'data', 'fragments', 'f1.png') (a, b) = reduced_image_dimensions(f1_file, tmpfile, 100, 100) assert isinstance(a, str) assert b is None assert image_dimensions(tmpfile) == (100, 31) delete_existing(tmpfile)
def _save_article_pmc(self, dest_dir, article, xml, zip_articles): inform('Writing ' + article.doi) to_archive = [] pdf_file = pmc_pdf_filename(article, dest_dir) if __debug__: log(f'downloading PDF to {pdf_file}') if not download_file(article.pdf, pdf_file): warn(f'Could not download PDF file for {article.doi}') article.status = 'failed-pdf-download' to_archive.append(pdf_file) jats_file = jats_filename(article, dest_dir) if __debug__: log(f'downloading JATS XML to {jats_file}') if not download_file(article.jats, jats_file): warn(f'Could not download JATS file for {article.doi}') article.status = 'failed-jats-download' if self.do_validate: if not valid_xml(jats_file, self._dtd): warn(f'Failed to validate JATS for article {article.doi}') article.status = 'failed-jats-validation' else: if __debug__: log(f'skipping DTD validation of {jats_file}') to_archive.append(jats_file) # We need to store the image with the name that appears in the # JATS file. That requires a little extra work to extract. image_extension = filename_extension(article.image) image_file = image_filename(article, dest_dir, ext=image_extension) if article.image: if __debug__: log(f'downloading image file to {image_file}') if download_file(article.image, image_file): with Image.open(image_file) as img: converted_img = image_without_alpha(img) converted_img = converted_img.convert('RGB') if __debug__: log(f'converting image to TIFF format') tiff_file = filename_basename(image_file) + '.tif' # Using save() means that only the 1st frame of a # multiframe image will be saved. converted_img.save(tiff_file, dpi=_TIFF_DPI, compression=None, description=tiff_comments(article)) to_archive.append(tiff_file) # We keep only the uncompressed TIFF version. if __debug__: log(f'deleting original image file {image_file}') delete_existing(image_file) else: warn(f'Failed to download image for {article.doi}') article.status = 'failed-image-download' else: if __debug__: log(f'skipping empty image file URL for {article.doi}') # Finally, put the files into their own zip archive. if zip_articles: if not article.status.startswith('failed'): zip_file = pmc_zip_filename(article, dest_dir) inform(f'Creating ZIP archive file "{zip_file}"') archive_files(zip_file, to_archive) if __debug__: log(f'verifying ZIP file {zip_file}') verify_archive(zip_file, 'zip') for file in to_archive: if __debug__: log(f'deleting file {file}') delete_existing(file) else: warn( f'ZIP archive for {article.doi} not created due to errors')
def run_services(self, item, index, base_name): '''Run all requested services on the image indicated by "item", using "index" and "base_name" to construct a download copy of the item if it has to be downloaded from a URL first. ''' # Shortcuts to make the code more readable. services = self._services inform(f'Starting on [white]{item}[/]') (item_file, item_fmt) = self._get(item, base_name, index) if not item_file: return dest_dir = self._output_dir if self._output_dir else path.dirname( item_file) if not writable(dest_dir): alert(f'Cannot write output in {dest_dir}.') return # Normalize input image to the lowest common denominator. image = self._normalized(item, item_fmt, item_file, dest_dir) if not image.file: warn(f'Skipping {relative(item_file)}') return # Send the file to the services and get Result tuples back. self._senders = [] if self._num_threads == 1: # For 1 thread, avoid thread pool to make debugging easier. results = [self._send(image, s) for s in services] else: executor = ThreadPoolExecutor(max_workers=self._num_threads, thread_name_prefix='ServiceThread') for service in services: future = executor.submit(self._send, image, service) self._senders.append(future) results = [future.result() for future in self._senders] # If a service failed for some reason (e.g., a network glitch), we # get no result back. Remove empty results & go on with the rest. results = [x for x in results if x is not None] if not results: warn(f'Nothing to do for {item}') return # Create grid file if requested. if self._make_grid: base = path.basename(filename_basename(item_file)) grid_file = path.realpath( path.join(dest_dir, base + '.handprint-all.png')) inform(f'Creating results grid image: {relative(grid_file)}') all_results = [r.annotated for r in results] width = math.ceil(math.sqrt(len(all_results))) from handprint.images import create_image_grid create_image_grid(all_results, grid_file, max_horizontal=width) # Clean up after ourselves. if not self._extended_results: for file in set(image.temp_files | {r.annotated for r in results}): if file and path.exists(file): delete_existing(file) elif image.file != image.item_file: # Delete the resized file. While it would help efficiency to # reuse it on subsequent runs, the risk is that those runs might # target different services and would end up using a different- # sized image than if we sized it appropriately for _this_ run. delete_existing(image.file) inform(f'Done with {relative(item)}')
def _save_article_portico(self, dest_dir, article, xmldict): article_dir = path.join(dest_dir, article.basename) jats_dir = path.join(article_dir, 'jats') try: os.makedirs(article_dir) if self.journal.uses_jats: os.makedirs(jats_dir) except FileExistsError: pass inform('Writing ' + article.doi) xml_file = xml_filename(article, article_dir) with open(xml_file, 'w', encoding='utf8') as f: if __debug__: log(f'writing XML to {xml_file}') f.write(xmltodict.unparse(xmldict, pretty=True)) pdf_file = pdf_filename(article, article_dir) if __debug__: log(f'downloading PDF to {pdf_file}') if not download_file(article.pdf, pdf_file): warn(f'Could not download PDF file for {article.doi}') article.status = 'failed-pdf-download' if not self.journal.uses_jats: # Nothing more to do. return jats_file = jats_filename(article, jats_dir) if __debug__: log(f'downloading JATS XML to {jats_file}') if not download_file(article.jats, jats_file): warn(f'Could not download JATS file for {article.doi}') article.status = 'failed-jats-download' if self.do_validate: if not valid_xml(jats_file, self._dtd): warn(f'Failed to validate JATS for article {article.doi}') article.status = 'failed-jats-validation' else: if __debug__: log(f'skipping DTD validation of {jats_file}') # We need to store the image with the name that appears in the # JATS file. That requires a little extra work to extract. image_extension = filename_extension(article.image) image_file = image_filename(article, jats_dir, ext=image_extension) if article.image: if __debug__: log(f'downloading image file to {image_file}') if download_file(article.image, image_file): with Image.open(image_file) as img: converted = image_without_alpha(img) converted = converted.convert('RGB') if __debug__: log(f'converting image to TIFF format') tiff_name = filename_basename(image_file) + '.tif' comments = tiff_comments(article, self.journal.name) # Using save() means only the 1st frame of a multiframe # image will be saved. converted.save(tiff_name, compression=None, dpi=_TIFF_DPI, description=comments) # We keep only the uncompressed TIFF version. if __debug__: log(f'deleting original image file {image_file}') delete_existing(image_file) else: warn(f'Failed to download image for {article.doi}') article.status = 'failed-image-download' else: if __debug__: log(f'skipping empty image URL for {article.doi}')