def _do_main_work(self): if self.overwrite: warn('Overwrite mode in effect.') if self.dry_run: warn('Running in dry run mode – will not modify files.') inform(f'Will process {pluralized("file", self._targets, True)}' + f' using {pluralized("method", self.methods)}' + f' [cyan2]{", ".join(self.methods)}[/].') if len(self._targets) > 10000: inform( "(That's a huge number of files – this will take a long time.)" ) elif len(self._targets) > 1000: inform("(That's a lot of files – this will take some time.)") for file in self._targets: (record, failure) = self._zotero.record_for_file(file) if failure: warn(failure) continue ext = filename_extension(file) for method in self._writers: if method.file_extension() and ext != method.file_extension(): f = antiformat(f'[steel_blue3]{file}[/]') warn( f"Method [cyan2]{method.name()}[/] can't be used on {f}" ) else: method.write_link(file, record.link)
def _resized_image(self, file): (max_width, max_height) = self._max_dimensions file_ext = filename_extension(file) name_tail = '.handprint' + file_ext new_file = file if name_tail in file else filename_basename( file) + name_tail if path.exists(new_file) and readable(new_file): from handprint.images import image_dimensions (image_width, image_height) = image_dimensions(new_file) if image_width < max_width and image_height < max_height: inform(f'Using reduced image found in {relative(new_file)}') return new_file else: # We found a "-reduced" file, perhaps from a previous run, but # for the current set of services, dimension are too large. if __debug__: log('existing resized file larger than' + f' {max_width}x{max_height}: {new_file}') inform(f'Dimensions too large; reducing dimensions: {relative(file)}') from handprint.images import reduced_image_dimensions (resized, error) = reduced_image_dimensions(file, new_file, max_width, max_height) if error: alert(f'Failed to re-dimension {relative(file)}: {error}') return None return resized
def _smaller_file(self, file): if not file: return None file_ext = filename_extension(file) name_tail = '.handprint' + file_ext new_file = file if name_tail in file else filename_basename( file) + name_tail if path.exists(new_file): from handprint.images import image_size if image_size(new_file) < self._max_size: inform(f'Reusing resized image found in {relative(new_file)}') return new_file else: # We found a ".handprint.ext" file, perhaps from a previous run, # but for the current set of services, it's larger than allowed. if __debug__: log('existing resized file larger than' + f' {self._max_size}b: {new_file}') inform(f'Size too large; reducing size: {relative(file)}') from handprint.images import reduced_image_size (resized, error) = reduced_image_size(file, new_file, self._max_size) if error: alert(f'Failed to resize {relative(file)}: {error}') return None return resized
def targets_from_arguments(self): # Validator_collection takes a long time to load. Delay loading it # until needed, so that overall application startup time is faster. from validator_collection.checkers import is_url targets = [] if self.from_file: if __debug__: log(f'reading {self.from_file}') targets = filter(None, open(self.from_file).read().splitlines()) else: for item in self.files: if is_url(item): targets.append(item) elif isfile(item) and filename_extension( item) in ACCEPTED_FORMATS: targets.append(item) elif isdir(item): # It's a directory, so look for files within. targets += files_in_directory(item, extensions=ACCEPTED_FORMATS) else: warn(f'"{item}" not a file or directory') # Filter files created in past runs. targets = filter(lambda name: '.handprint' not in name, targets) # If there is both a file in the format we generate and another # format of that file, ignore the other formats and just use ours. # Note: the value of targets is an iterator, but b/c it's tested inside # the loop, a separate list is needed (else get unexpected results). targets = list(targets) keep = [] for item in targets: ext = filename_extension(item) base = filename_basename(item) if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets): # png version of file is also present => skip this other version continue keep.append(item) return keep
def converted_image(orig_file, to_format, dest_file=None): '''Returns a tuple of (success, output file, error message). Returns a tuple of (new_file, error). The value of 'error' will be None if no error occurred; otherwise, the value will be a string summarizing the error that occurred and 'new_file' will be set to None. ''' dest_format = canonical_format_name(to_format) if dest_file is None: dest_file = filename_basename(file) + '.' + dest_format # PIL is unable to read PDF files, so in that particular case, we have to # convert it using another tool. if filename_extension(orig_file) == '.pdf': import fitz doc = fitz.open(orig_file) if len(doc) >= 1: if len(doc) >= 2: if __debug__: log(f'{orig_file} has > 1 images; using only 1st') # FIXME: if there's more than 1 image, we could extra the rest. # Doing so will require some architectural changes first. if __debug__: log(f'extracting 1st image from {relative(dest_file)}') page = doc[0] pix = page.getPixmap(alpha=False) if __debug__: log(f'writing {relative(dest_file)}') pix.writeImage(dest_file, dest_format) return (dest_file, None) else: if __debug__: log(f'fitz says there is no image image in {relative(orig_file)}' ) return (None, f'Cannot find an image inside {relative(orig_file)}') else: # When converting images, PIL may issue a DecompressionBombWarning but # it's not a concern in our application. Ignore it. with warnings.catch_warnings(): warnings.simplefilter('ignore') try: im = Image.open(orig_file) if __debug__: log(f'converting {relative(orig_file)} to RGB') im.convert('RGB') if __debug__: log(f'saving converted image to {relative(dest_file)}') if orig_file == dest_file: im.seek(0) im.save(dest_file, dest_format) return (dest_file, None) except Exception as ex: return (None, str(ex))
def _get(self, item, base_name, index): # Shortcuts to make the code more readable. output_dir = self._output_dir # For URLs, we download the corresponding files and name them with # the base_name. from validator_collection.checkers import is_url if is_url(item): # First make sure the URL actually points to an image. if __debug__: log(f'testing if URL contains an image: {item}') headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'} try: request = urllib.request.Request(item, None, headers) response = urllib.request.urlopen(request) except Exception as ex: warn(f'Skipping URL due to error: {ex}') return (None, None) if response.headers.get_content_maintype() != 'image': warn(f'Did not find an image at {item}') return (None, None) orig_fmt = response.headers.get_content_subtype() base = f'{base_name}-{index}' # If we weren't given an output dir, then for URLs, we have no # choice but to use the current dir to download the file. # Important: don't change self._output_dir because if other # inputs *are* files, then those files will need other output dirs. if not output_dir: output_dir = os.getcwd() file = path.realpath(path.join(output_dir, base + '.' + orig_fmt)) if not download_file(item, file): warn(f'Unable to download {item}') return (None, None) url_file = path.realpath(path.join(output_dir, base + '.url')) with open(url_file, 'w') as f: f.write(url_file_content(item)) inform( f'Wrote URL to [white on grey42]{relative(url_file)}[/]') else: file = path.realpath(path.join(os.getcwd(), item)) orig_fmt = filename_extension(file)[1:] if not path.getsize(file) > 0: warn(f'File has zero length: {relative(file)}') return (None, None) if __debug__: log(f'{relative(file)} has original format {orig_fmt}') return (file, orig_fmt)
def _do_preflight(self): '''Check the option values given by the user, and do other prep.''' if not network_available(): alert_fatal('No network connection.') raise CannotProceed(ExitCode.no_network) # Sanity-check the arguments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hint = '(Hint: use -h for help.)' if not self.files: alert_fatal( f'Need at least one folder path or file as argument. {hint}') raise CannotProceed(ExitCode.bad_arg) if any(item.startswith('-') for item in self.files): bad = next(item for item in self.files if item.startswith('-')) alert_fatal(f'Unrecognized option "{bad}" in arguments. {hint}') raise CannotProceed(ExitCode.bad_arg) if not self.use_keyring and not any([self.api_key, self.user_id]): alert_fatal( f"Need Zotero credentials if not using keyring. {hint}") raise CannotProceed(ExitCode.bad_arg) if self.after_date: try: # Convert user's input into a canonical format. self.after_date = parsed_datetime(self.after_date) self.after_date_str = self.after_date.strftime(DATE_FORMAT) if __debug__: log(f'parsed after_date as {self.after_date_str}') except KeyboardInterrupt as ex: if __debug__: log(f'got exception {str(ex)}') raise except Exception as ex: alert_fatal(f'Unable to parse after_date: "{str(ex)}". {hint}') raise CannotProceed(ExitCode.bad_arg) if self.file_ext: self.file_ext = self.file_ext.lower().split(',') self.file_ext = [ '.' + e for e in self.file_ext if not e.startswith('.') ] # Set up Zotero connection and gather files for work ~~~~~~~~~~~~~~~~~~ inform('Connecting to Zotero network servers ...') self._zotero = Zotero(self.api_key, self.user_id, self.use_keyring) if len(self.files) > 1 or path.isdir(self.files[0]): inform('Examining folders and looking for files ...') # 2 passes: traverse subdirectories recursively, then filter results. candidates = [] for item in self.files: if path.isfile(item): candidates.append(item) elif path.isdir(item): if __debug__: log(f'adding files in subdir {antiformat(item)}') candidates += files_in_directory(item) else: warn(f'Not a file nor a folder of files: "{antiformat(item)}"') if __debug__: log('gathering list of files ...') self._targets = [] for file in candidates: ext = filename_extension(file) if path.basename(file).startswith('.') or ext in _IGNORED_EXT: if __debug__: log(f'ignoring ignorable file {antiformat(file)}') continue if self.file_ext and ext not in self.file_ext: warn( f'Skipping file without desired extension: {antiformat(file)}' ) continue if file_is_alias(file): if __debug__: log(f'ignoring macOS alias {antiformat(file)}') continue self._targets.append(file) if __debug__: log(f'gathered {pluralized("file", self._targets, True)}') if self.after_date: if __debug__: log(f'filtering files by date {self.after_date_str}') kept = [] tzinfo = self.after_date.tzinfo for file in self.files: mtime = datetime.fromtimestamp(Path(file).stat().st_mtime) if mtime.replace(tzinfo=tzinfo) >= self.after_date: if __debug__: log(f'keeping {file}') kept.append(file) self._targets = kept if not self._targets: alert_fatal('No files to process; quitting.') raise CannotProceed(ExitCode.bad_arg)
def _save_article_pmc(self, dest_dir, article, xml, zip_articles): inform('Writing ' + article.doi) to_archive = [] pdf_file = pmc_pdf_filename(article, dest_dir) if __debug__: log(f'downloading PDF to {pdf_file}') if not download_file(article.pdf, pdf_file): warn(f'Could not download PDF file for {article.doi}') article.status = 'failed-pdf-download' to_archive.append(pdf_file) jats_file = jats_filename(article, dest_dir) if __debug__: log(f'downloading JATS XML to {jats_file}') if not download_file(article.jats, jats_file): warn(f'Could not download JATS file for {article.doi}') article.status = 'failed-jats-download' if self.do_validate: if not valid_xml(jats_file, self._dtd): warn(f'Failed to validate JATS for article {article.doi}') article.status = 'failed-jats-validation' else: if __debug__: log(f'skipping DTD validation of {jats_file}') to_archive.append(jats_file) # We need to store the image with the name that appears in the # JATS file. That requires a little extra work to extract. image_extension = filename_extension(article.image) image_file = image_filename(article, dest_dir, ext=image_extension) if article.image: if __debug__: log(f'downloading image file to {image_file}') if download_file(article.image, image_file): with Image.open(image_file) as img: converted_img = image_without_alpha(img) converted_img = converted_img.convert('RGB') if __debug__: log(f'converting image to TIFF format') tiff_file = filename_basename(image_file) + '.tif' # Using save() means that only the 1st frame of a # multiframe image will be saved. converted_img.save(tiff_file, dpi=_TIFF_DPI, compression=None, description=tiff_comments(article)) to_archive.append(tiff_file) # We keep only the uncompressed TIFF version. if __debug__: log(f'deleting original image file {image_file}') delete_existing(image_file) else: warn(f'Failed to download image for {article.doi}') article.status = 'failed-image-download' else: if __debug__: log(f'skipping empty image file URL for {article.doi}') # Finally, put the files into their own zip archive. if zip_articles: if not article.status.startswith('failed'): zip_file = pmc_zip_filename(article, dest_dir) inform(f'Creating ZIP archive file "{zip_file}"') archive_files(zip_file, to_archive) if __debug__: log(f'verifying ZIP file {zip_file}') verify_archive(zip_file, 'zip') for file in to_archive: if __debug__: log(f'deleting file {file}') delete_existing(file) else: warn( f'ZIP archive for {article.doi} not created due to errors')
def _save_article_portico(self, dest_dir, article, xmldict): article_dir = path.join(dest_dir, article.basename) jats_dir = path.join(article_dir, 'jats') try: os.makedirs(article_dir) if self.journal.uses_jats: os.makedirs(jats_dir) except FileExistsError: pass inform('Writing ' + article.doi) xml_file = xml_filename(article, article_dir) with open(xml_file, 'w', encoding='utf8') as f: if __debug__: log(f'writing XML to {xml_file}') f.write(xmltodict.unparse(xmldict, pretty=True)) pdf_file = pdf_filename(article, article_dir) if __debug__: log(f'downloading PDF to {pdf_file}') if not download_file(article.pdf, pdf_file): warn(f'Could not download PDF file for {article.doi}') article.status = 'failed-pdf-download' if not self.journal.uses_jats: # Nothing more to do. return jats_file = jats_filename(article, jats_dir) if __debug__: log(f'downloading JATS XML to {jats_file}') if not download_file(article.jats, jats_file): warn(f'Could not download JATS file for {article.doi}') article.status = 'failed-jats-download' if self.do_validate: if not valid_xml(jats_file, self._dtd): warn(f'Failed to validate JATS for article {article.doi}') article.status = 'failed-jats-validation' else: if __debug__: log(f'skipping DTD validation of {jats_file}') # We need to store the image with the name that appears in the # JATS file. That requires a little extra work to extract. image_extension = filename_extension(article.image) image_file = image_filename(article, jats_dir, ext=image_extension) if article.image: if __debug__: log(f'downloading image file to {image_file}') if download_file(article.image, image_file): with Image.open(image_file) as img: converted = image_without_alpha(img) converted = converted.convert('RGB') if __debug__: log(f'converting image to TIFF format') tiff_name = filename_basename(image_file) + '.tif' comments = tiff_comments(article, self.journal.name) # Using save() means only the 1st frame of a multiframe # image will be saved. converted.save(tiff_name, compression=None, dpi=_TIFF_DPI, description=comments) # We keep only the uncompressed TIFF version. if __debug__: log(f'deleting original image file {image_file}') delete_existing(image_file) else: warn(f'Failed to download image for {article.doi}') article.status = 'failed-image-download' else: if __debug__: log(f'skipping empty image URL for {article.doi}')