def targets_from_arguments(images, from_file, given_urls, say): targets = [] if from_file: if __debug__: log('Opening {}', from_file) with open(from_file) as f: targets = f.readlines() targets = [line.rstrip('\n') for line in targets] if __debug__: log('Read {} lines from {}.', len(targets), from_file) if not given_urls: targets = filter_urls(targets, say) elif given_urls: # We assume that the arguments are URLs and take them as-is. targets = images else: # We were given files and/or directories. Look for image files. # Ignore files that appear to be the previous output of Handprint. # These are files that end in, e.g., ".google.jpg" handprint_endings = ['.' + x + '.jpg' for x in KNOWN_METHODS.keys()] non_urls = filter_urls(images, say) non_urls = filter_endings(non_urls, handprint_endings) for item in non_urls: if path.isfile(item) and filename_extension( item) in ACCEPTED_FORMATS: targets.append(item) elif path.isdir(item): files = files_in_directory(item, extensions=ACCEPTED_FORMATS) files = filter_endings(files, handprint_endings) targets += files else: say.warn('"{}" not a file or directory'.format(item)) return targets
def _resized_image(self, file): (max_width, max_height) = self._max_dimensions file_ext = filename_extension(file) say = self._say if file.find('-reduced') > 0: new_file = file else: new_file = filename_basename(file) + '-reduced' + file_ext if path.exists(new_file) and readable(new_file): (image_width, image_height) = image_dimensions(new_file) if image_width < max_width and image_height < max_height: say.info('Using reduced image found in {}'.format( relative(new_file))) return new_file else: # We found a "-reduced" file, perhaps from a previous run, but # for the current set of services, dimension are too large. if __debug__: log('existing resized file larger than {}x{}: {}', max_width, max_height, new_file) say.info('Dimensions too large; reducing dimensions: {}'.format( relative(file))) (resized, error) = reduced_image_dimensions(file, new_file, max_width, max_height) if error: say.error('Failed to re-dimension {}: {}'.format( relative(file), error)) return None return resized
def _smaller_file(self, file): if not file: return None say = self._say file_ext = filename_extension(file) if file.find('-reduced') > 0: new_file = file else: new_file = filename_basename(file) + '-reduced' + file_ext if path.exists(new_file): if image_size(new_file) < self._max_size: say.info('Reusing resized image found in {}'.format( relative(new_file))) return new_file else: # We found a "-reduced" file, perhaps from a previous run, but # for the current set of services, it's larger than allowed. if __debug__: log('existing resized file larger than {}b: {}', humanize.intcomma(self._max_size), new_file) say.info('Size too large; reducing size: {}'.format(relative(file))) (resized, error) = reduced_image_size(file, new_file, self._max_size) if error: say.error('Failed to resize {}: {}'.format(relative(file), error)) return None return resized
def targets_from_arguments(self, files, from_file): targets = [] if from_file: if __debug__: log('Opening {}', from_file) with open(from_file) as f: targets = f.readlines() targets = [line.rstrip('\n') for line in targets] if __debug__: log('Read {} lines from {}.', len(targets), from_file) else: for item in files: if is_url(item): targets.append(item) elif path.isfile(item) and filename_extension( item) in ACCEPTED_FORMATS: targets.append(item) elif path.isdir(item): # It's a directory, so look for files within. # Ignore files that appear to be the previous output of Handprint. # (These are files that end in, e.g., ".google.png") handprint_endings = [ '.' + x + _OUTPUT_EXT for x in services_list() ] files = files_in_directory(item, extensions=ACCEPTED_FORMATS) files = filter_by_extensions(files, handprint_endings) targets += files else: self._say.warn('"{}" not a file or directory'.format(item)) # Filter files we created in past runs. targets = [x for x in targets if x.find('-reduced') < 0] targets = [x for x in targets if x.find('all-results') < 0] # If there is both a file in the format we generate and another # format of that file, ignore the other formats and just use ours. keep = [] for item in targets: ext = filename_extension(item) base = filename_basename(item) if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets): # png version of file is also present => skip this other version continue keep.append(item) return keep
def file_after_resizing(file, tool, spinner): file_ext = filename_extension(file) new_file = filename_basename(file) + '-reduced.' + file_ext if path.exists(new_file): spinner.update('Using reduced image found in {}'.format( relative(new_file))) return new_file else: spinner.update('Original image too large; reducing size') (resized, error) = reduced_image(file, tool.max_dimensions()) if not resized: spinner.fail('Failed to resize {}: {}'.format(relative( file, error))) return None return resized
def targets_from_arguments(self, files, from_file): targets = [] if from_file: if __debug__: log('reading {}', from_file) targets = filter(None, open(from_file).read().splitlines()) else: for item in files: if is_url(item): targets.append(item) elif path.isfile(item) and filename_extension( item) in ACCEPTED_FORMATS: targets.append(item) elif path.isdir(item): # It's a directory, so look for files within. targets += files_in_directory(item, extensions=ACCEPTED_FORMATS) else: warn('"{}" not a file or directory', item) # Filter files created in past runs. targets = filter(lambda name: '.handprint' not in name, targets) # If there is both a file in the format we generate and another # format of that file, ignore the other formats and just use ours. # Note: the value of targets is an iterator, but b/c it's tested inside # the loop, a separate list is needed (else get unexpected results). targets = list(targets) keep = [] for item in targets: ext = filename_extension(item) base = filename_basename(item) if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets): # png version of file is also present => skip this other version continue keep.append(item) return keep
def converted_image(orig_file, to_format, dest_file=None): '''Returns a tuple of (success, output file, error message). Returns a tuple of (new_file, error). The value of 'error' will be None if no error occurred; otherwise, the value will be a string summarizing the error that occurred and 'new_file' will be set to None. ''' dest_format = canonical_format_name(to_format) if dest_file is None: dest_file = filename_basename(file) + '.' + dest_format # PIL is unable to read PDF files, so in that particular case, we have to # convert it using another tool. if filename_extension(orig_file) == '.pdf': import fitz doc = fitz.open(orig_file) if len(doc) >= 1: if len(doc) >= 2: if __debug__: log('{} has > 1 images; using only 1st', orig_file) # FIXME: if there's more than 1 image, we could extra the rest. # Doing so will require some architectural changes first. if __debug__: log('extracting 1st image from {}', dest_file) page = doc[0] pix = page.getPixmap(alpha=False) if __debug__: log('writing {}', dest_file) pix.writeImage(dest_file, dest_format) return (dest_file, None) else: if __debug__: log('fitz says there is no image image in {}', orig_file) return (None, 'Cannot find an image inside {}'.format(orig_file)) else: # When converting images, PIL may issue a DecompressionBombWarning but # it's not a concern in our application. Ignore it. with warnings.catch_warnings(): warnings.simplefilter('ignore') try: im = Image.open(orig_file) if __debug__: log('converting {} to RGB', orig_file) im.convert('RGB') if __debug__: log('saving converted image to {}', dest_file) if orig_file == dest_file: im.seek(0) im.save(dest_file, dest_format) return (dest_file, None) except Exception as ex: return (None, str(ex))
def _get(self, item, base_name, index): # Shortcuts to make the code more readable. output_dir = self._output_dir say = self._say # For URLs, we download the corresponding files and name them with # the base_name. if is_url(item): # First make sure the URL actually points to an image. if __debug__: log('testing if URL contains an image: {}', item) try: response = urllib.request.urlopen(item) except Exception as ex: say.warn('Skipping URL due to error: {}'.format(ex)) return (None, None) if response.headers.get_content_maintype() != 'image': say.warn('Did not find an image at {}'.format(item)) return (None, None) orig_fmt = response.headers.get_content_subtype() base = '{}-{}'.format(base_name, index) # If we weren't given an output dir, then for URLs, we have no # choice but to use the current dir to download the file. # Important: don't change self._output_dir because if other # inputs *are* files, then those files will need other output dirs. if not output_dir: output_dir = os.getcwd() file = path.realpath(path.join(output_dir, base + '.' + orig_fmt)) if not download_file(item, file, say): say.warn('Unable to download {}'.format(item)) return (None, None) url_file = path.realpath(path.join(output_dir, base + '.url')) with open(url_file, 'w') as f: f.write(url_file_content(item)) say.info('Wrote URL to {}'.format(relative(url_file))) else: file = path.realpath(path.join(os.getcwd(), item)) orig_fmt = filename_extension(file)[1:] if __debug__: log('{} has original format {}', relative(file), orig_fmt) return (file, orig_fmt)
def targets_from_arguments(images, from_file, given_urls, say): targets = [] if from_file: with open(from_file) as f: targets = f.readlines() targets = [line.rstrip('\n') for line in targets] if __debug__: log('Read {} lines from "{}".', len(targets), from_file) if not given_urls: targets = filter_urls(targets, say) elif given_urls: # We assume that the arguments are URLs and take them as-is. targets = images else: # We were given files and/or directories. Look for image files. for item in filter_urls(images, say): if path.isfile(item) and filename_extension(item) in ACCEPTED_FORMATS: targets.append(item) elif path.isdir(item): targets += files_in_directory(item, extensions = ACCEPTED_FORMATS) else: say.warn('"{}" not a file or directory'.format(item)) return targets
def run(classes, item, index, base_name, output_dir, creds_dir, annotate, say): spinner = ProgressIndicator(say.use_color(), say.be_quiet()) try: spinner.start('Starting on {}'.format(relative(item))) if is_url(item): # Make sure the URLs point to images. if __debug__: log('Testing if URL contains an image: {}', item) try: response = request.urlopen(item) except Exception as err: if __debug__: log('Network access resulted in error: {}', str(err)) spinner.fail('Skipping URL due to error: {}'.format(err)) return if response.headers.get_content_maintype() != 'image': spinner.fail('Did not find an image at {}'.format(item)) return fmt = response.headers.get_content_subtype() base = '{}-{}'.format(base_name, index) file = path.realpath(path.join(output_dir, base + '.' + fmt)) error = download(item, file) if not error: spinner.update('Wrote contents to {}'.format(relative(file))) else: spinner.fail('Failed to download {}: {}'.format(item, error)) return url_file = path.realpath(path.join(output_dir, base + '.url')) with open(url_file, 'w') as f: f.write(url_file_content(item)) spinner.update('Wrote URL to {}'.format(relative(url_file))) else: file = path.realpath(path.join(os.getcwd(), item)) fmt = filename_extension(file) dest_dir = output_dir if output_dir else path.dirname(file) if not writable(dest_dir): say.fatal('Cannot write output in {}.'.format(dest_dir)) return # Iterate over the methods. for method_class in classes: method = method_class() method.init_credentials(creds_dir) last_time = timer() # If need to convert format, best do it after resizing original fmt. need_convert = fmt not in method.accepted_formats() # Test the dimensions, not bytes, because of compression. if image_dimensions(file) > method.max_dimensions(): file = file_after_resizing(file, method, spinner) if file and need_convert: file = file_after_converting(file, 'jpg', method, spinner) if not file: return spinner.update('Sending to {} {}'.format( color(method, 'white', say.use_color()), # Need explicit color research or colorization goes wrong. color('and waiting for response', 'info', say.use_color()))) try: result = method.result(file) except RateLimitExceeded as err: time_passed = timer() - last_time if time_passed < 1 / method.max_rate(): spinner.warn('Pausing due to rate limits') time.sleep(1 / method.max_rate() - time_passed) if result.error: spinner.fail(result.error) return file_name = path.basename(file) base_path = path.join(dest_dir, file_name) txt_file = alt_extension(base_path, str(method) + '.txt') json_file = alt_extension(base_path, str(method) + '.json') annot_file = alt_extension(base_path, str(method) + '.jpg') spinner.update('Text -> {}'.format(relative(txt_file))) save_output(result.text, txt_file) spinner.update('All data -> {}'.format(relative(json_file))) save_output(json.dumps(result.data), json_file) if annotate: spinner.update('Annotated image -> {}'.format( relative(annot_file))) save_output(annotated_image(file, result.boxes), annot_file) spinner.stop('Done with {}'.format(relative(item))) except (KeyboardInterrupt, UserCancelled) as err: spinner.warn('Interrupted') raise except AuthenticationFailure as err: spinner.fail('Unable to continue using {}: {}'.format(method, err)) return except Exception as err: spinner.fail(say.error_text('Stopping due to a problem')) raise
def run(method_class, targets, given_urls, output_dir, root_name, creds_dir, say): spinner = ProgressIndicator(say.use_color(), say.be_quiet()) try: tool = method_class() tool_name = tool.name() say.info('Using method "{}".'.format(tool_name)) tool.init_credentials(creds_dir) for index, item in enumerate(targets, 1): if not given_urls and (item.startswith('http') or item.startswith('ftp')): say.warn('Skipping URL "{}"'.format(item)) continue if say.use_color() and not say.be_quiet(): action = 'Downloading' if given_urls else 'Reading' spinner.start('{} {}'.format(action, item)) fmt = None if given_urls: # Make sure the URLs point to images. response = request.urlopen(item) if response.headers.get_content_maintype() != 'image': spinner.fail('Did not find an image at "{}"'.format(item)) continue fmt = response.headers.get_content_subtype() if fmt not in ACCEPTED_FORMATS: spinner.fail('Cannot use image format {} in "{}"'.format(fmt, item)) continue # If we're given URLs, we have to invent file names to store # the images and the OCR results. base = '{}-{}'.format(root_name, index) url_file = path.realpath(path.join(output_dir, base + '.url')) if __debug__: log('Writing URL to {}', url_file) with open(url_file, 'w') as f: f.write(url_file_content(item)) file = path.realpath(path.join(output_dir, base + '.' + fmt)) if __debug__: log('Starting wget on {}', item) (success, error) = download_url(item, file) if not success: spinner.fail('Failed to download {}: {}'.format(item, error)) continue else: file = path.realpath(path.join(os.getcwd(), item)) fmt = filename_extension(file) if output_dir: dest_dir = output_dir else: dest_dir = path.dirname(file) if not writable(dest_dir): say.fatal('Cannot write output in "{}".'.format(dest_dir)) return if fmt in FORMATS_MUST_CONVERT: spinner.update('Converting file format to JPEG: "{}"'.format(file)) (success, converted_file, msg) = convert_image(file, fmt, 'jpeg') if not success: spinner.fail('Failed to convert "{}": {}'.format(file, msg)) # Note: 'file' now points to the converted file, not the original file = converted_file file_name = path.basename(file) base_path = path.join(dest_dir, file_name) txt_file = replace_extension(base_path, '.' + tool_name + '.txt') json_file = replace_extension(base_path, '.' + tool_name + '.json') spinner.update('Sending to {} for text extraction'.format(tool_name)) save_output(tool.document_text(file), txt_file) spinner.update('Text from {} saved in {}'.format(tool_name, txt_file)) spinner.update('All data from {} saved in {}'.format(tool_name, json_file)) save_output(json.dumps(tool.all_results(file)), json_file) if say.use_color() and not say.be_quiet(): short_path = path.relpath(txt_file, os.getcwd()) spinner.stop('{} -> {}'.format(item, short_path)) except (KeyboardInterrupt, UserCancelled) as err: if spinner: spinner.stop() raise except Exception as err: if spinner: spinner.fail(say.error_text('Stopping due to a problem')) raise