Example #1
0
def targets_from_arguments(images, from_file, given_urls, say):
    targets = []
    if from_file:
        if __debug__: log('Opening {}', from_file)
        with open(from_file) as f:
            targets = f.readlines()
        targets = [line.rstrip('\n') for line in targets]
        if __debug__: log('Read {} lines from {}.', len(targets), from_file)
        if not given_urls:
            targets = filter_urls(targets, say)
    elif given_urls:
        # We assume that the arguments are URLs and take them as-is.
        targets = images
    else:
        # We were given files and/or directories.  Look for image files.
        # Ignore files that appear to be the previous output of Handprint.
        # These are files that end in, e.g., ".google.jpg"
        handprint_endings = ['.' + x + '.jpg' for x in KNOWN_METHODS.keys()]
        non_urls = filter_urls(images, say)
        non_urls = filter_endings(non_urls, handprint_endings)
        for item in non_urls:
            if path.isfile(item) and filename_extension(
                    item) in ACCEPTED_FORMATS:
                targets.append(item)
            elif path.isdir(item):
                files = files_in_directory(item, extensions=ACCEPTED_FORMATS)
                files = filter_endings(files, handprint_endings)
                targets += files
            else:
                say.warn('"{}" not a file or directory'.format(item))
    return targets
Example #2
0
 def _resized_image(self, file):
     (max_width, max_height) = self._max_dimensions
     file_ext = filename_extension(file)
     say = self._say
     if file.find('-reduced') > 0:
         new_file = file
     else:
         new_file = filename_basename(file) + '-reduced' + file_ext
     if path.exists(new_file) and readable(new_file):
         (image_width, image_height) = image_dimensions(new_file)
         if image_width < max_width and image_height < max_height:
             say.info('Using reduced image found in {}'.format(
                 relative(new_file)))
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, dimension are too large.
             if __debug__:
                 log('existing resized file larger than {}x{}: {}',
                     max_width, max_height, new_file)
     say.info('Dimensions too large; reducing dimensions: {}'.format(
         relative(file)))
     (resized, error) = reduced_image_dimensions(file, new_file, max_width,
                                                 max_height)
     if error:
         say.error('Failed to re-dimension {}: {}'.format(
             relative(file), error))
         return None
     return resized
Example #3
0
 def _smaller_file(self, file):
     if not file:
         return None
     say = self._say
     file_ext = filename_extension(file)
     if file.find('-reduced') > 0:
         new_file = file
     else:
         new_file = filename_basename(file) + '-reduced' + file_ext
     if path.exists(new_file):
         if image_size(new_file) < self._max_size:
             say.info('Reusing resized image found in {}'.format(
                 relative(new_file)))
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, it's larger than allowed.
             if __debug__:
                 log('existing resized file larger than {}b: {}',
                     humanize.intcomma(self._max_size), new_file)
     say.info('Size too large; reducing size: {}'.format(relative(file)))
     (resized, error) = reduced_image_size(file, new_file, self._max_size)
     if error:
         say.error('Failed to resize {}: {}'.format(relative(file), error))
         return None
     return resized
Example #4
0
    def targets_from_arguments(self, files, from_file):
        targets = []
        if from_file:
            if __debug__: log('Opening {}', from_file)
            with open(from_file) as f:
                targets = f.readlines()
            targets = [line.rstrip('\n') for line in targets]
            if __debug__:
                log('Read {} lines from {}.', len(targets), from_file)
        else:
            for item in files:
                if is_url(item):
                    targets.append(item)
                elif path.isfile(item) and filename_extension(
                        item) in ACCEPTED_FORMATS:
                    targets.append(item)
                elif path.isdir(item):
                    # It's a directory, so look for files within.
                    # Ignore files that appear to be the previous output of Handprint.
                    # (These are files that end in, e.g., ".google.png")
                    handprint_endings = [
                        '.' + x + _OUTPUT_EXT for x in services_list()
                    ]
                    files = files_in_directory(item,
                                               extensions=ACCEPTED_FORMATS)
                    files = filter_by_extensions(files, handprint_endings)
                    targets += files
                else:
                    self._say.warn('"{}" not a file or directory'.format(item))
        # Filter files we created in past runs.
        targets = [x for x in targets if x.find('-reduced') < 0]
        targets = [x for x in targets if x.find('all-results') < 0]

        # If there is both a file in the format we generate and another
        # format of that file, ignore the other formats and just use ours.
        keep = []
        for item in targets:
            ext = filename_extension(item)
            base = filename_basename(item)
            if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets):
                # png version of file is also present => skip this other version
                continue
            keep.append(item)
        return keep
Example #5
0
def file_after_resizing(file, tool, spinner):
    file_ext = filename_extension(file)
    new_file = filename_basename(file) + '-reduced.' + file_ext
    if path.exists(new_file):
        spinner.update('Using reduced image found in {}'.format(
            relative(new_file)))
        return new_file
    else:
        spinner.update('Original image too large; reducing size')
        (resized, error) = reduced_image(file, tool.max_dimensions())
        if not resized:
            spinner.fail('Failed to resize {}: {}'.format(relative(
                file, error)))
            return None
        return resized
Example #6
0
    def targets_from_arguments(self, files, from_file):
        targets = []
        if from_file:
            if __debug__: log('reading {}', from_file)
            targets = filter(None, open(from_file).read().splitlines())
        else:
            for item in files:
                if is_url(item):
                    targets.append(item)
                elif path.isfile(item) and filename_extension(
                        item) in ACCEPTED_FORMATS:
                    targets.append(item)
                elif path.isdir(item):
                    # It's a directory, so look for files within.
                    targets += files_in_directory(item,
                                                  extensions=ACCEPTED_FORMATS)
                else:
                    warn('"{}" not a file or directory', item)

        # Filter files created in past runs.
        targets = filter(lambda name: '.handprint' not in name, targets)

        # If there is both a file in the format we generate and another
        # format of that file, ignore the other formats and just use ours.
        # Note: the value of targets is an iterator, but b/c it's tested inside
        # the loop, a separate list is needed (else get unexpected results).
        targets = list(targets)
        keep = []
        for item in targets:
            ext = filename_extension(item)
            base = filename_basename(item)
            if ext != _OUTPUT_EXT and (base + _OUTPUT_EXT in targets):
                # png version of file is also present => skip this other version
                continue
            keep.append(item)
        return keep
Example #7
0
def converted_image(orig_file, to_format, dest_file=None):
    '''Returns a tuple of (success, output file, error message).
    Returns a tuple of (new_file, error).  The value of 'error' will be None
    if no error occurred; otherwise, the value will be a string summarizing the
    error that occurred and 'new_file' will be set to None.
    '''
    dest_format = canonical_format_name(to_format)
    if dest_file is None:
        dest_file = filename_basename(file) + '.' + dest_format
    # PIL is unable to read PDF files, so in that particular case, we have to
    # convert it using another tool.
    if filename_extension(orig_file) == '.pdf':
        import fitz
        doc = fitz.open(orig_file)
        if len(doc) >= 1:
            if len(doc) >= 2:
                if __debug__:
                    log('{} has > 1 images; using only 1st', orig_file)
            # FIXME: if there's more than 1 image, we could extra the rest.
            # Doing so will require some architectural changes first.
            if __debug__: log('extracting 1st image from {}', dest_file)
            page = doc[0]
            pix = page.getPixmap(alpha=False)
            if __debug__: log('writing {}', dest_file)
            pix.writeImage(dest_file, dest_format)
            return (dest_file, None)
        else:
            if __debug__:
                log('fitz says there is no image image in {}', orig_file)
            return (None, 'Cannot find an image inside {}'.format(orig_file))
    else:
        # When converting images, PIL may issue a DecompressionBombWarning but
        # it's not a concern in our application.  Ignore it.
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                im = Image.open(orig_file)
                if __debug__: log('converting {} to RGB', orig_file)
                im.convert('RGB')
                if __debug__: log('saving converted image to {}', dest_file)
                if orig_file == dest_file:
                    im.seek(0)
                im.save(dest_file, dest_format)
                return (dest_file, None)
            except Exception as ex:
                return (None, str(ex))
Example #8
0
    def _get(self, item, base_name, index):
        # Shortcuts to make the code more readable.
        output_dir = self._output_dir
        say = self._say

        # For URLs, we download the corresponding files and name them with
        # the base_name.
        if is_url(item):
            # First make sure the URL actually points to an image.
            if __debug__: log('testing if URL contains an image: {}', item)
            try:
                response = urllib.request.urlopen(item)
            except Exception as ex:
                say.warn('Skipping URL due to error: {}'.format(ex))
                return (None, None)
            if response.headers.get_content_maintype() != 'image':
                say.warn('Did not find an image at {}'.format(item))
                return (None, None)
            orig_fmt = response.headers.get_content_subtype()
            base = '{}-{}'.format(base_name, index)
            # If we weren't given an output dir, then for URLs, we have no
            # choice but to use the current dir to download the file.
            # Important: don't change self._output_dir because if other
            # inputs *are* files, then those files will need other output dirs.
            if not output_dir:
                output_dir = os.getcwd()
            file = path.realpath(path.join(output_dir, base + '.' + orig_fmt))
            if not download_file(item, file, say):
                say.warn('Unable to download {}'.format(item))
                return (None, None)
            url_file = path.realpath(path.join(output_dir, base + '.url'))
            with open(url_file, 'w') as f:
                f.write(url_file_content(item))
                say.info('Wrote URL to {}'.format(relative(url_file)))
        else:
            file = path.realpath(path.join(os.getcwd(), item))
            orig_fmt = filename_extension(file)[1:]

        if __debug__:
            log('{} has original format {}', relative(file), orig_fmt)
        return (file, orig_fmt)
Example #9
0
def targets_from_arguments(images, from_file, given_urls, say):
    targets = []
    if from_file:
        with open(from_file) as f:
            targets = f.readlines()
        targets = [line.rstrip('\n') for line in targets]
        if __debug__: log('Read {} lines from "{}".', len(targets), from_file)
        if not given_urls:
            targets = filter_urls(targets, say)
    elif given_urls:
        # We assume that the arguments are URLs and take them as-is.
        targets = images
    else:
        # We were given files and/or directories.  Look for image files.
        for item in filter_urls(images, say):
            if path.isfile(item) and filename_extension(item) in ACCEPTED_FORMATS:
                targets.append(item)
            elif path.isdir(item):
                targets += files_in_directory(item, extensions = ACCEPTED_FORMATS)
            else:
                say.warn('"{}" not a file or directory'.format(item))
    return targets
Example #10
0
def run(classes, item, index, base_name, output_dir, creds_dir, annotate, say):
    spinner = ProgressIndicator(say.use_color(), say.be_quiet())
    try:
        spinner.start('Starting on {}'.format(relative(item)))
        if is_url(item):
            # Make sure the URLs point to images.
            if __debug__: log('Testing if URL contains an image: {}', item)
            try:
                response = request.urlopen(item)
            except Exception as err:
                if __debug__:
                    log('Network access resulted in error: {}', str(err))
                spinner.fail('Skipping URL due to error: {}'.format(err))
                return
            if response.headers.get_content_maintype() != 'image':
                spinner.fail('Did not find an image at {}'.format(item))
                return
            fmt = response.headers.get_content_subtype()
            base = '{}-{}'.format(base_name, index)
            file = path.realpath(path.join(output_dir, base + '.' + fmt))
            error = download(item, file)
            if not error:
                spinner.update('Wrote contents to {}'.format(relative(file)))
            else:
                spinner.fail('Failed to download {}: {}'.format(item, error))
                return
            url_file = path.realpath(path.join(output_dir, base + '.url'))
            with open(url_file, 'w') as f:
                f.write(url_file_content(item))
                spinner.update('Wrote URL to {}'.format(relative(url_file)))
        else:
            file = path.realpath(path.join(os.getcwd(), item))
            fmt = filename_extension(file)

        dest_dir = output_dir if output_dir else path.dirname(file)
        if not writable(dest_dir):
            say.fatal('Cannot write output in {}.'.format(dest_dir))
            return

        # Iterate over the methods.
        for method_class in classes:
            method = method_class()
            method.init_credentials(creds_dir)
            last_time = timer()

            # If need to convert format, best do it after resizing original fmt.
            need_convert = fmt not in method.accepted_formats()
            # Test the dimensions, not bytes, because of compression.
            if image_dimensions(file) > method.max_dimensions():
                file = file_after_resizing(file, method, spinner)
            if file and need_convert:
                file = file_after_converting(file, 'jpg', method, spinner)
            if not file:
                return

            spinner.update('Sending to {} {}'.format(
                color(method, 'white', say.use_color()),
                # Need explicit color research or colorization goes wrong.
                color('and waiting for response', 'info', say.use_color())))
            try:
                result = method.result(file)
            except RateLimitExceeded as err:
                time_passed = timer() - last_time
                if time_passed < 1 / method.max_rate():
                    spinner.warn('Pausing due to rate limits')
                    time.sleep(1 / method.max_rate() - time_passed)
            if result.error:
                spinner.fail(result.error)
                return

            file_name = path.basename(file)
            base_path = path.join(dest_dir, file_name)
            txt_file = alt_extension(base_path, str(method) + '.txt')
            json_file = alt_extension(base_path, str(method) + '.json')
            annot_file = alt_extension(base_path, str(method) + '.jpg')
            spinner.update('Text -> {}'.format(relative(txt_file)))
            save_output(result.text, txt_file)
            spinner.update('All data -> {}'.format(relative(json_file)))
            save_output(json.dumps(result.data), json_file)
            if annotate:
                spinner.update('Annotated image -> {}'.format(
                    relative(annot_file)))
                save_output(annotated_image(file, result.boxes), annot_file)
        spinner.stop('Done with {}'.format(relative(item)))
    except (KeyboardInterrupt, UserCancelled) as err:
        spinner.warn('Interrupted')
        raise
    except AuthenticationFailure as err:
        spinner.fail('Unable to continue using {}: {}'.format(method, err))
        return
    except Exception as err:
        spinner.fail(say.error_text('Stopping due to a problem'))
        raise
Example #11
0
def run(method_class, targets, given_urls, output_dir, root_name, creds_dir, say):
    spinner = ProgressIndicator(say.use_color(), say.be_quiet())
    try:
        tool = method_class()
        tool_name = tool.name()
        say.info('Using method "{}".'.format(tool_name))
        tool.init_credentials(creds_dir)
        for index, item in enumerate(targets, 1):
            if not given_urls and (item.startswith('http') or item.startswith('ftp')):
                say.warn('Skipping URL "{}"'.format(item))
                continue
            if say.use_color() and not say.be_quiet():
                action = 'Downloading' if given_urls else 'Reading'
                spinner.start('{} {}'.format(action, item))
            fmt = None
            if given_urls:
                # Make sure the URLs point to images.
                response = request.urlopen(item)
                if response.headers.get_content_maintype() != 'image':
                    spinner.fail('Did not find an image at "{}"'.format(item))
                    continue
                fmt = response.headers.get_content_subtype()
                if fmt not in ACCEPTED_FORMATS:
                    spinner.fail('Cannot use image format {} in "{}"'.format(fmt, item))
                    continue
                # If we're given URLs, we have to invent file names to store
                # the images and the OCR results.
                base = '{}-{}'.format(root_name, index)
                url_file = path.realpath(path.join(output_dir, base + '.url'))
                if __debug__: log('Writing URL to {}', url_file)
                with open(url_file, 'w') as f:
                    f.write(url_file_content(item))
                file = path.realpath(path.join(output_dir, base + '.' + fmt))
                if __debug__: log('Starting wget on {}', item)
                (success, error) = download_url(item, file)
                if not success:
                    spinner.fail('Failed to download {}: {}'.format(item, error))
                    continue
            else:
                file = path.realpath(path.join(os.getcwd(), item))
                fmt = filename_extension(file)
            if output_dir:
                dest_dir = output_dir
            else:
                dest_dir = path.dirname(file)
                if not writable(dest_dir):
                    say.fatal('Cannot write output in "{}".'.format(dest_dir))
                    return
            if fmt in FORMATS_MUST_CONVERT:
                spinner.update('Converting file format to JPEG: "{}"'.format(file))
                (success, converted_file, msg) = convert_image(file, fmt, 'jpeg')
                if not success:
                    spinner.fail('Failed to convert "{}": {}'.format(file, msg))
                # Note: 'file' now points to the converted file, not the original
                file = converted_file
            file_name = path.basename(file)
            base_path = path.join(dest_dir, file_name)
            txt_file  = replace_extension(base_path, '.' + tool_name + '.txt')
            json_file = replace_extension(base_path, '.' + tool_name + '.json')
            spinner.update('Sending to {} for text extraction'.format(tool_name))
            save_output(tool.document_text(file), txt_file)
            spinner.update('Text from {} saved in {}'.format(tool_name, txt_file))
            spinner.update('All data from {} saved in {}'.format(tool_name, json_file))
            save_output(json.dumps(tool.all_results(file)), json_file)
            if say.use_color() and not say.be_quiet():
                short_path = path.relpath(txt_file, os.getcwd())
                spinner.stop('{} -> {}'.format(item, short_path))
    except (KeyboardInterrupt, UserCancelled) as err:
        if spinner:
            spinner.stop()
        raise
    except Exception as err:
        if spinner:
            spinner.fail(say.error_text('Stopping due to a problem'))
        raise