Ejemplo n.º 1
0
 def _resized_image(self, file):
     (max_width, max_height) = self._max_dimensions
     file_ext = filename_extension(file)
     say = self._say
     if file.find('-reduced') > 0:
         new_file = file
     else:
         new_file = filename_basename(file) + '-reduced' + file_ext
     if path.exists(new_file) and readable(new_file):
         (image_width, image_height) = image_dimensions(new_file)
         if image_width < max_width and image_height < max_height:
             say.info('Using reduced image found in {}'.format(
                 relative(new_file)))
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, dimension are too large.
             if __debug__:
                 log('existing resized file larger than {}x{}: {}',
                     max_width, max_height, new_file)
     say.info('Dimensions too large; reducing dimensions: {}'.format(
         relative(file)))
     (resized, error) = reduced_image_dimensions(file, new_file, max_width,
                                                 max_height)
     if error:
         say.error('Failed to re-dimension {}: {}'.format(
             relative(file), error))
         return None
     return resized
Ejemplo n.º 2
0
def annotated_image(file, text_boxes, service):
    service_name = service.name()

    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 20))
    axes.get_xaxis().set_visible(False)
    axes.get_yaxis().set_visible(False)
    axes.set_title(service_name, color='r', fontweight='bold', fontsize=22)

    if __debug__:
        log('reading image file for {}: {}', service_name, relative(file))
    img = mpimg.imread(file)
    axes.imshow(img, cmap="gray")

    props = dict(facecolor='white', alpha=0.7)
    if text_boxes:
        if __debug__:
            log('adding {} annotations for {}', len(text_boxes), service_name)
        polygons = [(item.boundingBox, item.text) for item in text_boxes]
        for polygon in polygons:
            vertices = [(polygon[0][i], polygon[0][i + 1])
                        for i in range(0, len(polygon[0]), 2)]
            x = max(0, vertices[0][0] - 4)
            y = max(0, vertices[0][1] - 8)
            text = polygon[1]
            plt.text(x, y, text, color='r', fontsize=11, va="top", bbox=props)

    if __debug__:
        log('generating png for {} for {}', service_name, relative(file))
    buf = io.BytesIO()
    fig.savefig(buf, format='png', dpi=300, bbox_inches='tight', pad_inches=0)
    buf.flush()
    buf.seek(0)
    plt.close(fig)

    return buf
Ejemplo n.º 3
0
 def _smaller_file(self, file):
     if not file:
         return None
     say = self._say
     file_ext = filename_extension(file)
     if file.find('-reduced') > 0:
         new_file = file
     else:
         new_file = filename_basename(file) + '-reduced' + file_ext
     if path.exists(new_file):
         if image_size(new_file) < self._max_size:
             say.info('Reusing resized image found in {}'.format(
                 relative(new_file)))
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, it's larger than allowed.
             if __debug__:
                 log('existing resized file larger than {}b: {}',
                     humanize.intcomma(self._max_size), new_file)
     say.info('Size too large; reducing size: {}'.format(relative(file)))
     (resized, error) = reduced_image_size(file, new_file, self._max_size)
     if error:
         say.error('Failed to resize {}: {}'.format(relative(file), error))
         return None
     return resized
Ejemplo n.º 4
0
def file_after_converting(file, to_format, tool, spinner):
    new_file = filename_basename(file) + '.' + to_format
    if path.exists(new_file):
        spinner.update('Using converted image found in {}'.format(
            relative(new_file)))
        return new_file
    else:
        spinner.update('Converting to {} format: {}'.format(
            to_format, relative(file)))
        (converted, error) = converted_image(file, to_format)
        if not converted:
            spinner.fail('Failed to convert {}: {}'.format(
                relative(file), error))
            return None
        return converted
Ejemplo n.º 5
0
def file_after_resizing(file, tool, spinner):
    file_ext = filename_extension(file)
    new_file = filename_basename(file) + '-reduced.' + file_ext
    if path.exists(new_file):
        spinner.update('Using reduced image found in {}'.format(
            relative(new_file)))
        return new_file
    else:
        spinner.update('Original image too large; reducing size')
        (resized, error) = reduced_image(file, tool.max_dimensions())
        if not resized:
            spinner.fail('Failed to resize {}: {}'.format(relative(
                file, error)))
            return None
        return resized
Ejemplo n.º 6
0
    def _save_output(self, result, file):
        say = self._say

        # First perform some sanity checks.
        if result is None:
            say.warn('No data for {}'.format(file))
            return
        if isinstance(result, tuple):
            # Assumes 2 elements: data, and error
            (data, error) = result
            if error:
                say.error('Error: {}'.format(error))
                say.warn('Unable to write {}'.format(file))
                return
            else:
                result = data

        if __debug__: log('writing output to file {}', relative(file))
        if isinstance(result, str):
            with open(file, 'w') as f:
                f.write(result)
        elif isinstance(result, io.BytesIO):
            with open(file, 'wb') as f:
                shutil.copyfileobj(result, f)
        else:
            # There's no other type in the code, so if we get here ...
            raise InternalError(
                'Unexpected data in save_output() -- please report this.')
Ejemplo n.º 7
0
 def _converted_file(self, file, to_format, dest_dir):
     basename = path.basename(filename_basename(file))
     new_file = path.join(dest_dir, basename + '.' + to_format)
     say = self._say
     if path.exists(new_file):
         say.info('Using already converted image in {}'.format(
             relative(new_file)))
         return new_file
     else:
         say.info('Converting to {} format: {}'.format(
             to_format, relative(file)))
         (converted, error) = converted_image(file, to_format, new_file)
         if error:
             say.error('Failed to convert {}: {}'.format(
                 relative(file), error))
             return None
         return converted
Ejemplo n.º 8
0
    def _get(self, item, base_name, index):
        # Shortcuts to make the code more readable.
        output_dir = self._output_dir
        say = self._say

        # For URLs, we download the corresponding files and name them with
        # the base_name.
        if is_url(item):
            # First make sure the URL actually points to an image.
            if __debug__: log('testing if URL contains an image: {}', item)
            try:
                response = urllib.request.urlopen(item)
            except Exception as ex:
                say.warn('Skipping URL due to error: {}'.format(ex))
                return (None, None)
            if response.headers.get_content_maintype() != 'image':
                say.warn('Did not find an image at {}'.format(item))
                return (None, None)
            orig_fmt = response.headers.get_content_subtype()
            base = '{}-{}'.format(base_name, index)
            # If we weren't given an output dir, then for URLs, we have no
            # choice but to use the current dir to download the file.
            # Important: don't change self._output_dir because if other
            # inputs *are* files, then those files will need other output dirs.
            if not output_dir:
                output_dir = os.getcwd()
            file = path.realpath(path.join(output_dir, base + '.' + orig_fmt))
            if not download_file(item, file, say):
                say.warn('Unable to download {}'.format(item))
                return (None, None)
            url_file = path.realpath(path.join(output_dir, base + '.url'))
            with open(url_file, 'w') as f:
                f.write(url_file_content(item))
                say.info('Wrote URL to {}'.format(relative(url_file)))
        else:
            file = path.realpath(path.join(os.getcwd(), item))
            orig_fmt = filename_extension(file)[1:]

        if __debug__:
            log('{} has original format {}', relative(file), orig_fmt)
        return (file, orig_fmt)
Ejemplo n.º 9
0
    def _send(self, file, service, dest_dir):
        '''Send the "file" to the service named "service" and write output in
        directory "dest_dir".
        '''
        say = self._say
        use_color = say.use_color()
        color = service.name_color()
        service_name = styled(service.name(),
                              color) if use_color else service.name()

        say.info(
            'Sending to {} and waiting for response ...'.format(service_name))
        last_time = timer()
        try:
            result = service.result(file)
        except AuthFailure as ex:
            raise AuthFailure('Unable to use {}: {}'.format(service, ex))
        except RateLimitExceeded as ex:
            time_passed = timer() - last_time
            if time_passed < 1 / service.max_rate():
                say.warn('Pausing {} due to rate limits'.format(service_name))
                time.sleep(1 / service.max_rate() - time_passed)
                # FIXME resend after pause
        if result.error:
            say.error('{} failed: {}'.format(service_name, result.error))
            say.warn('No result from {} for {}'.format(service_name,
                                                       relative(file)))
            return None

        say.info('Got result from {}.'.format(service_name))
        file_name = path.basename(file)
        base_path = path.join(dest_dir, file_name)
        annot_path = alt_extension(base_path, str(service) + '.png')
        say.info('Creating annotated image for {}.'.format(service_name))
        self._save_output(annotated_image(file, result.boxes, service),
                          annot_path)
        if self._extended_results:
            txt_file = alt_extension(base_path, str(service) + '.txt')
            json_file = alt_extension(base_path, str(service) + '.json')
            say.info('Saving all data for {}.'.format(service_name))
            self._save_output(json.dumps(result.data), json_file)
            say.info('Saving extracted text for {}.'.format(service_name))
            self._save_output(result.text, txt_file)

        # Return the annotated image file b/c we use it for the summary grid.
        return annot_path
Ejemplo n.º 10
0
def run(classes, item, index, base_name, output_dir, creds_dir, annotate, say):
    spinner = ProgressIndicator(say.use_color(), say.be_quiet())
    try:
        spinner.start('Starting on {}'.format(relative(item)))
        if is_url(item):
            # Make sure the URLs point to images.
            if __debug__: log('Testing if URL contains an image: {}', item)
            try:
                response = request.urlopen(item)
            except Exception as err:
                if __debug__:
                    log('Network access resulted in error: {}', str(err))
                spinner.fail('Skipping URL due to error: {}'.format(err))
                return
            if response.headers.get_content_maintype() != 'image':
                spinner.fail('Did not find an image at {}'.format(item))
                return
            fmt = response.headers.get_content_subtype()
            base = '{}-{}'.format(base_name, index)
            file = path.realpath(path.join(output_dir, base + '.' + fmt))
            error = download(item, file)
            if not error:
                spinner.update('Wrote contents to {}'.format(relative(file)))
            else:
                spinner.fail('Failed to download {}: {}'.format(item, error))
                return
            url_file = path.realpath(path.join(output_dir, base + '.url'))
            with open(url_file, 'w') as f:
                f.write(url_file_content(item))
                spinner.update('Wrote URL to {}'.format(relative(url_file)))
        else:
            file = path.realpath(path.join(os.getcwd(), item))
            fmt = filename_extension(file)

        dest_dir = output_dir if output_dir else path.dirname(file)
        if not writable(dest_dir):
            say.fatal('Cannot write output in {}.'.format(dest_dir))
            return

        # Iterate over the methods.
        for method_class in classes:
            method = method_class()
            method.init_credentials(creds_dir)
            last_time = timer()

            # If need to convert format, best do it after resizing original fmt.
            need_convert = fmt not in method.accepted_formats()
            # Test the dimensions, not bytes, because of compression.
            if image_dimensions(file) > method.max_dimensions():
                file = file_after_resizing(file, method, spinner)
            if file and need_convert:
                file = file_after_converting(file, 'jpg', method, spinner)
            if not file:
                return

            spinner.update('Sending to {} {}'.format(
                color(method, 'white', say.use_color()),
                # Need explicit color research or colorization goes wrong.
                color('and waiting for response', 'info', say.use_color())))
            try:
                result = method.result(file)
            except RateLimitExceeded as err:
                time_passed = timer() - last_time
                if time_passed < 1 / method.max_rate():
                    spinner.warn('Pausing due to rate limits')
                    time.sleep(1 / method.max_rate() - time_passed)
            if result.error:
                spinner.fail(result.error)
                return

            file_name = path.basename(file)
            base_path = path.join(dest_dir, file_name)
            txt_file = alt_extension(base_path, str(method) + '.txt')
            json_file = alt_extension(base_path, str(method) + '.json')
            annot_file = alt_extension(base_path, str(method) + '.jpg')
            spinner.update('Text -> {}'.format(relative(txt_file)))
            save_output(result.text, txt_file)
            spinner.update('All data -> {}'.format(relative(json_file)))
            save_output(json.dumps(result.data), json_file)
            if annotate:
                spinner.update('Annotated image -> {}'.format(
                    relative(annot_file)))
                save_output(annotated_image(file, result.boxes), annot_file)
        spinner.stop('Done with {}'.format(relative(item)))
    except (KeyboardInterrupt, UserCancelled) as err:
        spinner.warn('Interrupted')
        raise
    except AuthenticationFailure as err:
        spinner.fail('Unable to continue using {}: {}'.format(method, err))
        return
    except Exception as err:
        spinner.fail(say.error_text('Stopping due to a problem'))
        raise
Ejemplo n.º 11
0
    def run_services(self, item, index, base_name):
        '''Run all requested services on the image indicated by "item", using
        "index" and "base_name" to construct a download copy of the item if
        it has to be downloaded from a URL first.
        '''
        # Shortcuts to make the code more readable.
        services = self._services
        output_dir = self._output_dir
        say = self._say

        try:
            say.info('Starting on {}'.format(
                styled(item, 'white') if say.use_color() else item))

            (file, orig_fmt) = self._get(item, base_name, index)
            if not file:
                return

            dest_dir = output_dir if output_dir else path.dirname(file)
            if not writable(dest_dir):
                say.error('Cannot write output in {}.'.format(dest_dir))
                return

            # Sanity check
            if not path.getsize(file) > 0:
                say.warn('Skipping zero-length file {}'.format(relative(file)))
                return

            # Save grid file name now, because it's based on the original file.
            basename = path.basename(filename_basename(file))
            grid_file = path.realpath(
                path.join(dest_dir, basename + '.all-results.png'))

            # We will usually delete temporary files we create.
            to_delete = set()

            # Normalize to the lowest common denominator.
            (new_file,
             intermediate_files) = self._normalized(file, orig_fmt, dest_dir)
            if not new_file:
                say.warn('Skipping {}'.format(relative(file)))
                return
            file = new_file
            if intermediate_files:
                to_delete.update(intermediate_files)

            # Send the file to the services.  If the number of threads is set
            # to 1, we force non-thread-pool execution to make debugging easier.
            results = []
            if self._num_threads == 1:
                results = [self._send(file, s, dest_dir) for s in services]
            else:
                with ThreadPoolExecutor(
                        max_workers=self._num_threads) as executor:
                    results = list(
                        executor.map(self._send, repeat(file), iter(services),
                                     repeat(dest_dir)))

            # If a service failed for some reason (e.g., a network glitch), we
            # get no result back.  Remove empty results & go on with the rest.
            results = [x for x in results if x is not None]
            to_delete.update(results)

            # Create grid file if requested.
            if self._make_grid:
                say.info('Creating results grid image: {}'.format(
                    relative(grid_file)))
                create_image_grid(results, grid_file, max_horizontal=2)

            # Clean up after ourselves.
            if self._make_grid and not self._extended_results:
                for image_file in to_delete:
                    delete_existing(image_file)

            say.info('Done with {}'.format(relative(item)))
        except (KeyboardInterrupt, UserCancelled) as ex:
            say.warn('Interrupted')
            raise
        except Exception as ex:
            say.error('Stopping due to a problem')
            raise