def __init__(self, base_name, extended, from_file, output_dir, threads):
        '''Initialize internal state and prepare for running services.'''

        if not network_available():
            raise ServiceFailure('No network.')

        if from_file:
            if not path.exists(from_file):
                raise RuntimeError('File not found: {}'.format(from_file))
            if not readable(from_file):
                raise RuntimeError('File not readable: {}'.format(from_file))

        if output_dir:
            if path.isdir(output_dir):
                if not writable(output_dir):
                    raise RuntimeError(
                        'Directory not writable: {}'.format(output_dir))
            else:
                os.mkdir(output_dir)
                if __debug__:
                    log('created output_dir directory {}', output_dir)

        self._base_name = base_name
        self._extended = extended
        self._from_file = from_file
        self._output_dir = output_dir
        self._threads = threads
Exemple #2
0
 def _resized_image(self, file):
     (max_width, max_height) = self._max_dimensions
     file_ext = filename_extension(file)
     say = self._say
     if file.find('-reduced') > 0:
         new_file = file
     else:
         new_file = filename_basename(file) + '-reduced' + file_ext
     if path.exists(new_file) and readable(new_file):
         (image_width, image_height) = image_dimensions(new_file)
         if image_width < max_width and image_height < max_height:
             say.info('Using reduced image found in {}'.format(
                 relative(new_file)))
             return new_file
         else:
             # We found a "-reduced" file, perhaps from a previous run, but
             # for the current set of services, dimension are too large.
             if __debug__:
                 log('existing resized file larger than {}x{}: {}',
                     max_width, max_height, new_file)
     say.info('Dimensions too large; reducing dimensions: {}'.format(
         relative(file)))
     (resized, error) = reduced_image_dimensions(file, new_file, max_width,
                                                 max_height)
     if error:
         say.error('Failed to re-dimension {}: {}'.format(
             relative(file), error))
         return None
     return resized
Exemple #3
0
    def _image_from_file(self, file_path):
        '''Helper function for subclasses to read image files.
        Returns a tuple, (image, error), where "error" is a TRResult with a
        non-empty error field value if an error occurred, and "image" is the
        bytes of the image if it was successfully read.
        '''

        def error_result(error_text):
            return (None, TRResult(path = file_path, data = {}, text = '',
                                   error = error_text, boxes = []))

        if not readable(file_path):
            return error_result('Unable to read file: {}'.format(file_path))
        if __debug__: log('reading image file {} for {}', file_path, self.name())
        image = open(file_path, 'rb').read()
        if len(image) == 0:
            return error_result('Empty file: {}'.format(file_path))
        if len(image) > self.max_size():
            text = 'Exceeds {} byte limit for service: {}'.format(self.max_size(), file_path)
            return error_result(text)
        width, height = imagesize.get(file_path)
        if __debug__: log('image size is width = {}, height = {}', width, height)
        if self.max_dimensions():
            max_width, max_height = self.max_dimensions()
            if width > max_width or height > max_height:
                text = 'Image dimensions {}x{} exceed {} limits: {}'.format(
                    width, height, self.name(), file_path)
                return error_result(text)
        return (image, None)
Exemple #4
0
    def __init__(self):
        cfile = path.join(self.credentials_dir(), 'google_credentials.json')
        if not path.exists(cfile):
            raise AuthFailure('Credentials for Google have not been installed')
        elif not readable(cfile):
            raise AuthFailure(
                'Google credentials file unreadable: {}'.format(cfile))

        self.creds_file = cfile
        # Haven't been able to make it work; only the environment variable
        # approach has been working for me.
        #
        # with open(self.credentials_file, 'r') as file:
        #     self.credentials = json.load(file)
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = cfile
    def __init__(self):
        cfile = path.join(self.credentials_dir(),
                          credentials_filename('amazon'))
        if __debug__: log('credentials file for amazon is {}', cfile)
        if not path.exists(cfile):
            raise AuthFailure('Credentials for Amazon have not been installed')
        elif not readable(cfile):
            raise AuthFailure(
                'Amazon credentials file unreadable: {}'.format(cfile))

        try:
            with open(cfile, 'r') as file:
                self.credentials = json.load(file)
        except Exception as ex:
            raise AuthFailure(
                'Unable to parse Amazon exceptions file: {}'.format(str(ex)))
Exemple #6
0
    def __init__(self):
        cfile = path.join(self.credentials_dir(), 'microsoft_credentials.json')
        if not path.exists(cfile):
            raise AuthFailure(
                'Credentials for Microsoft have not been installed')
        elif not readable(cfile):
            raise AuthFailure(
                'Microsoft credentials file unreadable: {}'.format(cfile))

        self.creds_file = cfile
        try:
            with open(self.creds_file, 'r') as file:
                creds = json.load(file)
                self.credentials = creds['subscription_key']
        except Exception as ex:
            raise AuthFailure(
                'Unable to parse Microsoft exceptions file: {}'.format(
                    str(ex)))
def main(add_creds='A',
         base_name='B',
         compare=False,
         no_color=False,
         extended=False,
         from_file='F',
         no_grid=False,
         list=False,
         output_dir='O',
         quiet=False,
         relaxed=False,
         services='S',
         threads='T',
         version=False,
         debug='OUT',
         *files):
    '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") runs
alternative text recognition services on images of handwritten document pages.

Installing credentials for cloud-based services
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

If given the command-line flag -l (or /l on Windows), Handprint will print a
list of the known services, and exit.

Before a given service can be used, if it is cloud-based commercial OCR/HTR
service, Handprint needs to be supplied with user credentials for accessing
that service.  The credentials must be stored in a JSON file with a certain
format; see the Handprint user documentation for details about the formats
for each service.  To add a new credentials file, use the -a option (/a on
Windows) in combination with the name of a service and a single file path on
the command line.  The name supplied right after the -a option must be the
name of a recognized service (such as "google", "amazon", "microsoft"), and
the file argument must be a JSON file containing the credentials data in the
required format for that service.  Here is an example of adding credentials
for Google (assuming you created the JSON file as described in the docs):

  handprint -a google mygooglecreds.json

Run Handprint with the -a option multiple times to install credentials for
each different service.  Handprint will copy the credential files to its own
configuration directory and exit without doing anything else.  The directory
is different on different operating sytems; for example, on macOS it
is ~/Library/Application Support/Handprint/.

Basic usage
~~~~~~~~~~~

After credentials are installed, running Handprint without the -a option will
invoke one or more OCR/HTR services on files, directories of files, or URLs.
The image paths or URLs can be supplied in any of the following ways:

 a) one or more directory paths or one or more image file paths, which will
    be interpreted as images (either individually or in directories) to be
    processed;

 b) one or more URLs, which will be interpreted as network locations of image
    files to be processed; or

 c) if given the -f option (/f on Windows), a file containing either image
    paths or image URLs.

If given URLs, Handprint will first download the images found at the URLs to
a local directory indicated by the option -o (/o on Windows).  Handprint can
accept input images in JP2, JPEG, PDF, PNG, GIF, BMP, and TIFF formats.  To
make the results from different services more easily comparable, Handprint
will always convert all input images to the same format (PNG) no matter if
some services may accept other formats; it will also resize input images to
the smallest size accepted by any of the services invoked if an image exceeds
that size.  (For example, if service A accepts files up to 10 MB in size and
service B accepts files up to 5 MB, all input images will be resized to 5 MB
before sending them to A and B, even if A could accept a higher-resolution
image.)  In addition, a limitation of Handprint's current PDF support is that
only the first image in a PDF file is read -- if a PDF file contains
more than one image, the remaining images are ignored.

The default action is to run all known services.  The option -s (/s on
Windows) can be used to select only one service or a list of services
instead.  Lists of services should be separated by commas; e.g.,
"google,microsoft".  To find out which services are supported by Handprint, run
it with the command-line flag -l (or /l on Windows), which will make Handprint
print a list of the known services and exit immediately.

When performing OCR/HTR on images, Handprint temporarily (unless the -e
option is given -- see below) writes the results to new files that it creates
either in the same directories as the original files, or (if given the -o
option) the directory indicated by the -o option (/o on Windows).  The
results will be written in files named after the original files with the
addition of a string that indicates the service used.  For example, a file
named "somefile.jpg" will result in

  somefile.handprint-google.png
  somefile.handprint-microsoft.png
  somefile.handprint-amazon.png
  ...

and so on for each image and each service used.  THESE FILES ARE DELETED
after the final results grid image is created, UNLESS the -e option (/e on
Windows) is used to indicate that extended results should be produced; in that
case, these individual annotated image files are kept.

Visual display of recognition results
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

After gathering the results of each service for a given input, Handprint will
create a single compound image consisting of all the annotated results images
arranged in a grid.  This is intended to make it easier to compare the
results of multiple services against each other.  To skip the creation of the
results grid, use the -G option (/G on Windows).  The grid image will be named

  somefile.handprint-all.png

If given the -e option (/e on Windows), Handprint will produce extended
output that includes the complete response from the service (converted to a
JSON file by Handprint) and the text extracted (stored as a .txt file).  The
output of -e will be multiple files like this:

  somefile.handprint-google.png
  somefile.handprint-google.json
  somefile.handprint-google.txt
  somefile.handprint-microsoft.png
  somefile.handprint-microsoft.json
  somefile.handprint-microsoft.txt
  somefile.handprint-amazon.png
  somefile.handprint-amazon.json
  somefile.handprint-amazon.txt
  ...

The files will written to the directory indicated by -o, or (if -o is not
used) the directory where "somefile" is located.  When -o is not used and
the input images are given as URLs, then the files are written to the current
working directory instead.

If an image is too large for any of the services invoked, then Handprint will
resize it prior to sending the image to any of the services (as noted above).
It will write the reduced image to a file named "FILENAME.handprint.EXT", where
"FILENAME" is the original file name and "EXT" is the file extension.  This
file is normally deleted, unless you use the -e option (/e on Windows)
mentioned above, in which case you will find this additional file in the same
location as the others:

  somefile.handprint.png

When the inputs are URLs, Handprint must download a copy of the image located
at the network address (because it is not possible to write the results in
the network locations represented by the URLs.).  The images and other
results will be stored files whose root names have the form "document-N",
where "N" is an integer.  The root name can be changed using the -b option
(/b on Windows).  The image at networked locations will be converted to
ordinary PNG format for maximum compatibility with the different OCR
services and written to "document-N.png", and the URL corresponding to each
document will be written in a file named "document-N.url" so that it is
possible to connect each "document-N.png" to the URL it came from.

Finally, note that the use of the -G option (/G on Windows) WITHOUT either
the -e or -c option is an error because it means no output would be produced.

Comparing results to expected output
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Handprint supports comparing the output of HTR services to expected output
(i.e., ground truth) using the option -c (or /c on Windows).  This facility
requires that the user provides text files that contain the expected text
for each input image.  The ground-truth text files must have the following
characteristics:

 a) The file containing the expected results should be named ".gt.txt", with
    a base name identical to the image file.  For example, an image file named
    "somefile.jpg" should have a corresponding text file "somefile.gt.txt".

 b) The ground-truth text file should be located in the same directory as the
    input image file.

 c) The text should be line oriented, with each line representing a line of
    text in the image.

 d) The text should be plain text only.  No Unicode or binary encodings.
    (This limitation comes from the HTR services, which -- as of this
    writing -- return results in plain text format.)

Handprint will write the comparison results to a tab-delimited file named
after the input image and service but with the extension ".tsv".  For
example, for an input image "somefile.jpg" and results received from Google,
the comparison results will be written to "somefile.handprint-google.tsv".
(The use of a tab-delimited format rather than comma-delimited format avoids
the need to quote commas and other characters in the text.)

Handprint reports, for each text line, the number of errors (the Levenshtein
edit distance) and the character error rate (CER), and at the end it also
reports a sum total of errors.  The CER is computed as the Levenshtein edit
distance of each line divided by the number of characters in the expected
line text, multiplied by 100; this approach to normalizing the CER value is
conventional but note that it can lead to values greater than 100%.

By default, comparisons are done on an exact basis; character case is not
changed, punctuation is not removed, and stop words are not removed.
However, multiple contiguous spaces are converted to one space, and leading
spaces are removed from text lines.  If given the option -r (/r on Windows),
Handprint will relax the comparison algorithm as follows:

 i) convert all text to lower case
 ii) ignore certain sentence punctuation characters, namely , . : ;

Handprint attempts to cope with possibly-missing text in the HTR results by
matching up likely corresponding lines in the expected and received results.
It does this by comparing each line of ground-truth text to each line of the
HTR results using longest common subsequence similarity, as implemented by
the LCSSEQ function in the Python "textdistance" package.  If the lines do
not pass a threshold score, Handprint looks at subsequent lines of the HTR
results and tries to reestablish correspondence to ground truth.  If nothing
else in the HTR results appear close enough to the expected ground-truth
line, the line is assumed to be missing from the HTR results and scored
appropriately.

Additional command-line arguments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Handprint will send files to the different services in parallel, using a
number of process threads equal to 1/2 of the number of cores on the computer
it is running on.  (E.g., if your computer has 4 cores, it will by default use
at most 2 threads.)  The option -t (/t on Windows) can be used to change this
number.

If given the -q option (/q on Windows), Handprint will not print its usual
informational messages while it is working.  It will only print messages
for warnings or errors.  By default messages printed by Handprint are also
color-coded.  If given the option -C (/C on Windows), Handprint will not color
the text of messages it prints.  (This latter option is useful when running
Handprint within subshells inside other environments such as Emacs.)

If given the -V option (/V on Windows), this program will print the version
and other information, and exit without doing anything else.

If given the -@ argument (/@ on Windows), this program will output a detailed
trace of what it is doing to the terminal window, and will also drop into a
debugger upon the occurrence of any errors.  The debug trace will be sent to
the given destination, which can be '-' to indicate console output, or a file
path to send the output to a file.

Command-line arguments summary
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'''

    # Initial setup -----------------------------------------------------------

    debugging = debug != 'OUT'
    use_color = not no_color
    make_grid = not no_grid
    prefix = '/' if sys.platform.startswith('win') else '-'
    hint = '(Hint: use {}h for help.)'.format(prefix)
    ui = UI('Handprint', 'HANDwritten Page RecognitIoN Test', False, use_color,
            quiet)

    # Preprocess arguments and handle early exits -----------------------------

    if debugging:
        set_debug(True, debug)

    if version:
        print_version()
        exit(0)
    if list:
        inform('Known services: {}', ', '.join(services_list()))
        exit(0)

    print_intro(ui)

    if add_creds != 'A':
        service = add_creds.lower()
        if service not in services_list():
            alert('Unknown service: "{}". {}', service, hint)
            exit(1)
        if not files or len(files) > 1:
            alert('Option {}a requires one file. {}', prefix, hint)
            exit(1)
        creds_file = files[0]
        if not readable(creds_file):
            alert('File not readable: {}', creds_file)
            exit(1)
        Credentials.save_credentials(service, creds_file)
        inform('Saved credentials for service "{}".', service)
        exit(0)
    if no_grid and not extended and not compare:
        alert('{0}G without {0}e or {0}c produces no output. {1}', prefix,
              hint)
        exit(1)
    if any(item.startswith('-') for item in files):
        alert('Unrecognized option in arguments. {}', hint)
        exit(1)
    if not files and from_file == 'F':
        alert('Need images or URLs to have something to do. {}', hint)
        exit(1)
    if relaxed and not compare:
        warn('Option {0}r without {0}c has no effect. {1}', prefix, hint)

    services = services_list() if services == 'S' else services.lower().split(
        ',')
    if not all(s in services_list() for s in services):
        alert('"{}" is not a known services. {}', services, hint)
        exit(1)

    base_name = 'document' if base_name == 'B' else base_name
    from_file = None if from_file == 'F' else from_file
    output_dir = None if output_dir == 'O' else output_dir
    compare = 'relaxed' if (compare and relaxed) else compare
    threads = int(
        max(1,
            available_cpus() / 2 if threads == 'T' else int(threads)))

    # Do the real work --------------------------------------------------------

    try:
        body = MainBody(base_name, extended, from_file, output_dir, threads)
        body.run(services, files, make_grid, compare)
    except (KeyboardInterrupt, UserCancelled) as ex:
        if __debug__: log('received {}', sys.exc_info()[0].__name__)
        inform('Quitting.')
        exit(0)
    except Exception as ex:
        if debugging:
            import traceback
            alert('{}\n{}', str(ex), traceback.format_exc())
            import pdb
            pdb.set_trace()
        else:
            alert(str(ex))
            exit(2)
    inform('Done.')
Exemple #8
0
def main(base_name='B',
         creds_dir='C',
         from_file='F',
         list=False,
         method='M',
         output='O',
         given_urls=False,
         quiet=False,
         no_annot=False,
         no_color=False,
         debug=False,
         version=False,
         *images):
    '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") can
run alternative text recognition methods on images of document pages.

If given the command-line flag -l (or /l on Windows), Handprint will print a
list of the known methods and then exit.  The option -m (/m on Windows) can
be used to select a specific method.  (The default method is to run them all.)

When invoked, the command-line arguments should contain one of the following:

 a) one or more directory paths or one or more image file paths, which will
    be interpreted as images (either individually or in directories) to be
    processed;

 b) if given the -u option (/u on Windows), one or more URLs, which will be
    interpreted as network locations of image files to be processed;

 c) if given the -f option (/f on Windows), a file containing either image
    paths or (if combined with the -u option), image URLs

If given URLs (via the -u option), Handprint will first download the images
found at the URLs to a local directory indicated by the option -o (/o on
Windows).  Handprint will send each image file to OCR/HTR services from
Google, Microsoft and others.  It will write the results to new files placed
either in the same directories as the original files, or (if given the -o
option) to the directory indicated by the -o option value (/o on Windows).
The results will be written in files named after the original files with the
addition of a string that indicates the method used.  For example, a file
named "somefile.jpg" will produce

  somefile.jpg
  somefile.google.txt
  somefile.google.json
  somefile.microsoft.txt
  somefile.microsoft.json
  somefile.amazon.txt
  somefile.amazon.json
  ...

and so on for each image and each service used.  The .txt files will contain
the text extracted (if any).  The .json files will contain the complete
response from the service, converted to JSON by Handprint.  In some cases,
such as Google's API, the service may offer multiple operations and will
return individual results for different API calls or options; in those cases,
Handprint combines the results of multiple API calls into a single JSON
object.

Unless given the do-not-annotate option, -A (/A on Windows), Handprint will
also generate a copy of the image with superimposed bounding boxes and text
to show the recognition results.  The annotated images will include the name
of the service; in other words, the list of files produced by Handprint will
include

  somefile.google.jpg
  somefile.microsoft.jpg
  ...

and so on.  (They are distinguished from the original unannotated image, which
will be left in somefile.jpg.)

Note that if -u (/u on Windows) is given, then an output directory MUST also
be specified using the option -o (/o on Windows) because it is not possible
to write the results in the network locations represented by the URLs.  Also,
when -u is used, the images and text results will be stored in files whose
root names have the form "document-N", where "N" is an integer.  The root
name can be changed using the -r option (/r on Windows).  The image will be
converted to ordinary JPEG format for maximum compatibility with the
different OCR services and written to "document-N.jpg", and the URL
corresponding to each document will be written in a file named
"document-N.url" so that it is possible to connect each "document-N.jpg" to
the URL it came from.

If images are too large for a method/service, then Handprint will resize them
prior to sending them.  It will write the reduced image to a file named
"FILENAME-reduced.EXT", where "FILENAME" is the original file name and "EXT"
is the file extension.  This means that if an image needs to be resized, the
results of applying the text recognition methods will be, e.g.,

  somefile-reduced.jpg
  somefile-reduced.google.txt
  somefile-reduced.google.jpg
  somefile-reduced.google.json
  somefile-reduced.microsoft.txt
  somefile-reduced.microsoft.jpg
  somefile-reduced.microsoft.json
  somefile-reduced.amazon.txt
  somefile-reduced.amazon.jpg
  somefile-reduced.amazon.json
  ...

Credentials for different services need to be provided to Handprint in the
form of JSON files.  Each service needs a separate JSON file named after the
service (e.g., "microsoft_credentials.json") and placed in a directory that
Handprint searches.  By default, Handprint searches for the files in a
subdirectory named "creds" where Handprint is installed, but an alternative
directory can be indicated at run-time using the -c command-line option (/c
on Windows).  The specific format of each credentials file is different for
each service; please consult the Handprint documentation for more details.

If given the -q option (/q on Windows), Handprint will not print its usual
informational messages while it is working.  It will only print messages
for warnings or errors.

If given the -V option (/V on Windows), this program will print version
information and exit without doing anything else.
'''

    # Reverse some flags for easier code readability
    annotate = not no_annot

    # Prepare notification methods and hints.
    say = MessageHandlerCLI(not no_color, quiet)
    prefix = '/' if ON_WINDOWS else '-'
    hint = '(Hint: use {}h for help.)'.format(prefix)

    # Process arguments.
    if debug:
        set_debug(True)
    if version:
        print_version()
        exit()
    if list:
        say.info('Known methods:')
        for key in KNOWN_METHODS.keys():
            say.info('   {}'.format(key))
        exit()
    if not network_available():
        exit(say.fatal_text('No network.'))

    if from_file == 'F':
        from_file = None
    else:
        if not path.isabs(from_file):
            from_file = path.realpath(path.join(os.getcwd(), from_file))
        if not path.exists(from_file):
            exit(say.error_text('File not found: {}'.format(from_file)))
        if not readable(from_file):
            exit(say.error_text('File not readable: {}'.format(from_file)))

    if not images and not from_file:
        exit(say.error_text('Need provide images or URLs. {}'.format(hint)))
    if any(item.startswith('-') for item in images):
        exit(
            say.error_text(
                'Unrecognized option in arguments. {}'.format(hint)))

    if creds_dir == 'C':
        creds_dir = path.join(handprint_path(), 'creds')
    if not readable(creds_dir):
        exit(say.error_text('Directory not readable: {}'.format(creds_dir)))
    else:
        if __debug__: log('Assuming credentials found in {}.', creds_dir)

    if method == 'M':
        method = 'all'
    method = method.lower()
    if method != 'all' and method not in KNOWN_METHODS:
        exit(
            say.error_text('"{}" is not a known method. {}'.format(
                method, hint)))

    if output == 'O':
        output = None
    else:
        if not path.isabs(output):
            output = path.realpath(path.join(os.getcwd(), output))
        if path.isdir(output):
            if not writable(output):
                exit(
                    say.error_text(
                        'Directory not writable: {}'.format(output)))
        else:
            os.mkdir(output)
            if __debug__: log('Created output directory {}', output)
    if given_urls and not output:
        exit(say.error_text('Must provide an output directory if using URLs.'))
    if base_name != 'B' and not given_urls:
        exit(
            say.error_text(
                'Option {}r can only be used with URLs.'.format(prefix)))
    if base_name == 'B':
        base_name = 'document'

    # Create a list of files to be processed.
    targets = targets_from_arguments(images, from_file, given_urls, say)
    if not targets:
        exit(say.warn_text('No images to process; quitting.'))

    # Let's do this thing.
    try:
        num_items = len(targets)
        print_separators = num_items > 1 and not say.be_quiet()
        if method == 'all':
            # Order doesn't really matter; just make it consistent run-to-run.
            methods = sorted(KNOWN_METHODS.values(), key=lambda x: str(x))
            say.info(
                'Will apply all known methods to {} images.'.format(num_items))
        else:
            methods = [KNOWN_METHODS[method]]
            say.info('Will apply method "{}" to {} images.'.format(
                method, num_items))
        for index, item in enumerate(targets, start=1):
            if print_separators:
                say.msg('=' * 70, 'dark')
            run(methods, item, index, base_name, output, creds_dir, annotate,
                say)
        if print_separators:
            say.msg('=' * 70, 'dark')
    except (KeyboardInterrupt, UserCancelled) as err:
        exit(say.info_text('Quitting.'))
    except ServiceFailure as err:
        exit(say.error_text(str(err)))
    except Exception as err:
        if debug:
            import pdb
            pdb.set_trace()
        exit(say.error_text('{}\n{}'.format(str(err), traceback.format_exc())))
    say.info('Done.')
Exemple #9
0
def main(add_creds='A',
         base_name='B',
         no_color=False,
         extended=False,
         from_file='F',
         no_grid=False,
         list=False,
         output_dir='O',
         quiet=False,
         services='S',
         threads='T',
         version=False,
         debug=False,
         *files):
    '''Handprint (a loose acronym of "HANDwritten Page RecognitIoN Test") runs
alternative text recognition services on images of handwritten document pages.

Installing credentials for cloud-based services
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

If given the command-line flag -l (or /l on Windows), Handprint will print a
list of the known services, and exit.

Before a given service can be used, if it is cloud-based commercial OCR/HTR
service, Handprint needs to be supplied with user credentials for accessing
that service.  The credentials must be stored in a JSON file with a certain
format; see the Handprint user documentation for details about the formats
for each service.  To add a new credentials file, use the -a option (/a on
Windows) in combination with the name of a service and a single file path on
the command line.  The name supplied right after the -a option must be the
name of a recognized service (such as "google", "amazon", "microsoft"), and
the file argument must be a JSON file containing the credentials data in the
required format for that service.  Here is an example of adding credentials
for Google (assuming you created the JSON file as described in the docs):

  handprint -a google mygooglecreds.json

Run Handprint with the -a option multiple times to install credentials for
each different service.  Handprint will copy the credential files to its own
configuration directory and exit without doing anything else.  The directory
is different on different operating sytems; for example, on macOS it
is ~/Library/Application Support/Handprint/.

Basic usage
~~~~~~~~~~~

After credentials are installed, running Handprint without the -a option will
invoke one or more OCR/HTR services on files, directories of files, or URLs.
The image paths or URLs can be supplied in any of the following ways:

 a) one or more directory paths or one or more image file paths, which will
    be interpreted as images (either individually or in directories) to be
    processed;

 b) one or more URLs, which will be interpreted as network locations of image
    files to be processed; or

 c) if given the -f option (/f on Windows), a file containing either image
    paths or image URLs.

If given URLs, Handprint will first download the images found at the URLs to
a local directory indicated by the option -o (/o on Windows).  Handprint can
accept input images in JPEG, PNG, GIF, BMP, and TIFF formats.  To make the
results from different services more easily comparable, Handprint will always
convert all input images to the same format (PNG) no matter if some services
may accept other formats; it will also resize input images to the smallest
size accepted by any of the services invoked if an image exceeds that size.
(For example, if service A accepts files up to 10 MB in size and service B
accepts files up to 5 MB, all input images will be resized to 5 MB before
sending them to A and B, even if A could accept a higher-resolution image.)

The default action is to run all known services; the option -s (/s on
Windows) can be used to select only one service or a list of services
instead.  Lists of services should be separated by commas; e.g.,
"google,microsoft".

When performing OCR/HTR on images, Handprint temporarily (unless the -e
option is given -- see below) writes the results to new files that it creates
either in the same directories as the original files, or (if given the -o
option) the directory indicated by the -o option (/o on Windows).  The
results will be written in files named after the original files with the
addition of a string that indicates the service used.  For example, a file
named "somefile.jpg" will result in

  somefile.google.png
  somefile.microsoft.png
  somefile.amazon.png
  ...

and so on for each image and each service used.  These files are deleted
after the final results grid image is created, unless the -e option (/e on
Windows) is used to indicate that extended results should be produced; in that
case, these individual annotated image files are kept.

After gathering the results of each service for a given input, Handprint will
create a single compound image consisting of all the annotated results images
arranged in a grid.  This is intended to make it easier to compare the
results of multiple services against each other.  To skip the creation of the
results grid, use the -G option (/G on Windows).  The grid image will be named

  somefile.all-results.png

If given the -e option (/e on Windows), Handprint will produce extended
output that includes the complete response from the service (converted to a
JSON file by Handprint) and the text extracted (stored as a .txt file).  The
output of -e will be multiple files like this:

  somefile.google.png
  somefile.google.json
  somefile.google.txt
  somefile.microsoft.png
  somefile.microsoft.json
  somefile.microsoft.txt
  somefile.amazon.png
  somefile.amazon.json
  somefile.amazon.txt
  ...

The files will written to the directory indicated by -o, or (if -o is not
used) the directory where "somefile" is located.  When -o is not used and
the input images are given as URLs, then the files are written to the current
working directory instead.

If an image is too large for any of the services invoked, then Handprint will
resize it prior to sending the image to any of the services (as noted above).
It will write the reduced image to a file named "FILENAME-reduced.EXT", where
"FILENAME" is the original file name and "EXT" is the file extension.  This
means that if an image needs to be resized, the results of applying the text
recognition services will be, e.g.,

  somefile-reduced.png
  somefile-reduced.google.png
  somefile-reduced.google.json
  somefile-reduced.google.txt
  somefile-reduced.microsoft.png
  somefile-reduced.microsoft.json
  somefile-reduced.microsoft.txt
  somefile-reduced.amazon.json
  somefile-reduced.amazon.png
  somefile-reduced.amazon.txt
  ...

When the inputs are URLs, Handprint must download a copy of the image located
at the network address (because it is not possible to write the results in
the network locations represented by the URLs.).  The images and other
results will be stored files whose root names have the form "document-N",
where "N" is an integer.  The root name can be changed using the -b option
(/b on Windows).  The image at networked locations will be converted to
ordinary PNG format for maximum compatibility with the different OCR
services and written to "document-N.png", and the URL corresponding to each
document will be written in a file named "document-N.url" so that it is
possible to connect each "document-N.png" to the URL it came from.

Finally, note that the use of the -G option (/G on Windows) WITHOUT the -e
option is an error because it means no output would be produced.

Additional command-line arguments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Handprint will send files to the different services in parallel, using a
number of process threads equal to 1/2 of the number of cores on the computer
it is running on.  (E.g., if your computer has 4 cores, it will by default use
at most 2 threads.)  The option -t (/t on Windows) can be used to change this
number.

If given the -q option (/q on Windows), Handprint will not print its usual
informational messages while it is working.  It will only print messages
for warnings or errors.  By default messages printed by Handprint are also
color-coded.  If given the option -C (/C on Windows), Handprint will not color
the text of messages it prints.  (This latter option is useful when running
Handprint within subshells inside other environments such as Emacs.)

If given the -V option (/V on Windows), this program will print the version
and other information, and exit without doing anything else.

If given the -@ option (/@ on Windows), this program will print additional
diagnostic output as it runs; in addition, it will start the Python debugger
(pdb) when an exception occurs, instead of simply exiting.

Command-line arguments summary
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'''

    # Initial setup -----------------------------------------------------------

    say = MessageHandlerCLI(not no_color, quiet)
    prefix = '/' if sys.platform.startswith('win') else '-'
    hint = '(Hint: use {}h for help.)'.format(prefix)
    make_grid = not no_grid

    # Preprocess arguments and handle early exits -----------------------------

    if debug:
        set_debug(True)
    if version:
        print_version()
        exit()
    if list:
        say.info('Known services: {}'.format(', '.join(services_list())))
        exit()

    if add_creds != 'A':
        service = add_creds.lower()
        if service not in services_list():
            exit(
                say.error_text('Unknown service: "{}". {}'.format(
                    service, hint)))
        if not files or len(files) > 1:
            exit(
                say.error_text('Option {}a requires one file. {}'.format(
                    prefix, hint)))
        creds_file = files[0]
        if not readable(creds_file):
            exit(say.error_text('File not readable: {}'.format(creds_file)))
        Credentials.save_credentials(service, creds_file)
        exit(
            say.info_text(
                'Saved credentials for service "{}".'.format(service)))

    if no_grid and not extended:
        exit(
            say.error_text('{}G without {}e produces no output. {}'.format(
                prefix, prefix, hint)))
    if any(item.startswith('-') for item in files):
        exit(
            say.error_text(
                'Unrecognized option in arguments. {}'.format(hint)))
    if not files and from_file == 'F':
        exit(say.error_text('Need provide images or URLs. {}'.format(hint)))

    services = services_list() if services == 'S' else services.lower().split(
        ',')
    if not all(s in services_list() for s in services):
        exit(
            say.error_text('"{}" is not a known services. {}'.format(
                services, hint)))

    base_name = 'document' if base_name == 'B' else base_name
    from_file = None if from_file == 'F' else from_file
    output_dir = None if output_dir == 'O' else output_dir

    # Do the real work --------------------------------------------------------

    try:
        print_intro(say)
        body = MainBody(base_name, extended, from_file, output_dir, threads,
                        say)
        body.run(services, files, make_grid)
    except (KeyboardInterrupt, UserCancelled) as ex:
        if __debug__: log('received {}', sys.exc_info()[0].__name__)
        exit(say.info_text('Quitting.'))
    except Exception as ex:
        if debug:
            import traceback
            say.error('{}\n{}'.format(str(ex), traceback.format_exc()))
            import pdb
            pdb.set_trace()
        else:
            exit(say.error_text(str(ex)))
    say.info('Done.')
Exemple #10
0
def image_size(file):
    '''Returns the size of the image in 'file', in units of bytes.'''
    if not file or not readable(file):
        return 0
    return path.getsize(file)