Ejemplo n.º 1
0
    def _do_main_work(self):
        # Gather up some things and get prepared.
        targets = self.targets_from_arguments()
        if not targets:
            alert_fatal('No images to process; quitting.')
            raise CannotProceed(ExitCode.bad_arg)
        num_targets = len(targets)

        inform(f'Given {pluralized("image", num_targets, True)} to work on.')
        inform('Will apply results of {}: {}'.format(
            pluralized('service', len(self.services), True),
            ', '.join(self.services), num_targets))
        inform(
            f'Will use credentials stored in {Credentials.credentials_dir()}/.'
        )
        if self.extended:
            inform('Will save extended results.')
        num_threads = min(self.threads, len(self.services))
        inform(f'Will use up to {num_threads} process threads.')

        # Get to work.
        if __debug__: log('initializing manager and starting processes')
        import shutil
        print_separators = num_targets > 1
        rule = '─' * (shutil.get_terminal_size().columns or 80)
        for index, item in enumerate(targets, start=1):
            # Check whether we've been interrupted before doing another item.
            raise_for_interrupts()
            # Process next item.
            if print_separators:
                inform(rule)
            self._manager.run_services(item, index, self.base_name)
        if print_separators:
            inform(rule)
Ejemplo n.º 2
0
    def record_for_file(self, file):
        '''Returns a ZoteroRecord corresponding to the given local PDF file.'''
        f = antiformat(file)
        if not path.exists(file):
            # The file should always exist, because of how the list of files is
            # gathered, so something is wrong but we don't know what. Give up.
            raise ValueError(f'File not found: {f}')

        # Zotero stores content in the subdirectory like .../storage/N743ZXDF.
        # The item key is the alphanumeric directory name.
        # Given the key, there's no way to know whether the record is in a user
        # library or a group library, so we have to iterate over the options.
        itemkey = path.basename(path.dirname(file))
        record = None
        for library in self._libraries:
            try:
                record = library.item(itemkey)
                if __debug__: log(f'{itemkey} found in library {library.library_id}')
                break
            except zotero_errors.ResourceNotFound:
                if __debug__: log(f'{itemkey} not found in library {library.library_id}')
                continue
            except KeyboardInterrupt as ex:
                if __debug__: log(f'interrupted: {str(ex)}')
                raise
            except Exception as ex:
                if __debug__: log(f'got exception {str(ex)}')
                raise
            # pyzotero calls urllib3 for network connections. The latter uses
            # try-except clauses that broadly catch all Exceptions but don't
            # check for for KeyboardInterrupt. Thus, ^C during a network call
            # will show up as a failure to return data, not a KeyboardInterrupt.
            raise_for_interrupts()
        if not record:
            if __debug__: log(f'could not find a record for item key "{itemkey}"')
            return (None, f'Unable to retrieve Zotero record for {f}')

        # If the PDF isn't associated with a bib record, it won't have a parent.
        parentkey = self.parent_key(record, file)
        if not parentkey:
            if __debug__: log(f'file not associated with a parent record: {f}')
            return (None, f'File lacks a parent Zotero record: {f}')

        # We have an item record and a parent. We are happy campers.
        if __debug__: log(f'{parentkey} is parent of {itemkey} for {f}')
        r = ZoteroRecord(key = itemkey, parent_key = parentkey, file = file,
                         link = self.item_link(record, file), record = record)
        return (r, None)
Ejemplo n.º 3
0
    def __init__(self, key, user_id, use_keyring):
        if key and (not key.isalnum() or len(key) < 20):
            alert_fatal(f'"{key}" does not appear to be a valid API key.')
            raise CannotProceed(ExitCode.bad_arg)
        if user_id and not user_id.isdigit():
            alert_fatal(f'"{user_id}" does not appear to be a Zotero user ID.')
            raise CannotProceed(ExitCode.bad_arg)

        # If the user supplied all the values on the command line, those are
        # the values used.  If none were supplied by the user on the command
        # line, all the values are retrieved from the user's keyring.  If
        # some were supplied and others missing, they're filled in from
        # either the keyring or by prompting the user.

        if __debug__: log('keyring ' + ('enabled' if use_keyring else 'disabled'))
        if key is None and user_id is None and use_keyring:
            # We weren't given a key and id, but we can look in the keyring.
            if __debug__: log(f'getting id & key from keyring')
            key, user_id = keyring_credentials()
        if not key:
            key = validated_input('API key', key, lambda x: x.isalnum())
        if not user_id:
            user_id = validated_input('User ID', user_id, lambda x: x.isdigit())
        if use_keyring:
            if __debug__: log('saving credentials to keyring')
            save_keyring_credentials(key, user_id)

        self._key = key
        self._user_id = user_id

        # Get connected and store the Zotero conection object for the user
        # library first, then look up the group libraries that the user can
        # access and create Zotero objects for that too.  The way we use them,
        # we don't need to separate between them, so they all go into one list.

        self._libraries = []
        try:
            if __debug__: log(f'connecting to Zotero as user {user_id}')
            user = zotero.Zotero(user_id, 'user', key)
            # pyzotero will return an object but that doesn't mean the user
            # actually gave valid credentials. Need to try an operation.
            user.count_items()
            self._libraries.append(user)
            raise_for_interrupts()
        except zotero_errors.UserNotAuthorised as ex:
            if __debug__: log(f'got exception {str(ex)}')
            alert_fatal('Unable to connect to Zotero: invalid ID and/or API key.',
                        'The Zotero servers rejected attempts to connect.')
            raise CannotProceed(ExitCode.bad_arg)
        except KeyboardInterrupt as ex:
            if __debug__: log(f'got exception {str(ex)}')
            raise
        except Exception as ex:
            if __debug__: log(f'failed to create Zotero user object: str(ex)')
            alert_fatal('Unable to connect to Zotero API.')
            raise

        try:
            for group in user.groups():
                if __debug__: log(f'user can access group id {group["id"]}')
                self._libraries.append(zotero.Zotero(group['id'], 'group', key))
                raise_for_interrupts()
        except KeyboardInterrupt as ex:
            if __debug__: log(f'got exception {str(ex)}')
            raise
        except Exception as ex:
            if __debug__: log(f'failed to create Zotero group object: str(ex)')
            alert('Unable to retrieve Zotero group library; proceeding anyway.')
Ejemplo n.º 4
0
    def _result_from_api(self, path):
        # Read the image and proceed with contacting the service.
        (image, error) = self._image_from_file(path)
        if error:
            return error

        endpoint = self._credentials['endpoint']
        key = self._credentials['subscription_key']
        url = f'{endpoint}/vision/v3.2/read/analyze'
        headers = {'Ocp-Apim-Subscription-Key': key,
                   'Content-Type': 'application/octet-stream'}

        # The Microsoft API requires 2 phases: first submit the image for
        # processing, then wait & poll until the text is ready to be retrieved.

        if __debug__: log(f'contacting Microsoft for {relative(path)}')
        response = self._api('post', url, headers, image)
        if isinstance(response, tuple):
            return response             # If get back a tuple, it's an error.

        if 'Operation-Location' in response.headers:
            poll_url = response.headers['Operation-Location']
        else:
            if __debug__: log('no operation-location in response headers')
            raise ServiceFailure('Unexpected response from Microsoft server')
        if __debug__: log('polling MS for results ...')
        analysis = {}
        poll = True
        while poll:
            raise_for_interrupts()
            # Have never seen results returned in 1 s, and meanwhile, polling
            # still counts against our rate limit.  Wait 2 s to reduce calls.
            wait(2)
            response = self._api('get', poll_url, headers = headers, polling = True)
            if isinstance(response, tuple):
                return response         # If get back a tuple, it's an error.

            # Sometimes the response has no content.  I don't know why.
            # It's not clear what else can be done except to keep trying.
            if not response.text:
                if __debug__: log('received empty result from Microsoft.')
                continue

            analysis = response.json()
            if 'status' in analysis:
                if analysis['status'] in ('notStarted', 'running'):
                    if __debug__: log('Microsoft still processing image')
                    poll = True
                elif analysis['status'] == 'succeeded':
                    if __debug__: log('Microsoft returned success code')
                    poll = False
                else:
                    if analysis['status'] == 'failed':
                        text = 'Microsoft analysis failed'
                    else:
                        text = 'Error: Microsoft returned unexpected result'
                    return TRResult(path = path, data = {}, text = '',
                                    boxes = [], error = text)
            else:
                # No status key in JSON results means something's wrong.
                text = 'Error: Microsoft results not in expected format'
                return TRResult(path = path, data = {}, text = '',
                                boxes = [], error = text)

        if __debug__: log(f'results received from Microsoft for {relative(path)}')
        return analysis
Ejemplo n.º 5
0
    def _send(self, image, service):
        '''Get results from service named "service" for the "image".'''

        service_name = f'[{service.name_color()}]{service.name()}[/]'
        base_path = path.join(image.dest_dir, path.basename(image.file))
        json_file = self._renamed(base_path, str(service), 'json')

        saved_results = None
        if self._reuse_json and readable(json_file):
            inform(
                f'Reading saved results for {service_name} from {relative(json_file)}'
            )
            with open(json_file, 'r') as f:
                saved_results = json.load(f)
            output = service.result(image.file, saved_results)
        else:
            inform(f'Sending to {service_name} and waiting for response ...')
            last_time = timer()
            try:
                output = service.result(image.file, None)
            except AuthFailure as ex:
                raise AuthFailure(f'Service {service}: {str(ex)}')
            except RateLimitExceeded as ex:
                time_passed = timer() - last_time
                if time_passed < 1 / service.max_rate():
                    warn(f'Pausing {service_name} due to rate limits')
                    wait(1 / service.max_rate() - time_passed)
                    warn(f'Continuing {service_name}')
                    return self._send(image, service)
            if output.error:
                # Sanitize the error string in case it contains '{' characters.
                msg = output.error.replace('{', '{{{{').replace('}', '}}}}')
                alert(f'{service_name} failed: {msg}')
                warn(
                    f'No result from {service_name} for {relative(image.file)}'
                )
                return None
            inform(f'Got result from {service_name}.')

        raise_for_interrupts()
        inform(f'Creating annotated image for {service_name}.')
        annot_path = self._renamed(base_path, str(service), 'png')
        report_path = None
        from handprint.images import annotated_image
        with self._lock:
            img = annotated_image(image.file, output.boxes, service,
                                  self._text_size, self._text_color,
                                  self._text_shift, self._display,
                                  self._confidence)
            self._save(img, annot_path)

        if self._extended_results and (saved_results is None):
            inform(f'Saving all data for {service_name}.')
            raw_json = json.dumps(output.data, sort_keys=True, indent=2)
            self._save(raw_json, json_file)
            inform(f'Saving extracted text for {service_name}.')
            txt_file = self._renamed(base_path, str(service), 'txt')
            self._save(output.text, txt_file)
        if self._compare:
            gt_file = alt_extension(image.item_file, 'gt.txt')
            gt_path = relative(gt_file)
            report_path = self._renamed(image.item_file, str(service), 'tsv')
            relaxed = (self._compare == 'relaxed')
            if readable(gt_file) and nonempty(gt_file):
                if __debug__: log(f'reading ground truth from {gt_file}')
                gt_text = open(gt_file, 'r').read()
                inform(f'Saving {service_name} comparison to ground truth')
                from handprint.comparison import text_comparison
                self._save(text_comparison(output.text, gt_text, relaxed),
                           report_path)
            elif not nonempty(gt_file):
                warn(
                    f'Skipping {service_name} comparison because {gt_path} is empty'
                )
            else:
                warn(
                    f'Skipping {service_name} comparison because {gt_path} not available'
                )
        return Result(service, image, annot_path, report_path)
Ejemplo n.º 6
0
    def amazon_result(self, file_path, variant, method, image_keyword,
                      result_key, value_key, block_key, result):
        '''Returns the result from calling the service on the 'file_path'.
        The result is returned as an TRResult named tuple.
        '''

        # Delay loading the API packages until needed because they take time to
        # load.  Doing this speeds up overall application start time.
        import boto3
        import botocore

        if not result:
            # If any exceptions occur, let them be passed to caller.
            (image, error) = self._image_from_file(file_path)
            if error:
                return TRResult(path=file_path,
                                data={},
                                boxes=[],
                                text='',
                                error=error)
            try:
                if __debug__:
                    log(f'setting up Amazon client function "{variant}"')
                creds = self._credentials
                session = boto3.session.Session()
                client = session.client(
                    variant,
                    region_name=creds['region_name'],
                    aws_access_key_id=creds['aws_access_key_id'],
                    aws_secret_access_key=creds['aws_secret_access_key'])
                if __debug__: log('calling Amazon API function')
                result = getattr(client, method)(**{
                    image_keyword: {
                        'Bytes': image
                    }
                })
                if __debug__: log(f'received {len(result[result_key])} blocks')
            except botocore.exceptions.EndpointConnectionError as ex:
                raise AuthFailure(
                    f'Problem with credentials file -- {str(ex)}')
            except KeyboardInterrupt as ex:
                raise
            except KeyError as ex:
                msg = f'Amazon credentials file is missing {",".join(ex.args)}'
                raise AuthFailure(msg)
            except Exception as ex:
                if getattr(ex, 'response', False) and 'Error' in ex.response:
                    error = ex.response['Error']
                    code = error['Code']
                    text = error['Message']
                    path = relative(file_path)
                    if code in [
                            'UnsupportedDocumentException',
                            'BadDocumentException'
                    ]:
                        msg = f'Amazon {variant} reports bad or corrupted image in {path}'
                        raise CorruptedContent(msg)
                    elif code in [
                            'InvalidSignatureException',
                            'UnrecognizedClientException'
                    ]:
                        raise AuthFailure(
                            f'Problem with credentials file -- {text}')
                # Fallback if we can't get details.
                if __debug__: log(f'Amazon returned exception {str(ex)}')
                msg = f'Amazon {variant} failure for {path} -- {error["Message"]}'
                raise ServiceFailure(msg)

        raise_for_interrupts()
        full_text = ''
        boxes = []
        width, height = imagesize.get(file_path)
        if __debug__: log(f'parsing Amazon result for {relative(file_path)}')
        for block in result[result_key]:
            if value_key not in block:
                continue
            kind = block[value_key].lower()
            if kind in ['word', 'line']:
                text = block[block_key]
                corners = corner_list(block['Geometry']['Polygon'], width,
                                      height)
                if corners:
                    boxes.append(
                        Box(kind=kind,
                            bb=corners,
                            text=text,
                            score=block['Confidence'] / 100))
                else:
                    # Something's wrong with the vertex list. Skip & continue.
                    if __debug__: log(f'bad bb for {text}: {bb}')
            if kind == "line":
                if 'Text' in block:
                    full_text += block['Text'] + '\n'
                elif 'DetectedText' in block:
                    full_text += block['DetectedText'] + '\n'
        return TRResult(path=file_path,
                        data=result,
                        boxes=boxes,
                        text=full_text,
                        error=None)
Ejemplo n.º 7
0
    def result(self, path, result=None):
        '''Returns the result from calling the service on the 'file_path'.
        The result is returned as an TRResult named tuple.
        '''

        # Delay loading the API packages until needed because they take time to
        # load.  Doing this speeds up overall application start time.
        import google
        from google.cloud import vision_v1 as gv
        from google.api_core.exceptions import PermissionDenied
        from google.protobuf.json_format import MessageToDict

        if not result:
            # Read the image and proceed with contacting the service.
            (image, error) = self._image_from_file(path)
            if error:
                return error

            if __debug__:
                log(f'building Google API object for {relative(path)}')
            try:
                client = gv.ImageAnnotatorClient()
                params = gv.TextDetectionParams(
                    mapping={'enable_text_detection_confidence_score': True})
                context = gv.ImageContext(language_hints=['en-t-i0-handwrit'],
                                          text_detection_params=params)
                img = gv.Image(content=image)
                if __debug__:
                    log(f'sending image to Google for {relative(path)} ...')
                response = client.document_text_detection(
                    image=img, image_context=context)
                if __debug__:
                    log(f'received result from Google for {relative(path)}')
                result = dict_from_response(response)
            except google.api_core.exceptions.PermissionDenied as ex:
                text = 'Authentication failure for Google service -- {}'.format(
                    ex)
                raise AuthFailure(text)
            except google.auth.exceptions.DefaultCredentialsError as ex:
                text = 'Credentials file error for Google service -- {}'.format(
                    ex)
                raise AuthFailure(text)
            except google.api_core.exceptions.ServiceUnavailable as ex:
                text = 'Network, service, or Google configuration error -- {}'.format(
                    ex)
                raise ServiceFailure(text)
            except KeyboardInterrupt as ex:
                raise
            except Exception as ex:
                if isinstance(ex, KeyError):
                    # Can happen if you control-C in the middle of the Google call.
                    # Result is "Exception ignored in: 'grpc._cython.cygrpc._next'"
                    # printed to the terminal and we end up here.
                    raise KeyboardInterrupt
                else:
                    text = 'Error: {} -- {}'.format(str(ex), path)
                    return TRResult(path=path,
                                    data={},
                                    boxes=[],
                                    text='',
                                    error=text)

        raise_for_interrupts()
        boxes = []
        # See this page for more information about the structure:
        # https://cloud.google.com/vision/docs/handwriting#python
        if len(result['full_text_annotation']['pages']) > 1:
            warn('More than one page received from Google; using only first.')
        for block in result['full_text_annotation']['pages'][0]['blocks']:
            for para in block['paragraphs']:
                corners = corner_list(para['bounding_box']['vertices'])
                boxes.append(
                    Box(bb=corners,
                        kind='para',
                        text='',
                        score=para['confidence']))
                for word in para['words']:
                    text = ''
                    for symbol in word['symbols']:
                        text += symbol['text']
                    corners = corner_list(word['bounding_box']['vertices'])
                    if corners:
                        boxes.append(
                            Box(bb=corners,
                                kind='word',
                                text=text,
                                score=para['confidence']))
                    else:
                        # Something is wrong with the vertex list.
                        # Skip it and continue.
                        if __debug__: log(f'bad bb for {text}: {bb}')
        full_text = result['full_text_annotation']['text']
        return TRResult(path=path,
                        data=result,
                        boxes=boxes,
                        text=full_text,
                        error=None)