def _do_main_work(self): # Gather up some things and get prepared. targets = self.targets_from_arguments() if not targets: alert_fatal('No images to process; quitting.') raise CannotProceed(ExitCode.bad_arg) num_targets = len(targets) inform(f'Given {pluralized("image", num_targets, True)} to work on.') inform('Will apply results of {}: {}'.format( pluralized('service', len(self.services), True), ', '.join(self.services), num_targets)) inform( f'Will use credentials stored in {Credentials.credentials_dir()}/.' ) if self.extended: inform('Will save extended results.') num_threads = min(self.threads, len(self.services)) inform(f'Will use up to {num_threads} process threads.') # Get to work. if __debug__: log('initializing manager and starting processes') import shutil print_separators = num_targets > 1 rule = '─' * (shutil.get_terminal_size().columns or 80) for index, item in enumerate(targets, start=1): # Check whether we've been interrupted before doing another item. raise_for_interrupts() # Process next item. if print_separators: inform(rule) self._manager.run_services(item, index, self.base_name) if print_separators: inform(rule)
def record_for_file(self, file): '''Returns a ZoteroRecord corresponding to the given local PDF file.''' f = antiformat(file) if not path.exists(file): # The file should always exist, because of how the list of files is # gathered, so something is wrong but we don't know what. Give up. raise ValueError(f'File not found: {f}') # Zotero stores content in the subdirectory like .../storage/N743ZXDF. # The item key is the alphanumeric directory name. # Given the key, there's no way to know whether the record is in a user # library or a group library, so we have to iterate over the options. itemkey = path.basename(path.dirname(file)) record = None for library in self._libraries: try: record = library.item(itemkey) if __debug__: log(f'{itemkey} found in library {library.library_id}') break except zotero_errors.ResourceNotFound: if __debug__: log(f'{itemkey} not found in library {library.library_id}') continue except KeyboardInterrupt as ex: if __debug__: log(f'interrupted: {str(ex)}') raise except Exception as ex: if __debug__: log(f'got exception {str(ex)}') raise # pyzotero calls urllib3 for network connections. The latter uses # try-except clauses that broadly catch all Exceptions but don't # check for for KeyboardInterrupt. Thus, ^C during a network call # will show up as a failure to return data, not a KeyboardInterrupt. raise_for_interrupts() if not record: if __debug__: log(f'could not find a record for item key "{itemkey}"') return (None, f'Unable to retrieve Zotero record for {f}') # If the PDF isn't associated with a bib record, it won't have a parent. parentkey = self.parent_key(record, file) if not parentkey: if __debug__: log(f'file not associated with a parent record: {f}') return (None, f'File lacks a parent Zotero record: {f}') # We have an item record and a parent. We are happy campers. if __debug__: log(f'{parentkey} is parent of {itemkey} for {f}') r = ZoteroRecord(key = itemkey, parent_key = parentkey, file = file, link = self.item_link(record, file), record = record) return (r, None)
def __init__(self, key, user_id, use_keyring): if key and (not key.isalnum() or len(key) < 20): alert_fatal(f'"{key}" does not appear to be a valid API key.') raise CannotProceed(ExitCode.bad_arg) if user_id and not user_id.isdigit(): alert_fatal(f'"{user_id}" does not appear to be a Zotero user ID.') raise CannotProceed(ExitCode.bad_arg) # If the user supplied all the values on the command line, those are # the values used. If none were supplied by the user on the command # line, all the values are retrieved from the user's keyring. If # some were supplied and others missing, they're filled in from # either the keyring or by prompting the user. if __debug__: log('keyring ' + ('enabled' if use_keyring else 'disabled')) if key is None and user_id is None and use_keyring: # We weren't given a key and id, but we can look in the keyring. if __debug__: log(f'getting id & key from keyring') key, user_id = keyring_credentials() if not key: key = validated_input('API key', key, lambda x: x.isalnum()) if not user_id: user_id = validated_input('User ID', user_id, lambda x: x.isdigit()) if use_keyring: if __debug__: log('saving credentials to keyring') save_keyring_credentials(key, user_id) self._key = key self._user_id = user_id # Get connected and store the Zotero conection object for the user # library first, then look up the group libraries that the user can # access and create Zotero objects for that too. The way we use them, # we don't need to separate between them, so they all go into one list. self._libraries = [] try: if __debug__: log(f'connecting to Zotero as user {user_id}') user = zotero.Zotero(user_id, 'user', key) # pyzotero will return an object but that doesn't mean the user # actually gave valid credentials. Need to try an operation. user.count_items() self._libraries.append(user) raise_for_interrupts() except zotero_errors.UserNotAuthorised as ex: if __debug__: log(f'got exception {str(ex)}') alert_fatal('Unable to connect to Zotero: invalid ID and/or API key.', 'The Zotero servers rejected attempts to connect.') raise CannotProceed(ExitCode.bad_arg) except KeyboardInterrupt as ex: if __debug__: log(f'got exception {str(ex)}') raise except Exception as ex: if __debug__: log(f'failed to create Zotero user object: str(ex)') alert_fatal('Unable to connect to Zotero API.') raise try: for group in user.groups(): if __debug__: log(f'user can access group id {group["id"]}') self._libraries.append(zotero.Zotero(group['id'], 'group', key)) raise_for_interrupts() except KeyboardInterrupt as ex: if __debug__: log(f'got exception {str(ex)}') raise except Exception as ex: if __debug__: log(f'failed to create Zotero group object: str(ex)') alert('Unable to retrieve Zotero group library; proceeding anyway.')
def _result_from_api(self, path): # Read the image and proceed with contacting the service. (image, error) = self._image_from_file(path) if error: return error endpoint = self._credentials['endpoint'] key = self._credentials['subscription_key'] url = f'{endpoint}/vision/v3.2/read/analyze' headers = {'Ocp-Apim-Subscription-Key': key, 'Content-Type': 'application/octet-stream'} # The Microsoft API requires 2 phases: first submit the image for # processing, then wait & poll until the text is ready to be retrieved. if __debug__: log(f'contacting Microsoft for {relative(path)}') response = self._api('post', url, headers, image) if isinstance(response, tuple): return response # If get back a tuple, it's an error. if 'Operation-Location' in response.headers: poll_url = response.headers['Operation-Location'] else: if __debug__: log('no operation-location in response headers') raise ServiceFailure('Unexpected response from Microsoft server') if __debug__: log('polling MS for results ...') analysis = {} poll = True while poll: raise_for_interrupts() # Have never seen results returned in 1 s, and meanwhile, polling # still counts against our rate limit. Wait 2 s to reduce calls. wait(2) response = self._api('get', poll_url, headers = headers, polling = True) if isinstance(response, tuple): return response # If get back a tuple, it's an error. # Sometimes the response has no content. I don't know why. # It's not clear what else can be done except to keep trying. if not response.text: if __debug__: log('received empty result from Microsoft.') continue analysis = response.json() if 'status' in analysis: if analysis['status'] in ('notStarted', 'running'): if __debug__: log('Microsoft still processing image') poll = True elif analysis['status'] == 'succeeded': if __debug__: log('Microsoft returned success code') poll = False else: if analysis['status'] == 'failed': text = 'Microsoft analysis failed' else: text = 'Error: Microsoft returned unexpected result' return TRResult(path = path, data = {}, text = '', boxes = [], error = text) else: # No status key in JSON results means something's wrong. text = 'Error: Microsoft results not in expected format' return TRResult(path = path, data = {}, text = '', boxes = [], error = text) if __debug__: log(f'results received from Microsoft for {relative(path)}') return analysis
def _send(self, image, service): '''Get results from service named "service" for the "image".''' service_name = f'[{service.name_color()}]{service.name()}[/]' base_path = path.join(image.dest_dir, path.basename(image.file)) json_file = self._renamed(base_path, str(service), 'json') saved_results = None if self._reuse_json and readable(json_file): inform( f'Reading saved results for {service_name} from {relative(json_file)}' ) with open(json_file, 'r') as f: saved_results = json.load(f) output = service.result(image.file, saved_results) else: inform(f'Sending to {service_name} and waiting for response ...') last_time = timer() try: output = service.result(image.file, None) except AuthFailure as ex: raise AuthFailure(f'Service {service}: {str(ex)}') except RateLimitExceeded as ex: time_passed = timer() - last_time if time_passed < 1 / service.max_rate(): warn(f'Pausing {service_name} due to rate limits') wait(1 / service.max_rate() - time_passed) warn(f'Continuing {service_name}') return self._send(image, service) if output.error: # Sanitize the error string in case it contains '{' characters. msg = output.error.replace('{', '{{{{').replace('}', '}}}}') alert(f'{service_name} failed: {msg}') warn( f'No result from {service_name} for {relative(image.file)}' ) return None inform(f'Got result from {service_name}.') raise_for_interrupts() inform(f'Creating annotated image for {service_name}.') annot_path = self._renamed(base_path, str(service), 'png') report_path = None from handprint.images import annotated_image with self._lock: img = annotated_image(image.file, output.boxes, service, self._text_size, self._text_color, self._text_shift, self._display, self._confidence) self._save(img, annot_path) if self._extended_results and (saved_results is None): inform(f'Saving all data for {service_name}.') raw_json = json.dumps(output.data, sort_keys=True, indent=2) self._save(raw_json, json_file) inform(f'Saving extracted text for {service_name}.') txt_file = self._renamed(base_path, str(service), 'txt') self._save(output.text, txt_file) if self._compare: gt_file = alt_extension(image.item_file, 'gt.txt') gt_path = relative(gt_file) report_path = self._renamed(image.item_file, str(service), 'tsv') relaxed = (self._compare == 'relaxed') if readable(gt_file) and nonempty(gt_file): if __debug__: log(f'reading ground truth from {gt_file}') gt_text = open(gt_file, 'r').read() inform(f'Saving {service_name} comparison to ground truth') from handprint.comparison import text_comparison self._save(text_comparison(output.text, gt_text, relaxed), report_path) elif not nonempty(gt_file): warn( f'Skipping {service_name} comparison because {gt_path} is empty' ) else: warn( f'Skipping {service_name} comparison because {gt_path} not available' ) return Result(service, image, annot_path, report_path)
def amazon_result(self, file_path, variant, method, image_keyword, result_key, value_key, block_key, result): '''Returns the result from calling the service on the 'file_path'. The result is returned as an TRResult named tuple. ''' # Delay loading the API packages until needed because they take time to # load. Doing this speeds up overall application start time. import boto3 import botocore if not result: # If any exceptions occur, let them be passed to caller. (image, error) = self._image_from_file(file_path) if error: return TRResult(path=file_path, data={}, boxes=[], text='', error=error) try: if __debug__: log(f'setting up Amazon client function "{variant}"') creds = self._credentials session = boto3.session.Session() client = session.client( variant, region_name=creds['region_name'], aws_access_key_id=creds['aws_access_key_id'], aws_secret_access_key=creds['aws_secret_access_key']) if __debug__: log('calling Amazon API function') result = getattr(client, method)(**{ image_keyword: { 'Bytes': image } }) if __debug__: log(f'received {len(result[result_key])} blocks') except botocore.exceptions.EndpointConnectionError as ex: raise AuthFailure( f'Problem with credentials file -- {str(ex)}') except KeyboardInterrupt as ex: raise except KeyError as ex: msg = f'Amazon credentials file is missing {",".join(ex.args)}' raise AuthFailure(msg) except Exception as ex: if getattr(ex, 'response', False) and 'Error' in ex.response: error = ex.response['Error'] code = error['Code'] text = error['Message'] path = relative(file_path) if code in [ 'UnsupportedDocumentException', 'BadDocumentException' ]: msg = f'Amazon {variant} reports bad or corrupted image in {path}' raise CorruptedContent(msg) elif code in [ 'InvalidSignatureException', 'UnrecognizedClientException' ]: raise AuthFailure( f'Problem with credentials file -- {text}') # Fallback if we can't get details. if __debug__: log(f'Amazon returned exception {str(ex)}') msg = f'Amazon {variant} failure for {path} -- {error["Message"]}' raise ServiceFailure(msg) raise_for_interrupts() full_text = '' boxes = [] width, height = imagesize.get(file_path) if __debug__: log(f'parsing Amazon result for {relative(file_path)}') for block in result[result_key]: if value_key not in block: continue kind = block[value_key].lower() if kind in ['word', 'line']: text = block[block_key] corners = corner_list(block['Geometry']['Polygon'], width, height) if corners: boxes.append( Box(kind=kind, bb=corners, text=text, score=block['Confidence'] / 100)) else: # Something's wrong with the vertex list. Skip & continue. if __debug__: log(f'bad bb for {text}: {bb}') if kind == "line": if 'Text' in block: full_text += block['Text'] + '\n' elif 'DetectedText' in block: full_text += block['DetectedText'] + '\n' return TRResult(path=file_path, data=result, boxes=boxes, text=full_text, error=None)
def result(self, path, result=None): '''Returns the result from calling the service on the 'file_path'. The result is returned as an TRResult named tuple. ''' # Delay loading the API packages until needed because they take time to # load. Doing this speeds up overall application start time. import google from google.cloud import vision_v1 as gv from google.api_core.exceptions import PermissionDenied from google.protobuf.json_format import MessageToDict if not result: # Read the image and proceed with contacting the service. (image, error) = self._image_from_file(path) if error: return error if __debug__: log(f'building Google API object for {relative(path)}') try: client = gv.ImageAnnotatorClient() params = gv.TextDetectionParams( mapping={'enable_text_detection_confidence_score': True}) context = gv.ImageContext(language_hints=['en-t-i0-handwrit'], text_detection_params=params) img = gv.Image(content=image) if __debug__: log(f'sending image to Google for {relative(path)} ...') response = client.document_text_detection( image=img, image_context=context) if __debug__: log(f'received result from Google for {relative(path)}') result = dict_from_response(response) except google.api_core.exceptions.PermissionDenied as ex: text = 'Authentication failure for Google service -- {}'.format( ex) raise AuthFailure(text) except google.auth.exceptions.DefaultCredentialsError as ex: text = 'Credentials file error for Google service -- {}'.format( ex) raise AuthFailure(text) except google.api_core.exceptions.ServiceUnavailable as ex: text = 'Network, service, or Google configuration error -- {}'.format( ex) raise ServiceFailure(text) except KeyboardInterrupt as ex: raise except Exception as ex: if isinstance(ex, KeyError): # Can happen if you control-C in the middle of the Google call. # Result is "Exception ignored in: 'grpc._cython.cygrpc._next'" # printed to the terminal and we end up here. raise KeyboardInterrupt else: text = 'Error: {} -- {}'.format(str(ex), path) return TRResult(path=path, data={}, boxes=[], text='', error=text) raise_for_interrupts() boxes = [] # See this page for more information about the structure: # https://cloud.google.com/vision/docs/handwriting#python if len(result['full_text_annotation']['pages']) > 1: warn('More than one page received from Google; using only first.') for block in result['full_text_annotation']['pages'][0]['blocks']: for para in block['paragraphs']: corners = corner_list(para['bounding_box']['vertices']) boxes.append( Box(bb=corners, kind='para', text='', score=para['confidence'])) for word in para['words']: text = '' for symbol in word['symbols']: text += symbol['text'] corners = corner_list(word['bounding_box']['vertices']) if corners: boxes.append( Box(bb=corners, kind='word', text=text, score=para['confidence'])) else: # Something is wrong with the vertex list. # Skip it and continue. if __debug__: log(f'bad bb for {text}: {bb}') full_text = result['full_text_annotation']['text'] return TRResult(path=path, data=result, boxes=boxes, text=full_text, error=None)