def extract_text(self, data, languages=None): key = make_key('ocr', sha1(data).hexdigest()) text = kv.get(key) if text is not None: # log.info('%s chars cached', len(text)) return text.decode('utf-8') data = self.ensure_size(data) if data is None: return for attempt in range(1000): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = Image(data=data, languages=languages) response = service.Recognize(image) text = response.text or '' log.info('OCR: %s chars', len(text)) kv.set(key, text.encode('utf-8')) return text except self.Error as e: if e.code() == self.Status.RESOURCE_EXHAUSTED: continue log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt) self.reset_channel()
def extract_text(self, data, languages=None): key = sha1(data).hexdigest() text = Cache.get_cache(key) if text is not None: log.info('OCR: %s chars cached', len(text)) return text # log.info("Size: %s", len(data)) data = self.ensure_size(data) if data is None: return for attempt in range(10): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = RPCImage(data=data, languages=languages) response = service.Recognize(image) log.info('OCR: %s chars recognized', len(response.text)) if response.text is not None: Cache.set_cache(key, response.text) return response.text except self.Error as exc: log.exception("gRPC Error: %s", self.SERVICE) self.reset_channel() backoff(failures=attempt)
def extract_text(self, data, languages=None): if not MIN_SIZE < len(data) < MAX_SIZE: log.info('OCR: file size out of range (%d)', len(data)) return None key = make_key('ocr', sha1(data).hexdigest()) if kv.exists(key): text = kv.get(key) if text is not None: text = text.decode('utf-8') log.info('OCR: %s chars cached', len(text)) return text # data = self.ensure_size(data) # if data is None: # return for attempt in service_retries(): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = Image(data=data, languages=languages) response = service.Recognize(image) text = response.text if text is not None: log.info('OCR: %s chars (from %s bytes)', len(text), len(data)) kv.set(key, text) return text except self.Error as e: if e.code() not in self.TEMPORARY_ERRORS: return self.reset_channel() log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt)
def backoff_cluster(failures=0): """This is intended to halt traffic to the cluster if it is in a recovery state, e.g. after a master failure or severe node failures. """ for attempt in count(failures): backoff(failures=attempt) if check_cluster_ready(): return log.warning("Cluster is flustered.")
def index_safe(index, id, body, **kwargs): """Index a single document and retry until it has been stored.""" for attempt in service_retries(): try: es.index(index=index, doc_type='doc', id=id, body=body, **kwargs) body['id'] = str(id) body.pop('text', None) return body except RequestError: raise except Exception as exc: log.warning("Index error [%s:%s]: %s", index, id, exc) backoff(failures=attempt)
def extract_text(self, text, languages): for attempt in service_retries(): try: service = EntityExtractStub(self.channel) req = Text(text=text, languages=languages) for res in service.Extract(req): clazz = self.TYPES.get(res.type) yield (res.text, clazz, res.start, res.end) break except self.Error as e: if e.code() not in self.TEMPORARY_ERRORS: return self.reset_channel() log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt)
def query_delete(index, query, **kwargs): "Delete all documents matching the given query inside the index." for attempt in service_retries(): try: es.delete_by_query(index=index, body={'query': query}, conflicts='proceed', timeout=TIMEOUT, request_timeout=REQUEST_TIMEOUT, **kwargs) return except RequestError: raise except Exception as exc: log.warning("Query delete failed: %s", exc) backoff(failures=attempt)
def search_safe(*args, **kwargs): # This is not supposed to be used in every location where search is # run, but only where it's a backend search that we could back off of # without hurting UX. for attempt in service_retries(): try: kwargs['doc_type'] = 'doc' return es.search(*args, timeout=TIMEOUT, request_timeout=REQUEST_TIMEOUT, **kwargs) except RequestError: raise except Exception: log.exception("Search error: %r", kwargs) backoff(failures=attempt)
def extract(self, text, languages): if text is None or len(text) < self.MIN_LENGTH: return texts = textwrap.wrap(text, self.MAX_LENGTH) for text in texts: for attempt in range(10): try: service = EntityExtractStub(self.channel) req = Text(text=text, languages=languages) for res in service.Extract(req): clazz = self.TYPES.get(res.type) yield (res.text, clazz, res.start, res.end) break except self.Error as e: if e.code() == self.Status.RESOURCE_EXHAUSTED: continue log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt) self.reset_channel()
def extract_text(self, data, languages=None): key = sha1(data).hexdigest() text = Cache.get_cache(key) if text is not None: log.info('%s chars cached', len(text)) return text data = self.ensure_size(data) if data is None: return for attempt in range(1000): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = Image(data=data, languages=languages) response = service.Recognize(image) log.info('%s chars recognized', len(response.text)) if response.text is not None: Cache.set_cache(key, response.text) return response.text except self.Error as e: log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt)
def backoff_cluster(failures=0): """This is intended to halt traffic to the cluster if it is in a recovery state, e.g. after a master failure or severe node failures. """ backoff(failures=failures)