Example #1
0
File: ocr.py Project: jbaehne/aleph
    def extract_text(self, data, languages=None):
        key = make_key('ocr', sha1(data).hexdigest())
        text = kv.get(key)
        if text is not None:
            # log.info('%s chars cached', len(text))
            return text.decode('utf-8')

        data = self.ensure_size(data)
        if data is None:
            return

        for attempt in range(1000):
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = Image(data=data, languages=languages)
                response = service.Recognize(image)
                text = response.text or ''
                log.info('OCR: %s chars', len(text))
                kv.set(key, text.encode('utf-8'))
                return text
            except self.Error as e:
                if e.code() == self.Status.RESOURCE_EXHAUSTED:
                    continue
                log.warning("gRPC [%s]: %s", e.code(), e.details())
                backoff(failures=attempt)
                self.reset_channel()
Example #2
0
    def extract_text(self, data, languages=None):
        key = sha1(data).hexdigest()
        text = Cache.get_cache(key)
        if text is not None:
            log.info('OCR: %s chars cached', len(text))
            return text

        # log.info("Size: %s", len(data))
        data = self.ensure_size(data)
        if data is None:
            return

        for attempt in range(10):
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = RPCImage(data=data, languages=languages)
                response = service.Recognize(image)
                log.info('OCR: %s chars recognized', len(response.text))
                if response.text is not None:
                    Cache.set_cache(key, response.text)
                return response.text
            except self.Error as exc:
                log.exception("gRPC Error: %s", self.SERVICE)
                self.reset_channel()
                backoff(failures=attempt)
Example #3
0
    def extract_text(self, data, languages=None):
        if not MIN_SIZE < len(data) < MAX_SIZE:
            log.info('OCR: file size out of range (%d)', len(data))
            return None

        key = make_key('ocr', sha1(data).hexdigest())
        if kv.exists(key):
            text = kv.get(key)
            if text is not None:
                text = text.decode('utf-8')
                log.info('OCR: %s chars cached', len(text))
            return text

        # data = self.ensure_size(data)
        # if data is None:
        #     return

        for attempt in service_retries():
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = Image(data=data, languages=languages)
                response = service.Recognize(image)
                text = response.text
                if text is not None:
                    log.info('OCR: %s chars (from %s bytes)', len(text),
                             len(data))
                kv.set(key, text)
                return text
            except self.Error as e:
                if e.code() not in self.TEMPORARY_ERRORS:
                    return
                self.reset_channel()
                log.warning("gRPC [%s]: %s", e.code(), e.details())
                backoff(failures=attempt)
Example #4
0
def backoff_cluster(failures=0):
    """This is intended to halt traffic to the cluster if it is in a
    recovery state, e.g. after a master failure or severe node failures.
    """
    for attempt in count(failures):
        backoff(failures=attempt)
        if check_cluster_ready():
            return
        log.warning("Cluster is flustered.")
Example #5
0
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, doc_type='doc', id=id, body=body, **kwargs)
            body['id'] = str(id)
            body.pop('text', None)
            return body
        except RequestError:
            raise
        except Exception as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
        backoff(failures=attempt)
Example #6
0
 def extract_text(self, text, languages):
     for attempt in service_retries():
         try:
             service = EntityExtractStub(self.channel)
             req = Text(text=text, languages=languages)
             for res in service.Extract(req):
                 clazz = self.TYPES.get(res.type)
                 yield (res.text, clazz, res.start, res.end)
             break
         except self.Error as e:
             if e.code() not in self.TEMPORARY_ERRORS:
                 return
             self.reset_channel()
             log.warning("gRPC [%s]: %s", e.code(), e.details())
             backoff(failures=attempt)
Example #7
0
def query_delete(index, query, **kwargs):
    "Delete all documents matching the given query inside the index."
    for attempt in service_retries():
        try:
            es.delete_by_query(index=index,
                               body={'query': query},
                               conflicts='proceed',
                               timeout=TIMEOUT,
                               request_timeout=REQUEST_TIMEOUT,
                               **kwargs)
            return
        except RequestError:
            raise
        except Exception as exc:
            log.warning("Query delete failed: %s", exc)
        backoff(failures=attempt)
Example #8
0
def search_safe(*args, **kwargs):
    # This is not supposed to be used in every location where search is
    # run, but only where it's a backend search that we could back off of
    # without hurting UX.
    for attempt in service_retries():
        try:
            kwargs['doc_type'] = 'doc'
            return es.search(*args,
                             timeout=TIMEOUT,
                             request_timeout=REQUEST_TIMEOUT,
                             **kwargs)
        except RequestError:
            raise
        except Exception:
            log.exception("Search error: %r", kwargs)
        backoff(failures=attempt)
Example #9
0
 def extract(self, text, languages):
     if text is None or len(text) < self.MIN_LENGTH:
         return
     texts = textwrap.wrap(text, self.MAX_LENGTH)
     for text in texts:
         for attempt in range(10):
             try:
                 service = EntityExtractStub(self.channel)
                 req = Text(text=text, languages=languages)
                 for res in service.Extract(req):
                     clazz = self.TYPES.get(res.type)
                     yield (res.text, clazz, res.start, res.end)
                 break
             except self.Error as e:
                 if e.code() == self.Status.RESOURCE_EXHAUSTED:
                     continue
                 log.warning("gRPC [%s]: %s", e.code(), e.details())
                 backoff(failures=attempt)
                 self.reset_channel()
Example #10
0
    def extract_text(self, data, languages=None):
        key = sha1(data).hexdigest()
        text = Cache.get_cache(key)
        if text is not None:
            log.info('%s chars cached', len(text))
            return text

        data = self.ensure_size(data)
        if data is None:
            return

        for attempt in range(1000):
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = Image(data=data, languages=languages)
                response = service.Recognize(image)
                log.info('%s chars recognized', len(response.text))
                if response.text is not None:
                    Cache.set_cache(key, response.text)
                return response.text
            except self.Error as e:
                log.warning("gRPC [%s]: %s", e.code(), e.details())
                backoff(failures=attempt)
Example #11
0
def backoff_cluster(failures=0):
    """This is intended to halt traffic to the cluster if it is in a
    recovery state, e.g. after a master failure or severe node failures.
    """
    backoff(failures=failures)