コード例 #1
0
ファイル: tesseract.py プロジェクト: wilbrodn/aleph
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    try:
        img = Image.open(StringIO(data))
    except DecompressionBombWarning as dce:
        log.debug("Image too large: %", dce)
        return None
    except IOError as ioe:
        log.info("Unknown image format: %r", ioe)
        return None
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_image(img)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.get_text() or ''
    text = text.decode(encoding="UTF-8")
    # extractor.clear()
    log.debug('OCR done: %s, %s characters extracted', languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
コード例 #2
0
ファイル: ocr.py プロジェクト: dkhurshudian/aleph
    def extract_text(self, data, languages=None):
        key = sha1(data).hexdigest()
        text = Cache.get_cache(key)
        if text is not None:
            log.info('OCR: %s chars cached', len(text))
            return text

        # log.info("Size: %s", len(data))
        data = self.ensure_size(data)
        if data is None:
            return

        for attempt in range(10):
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = RPCImage(data=data, languages=languages)
                response = service.Recognize(image)
                log.info('OCR: %s chars recognized', len(response.text))
                if response.text is not None:
                    Cache.set_cache(key, response.text)
                return response.text
            except self.Error as exc:
                log.exception("gRPC Error: %s", self.SERVICE)
                self.reset_channel()
                backoff(failures=attempt)
コード例 #3
0
ファイル: tesseract.py プロジェクト: CodeForAfrica/aleph
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    try:
        img = Image.open(StringIO(data))
    except DecompressionBombWarning as dce:
        log.debug("Image too large: %", dce)
        return None
    except IOError as ioe:
        log.info("Unknown image format: %r", ioe)
        return None
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.ocr_image(img)
    extractor.clear()
    log.debug('OCR done: %s, %s characters extracted',
              languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
コード例 #4
0
    def extract_text(self, data, languages=None):
        key = sha1(data).hexdigest()
        text = Cache.get_cache(key)
        if text is not None:
            # log.info('%s chars cached', len(text))
            return text

        data = self.ensure_size(data)
        if data is None:
            return

        for attempt in range(1000):
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = Image(data=data, languages=languages)
                response = service.Recognize(image)
                log.info('OCR: %s chars', len(response.text))
                if response.text is not None:
                    Cache.set_cache(key, response.text)
                return response.text
            except self.Error as e:
                if e.code() == self.Status.RESOURCE_EXHAUSTED:
                    continue
                log.warning("gRPC [%s]: %s", e.code(), e.details())
                backoff(failures=attempt)
                self.reset_channel()
コード例 #5
0
 def test_cache_basic(self):
     assert None is Cache.get_cache('foo'), Cache.get_cache('foo')
     assert db.session.query(Cache).count() == 0
     Cache.set_cache('foo', 'bar')
     assert 'bar' == Cache.get_cache('foo'), Cache.get_cache('foo')
     assert db.session.query(Cache).count() == 1
     Cache.set_cache('foo', 'quuux')
     assert 'quuux' == Cache.get_cache('foo'), Cache.get_cache('foo')
     assert db.session.query(Cache).count() == 1
コード例 #6
0
    def extract_text(self, data, languages=None):
        key = sha1(data).hexdigest()
        text = Cache.get_cache(key)
        if text is not None:
            log.info('Vision API: %s chars cached', len(text))
            return text

        data = self.ensure_size(data)
        if data is not None:
            image = types.Image(content=data)
            res = self.client.document_text_detection(image)
            ann = res.full_text_annotation
            log.info('Vision API: %s chars recognized', len(ann.text))
            Cache.set_cache(key, ann.text)
            return ann.text
コード例 #7
0
ファイル: tesseract.py プロジェクト: tomjie/aleph
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    img = Image.open(StringIO(data))
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.ocr_image(img)
    log.debug('OCR done: %s, %s characters extracted', languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
コード例 #8
0
ファイル: tesseract.py プロジェクト: nivertech/aleph
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    img = Image.open(StringIO(data))
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.ocr_image(img)
    log.debug('OCR done: %s, %s characters extracted',
              languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text
コード例 #9
0
ファイル: manager.py プロジェクト: renesugar/aleph
 def set_cache(self, key, value):
     Cache.set_cache(key, value)
コード例 #10
0
ファイル: manager.py プロジェクト: renesugar/aleph
 def get_cache(self, key):
     return Cache.get_cache(key)