def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text try: img = Image.open(StringIO(data)) except DecompressionBombWarning as dce: log.debug("Image too large: %", dce) return None except IOError as ioe: log.info("Unknown image format: %r", ioe) return None # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) extractor.clear() log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def ocr_text(img): '''Perform OCR on the image.''' tr = Tesseract(lang='eng') tr.clear() pil_image = pil.Image.fromarray(img) tr.set_image(pil_image) utf8_text = tr.get_text() return utf8_text
def ocr(img,idioma): ocr_img = Image.fromarray(img) ocr = Tesseract(lang=idioma) ocr.set_image(ocr_img) pattern = re.compile('[a-zA-Z0-9]') text = ocr.get_utf8_text() text = text.splitlines() text = [x for x in text if x != ''] text = [x for x in text if pattern.search(x)] ocr.clear() return (text)
def ocr_text(img): tr = Tesseract(lang='eng') tr.clear() pil_image = pil.Image.fromarray(img) # Turn off OCR word dictionaries tr.set_variable('load_system_dawg', "F") tr.set_variable('load_freq_dawg', "F") tr.set_variable('-psm', "7") # treat image as single line tr.set_variable('tessedit_char_whitelist', "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") tr.set_image(pil_image) utf8_text = tr.get_text() return unicode(utf8_text)