def lambda_handler(event, context): bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8') shutil.copyfile("tesseract", tmp_dir + '/tesseract') shutil.copyfile("test.png", tmp_dir + '/test.png') os.chmod(tmp_dir + "/tesseract", 0755) os.chmod(tmp_dir, 0755) os.chmod('/tmp', 0755) print("before image file from s3") image_file = download_file(bucket, key) print("before image file to PIL") print("before OCR") result_file = tesseract(image_file) print("Print files in firectory") for file in os.listdir('/tmp'): print(file) try: print("before PyTessBaseAPI set 2") api = PyTessBaseAPI(path=os.path.join(SCRIPT_DIR, 'tessdata'), lang='eng',psm=PSM.AUTO_OSD) print("After API set") api.SetImageFile(image_file) print("After API set image") print("TEXT from tesserocr: %s" % api.GetUTF8Text()) print("CONFIDENCE from tesserocr: %s" % api.AllWordConfidences()) except Exception: pass
class TextExtractor: def __init__(self, image_path, seg_mode=PSM.SPARSE_TEXT): self.api = PyTessBaseAPI() self.api.SetPageSegMode(seg_mode) self.api.SetImageFile(image_path) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def _extract(self) -> Tuple: text = self.api.GetUTF8Text() conf = self.api.MeanTextConf() return text, conf def _extract_from_rect(self, x, y, w, h) -> Tuple: self.api.SetRectangle(x, y, w, h) return self._extract() #TODO: Add support of zero values def extract(self, x=None, y=None, w=None, h=None) -> Tuple: if all([x, y, w, h]): return self._extract_from_rect(x, y, w, h) else: return self._extract() def close(self): self.api.End()
def __init__(self, image_file, tessdata): api = PyTessBaseAPI(path=tessdata, psm=PSM.AUTO_OSD) api.SetImageFile(image_file) api.SetVariable("textord_tablefind_recognize_tables", "T") api.SetVariable("textord_tabfind_find_tables", "T") api.Recognize() self.api = api
def get_lines(filename): ''' Args ::filename (str): Image file relative or absolute path. Return: ::list: List of lines as text from the image. Every line contain the stop times for a certain trip. ''' api = PyTessBaseAPI() api.SetImageFile(filename) text = api.GetUTF8Text() textual_lines = [] line = '' line_num = 0 for char in text: line += char if char == "\n": # ignore lines with less than 5 chars (H:MM) if len(line) < 5: line = '' continue else: line_num += 1 # debug # print('linea: "', line, '" numero: ', line_num, # q 'largo de linea: ', len(line)) textual_lines.append(line) line = '' return textual_lines
def handler(event, context): api = PyTessBaseAPI() api.SetImageFile("sample.jpg") txt = api.GetUTF8Text() logging.info(txt) logging.info(api.AllWordConfidences()) return txt
def __init__(self, image_file, tessdata, oem_mode): if oem_mode == "v3": oem = OEM.TESSERACT_ONLY else: oem = OEM.LSTM_ONLY api = PyTessBaseAPI(path=tessdata, psm=PSM.AUTO_OSD, oem=oem) api.SetImageFile(image_file) self.api = api
def orientation_stuff(): api2 = PyTessBaseAPI(psm=PSM.OSD_ONLY, path=MY_TESSDATA_PATH) api2.SetImageFile('/home/johannes/Repos/tesseract/testing/eurotext.tif') # os = api2.DetectOS() os = api2.DetectOrientationScript( ) # beide verursachen fehler: 'Speicherzugriffsfehler (Speicherabzug geschrieben)' print( "Orientation: {orientation}\nOrientation confidence: {oconfidence}\n Script: {script}\nScript confidence: {sconfidence}" .format(**os))
def getText(self): file = open("test.png", "wb") file.write(self.document_text) file.close() text = "" out, err = Popen( 'python ../../models/tutorials/image/imagenet/classify_image.py --image_file test.png', shell=True, stdout=PIPE).communicate() text += out.decode("utf-8") api = PyTessBaseAPI() api.SetImageFile("test.png") text += api.GetUTF8Text() os.remove("test.png") return text
def preprocess_title(filename): title = '' api = PyTessBaseAPI() api.SetImageFile(filename) boxes = api.GetComponentImages(RIL.TEXTLINE, True) for i, (im, box, _, _) in enumerate(boxes): api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text() text = ' '.join(alpha_re.findall(ocrResult.strip())) if len(text) < 5: continue title = text break if title: logger.info("%s: %s", filename, title) return title
def getText(self): file = open("test.pdf", "wb") file.write(self.document_text) file.close() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) try: fp = open("test.pdf", 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() text = "".join(text.split("\n")) os.remove("test.pdf") return text except: text = "" with Image(filename="test.pdf") as img: img.save(filename="kek.png") for file in os.listdir(os.curdir): if file.endswith(".png") and file.startswith("kek"): api = PyTessBaseAPI() api.SetImageFile(file) text += api.GetUTF8Text() os.remove(file) return text
def tess_ocr(img): """Get text from an image. Args: img: The file path of image. Returns: A string. Raises: IOError: An error occurred accessing the img object. """ with c_locale(): from tesserocr import PyTessBaseAPI, PSM api = PyTessBaseAPI(lang='chi_sim', psm=PSM.AUTO_OSD) api.SetImageFile(img) text = api.GetUTF8Text() api.End() return text
def runTessTest(folderpath): images = [x for x in os.listdir(folderpath) if x[-3:] == 'png'] images.sort() actualLabels = [x[x.rfind('_') + 1:-4] for x in images] letterCorrectCounts = [0 for x in actualLabels] countDict = dict(Counter(actualLabels)) correctDict = \ {actualLabels[i]:letterCorrectCounts[i] for i in range(len(actualLabels))} api = PyTessBaseAPI(lang='frk', psm=10) correctCount = 0 for i in range(len(images)): img = images[i] api.SetImageFile(img) predictLabel = api.GetUTF8Text().rstrip() if actualLabels[i] == predictLabel: correctCount += 1 correctDict[predictLabel] += 1 accuracy = correctCount / len(images) accuracyDict = \ {x:round(correctDict[x]/countDict[x], 3) for x in countDict} return (accuracy, accuracyDict)
class OCREngine: def __init__(self, psm: int = 3, config: dict = {}): logging.info('Initializing OCR engine with PSM=%d and configs=%s' % (psm, config)) self.api = PyTessBaseAPI(psm=psm) for key in config.keys(): self.api.SetVariable(key, config[key]) logging.debug('OCR engine initialized') def build_graph(self, image_path: str, scheme: str = None) -> DocumentGraph: hocr = self._get_hocr(image_path) words = self._get_words(hocr, scheme) dg = DocumentGraph(words) return dg def _get_hocr(self, image_path: str) -> str: logging.info('Reading to hOCR from image: %s' % image_path) self.api.SetImageFile(image_path) hocr_text = self.api.GetHOCRText(0) logging.debug('Image read') return hocr_text def _get_words(self, hocr: str, scheme: str = None): logging.info('Extracting words from hOCR.') if scheme is None: logging.warning('No scheme specified. Assuming xyxy') scheme = 'xyxy' soup = BeautifulSoup(hocr, 'html.parser') word_tags = soup.select('.ocrx_word') word_nodes = [self._make_node(tag, scheme=scheme) for tag in word_tags] word_nodes = list(filter(lambda node: node is not None, word_nodes)) return word_nodes def _make_node(self, tag: dict, scheme: str) -> WordNode: fields = tag['title'].split(';') if not len(fields) == 2: logging.warn('Malformed tag: %s. Skipping.' % tag) return None word = tag.text coordinates = tuple(map(int, fields[0].split()[1:])) conf = int(fields[1].split()[1]) wn = WordNode(word, WordNode.convert_coords(coordinates, scheme), conf) logging.debug('Made word: %s' % wn.__repr__()) return wn def close(self): self.api.End() logging.debug('OCR engine closed') def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): if exc_type: print("type: %s\nvalue: %s\ntrace: %s" % (exc_type, exc_value, traceback)) self.close()
def __init__(self, image_file, tessdata): self.image_file = image_file self.tessdata = tessdata api = PyTessBaseAPI(path=tessdata, psm=PSM.OSD_ONLY) api.SetImageFile(image_file) self.api = api
def run_tesseract(image_file): if tessdata: api = PyTessBaseAPI(path=tessdata, psm=PSM.AUTO_OSD) else: api = PyTessBaseAPI(psm=PSM.AUTO_OSD) api.SetImageFile(image_file) api.SetVariable("textord_tablefind_recognize_tables", "T") api.SetVariable("textord_tabfind_find_tables", "T") api.Recognize() document = {} it = api.AnalyseLayout() if it is not None: orientation, direction, order, deskew_angle = it.Orientation() api.Recognize() ri = api.GetIterator() if ri is not None: document = { "orientation": orientation, "writing_direction": direction, "text_direction": order, "deskew_angle": deskew_angle, "blocks": [] } while ri.IsAtBeginningOf(RIL.BLOCK): block = { "block_type": ri.BlockType(), "block_type_str": BlockType[ri.BlockType()], "box": ri.BoundingBox(RIL.BLOCK), "ocr_text": ri.GetUTF8Text(RIL.BLOCK), "confidence": ri.Confidence(RIL.BLOCK), "paragraphs": [] } break_para = False while True: if ri.IsAtFinalElement(RIL.BLOCK, RIL.PARA): break_para = True break_line = False paragraph = { "box": ri.BoundingBox(RIL.PARA), "ocr_text": ri.GetUTF8Text(RIL.PARA), "paragraph_info": list(ri.ParagraphInfo()), "confidence": ri.Confidence(RIL.PARA), "lines": [] } while True: if ri.IsAtFinalElement(RIL.PARA, RIL.TEXTLINE): break_line = True break_word = False line = { "box": ri.BoundingBox(RIL.TEXTLINE), "ocr_text": ri.GetUTF8Text(RIL.TEXTLINE), "confidence": ri.Confidence(RIL.TEXTLINE), "words": [] } while True: word = { "box": ri.BoundingBox(RIL.WORD), "ocr_text": ri.GetUTF8Text(RIL.WORD), "confidence": ri.Confidence(RIL.WORD), "attributes": ri.WordFontAttributes() } if ri.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break_word = True line["words"].append(word) if break_word: break ri.Next(RIL.WORD) paragraph["lines"].append(line) if break_line: break ri.Next(RIL.TEXTLINE) block["paragraphs"].append(paragraph) if break_para: break ri.Next(RIL.PARA) document["blocks"].append(block) ri.Next(RIL.BLOCK) return document
from tesserocr import file_to_text from tesserocr import PyTessBaseAPI api = PyTessBaseAPI() api.__init__(lang="rus") api.SetImageFile("images/okey-micro.jpg") api.GetThresholdedImage().save("dfd.png") print file_to_text("dfd.png", lang="rus")
# for img in images: # api.SetImageFile(img) # print( api.GetUTF8Text()) # print( api.AllWordConfidences()) img = Image.open( glo.DATA_FOLDER + '/number_range_predictorcropped3.png' ) #glo.UNCLASSIFIED_GLOBAL_CAPTURES_FOLDER + '/fullcapture961 .png') #img = img.convert('L') from tesserocr import PyTessBaseAPI, RIL, iterate_level, PSM #print(help(tesserocr)) api = PyTessBaseAPI() api.Init() api.SetImageFile(glo.DATA_FOLDER + '/number_range_predictorcropped3.png') api.SetVariable("tessedit_pageseg_mode", "7") api.SetVariable("language_model_penalty_non_dict_word", "0") api.SetVariable("doc_dict_enable", "0") print("recognized txt:", api.GetUTF8Text().encode('utf-8').strip()) #api.Recognize() """ ri = api.GetIterator() level = RIL.SYMBOL for r in iterate_level(ri, level): symbol = r.GetUTF8Text(level) # r == ri conf = r.Confidence(level) print(u'symbol {}, conf: {}'.format(symbol, conf).encode('utf-8').strip()) indent = False ci = r.GetChoiceIterator() for c in ci: