def test_image_to_alto_xml(test_file): result = image_to_alto_xml(test_file) assert isinstance(result, bytes) result = result.decode('utf-8') if IS_PYTHON_2 else str(result, 'utf-8') result = str(result).strip() assert result.startswith('<?xml') assert result.endswith('</alto>')
def _extract_lines(img: np.ndarray, contour: Contour, lang: str) -> List[alto.TextLine]: truncated_image = _truncate(img, contour) alto_ = alto.parse( _decode(pytesseract.image_to_alto_xml(truncated_image, lang=lang))) return assert_one_page_and_get_it(alto_).extract_lines()
def convert_file_to_xml(input_file=image_file): """ Converting image files to ALTO xml and use convert_djvu_to_xml() and convert_pdf_to_xml(). """ # If image file: # Windows bitmaps - *.bmp, *.dib # JPEG files - *.jpeg, *.jpg, *.jpe # JPEG 2000 files - *.jp2 # Portable Network Graphics - *.png # WebP - *.webp # Portable image format - *.pbm, *.pgm, *.ppm *.pxm, *.pnm # Sun rasters - *.sr, *.ras # TIFF files - *.tiff, *.tif # OpenEXR Image files - *.exr # Radiance HDR - *.hdr, *.pic valid_image_extensions = [ ".bmp", ".dib", ".jpeg", ".jpg", ".jpe", ".jp2", ".png", ".webp" ".pbm", ".pgm", ".ppm", ".sr", ".ras", ".tiff", ".tif", ".exr", ".hdr", ".pic" ] extention = os.path.splitext(input_file)[1].lower() if extention in valid_image_extensions: # Tesseract command file in installation directory pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Load image image = cv2.imread(input_file) # Run tesseract, returning binary text ALTO xml alto_xml = pytesseract.image_to_alto_xml(image, lang='rus+eng') return(alto_xml) elif extention == ".pdf": pass # TODO: pdf convertion function elif (extention == ".djvu") or (extention == ".djv"): alto_xml = convert_djvu_to_xml(input_file) return(alto_xml) else: pass
def _run_tesseract(page: str, lang: str) -> str: return _decode(pytesseract.image_to_alto_xml(page, lang=lang))
def _tesseract(page: Any) -> str: return _decode( pytesseract.image_to_alto_xml(page, lang=CONFIG.tesseract.lang))
def test_image_to_alto_xml_support(test_file): with pytest.raises(ALTONotSupported): image_to_alto_xml(test_file)
# Timeout/terminate the tesseract job after a period of time try: print(pytesseract.image_to_string('test.jpg', timeout=2)) # Timeout after 2 seconds print(pytesseract.image_to_string( 'test.jpg', timeout=0.5)) # Timeout after half a second except RuntimeError as timeout_error: # Tesseract processing is terminated pass # Get bounding box estimates print(pytesseract.image_to_boxes(Image.open('test.png'))) # Get verbose data including boxes, confidences, line and page numbers print(pytesseract.image_to_data(Image.open('test.png'))) # Get information about orientation and script detection print(pytesseract.image_to_osd(Image.open('test.png'))) # Get a searchable PDF pdf = pytesseract.image_to_pdf_or_hocr('test.png', extension='pdf') with open('test.pdf', 'w+b') as f: f.write(pdf) # pdf type is bytes by default # Get HOCR output hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr') # Get ALTO XML output xml = pytesseract.image_to_alto_xml('test.png')
# Get bounding box estimates #boxes = pytesseract.image_to_boxes(Image.open('/home/moshe/Downloads/test.png')) # Get verbose data including boxes, confidences, line and page numbers img = cv2.imread('/home/moshe/Downloads/test.png') d = pytesseract.image_to_data(img, lang="heb", output_type=Output.DICT) print(d) n_boxes = len(d['level']) for i in range(n_boxes): (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i]) cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2) cv2.imshow('img', img) cv2.waitKey(0) # Get information about orientation and script detection print(pytesseract.image_to_osd(Image.open('/home/moshe/Downloads/test.png'))) # Get a searchable PDF pdf = pytesseract.image_to_pdf_or_hocr('/home/moshe/Downloads/test.png', extension='pdf') with open('test.pdf', 'w+b') as f: f.write(pdf) # pdf type is bytes by default # Get HOCR output hocr = pytesseract.image_to_pdf_or_hocr('/home/moshe/Downloads/test.png', extension='hocr') # Get ALTO XML output xml = pytesseract.image_to_alto_xml('/home/moshe/Downloads/test.png')
import cv2 import pytesseract pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" filename = r'C:\Data2\OCR\test_page.png' output_file = r'C:\Data2\OCR\alto_page.xml' # read the image img = cv2.imread(filename) # For testing: # # Prepare image to OCR # image_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # image_bin = cv2.adaptiveThreshold(image_gray,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY ,41,3) # cv2.imshow("Original", img) # cv2.imshow("Gray", image_gray) # cv2.imshow("Bin", image_bin) # run tesseract, returning binary text ALTO xml alto_xml = pytesseract.image_to_alto_xml(img, lang='rus+eng') #use # save output xml f = open(output_file, "wb") f.write(alto_xml) f.close() #input('pause…')
def image_to_alto_xml(filename): alto_xml = tesserakti.image_to_alto_xml(filename) # alto_xml = tesserakti.image_to_alto_xml(kuva.open(filename)) return alto_xml
import io import sys import pytesseract import cv2 if __name__ == '__main__': if len(sys.argv) == 3: img_path = sys.argv[1] outType = sys.argv[2] ext = '.txt' if outType == '-t' else '.alto' img = cv2.imread(img_path) # preprocessings # Lire un fichier de paramètre pour simplifier les démarches # ocr out_name = img_path.split('\.')[0] + ext if ext == '.txt': outFile = io.open(out_name, 'w') txt = pytesseract.image_to_string(img) outFile.write(txt) else: # alto outFile = io.open(out_name, 'wb') alto = pytesseract.image_to_alto_xml(img) outFile.write(alto) outFile.close()
import cv2 import pytesseract from PIL import Image import xml.etree.ElementTree as ET pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # img = cv2.imread('page_1.jpg') # print(pytesseract.image_to_string(img)) # print(pytesseract.image_to_boxes(Image.open('page_1.jpg'))) # print(pytesseract.image_to_data(Image.open('page_1.jpg'))) # print(pytesseract.image_to_osd(Image.open('page_1.jpg'))) x = pytesseract.image_to_alto_xml('page_1.jpg') tree = ET.parse(x) root = tree.getroot() # ET.fromstring(country_data_as_string) print(root.tag)
def main(): os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" with open(sys.argv[1], "r") as config_file: cfg = yaml.safe_load(config_file) print(str(cfg)) det_cfg = cfg["detection"] rec_cfg = cfg["recognition"] logging.basicConfig(format="%(asctime)s %(module)-12s %(levelname)-8s %(message)s") logging.warn("Starting detection") detection = Detection(det_cfg) found_frames = detection.detect_subtitle_region(cfg["video"]) y_start, y_end = detection.get_subtitle_region() char_width = detection.get_char_width() char_dist = detection.get_char_dist() if char_width == 0 or char_dist == 0: logging.error("Char width is 0") return logging.warn( "Found y pos ({}, {}), character width {}, character distance {}".format(y_start, y_end, char_width, char_dist)) recognition = Recognition(rec_cfg["model"], rec_cfg["weights"], rec_cfg["dictionary"]) cyk = True for index, f in enumerate(FONTS): font = load_font(f, char_width) font2 = load_font(f, char_width // 2) if font is None: logging.error("No CYK font found") cyk = False else: logging.warn("Loaded font {}".format(FONTS[index])) cap = cv2.VideoCapture(cfg["video"]) save_image_seq = cfg["video_offset_start"] save_image_seq_end = cfg["video_offset_end"] cap.set(cv2.CAP_PROP_POS_FRAMES, save_image_seq) vout = cv2.VideoWriter(cfg["output_sub_video"], cv2.VideoWriter_fourcc(*'mp4v'), 29.97, (1920,1080-y_start+120)) vout.set(cv2.VIDEOWRITER_PROP_QUALITY, 0.1) print(vout) custom_config = r'--psm 7 -l chi_sim' frames_ocr = {} #for frame in found_frames: while cap.isOpened(): ret, frame = cap.read() if not ret: break text = [] img = Image.fromarray(frame) draw = ImageDraw.Draw(img) x_start = 1920 x_end = 0 for char_region, start, stop in detection.detect_char_regions(frame[y_start:y_end, ], save_image=False, save_image_name="fill/seq_{}_{:06d}.tiff".format("{}", save_image_seq)): if x_start > start: x_start = start if x_end < stop: x_end = stop continue res = recognition.recognize_character(char_region) text.append((start, stop, res[1], res[2])) logging.warn("Detected Region {} {} in ({} {})".format(start, stop, y_start, y_end)) save_image_seq += 1 if save_image_seq > save_image_seq_end: break for start, stop, char, prob in text: draw.rectangle([(start, y_start), (stop, y_end)], outline=RECTANGLE_COLOR) draw.rectangle([(start + 1, y_start + 1), (stop - 1, y_end - 1)], outline=RECTANGLE_COLOR) draw.rectangle([(start + 2, y_start + 2), (stop - 2, y_end - 2)], outline=RECTANGLE_COLOR) probability = str(int(prob * 100)) + "%" if cyk: draw.text((start, y_start - (stop - start)), char, fill=FONT_COLOR, font=font) draw.text((start, y_start - 1.5 * (stop - start)), probability, fill=FONT_COLOR, font=font2) #logging.warn("Detected character {} ({})".format(char, probability)) #cv2.imshow('image', np.array(img)) #cv2.resizeWindow('image', int(1920/2), int(1080/2)) #cv2.waitKey(0) #cv2.destroyAllWindows() vout.write(frame[y_start-120:1080, ]) if x_start < x_end: gray = cv2.cvtColor(frame[y_start:y_end, x_start:x_end], cv2.COLOR_BGR2GRAY) #gray = img # threshhold ret,bin = cv2.threshold(gray,245,255,cv2.THRESH_BINARY) # closing kernel = np.ones((3,3),np.uint8) closing = cv2.morphologyEx(bin, cv2.MORPH_CLOSE, kernel) # invert black/white inv = cv2.bitwise_not(closing) img_rgb = cv2.cvtColor(inv, cv2.COLOR_GRAY2RGB) #print(img_rgb) data_xml = pytesseract.image_to_alto_xml(img_rgb, config=custom_config) print(str(save_image_seq) + " " + data_xml.decode('utf-8')) #print(str(i) + " " + json.dumps(data_xml.decode('utf-8'))) frames_ocr[save_image_seq] = data_xml.decode('utf-8') cap.release() vout.release() with open(cfg['output_sub_ocr'], 'w') as outfile: json.dump(frames_ocr, outfile, sort_keys=True, indent=2)