Ejemplo n.º 1
0
def test_image_to_alto_xml(test_file):
    result = image_to_alto_xml(test_file)
    assert isinstance(result, bytes)
    result = result.decode('utf-8') if IS_PYTHON_2 else str(result, 'utf-8')
    result = str(result).strip()
    assert result.startswith('<?xml')
    assert result.endswith('</alto>')
def _extract_lines(img: np.ndarray, contour: Contour,
                   lang: str) -> List[alto.TextLine]:
    truncated_image = _truncate(img, contour)

    alto_ = alto.parse(
        _decode(pytesseract.image_to_alto_xml(truncated_image, lang=lang)))
    return assert_one_page_and_get_it(alto_).extract_lines()
Ejemplo n.º 3
0
def convert_file_to_xml(input_file=image_file):
    """
    Converting image files to ALTO xml and
    use convert_djvu_to_xml() and convert_pdf_to_xml().
    """

    # If image file:
    # Windows bitmaps - *.bmp, *.dib
    # JPEG files - *.jpeg, *.jpg, *.jpe
    # JPEG 2000 files - *.jp2
    # Portable Network Graphics - *.png
    # WebP - *.webp
    # Portable image format - *.pbm, *.pgm, *.ppm *.pxm, *.pnm
    # Sun rasters - *.sr, *.ras
    # TIFF files - *.tiff, *.tif
    # OpenEXR Image files - *.exr
    # Radiance HDR - *.hdr, *.pic
    valid_image_extensions = [
                             ".bmp", ".dib",
                             ".jpeg", ".jpg", ".jpe",
                             ".jp2", 
                             ".png",
                             ".webp"
                             ".pbm", ".pgm", ".ppm", 
                             ".sr", ".ras", 
                             ".tiff", ".tif",
                             ".exr",
                             ".hdr", ".pic"
                             ]
    extention = os.path.splitext(input_file)[1].lower()
    if extention in valid_image_extensions:
        # Tesseract command file in installation directory
        pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
        # Load image
        image = cv2.imread(input_file)
        # Run tesseract, returning binary text ALTO xml
        alto_xml = pytesseract.image_to_alto_xml(image, lang='rus+eng')
        return(alto_xml)
    elif extention == ".pdf":
        pass # TODO: pdf convertion function
    elif (extention == ".djvu") or (extention == ".djv"):
        alto_xml = convert_djvu_to_xml(input_file)
        return(alto_xml)
    else:
        pass
Ejemplo n.º 4
0
def _run_tesseract(page: str, lang: str) -> str:
    return _decode(pytesseract.image_to_alto_xml(page, lang=lang))
Ejemplo n.º 5
0
def _tesseract(page: Any) -> str:
    return _decode(
        pytesseract.image_to_alto_xml(page, lang=CONFIG.tesseract.lang))
Ejemplo n.º 6
0
def test_image_to_alto_xml_support(test_file):
    with pytest.raises(ALTONotSupported):
        image_to_alto_xml(test_file)
Ejemplo n.º 7
0
# Timeout/terminate the tesseract job after a period of time
try:
    print(pytesseract.image_to_string('test.jpg',
                                      timeout=2))  # Timeout after 2 seconds
    print(pytesseract.image_to_string(
        'test.jpg', timeout=0.5))  # Timeout after half a second
except RuntimeError as timeout_error:
    # Tesseract processing is terminated
    pass

# Get bounding box estimates
print(pytesseract.image_to_boxes(Image.open('test.png')))

# Get verbose data including boxes, confidences, line and page numbers
print(pytesseract.image_to_data(Image.open('test.png')))

# Get information about orientation and script detection
print(pytesseract.image_to_osd(Image.open('test.png')))

# Get a searchable PDF
pdf = pytesseract.image_to_pdf_or_hocr('test.png', extension='pdf')
with open('test.pdf', 'w+b') as f:
    f.write(pdf)  # pdf type is bytes by default

# Get HOCR output
hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')

# Get ALTO XML output
xml = pytesseract.image_to_alto_xml('test.png')
Ejemplo n.º 8
0
# Get bounding box estimates
#boxes = pytesseract.image_to_boxes(Image.open('/home/moshe/Downloads/test.png'))

# Get verbose data including boxes, confidences, line and page numbers

img = cv2.imread('/home/moshe/Downloads/test.png')
d = pytesseract.image_to_data(img, lang="heb", output_type=Output.DICT)
print(d)
n_boxes = len(d['level'])
for i in range(n_boxes):
    (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
    cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

cv2.imshow('img', img)
cv2.waitKey(0)
# Get information about orientation and script detection
print(pytesseract.image_to_osd(Image.open('/home/moshe/Downloads/test.png')))

# Get a searchable PDF
pdf = pytesseract.image_to_pdf_or_hocr('/home/moshe/Downloads/test.png',
                                       extension='pdf')
with open('test.pdf', 'w+b') as f:
    f.write(pdf)  # pdf type is bytes by default

# Get HOCR output
hocr = pytesseract.image_to_pdf_or_hocr('/home/moshe/Downloads/test.png',
                                        extension='hocr')

# Get ALTO XML output
xml = pytesseract.image_to_alto_xml('/home/moshe/Downloads/test.png')
Ejemplo n.º 9
0
import cv2
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

filename = r'C:\Data2\OCR\test_page.png'
output_file = r'C:\Data2\OCR\alto_page.xml'

# read the image
img = cv2.imread(filename)

# For testing:
# # Prepare image to OCR
# image_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# image_bin = cv2.adaptiveThreshold(image_gray,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY ,41,3)
# cv2.imshow("Original", img)
# cv2.imshow("Gray", image_gray)
# cv2.imshow("Bin", image_bin)

# run tesseract, returning binary text ALTO xml
alto_xml = pytesseract.image_to_alto_xml(img, lang='rus+eng')  #use

# save output xml
f = open(output_file, "wb")
f.write(alto_xml)
f.close()

#input('pause…')
Ejemplo n.º 10
0
def image_to_alto_xml(filename):
    alto_xml = tesserakti.image_to_alto_xml(filename)
    # alto_xml = tesserakti.image_to_alto_xml(kuva.open(filename))
    return alto_xml
Ejemplo n.º 11
0
import io
import sys
import pytesseract
import cv2

if __name__ == '__main__':
    if len(sys.argv) == 3:
        img_path = sys.argv[1]
        outType = sys.argv[2]
        ext = '.txt' if outType == '-t' else '.alto'
        img = cv2.imread(img_path)

        # preprocessings
        # Lire un fichier de paramètre pour simplifier les démarches
        # ocr
        out_name = img_path.split('\.')[0] + ext
        if ext == '.txt':
            outFile = io.open(out_name, 'w')
            txt = pytesseract.image_to_string(img)
            outFile.write(txt)
        else:  # alto
            outFile = io.open(out_name, 'wb')
            alto = pytesseract.image_to_alto_xml(img)
            outFile.write(alto)
        outFile.close()
import cv2
import pytesseract
from PIL import Image
import xml.etree.ElementTree as ET
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# img  = cv2.imread('page_1.jpg')

# print(pytesseract.image_to_string(img))

# print(pytesseract.image_to_boxes(Image.open('page_1.jpg')))

# print(pytesseract.image_to_data(Image.open('page_1.jpg')))

# print(pytesseract.image_to_osd(Image.open('page_1.jpg')))

x = pytesseract.image_to_alto_xml('page_1.jpg')
tree = ET.parse(x)
root = tree.getroot()
# ET.fromstring(country_data_as_string)
print(root.tag)
Ejemplo n.º 13
0
def main():
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

    with open(sys.argv[1], "r") as config_file:
        cfg = yaml.safe_load(config_file)

    print(str(cfg))

    det_cfg = cfg["detection"]
    rec_cfg = cfg["recognition"]

    logging.basicConfig(format="%(asctime)s %(module)-12s %(levelname)-8s %(message)s")

    logging.warn("Starting detection")


    detection = Detection(det_cfg)

    found_frames = detection.detect_subtitle_region(cfg["video"])

    y_start, y_end = detection.get_subtitle_region()
    char_width = detection.get_char_width()
    char_dist = detection.get_char_dist()
    if char_width == 0 or char_dist == 0:
        logging.error("Char width is 0")
        return

    logging.warn(
        "Found y pos ({}, {}), character width {}, character distance {}".format(y_start, y_end, char_width, char_dist))

    recognition = Recognition(rec_cfg["model"], rec_cfg["weights"], rec_cfg["dictionary"])

    cyk = True
    for index, f in enumerate(FONTS):
        font = load_font(f, char_width)
        font2 = load_font(f, char_width // 2)
    if font is None:
        logging.error("No CYK font found")
        cyk = False
    else:
        logging.warn("Loaded font {}".format(FONTS[index]))

    cap = cv2.VideoCapture(cfg["video"])
    save_image_seq = cfg["video_offset_start"]
    save_image_seq_end = cfg["video_offset_end"]
    cap.set(cv2.CAP_PROP_POS_FRAMES, save_image_seq)
    vout = cv2.VideoWriter(cfg["output_sub_video"], cv2.VideoWriter_fourcc(*'mp4v'), 29.97, (1920,1080-y_start+120))
    vout.set(cv2.VIDEOWRITER_PROP_QUALITY, 0.1)
    print(vout)

    custom_config = r'--psm 7 -l chi_sim'
    frames_ocr = {}
    #for frame in found_frames:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        text = []
        img = Image.fromarray(frame)
        draw = ImageDraw.Draw(img)
        x_start = 1920
        x_end = 0
        for char_region, start, stop in detection.detect_char_regions(frame[y_start:y_end, ], save_image=False, save_image_name="fill/seq_{}_{:06d}.tiff".format("{}", save_image_seq)):
            if x_start > start:
                x_start = start
            if x_end < stop:
                x_end = stop
            continue
            res = recognition.recognize_character(char_region)
            text.append((start, stop, res[1], res[2]))
            logging.warn("Detected Region {} {} in ({} {})".format(start, stop, y_start, y_end))

        save_image_seq += 1
        if save_image_seq > save_image_seq_end:
            break

        for start, stop, char, prob in text:
            draw.rectangle([(start, y_start), (stop, y_end)], outline=RECTANGLE_COLOR)
            draw.rectangle([(start + 1, y_start + 1), (stop - 1, y_end - 1)], outline=RECTANGLE_COLOR)
            draw.rectangle([(start + 2, y_start + 2), (stop - 2, y_end - 2)], outline=RECTANGLE_COLOR)

            probability = str(int(prob * 100)) + "%"
            if cyk:
                draw.text((start, y_start - (stop - start)), char, fill=FONT_COLOR, font=font)
                draw.text((start, y_start - 1.5 * (stop - start)), probability, fill=FONT_COLOR, font=font2)
            
            #logging.warn("Detected character {} ({})".format(char, probability))

        #cv2.imshow('image', np.array(img))
        #cv2.resizeWindow('image', int(1920/2), int(1080/2))
        #cv2.waitKey(0)
        #cv2.destroyAllWindows()

        vout.write(frame[y_start-120:1080, ])
        if x_start < x_end:
            gray = cv2.cvtColor(frame[y_start:y_end, x_start:x_end], cv2.COLOR_BGR2GRAY)
            #gray = img

            # threshhold
            ret,bin = cv2.threshold(gray,245,255,cv2.THRESH_BINARY)

            # closing
            kernel = np.ones((3,3),np.uint8)
            closing = cv2.morphologyEx(bin, cv2.MORPH_CLOSE, kernel)

            # invert black/white
            inv = cv2.bitwise_not(closing)

            img_rgb = cv2.cvtColor(inv, cv2.COLOR_GRAY2RGB)
            #print(img_rgb)
            data_xml = pytesseract.image_to_alto_xml(img_rgb, config=custom_config)
            print(str(save_image_seq) + " " + data_xml.decode('utf-8'))
            #print(str(i) + " " + json.dumps(data_xml.decode('utf-8')))
            frames_ocr[save_image_seq] = data_xml.decode('utf-8')

    cap.release()
    vout.release()

    with open(cfg['output_sub_ocr'], 'w') as outfile:
        json.dump(frames_ocr, outfile, sort_keys=True, indent=2)