def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext( storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format( doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext(storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format(doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def ocr_kraken(doc, method=u'ocr_kraken', model=None): """ Runs kraken on an input document and writes a TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ output_path = ( doc[0], os.path.splitext(storage.insert_suffix(doc[1], method, model))[0] + '.xml') logger.debug('Loading model {}'.format(model)) try: rnn = models.load_any(mod_db[model]) except Exception as e: raise NidabaInvalidParameterException(str(e)) logger.debug('Reading TEI segmentation from {}'.format(doc)) tei = OCRRecord() with storage.StorageFile(*doc) as seg: tei.load_tei(seg) img = Image.open( storage.get_abs_path(*storage.get_storage_path_url(tei.img))) if is_bitonal(img): img = img.convert('1') else: raise NidabaInvalidParameterException('Input image is not bitonal') logger.debug('Clearing out word/grapheme boxes') # kraken is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('kraken', 'character recognition') lines = tei.lines i = 0 rnn = models.load_any(mod_db[model]) logger.debug('Start recognizing characters') for line_id, rec in izip( lines, rpred.rpred( rnn, img, { 'text_direction': 'horizontal-tb', 'boxes': [list(x['bbox']) for x in lines.itervalues()] })): # scope the current line and add all graphemes recognized by kraken to # it. logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) i += 1 splits = regex.split(u'(\s+)', rec.prediction) line_offset = 0 for segment, whitespace in izip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)]) logger.debug( 'Creating new segment at {} {} {} {}'.format(*seg_bbox)) tei.add_segment(seg_bbox) logger.debug('Adding graphemes (segment): {}'.format( rec.prediction[line_offset:line_offset + len(segment)])) tei.add_graphemes([{ 'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100) } for x in rec[line_offset:line_offset + len(segment)]]) line_offset += len(segment) if whitespace: logger.debug('Adding graphemes (whitespace): {}'.format( rec.prediction[line_offset:line_offset + len(whitespace)])) seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)]) tei.add_segment(seg_bbox) tei.add_graphemes([{ 'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100) } for x in rec[line_offset:line_offset + len(whitespace)]]) line_offset += len(whitespace) with storage.StorageFile(*output_path, mode='wb') as fp: logger.debug('Writing TEI to {}'.format(fp.abs_path)) tei.write_tei(fp) return output_path
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path) if os.WIFSIGNALED(status): raise NidabaTesseractException( 'Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path) api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format( w, h, *doc)) tei = OCRRecord() tei.dimensions = (w, h) tei.img = storage.get_url(*doc) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([{ 'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value) }]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write_tei(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path)
def ocr_kraken(doc, method=u'ocr_kraken', model=None): """ Runs kraken on an input document and writes a TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ input_path = storage.get_abs_path(*doc[1]) output_path = (doc[1][0], os.path.splitext(storage.insert_suffix(doc[1][1], method, model))[0] + '.xml') logger.debug('Searching for model {}'.format(model)) if model in nidaba_cfg['kraken_models']: model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model])) elif model in nidaba_cfg['ocropus_models']: model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model])) else: raise NidabaInvalidParameterException('Model not defined in ' 'configuration') img = Image.open(input_path) logger.debug('Reading TEI segmentation from {}'.format(doc[1])) tei = OCRRecord() with storage.StorageFile(*doc[0]) as seg: tei.load_tei(seg) logger.debug('Clearing out word/grapheme boxes') # kraken is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('kraken', 'character recognition') lines = tei.lines logger.debug('Loading model {}'.format(model)) rnn = models.load_any(model) i = 0 logger.debug('Start recognizing characters') for line_id, rec in zip(lines, rpred.rpred(rnn, img, [x['bbox'] for x in lines.itervalues()])): # scope the current line and add all graphemes recognized by kraken to # it. logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) i += 1 splits = regex.split(u'(\s+)', rec.prediction) line_offset = 0 for segment, whitespace in izip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)]) logger.debug('Creating new segment at {} {} {} {}'.format(*seg_bbox)) tei.add_segment(seg_bbox) logger.debug('Adding graphemes (segment): {}'.format(rec.prediction[line_offset:line_offset+len(segment)])) tei.add_graphemes([{'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(segment)]]) line_offset += len(segment) if whitespace: logger.debug('Adding graphemes (whitespace): {}'.format(rec.prediction[line_offset:line_offset+len(whitespace)])) seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)]) tei.add_segment(seg_bbox) tei.add_graphemes([{'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(whitespace)]]) line_offset += len(whitespace) with storage.StorageFile(*output_path, mode='wb') as fp: logger.debug('Writing TEI to {}'.format(fp.abs_path)) tei.write_tei(fp) return output_path
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path) if os.WIFSIGNALED(status): raise NidabaTesseractException('Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path) api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(w, h, *doc)) tei = OCRRecord() tei.dimensions = (w, h) tei.img = storage.get_url(*doc) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([{'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value)}]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write_tei(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path)
def ocr(image_path, segmentation_path, output_path, model_path): """ Scan a single image with ocropus. Reads a single image file from ```imagepath``` and writes the recognized text as a TEI document into output_path. Args: image_path (unicode): Path of the input file segmentation_path (unicode): Path of the segmentation XML file. output_path (unicode): Path of the output file model_path (unicode): Path of the recognition model. Must be a pyrnn.gz pickle dump interoperable with ocropus-rpred. Returns: (unicode): A string of the output file that is actually written. As Ocropus rewrites output file paths without notice it may be different from the ```outputfilepath``` argument. Raises: NidabaOcropusException: Ocropus somehow failed. The error output is contained in the message but as it is de facto unusable as a library it's impossible to deduct the nature of the problem. """ try: logger.debug('Loading pyrnn from {}'.format(model_path)) network = ocrolib.load_object(model_path, verbose=0) lnorm = getattr(network, "lnorm") except Exception as e: raise NidabaOcropusException('Something somewhere broke: ' + e.msg) im = Image.open(image_path) logger.debug('Loading TEI segmentation {}'.format(segmentation_path)) tei = OCRRecord() with open(segmentation_path, 'r') as seg_fp: tei.load_tei(seg_fp) logger.debug('Clearing out word/grapheme boxes') # ocropus is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('ocropus', 'character recognition') for line_id, box in tei.lines.iteritems(): logger.debug('Recognizing line {}'.format(box['bbox'])) line = ocrolib.pil2array(im.crop(box['bbox'])) temp = np.amax(line) - line temp = temp * 1.0 / np.amax(temp) lnorm.measure(temp) line = lnorm.normalize(line, cval=np.amax(line)) if line.ndim == 3: np.mean(line, 2) line = ocrolib.lstm.prepare_line(line, 16) pred = network.predictString(line) pred = ocrolib.normalize_text(pred) logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) logger.debug('Adding graphemes: {}'.format(pred)) tei.add_graphemes({'grapheme': x} for x in pred) with open(output_path, 'wb') as fp: logger.debug('Writing TEI to {}'.format(fp.name)) tei.write_tei(fp) return output_path