def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext( storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format( doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext(storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format(doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path) if os.WIFSIGNALED(status): raise NidabaTesseractException( 'Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path) api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format( w, h, *doc)) tei = OCRRecord() tei.dimensions = (w, h) tei.img = storage.get_url(*doc) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([{ 'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value) }]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write_tei(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path)
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path) if os.WIFSIGNALED(status): raise NidabaTesseractException('Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path) api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(w, h, *doc)) tei = OCRRecord() tei.dimensions = (w, h) tei.img = storage.get_url(*doc) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([{'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value)}]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write_tei(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path)