def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext(storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format(doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return {'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc}
def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None, extended=False): """ Runs tesseract on an input document. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files languages (list): A list of tesseract classifier identifiers extended (bool): Switch to enable extended hOCR generation containing character cuts and confidences. Has no effect when direct or legacy implementation is used. Returns: (unicode, unicode): Storage tuple for the output file """ seg = OCRRecord() with storage.StorageFile(*doc) as fp: seg.load_tei(fp) with storage.StorageFile(doc[0], splitext(doc[1])[0] + '.uzn', mode='wb') as fp: uzn = UZNWriter(fp) for line in seg.lines.itervalues(): uzn.writerow(*line['bbox']) image_path = storage.get_abs_path(*storage.get_storage_path_url(seg.img)) if isinstance(languages, basestring): languages = [languages] output_path = storage.insert_suffix(image_path, method, *languages) logger.debug( 'Invoking tesseract with {} call method'.format(implementation)) if implementation == 'legacy': result_path = output_path + '.html' ocr_direct(image_path, output_path, languages) elif implementation == 'direct': result_path = output_path + '.hocr' ocr_direct(image_path, output_path, languages) elif implementation == 'capi': result_path = output_path + '.xml' ocr_capi(image_path, result_path, seg, languages, extended) else: raise NidabaTesseractException('Invalid implementation selected', implementation) if not result_path[-4:] == '.xml': logger.debug('Converting hOCR ({}) -> TEI ({})'.format( result_path, output_path + '.xml')) tei = OCRRecord() with open(result_path) as fp: tei.load_hocr(fp) os.unlink(result_path) with open(output_path + '.xml', 'wb') as fp: tei.write_tei(fp) result_path = output_path + '.xml' return storage.get_storage_path(result_path)
def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None, extended=False): """ Runs tesseract on an input document. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files languages (list): A list of tesseract classifier identifiers extended (bool): Switch to enable extended hOCR generation containing character cuts and confidences. Has no effect when direct or legacy implementation is used. Returns: (unicode, unicode): Storage tuple for the output file """ image_path = storage.get_abs_path(*doc[1]) # rewrite the segmentation file to lines in UZN format logger.debug('Rewriting TEI ({}) -> UZN ({})'.format(doc[0][1], splitext(doc[1][1])[0] + '.uzn')) seg = OCRRecord() with storage.StorageFile(*doc[0]) as fp: seg.load_tei(fp) with storage.StorageFile(doc[1][0], splitext(doc[1][1])[0] + '.uzn', mode='wb') as fp: uzn = UZNWriter(fp) for line in seg.lines.itervalues(): uzn.writerow(*line['bbox']) if isinstance(languages, basestring): languages = [languages] output_path = storage.insert_suffix(image_path, method, *languages) logger.debug('Invoking tesseract with {} call method'.format(implementation)) if implementation == 'legacy': result_path = output_path + '.html' ocr_direct(image_path, output_path, languages) elif implementation == 'direct': result_path = output_path + '.hocr' ocr_direct(image_path, output_path, languages) elif implementation == 'capi': result_path = output_path + '.xml' ocr_capi(image_path, result_path, seg, languages, extended) else: raise NidabaTesseractException('Invalid implementation selected', implementation) if not result_path[-4:] == '.xml': logger.debug('Converting hOCR ({}) -> TEI ({})'.format(result_path, output_path + '.xml')) tei = OCRRecord() with open(result_path) as fp: tei.load_hocr(fp) os.unlink(result_path) with open(output_path + '.xml', 'wb') as fp: tei.write_tei(fp) result_path = output_path + '.xml' return storage.get_storage_path(result_path)
def tei2hocr(doc, method=u'tei2hocr'): """ Convert a TEI Facsimile to hOCR preserving as much metadata as possible. Args: doc (unicode, unicode): Storage tuple of the input document Returns: (unicode, unicode): Storage tuple of the output document """ with storage.StorageFile(*doc) as fp: tei = OCRRecord() logger.debug('Reading TEI ({}/{})'.format(*doc)) tei.load_tei(fp) output_path = storage.insert_suffix(doc[1], method) with storage.StorageFile(doc[0], output_path, 'wb') as fp: logger.debug('Writing hOCR to {}'.format(fp.abs_path)) tei.write_hocr(fp) return (doc[0], output_path)
def tei2txt(doc, method=u'tei2txt'): """ Convert a TEI Facsimile to a plain text file. Args: doc (unicode, unicode): Storage tuple of the input document Returns: (unicode, unicode): Storage tuple of the output document """ with storage.StorageFile(*doc) as fp: tei = OCRRecord() logger.debug('Reading TEI ({}/{})'.format(*doc)) tei.load_tei(fp) output_path = storage.insert_suffix(doc[1], method) with storage.StorageFile(doc[0], output_path, 'wb') as fp: logger.debug('Writing text to {}'.format(fp.abs_path)) tei.write_text(fp) return (doc[0], output_path)
def tei2abbyyxml(doc, method=u'abbyyxml'): """ Convert a TEI Facsimile to a format similar to Abbyy FineReader's XML output. Args: doc (unicode, unicode): Storage tuple of the input document Returns: (unicode, unicode): Storage tuple of the output document """ with storage.StorageFile(*doc) as fp: tei = OCRRecord() logger.debug('Reading TEI ({}/{})'.format(*doc)) tei.load_tei(fp) output_path = storage.insert_suffix(doc[1], method) with storage.StorageFile(doc[0], output_path, 'wb') as fp: logger.debug('Writing abbyyxml to {}'.format(fp.abs_path)) tei.write_abbyyxml(fp) return (doc[0], output_path)
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path( *nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search( tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return { 'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc }
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True): """ Extracts self reported confidence values from input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()]) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(edist)) return output_path else: return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
def spell_check(doc, method=u'spell_check', language=u'', filter_punctuation=False): """ Adds spelling suggestions to an TEI XML document. Alternative spellings for each segment will be included in a choice tagcontaining a series of corr tags with the original segment appearing beneath a sic element. Correct words, i.e. words appearing verbatim in the dictionary, are left untouched. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to the output file. language (unicode): Identifier defined in the nidaba configuration as a valid dictionary. filter_punctuation (bool): Switch to filter punctuation inside ``seg`` Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, language, unicode(filter_punctuation)) dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary']) del_dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['deletion_dictionary']) with storage.StorageFile(*doc) as fp: logger.debug('Reading TEI ({})'.format(fp.abs_path)) tei = OCRRecord() tei.load_tei(fp) logger.debug('Performing spell check') ret = lex.tei_spellcheck(tei, dictionary, del_dictionary, filter_punctuation) with storage.StorageFile(*storage.get_storage_path(output_path), mode='wb') as fp: logger.debug('Writing TEI ({})'.format(fp.abs_path)) ret.write_tei(fp) return storage.get_storage_path(output_path)
def ocr_kraken(doc, method=u'ocr_kraken', model=None): """ Runs kraken on an input document and writes a TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ output_path = ( doc[0], os.path.splitext(storage.insert_suffix(doc[1], method, model))[0] + '.xml') logger.debug('Loading model {}'.format(model)) try: rnn = models.load_any(mod_db[model]) except Exception as e: raise NidabaInvalidParameterException(str(e)) logger.debug('Reading TEI segmentation from {}'.format(doc)) tei = OCRRecord() with storage.StorageFile(*doc) as seg: tei.load_tei(seg) img = Image.open( storage.get_abs_path(*storage.get_storage_path_url(tei.img))) if is_bitonal(img): img = img.convert('1') else: raise NidabaInvalidParameterException('Input image is not bitonal') logger.debug('Clearing out word/grapheme boxes') # kraken is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('kraken', 'character recognition') lines = tei.lines i = 0 rnn = models.load_any(mod_db[model]) logger.debug('Start recognizing characters') for line_id, rec in izip( lines, rpred.rpred( rnn, img, { 'text_direction': 'horizontal-tb', 'boxes': [list(x['bbox']) for x in lines.itervalues()] })): # scope the current line and add all graphemes recognized by kraken to # it. logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) i += 1 splits = regex.split(u'(\s+)', rec.prediction) line_offset = 0 for segment, whitespace in izip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)]) logger.debug( 'Creating new segment at {} {} {} {}'.format(*seg_bbox)) tei.add_segment(seg_bbox) logger.debug('Adding graphemes (segment): {}'.format( rec.prediction[line_offset:line_offset + len(segment)])) tei.add_graphemes([{ 'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100) } for x in rec[line_offset:line_offset + len(segment)]]) line_offset += len(segment) if whitespace: logger.debug('Adding graphemes (whitespace): {}'.format( rec.prediction[line_offset:line_offset + len(whitespace)])) seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)]) tei.add_segment(seg_bbox) tei.add_graphemes([{ 'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100) } for x in rec[line_offset:line_offset + len(whitespace)]]) line_offset += len(whitespace) with storage.StorageFile(*output_path, mode='wb') as fp: logger.debug('Writing TEI to {}'.format(fp.abs_path)) tei.write_tei(fp) return output_path
def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext( storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format( doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def ocr(image_path, segmentation_path, output_path, model_path): """ Scan a single image with ocropus. Reads a single image file from ```imagepath``` and writes the recognized text as a TEI document into output_path. Args: image_path (unicode): Path of the input file segmentation_path (unicode): Path of the segmentation XML file. output_path (unicode): Path of the output file model_path (unicode): Path of the recognition model. Must be a pyrnn.gz pickle dump interoperable with ocropus-rpred. Returns: (unicode): A string of the output file that is actually written. As Ocropus rewrites output file paths without notice it may be different from the ```outputfilepath``` argument. Raises: NidabaOcropusException: Ocropus somehow failed. The error output is contained in the message but as it is de facto unusable as a library it's impossible to deduct the nature of the problem. """ try: logger.debug('Loading pyrnn from {}'.format(model_path)) network = ocrolib.load_object(model_path, verbose=0) lnorm = getattr(network, "lnorm") except Exception as e: raise NidabaOcropusException('Something somewhere broke: ' + e.msg) im = Image.open(image_path) logger.debug('Loading TEI segmentation {}'.format(segmentation_path)) tei = OCRRecord() with open(segmentation_path, 'r') as seg_fp: tei.load_tei(seg_fp) logger.debug('Clearing out word/grapheme boxes') # ocropus is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('ocropus', 'character recognition') for line_id, box in tei.lines.iteritems(): logger.debug('Recognizing line {}'.format(box['bbox'])) line = ocrolib.pil2array(im.crop(box['bbox'])) temp = np.amax(line) - line temp = temp * 1.0 / np.amax(temp) lnorm.measure(temp) line = lnorm.normalize(line, cval=np.amax(line)) if line.ndim == 3: np.mean(line, 2) line = ocrolib.lstm.prepare_line(line, 16) pred = network.predictString(line) pred = ocrolib.normalize_text(pred) logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) logger.debug('Adding graphemes: {}'.format(pred)) tei.add_graphemes({'grapheme': x} for x in pred) with open(output_path, 'wb') as fp: logger.debug('Writing TEI to {}'.format(fp.name)) tei.write_tei(fp) return output_path
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path) if os.WIFSIGNALED(status): raise NidabaTesseractException( 'Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path) api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format( w, h, *doc)) tei = OCRRecord() tei.dimensions = (w, h) tei.img = storage.get_url(*doc) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([{ 'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value) }]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write_tei(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path)
def tei_metadata(doc, method=u'metadata', metadata=None, validate=False): """ Enriches a TEI-XML document with various metadata from an user-supplied YAML file. The following fields may be contained in the metadata file with the bolded subset mandatory for a valid TEI-XML file. They are grouped by their place in the header. Unknown fields are ignored and input is escaped as to disable injection. Some element may also be extended by increasing their arity, the second value is then usually used as a global identifer/locator, i.e. an URL or authority control ID. titleStmt: * ``title``: Title of the resource * author: Name of the author of the resource (may be extended) * editor: Name of the editor, compiler, translator, etc. of the resource (may be extended) * funder: Institution responsible for the funding of the text (may be extended) * principal: PI responsible for the creation of the text (may be extended) * sponsor: Name of the sponsoring institution (may be extended) * meeting: Conference/meeting resulting in the text (may be extended) editionStmt: * edition: Peculiarities to the underlying edition of the text publicationStmt: * ``licence``: Licence of the content (may be extended) * ``publisher``: Person or agency responsible for the publication of the text (may be extended) * distributor: Person or agency responsible for the text's distribution (may be extended) * authority: Authority responsible for making the work available * idno: Identifier of the publication (may be extended with the type of identifier) * pub_place: Place of publication * date: Date of publication seriesStmt: * series_title: Title of the series to which the publication belongs notesStmt: * note: Misc. notes about the text sourceDesc: * ``source_desc``: Description of the source document other: * lang: Abbreviation of the language used in the header There is a sample file from the OpenPhilology project in the example directory. Args: doc (unicode, unicode): Storage tuple of the input document method (unicode): metadata (unicode, unicode): Storage tuple of the metadata YAML file Returns: (unicode, unicode): Storage tuple of the output document Raises: NidabaTEIException if the resulting document is not TEI compatible and validation is enabled. """ with storage.StorageFile(*doc) as fp: tei = OCRRecord() logger.debug('Reading TEI ({}/{})'.format(*doc)) tei.load_tei(fp) logger.debug('Reading metadata file ({}/{})'.format(*metadata)) with storage.StorageFile(*metadata) as fp: meta = yaml.safe_load(fp) for field in tei.fields: if field in meta: logger.debug('Adding field {} ({})'.format(field, meta[field])) setattr(tei, field, meta[field]) if validate: raise NidabaTEIException('Validation not yet implemented.') output_path = storage.insert_suffix(doc[1], method, metadata[1]) with storage.StorageFile(doc[0], output_path, 'wb') as fp: logger.debug('Writing TEI to {}'.format(fp.abs_path)) tei.write_tei(fp) return (doc[0], output_path)
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path) if os.WIFSIGNALED(status): raise NidabaTesseractException('Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path) api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(w, h, *doc)) tei = OCRRecord() tei.dimensions = (w, h) tei.img = storage.get_url(*doc) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([{'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value)}]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write_tei(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path)
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None, xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the algorithm of python's difflib SequenceMatcher. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = OCRRecord() tei.load_tei(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = OCRRecord() tei.load_tei(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) sm = difflib.SequenceMatcher() sm.set_seqs(text, gt) logger.debug('Accuracy: {}'.format(sm.ratio())) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(sm.ratio())) return output_path else: return {'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc}
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None, xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the algorithm of python's difflib SequenceMatcher. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = OCRRecord() tei.load_tei(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = OCRRecord() tei.load_tei(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) sm = difflib.SequenceMatcher() sm.set_seqs(text, gt) logger.debug('Accuracy: {}'.format(sm.ratio())) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(sm.ratio())) return output_path else: return { 'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc }
def ocr_kraken(doc, method=u'ocr_kraken', model=None): """ Runs kraken on an input document and writes a TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ input_path = storage.get_abs_path(*doc[1]) output_path = (doc[1][0], os.path.splitext(storage.insert_suffix(doc[1][1], method, model))[0] + '.xml') logger.debug('Searching for model {}'.format(model)) if model in nidaba_cfg['kraken_models']: model = storage.get_abs_path(*(nidaba_cfg['kraken_models'][model])) elif model in nidaba_cfg['ocropus_models']: model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model])) else: raise NidabaInvalidParameterException('Model not defined in ' 'configuration') img = Image.open(input_path) logger.debug('Reading TEI segmentation from {}'.format(doc[1])) tei = OCRRecord() with storage.StorageFile(*doc[0]) as seg: tei.load_tei(seg) logger.debug('Clearing out word/grapheme boxes') # kraken is a line recognizer tei.clear_graphemes() tei.clear_segments() # add and scope new responsibility statement tei.add_respstmt('kraken', 'character recognition') lines = tei.lines logger.debug('Loading model {}'.format(model)) rnn = models.load_any(model) i = 0 logger.debug('Start recognizing characters') for line_id, rec in zip(lines, rpred.rpred(rnn, img, [x['bbox'] for x in lines.itervalues()])): # scope the current line and add all graphemes recognized by kraken to # it. logger.debug('Scoping line {}'.format(line_id)) tei.scope_line(line_id) i += 1 splits = regex.split(u'(\s+)', rec.prediction) line_offset = 0 for segment, whitespace in izip_longest(splits[0::2], splits[1::2]): if len(segment): seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(segment)]) logger.debug('Creating new segment at {} {} {} {}'.format(*seg_bbox)) tei.add_segment(seg_bbox) logger.debug('Adding graphemes (segment): {}'.format(rec.prediction[line_offset:line_offset+len(segment)])) tei.add_graphemes([{'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(segment)]]) line_offset += len(segment) if whitespace: logger.debug('Adding graphemes (whitespace): {}'.format(rec.prediction[line_offset:line_offset+len(whitespace)])) seg_bbox = max_bbox(rec.cuts[line_offset:line_offset + len(whitespace)]) tei.add_segment(seg_bbox) tei.add_graphemes([{'grapheme': x[0], 'bbox': x[1], 'confidence': int(x[2] * 100)} for x in rec[line_offset:line_offset+len(whitespace)]]) line_offset += len(whitespace) with storage.StorageFile(*output_path, mode='wb') as fp: logger.debug('Writing TEI to {}'.format(fp.abs_path)) tei.write_tei(fp) return output_path