def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext( storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format( doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def any_to_png(doc, method=u'any_to_png'): """ Converts an image (color or otherwise) in any format recognized by pillow to PNG. The pillow image library relies on external libraries for loading and saving Image data. To recognize the most common image formats used for digital archival you'll need: - libtiff - zlib - libjpeg - openjpeg (version 2.0 +) - libwebp To have access to all formats run (on Debian/Ubuntu): .. code-block:: console # apt-get -y install libtiff5-dev libjpeg62-turbo-dev zlib1g-dev \ libwebp-dev libopenjp2-dev Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = os.path.splitext(storage.insert_suffix(input_path, method))[0] + '.png' return storage.get_storage_path(image.any_to_png(input_path, output_path))
def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None, extended=False): """ Runs tesseract on an input document. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files languages (list): A list of tesseract classifier identifiers extended (bool): Switch to enable extended hOCR generation containing character cuts and confidences. Has no effect when direct or legacy implementation is used. Returns: (unicode, unicode): Storage tuple for the output file """ image_path = storage.get_abs_path(*doc[1]) # rewrite the segmentation file to lines in UZN format logger.debug('Rewriting TEI ({}) -> UZN ({})'.format(doc[0][1], splitext(doc[1][1])[0] + '.uzn')) seg = TEIFacsimile() with storage.StorageFile(*doc[0]) as fp: seg.read(fp) with storage.StorageFile(doc[1][0], splitext(doc[1][1])[0] + '.uzn', mode='wb') as fp: uzn = UZNWriter(fp) for line in seg.lines: uzn.writerow(*line[:4]) if isinstance(languages, basestring): languages = [languages] output_path = storage.insert_suffix(image_path, method, *languages) logger.debug('Invoking tesseract with {} call method'.format(implementation)) if implementation == 'legacy': result_path = output_path + '.html' ocr_direct(image_path, output_path, languages) elif implementation == 'direct': result_path = output_path + '.hocr' ocr_direct(image_path, output_path, languages) elif implementation == 'capi': result_path = output_path + '.xml' ocr_capi(image_path, result_path, seg, languages, extended) else: raise NidabaTesseractException('Invalid implementation selected', implementation) if not result_path[-4:] == '.xml': logger.debug('Converting hOCR ({}) -> TEI ({})'.format(result_path, output_path + '.xml')) tei = TEIFacsimile() with open(result_path) as fp: tei.load_hocr(fp) os.unlink(result_path) with open(output_path + '.xml', 'wb') as fp: tei.write(fp) result_path = output_path + '.xml' return storage.get_storage_path(result_path)
def segmentation_kraken(doc, method=u'segment_kraken', black_colseps=False): """ Performs page segmentation using kraken's built-in algorithm and writes a skeleton TEI file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string append to all output files black_colseps (bool): Assume black column separator instead of white ones. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path, ext = os.path.splitext(storage.insert_suffix(input_path, method)) logger.debug('Reading image using PIL') img = Image.open(input_path) with open(output_path + '.xml', 'w') as fp: logger.debug('Initializing TEI with {} ({} {})'.format(doc[1], *img.size)) tei = OCRRecord() tei.img = storage.get_url(*doc) tei.dimensions = img.size tei.title = os.path.basename(doc[1]) tei.add_respstmt('kraken', 'page segmentation') for seg in pageseg.segment(img, black_colseps=black_colseps)['boxes']: logger.debug('Found line at {} {} {} {}'.format(*seg)) tei.add_line(seg) logger.debug('Write segmentation to {}'.format(fp.name)) tei.write_tei(fp) return storage.get_storage_path(output_path + '.xml')
def sauvola(doc, method=u'sauvola', whsize=10, factor=0.35): """ Binarizes an input document utilizing Sauvola thresholding as described in [0]. Expects 8bpp grayscale images as input. [0] Sauvola, Jaakko, and Matti Pietikäinen. "Adaptive document image binarization." Pattern recognition 33.2 (2000): 225-236. Args: doc (unicode): The input document tuple. method (unicode): The suffix string appended to all output files whsize (int): The window width and height that local statistics are calculated on are twice the value of whsize. The minimal value is 2. factor (float): The threshold reduction factor due to variance. 0 =< factor < 1. Returns: (unicode, unicode): Storage tuple of the output file Raises: NidabaInvalidParameterException: Input parameters are outside the valid range. """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, unicode(whsize), unicode(factor)) lept_sauvola(input_path, output_path, whsize, factor) return storage.get_storage_path(output_path)
def otsu(doc, method=u'otsu', thresh=100, mincount=50, bgval=255, smoothx=2, smoothy=2): """ Binarizes an input document utilizing a naive implementation of Otsu's thresholding. Args: doc (unicode, unicode): The input document tuple. id (unicode): The nidaba batch identifier this task is a part of method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file Raises: NidabaInvalidParameterException: Input parameters are outside the valid range. """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, unicode(thresh), unicode(mincount), unicode(bgval), unicode(smoothx), unicode(smoothy)) if smoothx < 0 or smoothy < 0 or bgval < 0 or thresh < 0 or mincount < 0: raise NidabaInvalidParameterException('Parameters (' + unicode(thresh) + ',' + unicode(mincount) + ',' + unicode(bgval) + ',' + unicode(smoothx) + ',' + unicode(smoothy) + ',' + ') outside of valid range') return storage.get_storage_path(leper.otsu_binarize(input_path, output_path, thresh, mincount, bgval, smoothx, smoothy))
def sauvola(doc, method=u'sauvola', whsize=10, factor=0.35): """ Binarizes an input document utilizing Sauvola thresholding as described in [0]. Expects 8bpp grayscale images as input. [0] Sauvola, Jaakko, and Matti Pietikäinen. "Adaptive document image binarization." Pattern recognition 33.2 (2000): 225-236. Args: doc (unicode): The input document tuple. method (unicode): The suffix string appended to all output files whsize (int): The window width and height that local statistics are calculated on are twice the value of whsize. The minimal value is 2. factor (float): The threshold reduction factor due to variance. 0 =< factor < 1. Returns: (unicode, unicode): Storage tuple of the output file Raises: NidabaInvalidParameterException: Input parameters are outside the valid range. """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, unicode(whsize), unicode(factor)) lept_sauvola(input_path, output_path, whsize, factor) return storage.get_storage_path(output_path)
def any_to_png(doc, method=u'any_to_png'): """ Converts an image (color or otherwise) in any format recognized by pillow to PNG. The pillow image library relies on external libraries for loading and saving Image data. To recognize the most common image formats used for digital archival you'll need: - libtiff - zlib - libjpeg - openjpeg (version 2.0 +) - libwebp To have access to all formats run (on Debian/Ubuntu): .. code-block:: console # apt-get -y install libtiff5-dev libjpeg62-turbo-dev zlib1g-dev \ libwebp-dev libopenjp2-dev Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = os.path.splitext(storage.insert_suffix(input_path, method))[0] + '.png' return storage.get_storage_path(image.any_to_png(input_path, output_path))
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return {'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc}
def nlbin(doc, method=u'nlbin', threshold=0.5, zoom=0.5, escale=1.0, border=0.1, perc=80, range=20, low=5, high=90): """ Binarizes an input document utilizing ocropus'/kraken's nlbin algorithm. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to all output files. threshold (float): zoom (float): escale (float): border (float) perc (int): range (int): low (int): high (int): Returns: (unicode, unicode): Storage tuple of the output file Raises: NidabaInvalidParameterException: Input parameters are outside the valid range. """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, unicode(threshold), unicode(zoom), unicode(escale), unicode(border), unicode(perc), unicode(range), unicode(low), unicode(high)) kraken_nlbin(input_path, output_path, threshold, zoom, escale, border, perc, range, low, high) return storage.get_storage_path(output_path)
def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None, extended=False): """ Runs tesseract on an input document. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files languages (list): A list of tesseract classifier identifiers extended (bool): Switch to enable extended hOCR generation containing character cuts and confidences. Has no effect when direct or legacy implementation is used. Returns: (unicode, unicode): Storage tuple for the output file """ seg = OCRRecord() with storage.StorageFile(*doc) as fp: seg.load_tei(fp) with storage.StorageFile(doc[0], splitext(doc[1])[0] + '.uzn', mode='wb') as fp: uzn = UZNWriter(fp) for line in seg.lines.itervalues(): uzn.writerow(*line['bbox']) image_path = storage.get_abs_path(*storage.get_storage_path_url(seg.img)) if isinstance(languages, basestring): languages = [languages] output_path = storage.insert_suffix(image_path, method, *languages) logger.debug( 'Invoking tesseract with {} call method'.format(implementation)) if implementation == 'legacy': result_path = output_path + '.html' ocr_direct(image_path, output_path, languages) elif implementation == 'direct': result_path = output_path + '.hocr' ocr_direct(image_path, output_path, languages) elif implementation == 'capi': result_path = output_path + '.xml' ocr_capi(image_path, result_path, seg, languages, extended) else: raise NidabaTesseractException('Invalid implementation selected', implementation) if not result_path[-4:] == '.xml': logger.debug('Converting hOCR ({}) -> TEI ({})'.format( result_path, output_path + '.xml')) tei = OCRRecord() with open(result_path) as fp: tei.load_hocr(fp) os.unlink(result_path) with open(output_path + '.xml', 'wb') as fp: tei.write_tei(fp) result_path = output_path + '.xml' return storage.get_storage_path(result_path)
def spell_check(doc, method=u'spell_check', language=u'', filter_punctuation=False): """ Adds spelling suggestions to an TEI XML document. Alternative spellings for each segment will be included in a choice tagcontaining a series of corr tags with the original segment appearing beneath a sic element. Correct words, i.e. words appearing verbatim in the dictionary, are left untouched. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to the output file. language (unicode): Identifier defined in the nidaba configuration as a valid dictionary. filter_punctuation (bool): Switch to filter punctuation inside ``seg`` Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, language, unicode(filter_punctuation)) dictionary = storage.get_abs_path( *nidaba_cfg['lang_dicts'][language]['dictionary']) del_dictionary = storage.get_abs_path( *nidaba_cfg['lang_dicts'][language]['deletion_dictionary']) with storage.StorageFile(*doc) as fp: logger.debug('Reading TEI ({})'.format(fp.abs_path)) tei = TEIFacsimile() tei.read(fp) logger.debug('Performing spell check') ret = lex.tei_spellcheck(tei, dictionary, del_dictionary, filter_punctuation) with storage.StorageFile(*storage.get_storage_path(output_path), mode='wb') as fp: logger.debug('Writing TEI ({})'.format(fp.abs_path)) ret.write(fp) return storage.get_storage_path(output_path)
def deskew(doc, method=u'deskew'): """ Removes skew (rotational distortion) from an 1bpp input image. Args: doc (unicode, unicode): The input document tuple. id (unicode): The nidaba batch identifier this task is a part of method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) return storage.get_storage_path(leper.deskew(input_path, output_path))
def rgb_to_gray(doc, method=u'rgb_to_gray'): """ Converts an arbitrary bit depth image to grayscale and writes it back appending a suffix. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) return storage.get_storage_path(image.rgb_to_gray(input_path, output_path))
def deskew(doc, method=u'deskew'): """ Removes skew (rotational distortion) from an 1bpp input image. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) lept_deskew(input_path, output_path) return storage.get_storage_path(output_path)
def deskew(doc, method=u'deskew'): """ Removes skew (rotational distortion) from an 1bpp input image. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) lept_deskew(input_path, output_path) return storage.get_storage_path(output_path)
def spell_check(doc, method=u'spell_check', language=u'', filter_punctuation=False): """ Adds spelling suggestions to an TEI XML document. Alternative spellings for each segment will be included in a choice tagcontaining a series of corr tags with the original segment appearing beneath a sic element. Correct words, i.e. words appearing verbatim in the dictionary, are left untouched. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to the output file. language (unicode): Identifier defined in the nidaba configuration as a valid dictionary. filter_punctuation (bool): Switch to filter punctuation inside ``seg`` Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, language, unicode(filter_punctuation)) dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary']) del_dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['deletion_dictionary']) with storage.StorageFile(*doc) as fp: logger.debug('Reading TEI ({})'.format(fp.abs_path)) tei = OCRRecord() tei.load_tei(fp) logger.debug('Performing spell check') ret = lex.tei_spellcheck(tei, dictionary, del_dictionary, filter_punctuation) with storage.StorageFile(*storage.get_storage_path(output_path), mode='wb') as fp: logger.debug('Writing TEI ({})'.format(fp.abs_path)) ret.write_tei(fp) return storage.get_storage_path(output_path)
def rgb_to_gray(doc, method=u'rgb_to_gray'): """ Converts an arbitrary bit depth image to grayscale and writes it back appending a suffix. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) return storage.get_storage_path(image.rgb_to_gray(input_path, output_path))
def dewarp(doc, method=u'dewarp'): """ Removes perspective distortion (as commonly exhibited by overhead scans) from an 1bpp input image. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) lept_dewarp(input_path, output_path) return storage.get_storage_path(output_path)
def dewarp(doc, method=u'dewarp'): """ Removes perspective distortion (as commonly exhibited by overhead scans) from an 1bpp input image. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) lept_dewarp(input_path, output_path) return storage.get_storage_path(output_path)
def otsu(doc, method=u'otsu'): """ Binarizes an input document utilizing a naive implementation of Otsu's thresholding. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to all output files. Returns: (unicode, unicode): Storage tuple of the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method) return storage.get_storage_path(image.otsu(input_path, output_path))
def ocr_tesseract(doc, method=u'ocr_tesseract', languages=None): """ Runs tesseract on an input document. Args: doc (unicode, unicode): The input document tuple id (unicode): The nidaba batch identifier this task is a part of method (unicode): The suffix string appended to all output files languages (list of unicode): A list of languages for the tesseract language model Returns: (unicode, unicode): Storage tuple for the output file """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, *languages) return storage.get_storage_path(tesseract.ocr(input_path, output_path, languages))
def nlbin(doc, method=u'nlbin', threshold=0.5, zoom=0.5, escale=1.0, border=0.1, perc=80, range=20, low=5, high=90): """ Binarizes an input document utilizing ocropus'/kraken's nlbin algorithm. Args: doc (unicode, unicode): The input document tuple. method (unicode): The suffix string appended to all output files. threshold (float): zoom (float): escale (float): border (float) perc (int): range (int): low (int): high (int): Returns: (unicode, unicode): Storage tuple of the output file Raises: NidabaInvalidParameterException: Input parameters are outside the valid range. """ input_path = storage.get_abs_path(*doc) output_path = storage.insert_suffix(input_path, method, unicode(threshold), unicode(zoom), unicode(escale), unicode(border), unicode(perc), unicode(range), unicode(low), unicode(high)) img = Image.open(input_path) o_img = binarization.nlbin(img, threshold, zoom, escale, border, perc, range, low, high) o_img.save(output_path) return storage.get_storage_path(output_path)
def ocr_ocropus(doc, method=u'ocr_ocropus', model=None): """ Runs ocropus on an input document. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ image_path = storage.get_abs_path(*doc[1]) segmentation_path = storage.get_abs_path(*doc[0]) output_path = os.path.splitext( storage.insert_suffix(image_path, method, model))[0] + '.xml' model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model])) return storage.get_storage_path( ocr(image_path, segmentation_path, output_path, model))
def ocr_ocropus(doc, method=u'ocr_ocropus', model=None): """ Runs ocropus on an input document. Args: doc (unicode, unicode): The input document tuple id (unicode): The nidaba batch identifier this task is a part of method (unicode): The suffix string appended to all output files model (unicode): Identifier for the font model to use Returns: (unicode, unicode): Storage tuple for the output file """ input_path = storage.get_abs_path(*doc) output_path = os.path.splitext(storage.insert_suffix(input_path, method, model))[0] + '.html' model = storage.get_abs_path(*(nidaba_cfg['ocropus_models'][model])) return storage.get_storage_path(ocropus.ocr(input_path, output_path, model))
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path( *nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search( tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return { 'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc }
def blend_hocr(docs, method=u'blend_hocr', language=u''): """ Blends multiple hOCR files using the algorithm from Bruce Robertsons rigaudon. It requires a working spell checking for the input document's language; otherwise all matched bboxes will be bunched together without any scoring. Args: docs [(id, path), ...]: A list of storage module tupels that will be merged into a single output document. id (unicode): The nidaba batch identifier this task is a part of language (unicode): Language used for spell-checking based scoring. If not defined no scoring will be used. method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ # create the output document path from the first input document input_path = storage.get_abs_path(*docs[0]) output_path = storage.insert_suffix(input_path, method) return merge_hocr.merge(docs, language, storage.get_storage_path(output_path))
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True): """ Extracts self reported confidence values from input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()]) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(edist)) return output_path else: return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
def text_rep_confidence(doc, method=u'text_rep_confidence', divert=True): """ Extracts self reported confidence values from input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) edist = numpy.mean([x['confidence'] for x in tei.graphemes.itervalues()]) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(edist)) return output_path else: return {'edit_ratio': edist, 'ground_truth': '', 'doc': doc}
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path), doc if os.WIFSIGNALED(status): raise NidabaTesseractException('Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path), doc api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format(w, h, *doc)) tei = TEIFacsimile() tei.document((w, h), os.path.join(*doc)) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([(None, (x0.value, y0.value, x1.value, y1.value))]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format(x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path), doc
def segmentation_tesseract(doc, method=u'segment_tesseract'): """ Performs page segmentation using tesseract's built-in algorithm and writes a TEI XML segmentation file. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to all output files. Returns: Two storage tuples with the first one containing the segmentation and the second one being the file the segmentation was calculated upon. """ input_path = storage.get_abs_path(*doc) output_path = splitext(storage.insert_suffix(input_path, method))[0] + '.xml' ver = tesseract.TessVersion() if int(ver.split('.')[0]) < 3 or int(ver.split('.')[1]) < 2: raise NidabaTesseractException('libtesseract version is too old. Set ' 'implementation to direct.') # tesseract has a tendency to crash arbitrarily on some inputs # necessitating execution in a separate process to ensure the worker # doesn't just die. We use fork as the multiprocessing module thinks # programmers are too stupid to reap their children. logger.info('Forking before entering unstable ctypes code') pid = os.fork() if pid != 0: try: logger.info('Waiting for child to complete') _, status = os.waitpid(pid, 0) except OSError as e: if e.errno not in (errno.EINTR, errno.ECHILD): raise return storage.get_storage_path(output_path) if os.WIFSIGNALED(status): raise NidabaTesseractException( 'Tesseract killed by signal: {0}'.format(os.WTERMSIG(status))) return storage.get_storage_path(output_path) api = tesseract.TessBaseAPICreate() rc = tesseract.TessBaseAPIInit3(api, tessdata.encode('utf-8'), None) if (rc): tesseract.TessBaseAPIDelete(api) raise NidabaTesseractException('Tesseract initialization failed.') # only do segmentation and script detection logger.debug('Setting page set mode to 2') tesseract.TessBaseAPISetPageSegMode(api, 2) logger.debug('Reading {} using leptonica'.format(input_path)) pix = leptonica.pixRead(input_path.encode('utf-8')) logger.debug('Setting PIX as input image') tesseract.TessBaseAPISetImage2(api, pix) logger.debug('Analyzing page layout') it = tesseract.TessBaseAPIAnalyseLayout(api) logger.debug('Destroying PIX') leptonica.pixDestroy(ctypes.byref(pix)) x0, y0, x1, y1 = (ctypes.c_int(), ctypes.c_int(), ctypes.c_int(), ctypes.c_int()) w, h = Image.open(input_path).size logger.info('Initializing TEI XML file with {}x{} {}/{}'.format( w, h, *doc)) tei = OCRRecord() tei.dimensions = (w, h) tei.img = storage.get_url(*doc) tei.title = os.path.basename(doc[1]) tei.add_respstmt('tesseract', 'page segmentation') while True: if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_TEXTLINE): tesseract.TessPageIteratorBoundingBox(it, RIL_TEXTLINE, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_line((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new line at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if tesseract.TessPageIteratorIsAtBeginningOf(it, RIL_WORD): tesseract.TessPageIteratorBoundingBox(it, RIL_WORD, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_segment((x0.value, y0.value, x1.value, y1.value)) logger.debug('Segmenter found new word at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) tesseract.TessPageIteratorBoundingBox(it, RIL_SYMBOL, ctypes.byref(x0), ctypes.byref(y0), ctypes.byref(x1), ctypes.byref(y1)) tei.add_graphemes([{ 'grapheme': '', 'bbox': (x0.value, y0.value, x1.value, y1.value) }]) logger.debug('Segmenter found new symbol at {} {} {} {}'.format( x0.value, y0.value, x1.value, y1.value)) if not tesseract.TessPageIteratorNext(it, RIL_SYMBOL): logger.debug('No more elements on page') break logger.debug('Deleting page iterator and base API') tesseract.TessPageIteratorDelete(it) tesseract.TessBaseAPIEnd(api) tesseract.TessBaseAPIDelete(api) logger.info('Writing segmentation to {}'.format(output_path)) with open(output_path, 'w') as fp: tei.write_tei(fp) logger.info('Quitting child process') os._exit(os.EX_OK) return storage.get_storage_path(output_path)
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None, xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the algorithm of python's difflib SequenceMatcher. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) sm = difflib.SequenceMatcher() sm.set_seqs(text, gt) logger.debug('Accuracy: {}'.format(sm.ratio())) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(sm.ratio())) return output_path else: return {'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc}
def text_diff_ratio(doc, method=u'text_diff_ratio', ground_truth=None, xml_in=True, gt_format=u'tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the algorithm of python's difflib SequenceMatcher. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) sm = difflib.SequenceMatcher() sm.set_seqs(text, gt) logger.debug('Accuracy: {}'.format(sm.ratio())) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(sm.ratio())) return output_path else: return { 'diff_ratio': sm.ratio(), 'ground_truth': ground_truth, 'doc': doc }