def tei_spellcheck(facsimile, dictionary, deletion_dictionary, filter_punctuation=False): """ Performs a spell check on an TEI XML document. Each ``seg`` element is treated as a single word and spelling corrections will be inserted using a choice tag. Correct words will be untouched and correction candidates will be sorted by edit distance. Args: facsimile (nidaba.tei.TEIFacsimile): TEIFacsimile object. dictionary (unicode): Path to a base dictionary. deletion_dictionary (unicode): Path to a deletion dictionary. filter_punctuation (bool): Switch to filter punctuation inside segments. Returns: A TEIFacsimile object containing the spelling corrections. """ text_tokens = [x[-1] for x in facsimile.segments] if filter_punctuation: text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens] suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary) facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein') for segment in facsimile.segments: key = alg.sanitize(segment[-1]) if filter_punctuation: key = regex.sub('[^\w]', '', key) if key not in suggestions: continue for sugg in suggestions[key]: facsimile.add_choices(segment[-2], [(sugg, 100 - 10 * alg.edit_distance(key, sugg))]) return facsimile
def spellcheck(tokens, dictionary, deletion_dictionary): """ Performs a spell check on a sequence of tokens. The spelling of each sequence of characters is compared against a dictionary containing deletions of valid words and a dictionary of correct words. Args: tokens (iterable): An iterable returning sequences of unicode characters. dictionary (unicode): Path to a base dictionary. deletion_dictionary (unicode): Path to a deletion dictionary. Returns: A dictionary containing a sorted (least to highest edit distance) list of suggestions for each character sequence that is not contained verbatim in the dictionary. Tokens that are not recognized as valid words but don't have spelling suggestions either will be contained in the result dictionary. """ suggestions = {} for tok in tokens: tok = alg.sanitize(tok) if alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word): continue if tok in suggestions: continue ret = alg.mapped_sym_suggest(tok, deletion_dictionary, dictionary, 1) suggestions[tok] = alg.suggestions(tok, set.union(*ret.itervalues())) return suggestions
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path(*nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search(tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return {'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc}
def tei_spellcheck(facsimile, dictionary, deletion_dictionary, filter_punctuation=False): """ Performs a spell check on an TEI XML document. Each ``seg`` element is treated as a single word and spelling corrections will be inserted using a choice tag. Correct words will be untouched and correction candidates will be sorted by edit distance. Args: facsimile (nidaba.tei.OCRRecord): OCR record object. dictionary (unicode): Path to a base dictionary. deletion_dictionary (unicode): Path to a deletion dictionary. filter_punctuation (bool): Switch to filter punctuation inside segments. Returns: A OCRRecord object containing the spelling corrections. """ text_tokens = set(''.join(y['grapheme'] for y in x.get('content').itervalues()) for x in facsimile.segments.itervalues()) text_tokens.remove('') text_tokens = list(text_tokens) if filter_punctuation: text_tokens = [regex.sub('[^\w]', '', x) for x in text_tokens] suggestions = spellcheck(text_tokens, dictionary, deletion_dictionary) facsimile.add_respstmt('spell-checker', 'nidaba-levenshtein') for seg_id, segment in facsimile.segments.iteritems(): key = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) if filter_punctuation: key = regex.sub('[^\w]', '', key) if key not in suggestions: continue for sugg in suggestions[key]: facsimile.add_choices( seg_id, [{ 'alternative': sugg, 'confidence': 100 - 10 * alg.edit_distance(key, sugg) }]) return facsimile
def cleanlines(path, encoding=u'utf-8', normalization=u'NFD'): """ Read in lines from a file and return them as a sanitized list. Non-unique linse will be repeated. Args: path (unicode): Absolute path of the file to be read encoding (unicode): Encoding to use for decoding the file normalization (unicode): Normalization format to use Returns: list: List of lines containing the sanitized output, i.e. normalized unicode objects. """ words = [] with codecs.open(path, u'r', encoding=encoding) as lines: for line in lines: words.append(alg.sanitize(line, normalization=normalization)) return words
def text_lexicality(doc, method=u'text_lexicality', language=u'', divert=True): """ Calculates the lexicality of text in input documents. Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) dictionary = storage.get_abs_path( *nidaba_cfg['lang_dicts'][language]['dictionary']) with storage.StorageFile(*doc) as fp: tei = OCRRecord() tei.load_tei(fp) cnt = 0 err_cnt = 0 for seg_id, segment in facsimile.segments.iteritems(): tok = alg.sanitize(''.join(x['grapheme'] for x in segment['content'].itervalues())) tok = regex.sub('[^\w]', '', key) cnt += 1 if not alg.mmap_bin_search( tok, dictionary, entryparser_fn=alg.key_for_single_word): err_cnt += 1 if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(err_cnt / float(cnt))) return output_path else: return { 'edit_ratio': err_cnt / float(cnt), 'ground_truth': '', 'doc': doc }
def cleanwords(path, encoding=u'utf-8', normalization=u'NFD'): """ Read in every word from a files as separated by lines and spaces. Non-unique words will be repeated as they are read in. Detects only words divided by a standard space. Args: path (unicode): Absolute path of the file to be read encoding (unicode): Encoding to use for decoding the file normalization (unicode): Normalization format to use Returns: list: List of words containing the sanitized output, i.e. normalized unicode objects. """ words = [] with codecs.open(path, u'r', encoding=encoding) as lines: for line in lines: for seg in line.split(u' '): clean = alg.sanitize(seg, normalization=normalization) if clean != u'': words.append(clean) return words
def cleanup(text): """ Removes lines containing only whitespace and normalizes to NFD. """ text = sanitize(text) return '\n'.join([s for s in text.splitlines() if len(s.strip())])