def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False): import Levenshtein # out = difflib.get_close_matches('ἐστιν',words) (dict_words, words_clean, words_freq) = word_dicts # print "word in:" # print dump(wordIn) # wordIn = preprocess_word(wordIn) # print "word in pp:" # print dump(wordIn) wordInTrans = leven.transIn(wordIn) if (debug): print print "getCloseWords for", wordInTrans.encode( 'utf-8'), "(", wordIn.encode('utf-8'), ")" dump(wordIn) output_words = [] #dict_words_set = set(dict_words) n = 0 # print "Now comparing to..." if wordInTrans in dict_words: pass # print "short-circuting dictionary word" # output_words.append((wordInTrans,0,0,0,'xxx','yyy')) else: for word in dict_words: # print u"*****" + words_clean[n] # print "word into comparison:" # print dump(word) lev_distance = Levenshtein.distance( wordInTrans, word ) # difflib.SequenceMatcher(None, word, wordInTrans).ratio() # print "distance: ", # print ratio if lev_distance <= threshold: edits = Levenshtein.editops(wordInTrans, word) w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False) output_words.append( (word, lev_distance, len(edits), w, 'xxx', 'yyy')) if (lev_distance == 0) and (fast == True): # In the case of an exact match, cut the search short # We might have got some close matches ahead of time, so this # will not create a complete list output_words = sorted( output_words, key=lambda word: int(words_freq[word[0]])) return sorted(output_words, key=lambda word: int(word[3])) n = n + 1 return sorted(output_words, key=lambda word: word[3])
def weight_for_leven_edits(wordFrom, wordTo, edits, weight_rules, max_weight, debug=False): if (debug): print print print "Weight Analysis" print "word in: ", wordFrom dump(wordFrom) print print "word to: ", wordTo dump(wordTo) cumulative_weight = 0 for edit in edits: edit_weight = 0 if (debug): print edit (command, char_num_in_word_one, char_num_in_word_two) = edit if (char_num_in_word_one > (len(wordFrom) - 1)): char_in_word_one = '' else: char_in_word_one = wordFrom[char_num_in_word_one] if (char_num_in_word_two > (len(wordTo) - 1)): char_in_word_two = '' else: char_in_word_two = wordTo[char_num_in_word_two] if (debug): print '\t', command if char_in_word_one: print '\t', unicodedata.name(char_in_word_one) else: print '\tx' if char_in_word_two: print '\t', unicodedata.name(char_in_word_two) else: print '\tx' if (command == 'replace'): edit_weight = 10 elif (command == 'delete'): edit_weight = 15 elif (command == 'insert'): edit_weight = 18 else: raise ValueError('unknown Levenshtein edit operation: ' + command) for weight_rule in weight_rules: if (weight_rule[0] == command) and (weight_rule[1] == '*' or char_in_word_one in weight_rule[1]) and (weight_rule[2] == '*' or char_in_word_two in weight_rule[2]): if (debug): print '\t weight rule applied:' print '\t', weight_rule edit_weight = weight_rule[3] break if (debug): print '\tweight: ', edit_weight cumulative_weight += edit_weight if (cumulative_weight >= max_weight): break return cumulative_weight
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False): import Levenshtein # out = difflib.get_close_matches('ἐστιν',words) (dict_words, words_clean, words_freq) = word_dicts # print "word in:" # print dump(wordIn) # wordIn = preprocess_word(wordIn) # print "word in pp:" # print dump(wordIn) wordInTrans = leven.transIn(wordIn) if (debug): print print "getCloseWords for", wordInTrans.encode('utf-8'), "(", wordIn.encode('utf-8'),")" dump(wordIn) output_words = [] #dict_words_set = set(dict_words) n = 0 # print "Now comparing to..." if wordInTrans in dict_words: pass # print "short-circuting dictionary word" # output_words.append((wordInTrans,0,0,0,'xxx','yyy')) else: for word in dict_words: # print u"*****" + words_clean[n] # print "word into comparison:" # print dump(word) lev_distance = Levenshtein.distance( wordInTrans, word) # difflib.SequenceMatcher(None, word, wordInTrans).ratio() # print "distance: ", # print ratio if lev_distance <= threshold: edits = Levenshtein.editops(wordInTrans, word) w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False) output_words.append( (word, lev_distance, len(edits), w, 'xxx', 'yyy')) if (lev_distance == 0) and (fast == True): # In the case of an exact match, cut the search short # We might have got some close matches ahead of time, so this # will not create a complete list output_words = sorted( output_words, key=lambda word: int(words_freq[word[0]])) return sorted(output_words, key=lambda word: int(word[3])) n = n + 1 return sorted(output_words, key=lambda word: word[3])
def unicode_test(word): import unicodedata print print circumflex = unicode(u"\N{COMBINING GREEK PERISPOMENI}") other_circumflex = unicode(u"\N{COMBINING CIRCUMFLEX ACCENT}") word = word.replace(other_circumflex, circumflex) print "*** ", word, ": " print "Input: " dump(word) nfd = unicodedata.normalize('NFD', word) if not word == nfd: print "The decomposed verison is NOT the same: ", print nfd dump(nfd) else: print "(NFD is the same)" try: cfd = unicodedata.normalize('NFC', word) except Error as foo: print foo print "CFD: ", cfd dump(cfd)
def process_vocab((vocab, word_dicts, max_weight)): from greek_tools import is_capitalized debug = True (dict_words, words_clean, words_freq) = word_dicts output_string = '' euro_sign = unicode(u"\N{EURO SIGN}") for wordIn in vocab: wordIn_original = wordIn wordIn = preprocess_word(wordIn) output_words = getCloseWords(wordIn, word_dicts, teubner_serif_weights, max_weight, threshold=3) # If the word doesn't have an exact match, and it is capitalized, then redo with # a uncapitalized version isCapitalized = False hasBeenLowered = False if debug: print print wordIn.encode('utf-8') if is_capitalized(wordIn): if debug: print wordIn.encode('utf-8'), "is capitalized" isCapitalized = True min_weight = max_weight + 1 for output_word in output_words: if output_word[3] < min_weight: min_weight = output_word[3] if debug: print "minweight is ", min_weight if isCapitalized and (len(output_words) == 0 or min_weight > max_weight): if debug: for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: print word, words_clean[word].encode( 'utf-8'), w, lev_distance, words_freq[word] print "not found directly, so using", wordIn.lower().encode( 'utf-8') output_words = getCloseWords(wordIn.lower(), word_dicts, teubner_serif_weights, max_weight, threshold=3) hasBeenLowered = True # print # print wordIn, ":" # If the input word is in the dictionary if len(output_words) > 0 and output_words[0][1] == 0: if debug: print "*" else: if len(output_words) > 0 and output_words[0][3] < max_weight: best_result_word = words_clean[output_words[0][0]] if (hasBeenLowered): best_result_word = best_result_word.capitalize() if not (best_result_word == wordIn_original or best_result_word == wordIn_original.lower()): output_string += wordIn_original + euro_sign + best_result_word + '\n' if debug: dump(wordIn_original) print dump(wordIn) print dump(best_result_word) for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: if (hasBeenLowered): word_to_print = word.capitalize() else: word_to_print = word if debug: print word_to_print, words_clean[word].encode( 'utf-8'), w, lev_distance, words_freq[word] # dump(word_to_print) # print # dump(words_clean[word]) # dump(word) if (lev_distance == 0): break return output_string
for word_element in hocr_word_elements: #print word_element.text try: word = unicodedata.normalize('NFD', word_element.text) except TypeError: word = unicodedata.normalize('NFD', unicode(word_element.text)) #dump(word) parts = split_text_token(word) #for part in parts: #print '\t',part try: #print "trying to check", parts[1] error_word = parts[1] parts = (parts[0], spellcheck_dict[parts[1]], parts[2]) print "replaced", error_word, "with", parts[1] dump(error_word) print dump(parts[1]) word_element.set('data-pre-spellcheck', word) # dump(parts[1]) except KeyError: #print "no check" pass # print parts[0]+parts[1]+parts[2] word_element.text = parts[0] + parts[1] + parts[2] fileOut.write( html_parser.unescape( etree.tostring(treeIn.getroot(), encoding="UTF-8", xml_declaration=True))) fileOut.close()
def process_vocab((vocab,word_dicts, max_weight)): from greek_tools import is_capitalized debug = True (dict_words, words_clean, words_freq) = word_dicts output_string = '' euro_sign = unicode(u"\N{EURO SIGN}") for wordIn in vocab: wordIn_original = wordIn wordIn = preprocess_word(wordIn) output_words = getCloseWords( wordIn, word_dicts, teubner_serif_weights, max_weight, threshold=3) # If the word doesn't have an exact match, and it is capitalized, then redo with # a uncapitalized version isCapitalized = False hasBeenLowered = False if debug: print print wordIn.encode('utf-8') if is_capitalized(wordIn): if debug: print wordIn.encode('utf-8'), "is capitalized" isCapitalized = True min_weight = max_weight + 1 for output_word in output_words: if output_word[3] < min_weight: min_weight = output_word[3] if debug: print "minweight is ", min_weight if isCapitalized and (len(output_words) == 0 or min_weight > max_weight): if debug: for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: print word, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word] print "not found directly, so using", wordIn.lower().encode('utf-8') output_words = getCloseWords(wordIn.lower( ), word_dicts, teubner_serif_weights, max_weight, threshold=3) hasBeenLowered = True # print # print wordIn, ":" # If the input word is in the dictionary if len(output_words) > 0 and output_words[0][1] == 0: if debug: print "*" else: if len(output_words) > 0 and output_words[0][3] < max_weight: best_result_word = words_clean[output_words[0][0]] if (hasBeenLowered): best_result_word = best_result_word.capitalize() if not (best_result_word == wordIn_original or best_result_word == wordIn_original.lower()): output_string += wordIn_original + euro_sign + best_result_word + '\n' if debug: dump(wordIn_original) print dump(wordIn) print dump(best_result_word) for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: if (hasBeenLowered): word_to_print = word.capitalize() else: word_to_print = word if debug: print word_to_print, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word] # dump(word_to_print) # print # dump(words_clean[word]) # dump(word) if (lev_distance == 0): break return output_string
root = treeIn.getroot() hocr_word_elements = treeIn.xpath("//html:span[@class='ocr_word'] | //span[@class='ocr_word']",namespaces={'html':"http://www.w3.org/1999/xhtml"}) for word_element in hocr_word_elements: #print word_element.text try: word = unicodedata.normalize('NFD',word_element.text) except TypeError: word = unicodedata.normalize('NFD',unicode(word_element.text)) #dump(word) parts = split_text_token(word) #for part in parts: #print '\t',part try: #print "trying to check", parts[1] error_word = parts[1] parts = (parts[0], spellcheck_dict[parts[1]], parts[2]) print "replaced", error_word, "with", parts[1] dump(error_word) print dump(parts[1]) word_element.set('data-pre-spellcheck',word) # dump(parts[1]) except KeyError: #print "no check" pass # print parts[0]+parts[1]+parts[2] word_element.text = parts[0]+parts[1]+parts[2] fileOut.write(html_parser.unescape(etree.tostring(treeIn.getroot(), encoding="UTF-8",xml_declaration=True))) fileOut.close()