def makeDict(fileName): words_transformed = [] words_clean = {} words_freq = {} n = 0 mine = codecs.open(fileName, 'r', 'utf-8') for line in mine: # wordsInLine = line.split() # print len(wordsInLine) # for word in wordsInLine: (word, freq) = line.split(',') # print len(wordsInLine) # for word in wordsInLine: freq = int(freq.rstrip('\r\n')) word = preprocess_word(word.rstrip('\n\r' + r'\x11')) thisWord = leven.transIn(word) words_transformed.append(thisWord) words_clean[thisWord] = word words_freq[thisWord] = freq # print "dictionary: ",len(words_transformed), " words." # for word in words_transformed: # dump(word) # print # dump(words_clean[word]) # print # print return (words_transformed, words_clean, words_freq)
def makeDict(fileName): words_transformed = [] words_clean = {} words_freq = {} n = 0 mine = codecs.open(fileName, 'r', 'utf-8') for line in mine: # wordsInLine = line.split() # print len(wordsInLine) # for word in wordsInLine: (word, freq) = line.split(',') # print len(wordsInLine) # for word in wordsInLine: freq = int(freq.rstrip('\r\n')) word = preprocess_word(word.rstrip('\n\r'+r'\x11')) thisWord = leven.transIn(word) words_transformed.append(thisWord) words_clean[thisWord] = word words_freq[thisWord] = freq # print "dictionary: ",len(words_transformed), " words." # for word in words_transformed: # dump(word) # print # dump(words_clean[word]) # print # print return (words_transformed, words_clean, words_freq)
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False): import Levenshtein # out = difflib.get_close_matches('ἐστιν',words) (dict_words, words_clean, words_freq) = word_dicts # print "word in:" # print dump(wordIn) # wordIn = preprocess_word(wordIn) # print "word in pp:" # print dump(wordIn) wordInTrans = leven.transIn(wordIn) if (debug): print print "getCloseWords for", wordInTrans.encode( 'utf-8'), "(", wordIn.encode('utf-8'), ")" dump(wordIn) output_words = [] #dict_words_set = set(dict_words) n = 0 # print "Now comparing to..." if wordInTrans in dict_words: pass # print "short-circuting dictionary word" # output_words.append((wordInTrans,0,0,0,'xxx','yyy')) else: for word in dict_words: # print u"*****" + words_clean[n] # print "word into comparison:" # print dump(word) lev_distance = Levenshtein.distance( wordInTrans, word ) # difflib.SequenceMatcher(None, word, wordInTrans).ratio() # print "distance: ", # print ratio if lev_distance <= threshold: edits = Levenshtein.editops(wordInTrans, word) w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False) output_words.append( (word, lev_distance, len(edits), w, 'xxx', 'yyy')) if (lev_distance == 0) and (fast == True): # In the case of an exact match, cut the search short # We might have got some close matches ahead of time, so this # will not create a complete list output_words = sorted( output_words, key=lambda word: int(words_freq[word[0]])) return sorted(output_words, key=lambda word: int(word[3])) n = n + 1 return sorted(output_words, key=lambda word: word[3])
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False): import Levenshtein # out = difflib.get_close_matches('ἐστιν',words) (dict_words, words_clean, words_freq) = word_dicts # print "word in:" # print dump(wordIn) # wordIn = preprocess_word(wordIn) # print "word in pp:" # print dump(wordIn) wordInTrans = leven.transIn(wordIn) if (debug): print print "getCloseWords for", wordInTrans.encode('utf-8'), "(", wordIn.encode('utf-8'),")" dump(wordIn) output_words = [] #dict_words_set = set(dict_words) n = 0 # print "Now comparing to..." if wordInTrans in dict_words: pass # print "short-circuting dictionary word" # output_words.append((wordInTrans,0,0,0,'xxx','yyy')) else: for word in dict_words: # print u"*****" + words_clean[n] # print "word into comparison:" # print dump(word) lev_distance = Levenshtein.distance( wordInTrans, word) # difflib.SequenceMatcher(None, word, wordInTrans).ratio() # print "distance: ", # print ratio if lev_distance <= threshold: edits = Levenshtein.editops(wordInTrans, word) w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False) output_words.append( (word, lev_distance, len(edits), w, 'xxx', 'yyy')) if (lev_distance == 0) and (fast == True): # In the case of an exact match, cut the search short # We might have got some close matches ahead of time, so this # will not create a complete list output_words = sorted( output_words, key=lambda word: int(words_freq[word[0]])) return sorted(output_words, key=lambda word: int(word[3])) n = n + 1 return sorted(output_words, key=lambda word: word[3])