def unify_dirs(key_dir,value_dir,encode=True): new_dir={} for key_i in key_dir: index=key_dir[key_i] all_endings=[] for end_i in index: all_endings+=value_dir[end_i] if(encode): key_i=code_digraphs(key_i) new_dir[key_i]=[code_digraphs(value_i) for value_i in all_endings] else: new_dir[key_i]=all_endings return new_dir #forms=build_forms(u'resources/lab2/pocz.dat', # u'resources/lab2/konc.dat') #print(forms.stats())
def build_forms_histogram(filename, forms2basic, hist_size=0): text = tools.read_text(filename, clean_txt=False) words = tools.find_words(text) words = [code_digraphs(word_i) for word_i in words] forms = [forms2basic[word_i] for word_i in words if (word_i in forms2basic)] # print(len(forms)) forms = tools.unique_list(forms) # list(forms) # print(len(forms)) # print(forms2basic) return build_histogram(forms, laplace_smoothing=True, size=hist_size)
def build_forms_histogram(filename,forms2basic,hist_size=0): text=tools.read_text(filename,clean_txt=False) words=tools.find_words(text) words=[code_digraphs(word_i) for word_i in words] forms=[ forms2basic[word_i] for word_i in words if(word_i in forms2basic)] #print(len(forms)) forms=tools.unique_list(forms)#list(forms) #print(len(forms)) #print(forms2basic) return build_histogram(forms,laplace_smoothing=True,size=hist_size)
def correct(self,new_word,unique=False): new_word=code_digraphs(new_word) keys=self.forms_dict.all_basic() words=knn.nearest_k(new_word,keys,k=5,metric=norm_begin_metric) #tools.print_unicode(words) if(unique): full_words=[] for word_i in words: full_word_i=self.forms_dict.full_words([word_i])[0] full_words.append(full_word_i) else: full_words=self.forms_dict.full_words(words) prob_pairs=[ (word_i,self.p(new_word,word_i)) for word_i in full_words] prob_pairs.sort(key=lambda x: x[1], reverse=True) return prob_pairs
def correct(self, new_word, unique=False): new_word = code_digraphs(new_word) keys = self.forms_dict.all_basic() words = knn.nearest_k(new_word, keys, k=5, metric=norm_begin_metric) #tools.print_unicode(words) if (unique): full_words = [] for word_i in words: full_word_i = self.forms_dict.full_words([word_i])[0] full_words.append(full_word_i) else: full_words = self.forms_dict.full_words(words) prob_pairs = [(word_i, self.p(new_word, word_i)) for word_i in full_words] prob_pairs.sort(key=lambda x: x[1], reverse=True) return prob_pairs
def main_loop(): full_words = tools.read_lines(FORMS_SOURCE, clean_text=False) words = [distance.code_digraphs(word_i) for word_i in full_words] correct_word = distance.curry_correct(words, full_words) tools.ui_loop(correct_word, get_word)
def main_loop(): full_words=tools.read_lines(FORMS_SOURCE,clean_text=False) words=[distance.code_digraphs(word_i) for word_i in full_words] correct_word=distance.curry_correct(words,full_words) tools.ui_loop(correct_word,get_word)