def build_dict(segfun, n=50, stopword_list=None, filename = None): we = WeightEngine(segfun, stopwordList=stopword_list) we.weight_learning(read_arbitrary_mock(2000000)) if filename: we.save_record(filename) return sort_dict(we._dict)[:n]
def export_mmseg(we, text, funcList, stopword=None, expectList=[], noneList=[], n=5): """ tf_list idf_list """ tf_list = [we.tf, we.log_tf, we.a_tf, we.b_tf, we.L_tf] df_list = [we.n_df, we.idf, we.prob_idf] compareList = [["weighting method\seg system"]] if stopword: compareList[0] += [func.__name__ + "(stoplist added)" for func in funcList] else: compareList[0] += [func.__name__ for func in funcList] for tf in tf_list: for df in df_list: head = "%s %s:" % (tf.__name__, df.__name__) elementList = [] if stopword: we.set_stopword(stopword) for func in funcList: we.set_seg_fun(func) result = sort_dict(we.tf_idf_dict(text, tf, df)) p = 0.0 resultList = [item[0] for item in result[:n]] for item in resultList: if item in expectList: p += 0.25 elif item in noneList: p -= 0.1 elementList.append(" ".join(resultList) + str(p)) else: we.set_stopword(None) for func in funcList: we.set_seg_fun(func) result = sort_dict(we.tf_idf_dict(text, tf, df)) p = 0.0 resultList = [item[0] for item in result[:n]] for item in resultList: if item in expectList: p += 0.2 elif item in noneList: p -= 0.1 elementList.append(" ".join(resultList) + str(p)) compareList.append([head, ] + elementList) return compareList
def show_range(MIN, MAX): colorlist = "bcgkmrwy" # blue,cyan,green,black,magenta,red,white,yellow filelist = ["mmseg_save.bak", "ICTCLAS_save.bak", "mmseg_save_with_stopwords.bak", "ICTCLAS_save_with_stopwords.bak"] # blue, cyan green black i = 0 for f in filelist: we = WeightEngine() we.load_record(f) x = [item[1] for item in sort_dict(we._dict)[MIN:MAX]] plt.plot(x, colorlist[i]); i += 1 plt.show() return
def test_integrated_tf_idf(self): """ tf_list idf_list """ tf_list = [self.we.tf, self.we.log_tf, self.we.a_tf, self.we.b_tf, self.we.L_tf] df_list = [self.we.n_df, self.we.idf, self.we.prob_idf] print_format = list() for tf in tf_list: for df in df_list: result = sort_dict(self.we.tf_idf_dict(self.text, tf, df)) head = "%s %s:" % (tf.__name__, df.__name__) showTable([word[0] for word in result], [word[1] for word in result], title_name = head) print_format.append([head] + [word[0] for word in result[:5]]) col_printtable(print_format)
def load_dict_from_save(filename, n=50): we = WeightEngine(mmseg_segfun) we.load_record(filename) return sort_dict(we._dict)[:n]