Esempio n. 1
0
def build_dict(segfun, n=50, stopword_list=None, filename = None):
    we = WeightEngine(segfun, stopwordList=stopword_list)
    we.weight_learning(read_arbitrary_mock(2000000))

    if filename:
        we.save_record(filename)

    return sort_dict(we._dict)[:n]
Esempio n. 2
0
def export_mmseg(we, text, funcList, stopword=None, expectList=[], noneList=[], n=5):
    """
    tf_list
    idf_list
    """
    tf_list = [we.tf, we.log_tf, we.a_tf, we.b_tf, we.L_tf]
    df_list = [we.n_df, we.idf, we.prob_idf]
    compareList = [["weighting method\seg system"]]
    if stopword:
        compareList[0] += [func.__name__ + "(stoplist added)" for func in funcList]
    else:
        compareList[0] += [func.__name__ for func in funcList]

    for tf in tf_list:
        for df in df_list:
            head = "%s %s:" % (tf.__name__, df.__name__)
            elementList = []

            if stopword:
                we.set_stopword(stopword)
                for func in funcList:
                    we.set_seg_fun(func)
                    result = sort_dict(we.tf_idf_dict(text, tf, df))
                    p = 0.0
                    resultList = [item[0] for item in result[:n]]
                    for item in resultList:
                        if item in expectList:
                            p += 0.25
                        elif item in noneList:
                            p -= 0.1
                    elementList.append(" ".join(resultList) + str(p))
            else:
                we.set_stopword(None)
                for func in funcList:
                    we.set_seg_fun(func)
                    result = sort_dict(we.tf_idf_dict(text, tf, df))
                    p = 0.0
                    resultList = [item[0] for item in result[:n]]
                    for item in resultList:
                        if item in expectList:
                            p += 0.2
                        elif item in noneList:
                            p -= 0.1
                    elementList.append(" ".join(resultList) + str(p))
            compareList.append([head, ] + elementList)
    return compareList
Esempio n. 3
0
def show_range(MIN, MAX):
    colorlist = "bcgkmrwy"
    # blue,cyan,green,black,magenta,red,white,yellow

    filelist = ["mmseg_save.bak", "ICTCLAS_save.bak", "mmseg_save_with_stopwords.bak", "ICTCLAS_save_with_stopwords.bak"]
    #                 blue,         cyan                     green                    black
    i = 0
    for f in filelist:
        we = WeightEngine()
        we.load_record(f)
        x = [item[1] for item in sort_dict(we._dict)[MIN:MAX]]
        plt.plot(x, colorlist[i]);
        i += 1

    plt.show()
    return
Esempio n. 4
0
    def test_integrated_tf_idf(self):
        """
        tf_list
        idf_list
        """
        tf_list = [self.we.tf, self.we.log_tf, self.we.a_tf, self.we.b_tf, self.we.L_tf]
        df_list = [self.we.n_df, self.we.idf, self.we.prob_idf]
        print_format = list()

        for tf in tf_list:
            for df in df_list:
                result = sort_dict(self.we.tf_idf_dict(self.text, tf, df))
                head = "%s %s:" % (tf.__name__, df.__name__)
                showTable([word[0] for word in result],
                    [word[1] for word in result], title_name = head)
                print_format.append([head] + [word[0] for word in result[:5]])
        col_printtable(print_format)
Esempio n. 5
0
def load_dict_from_save(filename, n=50):
    we = WeightEngine(mmseg_segfun)
    we.load_record(filename)
    return sort_dict(we._dict)[:n]