Example #1
0
class WordExtractWithStopwordTestCase(unittest.TestCase):
    # Only use setUp() and tearDown() if necessary
    def setUp(self):
        # stopword list
        # Using mmseg to test extract word.
        self.we = WeightEngine(segment_function, stopwordList = STOPLIST)
        self.we.load_record("Hugerecord_save.dat")
        self.text = u"""
你说我在休假期间,大半夜哄完孩子来看代码也不容易。为什么要恶心我呢?
貌似GIT练习的强度不太够了,应该根据大家反馈搞一个新版本出来。不过今天要说的是注释的写法。
大家可以参看这个链接:http://www.kernel.org/pub/linux/kernel/v3.0/ChangeLog-3.3.3

要求带人的同学仔细看看,并且回复邮件说明我们和上面的之间的差距。这些同学有@克毅 @雄飞 @贺贺。

其他人仔细阅读,有心得的回复邮件参与讨论;没有心得的等我回去挨收拾吧。

再强调一遍,GIT是我们研发工作的基础,就好比我们说的话一样,都要说普通话,基础一样才能沟通交流。否则别的都是白扯。"""
        self.term = u"基础"

    def tearDown(self):
        pass
    def teststop_words(self):
        result1 = self.we.tf_idf_dict(self.text)
        show_dict(result1)
        result2 = self.we.tf_idf(self.term, self.text)
        print self.term, result2
        return
Example #2
0
def build_dict(segfun, n=50, stopword_list=None, filename = None):
    we = WeightEngine(segfun, stopwordList=stopword_list)
    we.weight_learning(read_arbitrary_mock(2000000))

    if filename:
        we.save_record(filename)

    return sort_dict(we._dict)[:n]
Example #3
0
def AnalysisComparison(text, record, expectList=[], noneList=[]):
    funcList = [mmseg_segfun, ICTCLAS_segfun]
    we = WeightEngine(mmseg_segfun)
    we.load_record(record)
    # compareStatus(list_basic)
    stop_list = [u"的", u"。", u",", u"是"] + STOPLIST

    result = export_mmseg(we, text, funcList, stop_list, expectList, noneList)
    compareStatus(result)
def show_range(MIN, MAX):
    colorlist = "bcgkmrwy"
    # blue,cyan,green,black,magenta,red,white,yellow

    filelist = ["mmseg_save.bak", "ICTCLAS_save.bak", "mmseg_save_with_stopwords.bak", "ICTCLAS_save_with_stopwords.bak"]
    #                 blue,         cyan                     green                    black
    i = 0
    for f in filelist:
        we = WeightEngine()
        we.load_record(f)
        x = [item[1] for item in sort_dict(we._dict)[MIN:MAX]]
        plt.plot(x, colorlist[i]);
        i += 1

    plt.show()
    return
Example #5
0
class WordExtractTestCase2(unittest.TestCase):

    # Only use setUp() and tearDown() if necessary
    def setUp(self):
        # Using mmseg to test extract word.
        self.we = WeightEngine(segment_function)
        self.we.load_record("Hugerecord_save.dat")
        self.text = u"这里要说的是,转义虽然对前端展示的时候带来了好处,但是确带来了数据的不一致性。"
        self.term = u"这里"
    def tearDown(self):
        pass
    def test_extract_tags(self):
        """
        TODO: HOW TO test this kind of function
        """
        result1 = self.we.tf_idf_dict(self.text)
        show_dict(result1)
        result2 = self.we.tf_idf(self.term, self.text)
        print self.term, result2
    def test_stop_words_filter(self):
        wordlst = [u"aa", u"bb"]
        self.assertEqual([stop_words_filter(wordlst, [u"aa"])], [u"bb"])
        pass
    def test_integrated_tf_idf(self):
        """
        tf_list
        idf_list
        """
        tf_list = [self.we.tf, self.we.log_tf, self.we.a_tf, self.we.b_tf, self.we.L_tf]
        df_list = [self.we.n_df, self.we.idf, self.we.prob_idf]
        print_format = list()

        for tf in tf_list:
            for df in df_list:
                result = sort_dict(self.we.tf_idf_dict(self.text, tf, df))
                head = "%s %s:" % (tf.__name__, df.__name__)
                showTable([word[0] for word in result],
                    [word[1] for word in result], title_name = head)
                print_format.append([head] + [word[0] for word in result[:5]])
        col_printtable(print_format)
Example #6
0
    def setUp(self):
        # stopword list
        # Using mmseg to test extract word.
        self.we = WeightEngine(segment_function, stopwordList = STOPLIST)
        self.we.load_record("Hugerecord_save.dat")
        self.text = u"""
你说我在休假期间,大半夜哄完孩子来看代码也不容易。为什么要恶心我呢?
貌似GIT练习的强度不太够了,应该根据大家反馈搞一个新版本出来。不过今天要说的是注释的写法。
大家可以参看这个链接:http://www.kernel.org/pub/linux/kernel/v3.0/ChangeLog-3.3.3

要求带人的同学仔细看看,并且回复邮件说明我们和上面的之间的差距。这些同学有@克毅 @雄飞 @贺贺。

其他人仔细阅读,有心得的回复邮件参与讨论;没有心得的等我回去挨收拾吧。

再强调一遍,GIT是我们研发工作的基础,就好比我们说的话一样,都要说普通话,基础一样才能沟通交流。否则别的都是白扯。"""
        self.term = u"基础"
Example #7
0
 def setUp(self):
     # Using mmseg to test extract word.
     self.we = WeightEngine(segment_function)
     self.term = u"发财"
     self.text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
Example #8
0
class WordExtractTestCase(unittest.TestCase):

    # Only use setUp() and tearDown() if necessary
    def setUp(self):
        # Using mmseg to test extract word.
        self.we = WeightEngine(segment_function)
        self.term = u"发财"
        self.text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"

    def tearDown(self):
        pass
    def test_add_record(self):
        self.we.load_record("record.dat")
        self.we.add_record(u"操蛋", 15)
        self.assertEqual(len(self.we._dict), 4)
        self.we.add_record(u"操蛋", 15)
        self.assertEqual(len(self.we._dict), 4)
        self.assertEqual(self.we._dict[u"操蛋"], 30)
        self.assertEqual(self.we.N, 15)

    def test_load_record(self):
        self.we.load_record("record.dat")
        self.assertEqual(len(self.we._dict), 3)
        self.assertEqual(self.we._dict[u"理想"], 15)
        self.assertEqual(self.we._dict[u"文化"], 1)
        self.assertEqual(self.we._dict[u"德行"], 12)
        self.assertNotIn(u"操蛋", self.we._dict)
        self.assertNotIn(u"毛线", self.we._dict)

    def test_save_record(self):
        self.we.load_record("record.dat")
        self.we.save_record("record_save.dat")

    # def test_weight_learning(self):
    #     self.we.weight_learning(read_tiny_mock())
        # self.we.show_dict(sort=True)
        # self.we.show_dict()

    def test_df(self):
        self.we.load_record("record.dat")
        self.assertEqual(self.we.df(u"转发"), 1)
        self.assertEqual(self.we.df(u"理想"), 16)

    def test_idf(self):
        self.we.load_record("record.dat")
        self.assertEqual(self.we.df(u"转发"), 1)
        self.assertEqual(self.we.df(u"理想"), 16)
        self.we.idf("美丽")
        self.we.idf("转发")
        self.we.idf("阿拉")

    def test_prob_idf(self):
        self.we.load_record("record.dat")
        self.we.prob_idf("美丽")
        self.we.prob_idf("转发")
        self.we.prob_idf("阿拉")
    def text_word(self):
        term = u"发财"
        text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
        self.assertIn(term, self.we.text_word(text))

    def test_tf(self):
        term = u"发财"
        text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
        self.we.tf(term, self.we.text_word(text))

    def test_log_tf(self):
        term = u"发财"
        text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
        self.we.log_tf(term, self.we.text_word(text))

    def test_a_tf(self):
        term = u"发财"
        text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
        self.we.a_tf(term, self.we.text_word(text))

    def test_b_tf(self):
        term = u"发财"
        text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
        self.we.b_tf(term, self.we.text_word(text))
        term = u"dd"
        self.we.b_tf(term, self.we.text_word(text))

    def test_L_tf(self):
        term = u"发财"
        text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
        self.we.L_tf(term, self.we.text_word(text))
        term = u"dd"
        self.we.L_tf(term, self.we.text_word(text))
Example #9
0
 def setUp(self):
     # Using mmseg to test extract word.
     self.we = WeightEngine(segment_function)
     self.we.load_record("Hugerecord_save.dat")
     self.text = u"这里要说的是,转义虽然对前端展示的时候带来了好处,但是确带来了数据的不一致性。"
     self.term = u"这里"
Example #10
0
def load_dict_from_save(filename, n=50):
    we = WeightEngine(mmseg_segfun)
    we.load_record(filename)
    return sort_dict(we._dict)[:n]