class WordExtractWithStopwordTestCase(unittest.TestCase): # Only use setUp() and tearDown() if necessary def setUp(self): # stopword list # Using mmseg to test extract word. self.we = WeightEngine(segment_function, stopwordList = STOPLIST) self.we.load_record("Hugerecord_save.dat") self.text = u""" 你说我在休假期间,大半夜哄完孩子来看代码也不容易。为什么要恶心我呢? 貌似GIT练习的强度不太够了,应该根据大家反馈搞一个新版本出来。不过今天要说的是注释的写法。 大家可以参看这个链接:http://www.kernel.org/pub/linux/kernel/v3.0/ChangeLog-3.3.3 要求带人的同学仔细看看,并且回复邮件说明我们和上面的之间的差距。这些同学有@克毅 @雄飞 @贺贺。 其他人仔细阅读,有心得的回复邮件参与讨论;没有心得的等我回去挨收拾吧。 再强调一遍,GIT是我们研发工作的基础,就好比我们说的话一样,都要说普通话,基础一样才能沟通交流。否则别的都是白扯。""" self.term = u"基础" def tearDown(self): pass def teststop_words(self): result1 = self.we.tf_idf_dict(self.text) show_dict(result1) result2 = self.we.tf_idf(self.term, self.text) print self.term, result2 return
def build_dict(segfun, n=50, stopword_list=None, filename = None): we = WeightEngine(segfun, stopwordList=stopword_list) we.weight_learning(read_arbitrary_mock(2000000)) if filename: we.save_record(filename) return sort_dict(we._dict)[:n]
def AnalysisComparison(text, record, expectList=[], noneList=[]): funcList = [mmseg_segfun, ICTCLAS_segfun] we = WeightEngine(mmseg_segfun) we.load_record(record) # compareStatus(list_basic) stop_list = [u"的", u"。", u",", u"是"] + STOPLIST result = export_mmseg(we, text, funcList, stop_list, expectList, noneList) compareStatus(result)
def show_range(MIN, MAX): colorlist = "bcgkmrwy" # blue,cyan,green,black,magenta,red,white,yellow filelist = ["mmseg_save.bak", "ICTCLAS_save.bak", "mmseg_save_with_stopwords.bak", "ICTCLAS_save_with_stopwords.bak"] # blue, cyan green black i = 0 for f in filelist: we = WeightEngine() we.load_record(f) x = [item[1] for item in sort_dict(we._dict)[MIN:MAX]] plt.plot(x, colorlist[i]); i += 1 plt.show() return
class WordExtractTestCase2(unittest.TestCase): # Only use setUp() and tearDown() if necessary def setUp(self): # Using mmseg to test extract word. self.we = WeightEngine(segment_function) self.we.load_record("Hugerecord_save.dat") self.text = u"这里要说的是,转义虽然对前端展示的时候带来了好处,但是确带来了数据的不一致性。" self.term = u"这里" def tearDown(self): pass def test_extract_tags(self): """ TODO: HOW TO test this kind of function """ result1 = self.we.tf_idf_dict(self.text) show_dict(result1) result2 = self.we.tf_idf(self.term, self.text) print self.term, result2 def test_stop_words_filter(self): wordlst = [u"aa", u"bb"] self.assertEqual([stop_words_filter(wordlst, [u"aa"])], [u"bb"]) pass def test_integrated_tf_idf(self): """ tf_list idf_list """ tf_list = [self.we.tf, self.we.log_tf, self.we.a_tf, self.we.b_tf, self.we.L_tf] df_list = [self.we.n_df, self.we.idf, self.we.prob_idf] print_format = list() for tf in tf_list: for df in df_list: result = sort_dict(self.we.tf_idf_dict(self.text, tf, df)) head = "%s %s:" % (tf.__name__, df.__name__) showTable([word[0] for word in result], [word[1] for word in result], title_name = head) print_format.append([head] + [word[0] for word in result[:5]]) col_printtable(print_format)
def setUp(self): # stopword list # Using mmseg to test extract word. self.we = WeightEngine(segment_function, stopwordList = STOPLIST) self.we.load_record("Hugerecord_save.dat") self.text = u""" 你说我在休假期间,大半夜哄完孩子来看代码也不容易。为什么要恶心我呢? 貌似GIT练习的强度不太够了,应该根据大家反馈搞一个新版本出来。不过今天要说的是注释的写法。 大家可以参看这个链接:http://www.kernel.org/pub/linux/kernel/v3.0/ChangeLog-3.3.3 要求带人的同学仔细看看,并且回复邮件说明我们和上面的之间的差距。这些同学有@克毅 @雄飞 @贺贺。 其他人仔细阅读,有心得的回复邮件参与讨论;没有心得的等我回去挨收拾吧。 再强调一遍,GIT是我们研发工作的基础,就好比我们说的话一样,都要说普通话,基础一样才能沟通交流。否则别的都是白扯。""" self.term = u"基础"
def setUp(self): # Using mmseg to test extract word. self.we = WeightEngine(segment_function) self.term = u"发财" self.text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。"
class WordExtractTestCase(unittest.TestCase): # Only use setUp() and tearDown() if necessary def setUp(self): # Using mmseg to test extract word. self.we = WeightEngine(segment_function) self.term = u"发财" self.text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。" def tearDown(self): pass def test_add_record(self): self.we.load_record("record.dat") self.we.add_record(u"操蛋", 15) self.assertEqual(len(self.we._dict), 4) self.we.add_record(u"操蛋", 15) self.assertEqual(len(self.we._dict), 4) self.assertEqual(self.we._dict[u"操蛋"], 30) self.assertEqual(self.we.N, 15) def test_load_record(self): self.we.load_record("record.dat") self.assertEqual(len(self.we._dict), 3) self.assertEqual(self.we._dict[u"理想"], 15) self.assertEqual(self.we._dict[u"文化"], 1) self.assertEqual(self.we._dict[u"德行"], 12) self.assertNotIn(u"操蛋", self.we._dict) self.assertNotIn(u"毛线", self.we._dict) def test_save_record(self): self.we.load_record("record.dat") self.we.save_record("record_save.dat") # def test_weight_learning(self): # self.we.weight_learning(read_tiny_mock()) # self.we.show_dict(sort=True) # self.we.show_dict() def test_df(self): self.we.load_record("record.dat") self.assertEqual(self.we.df(u"转发"), 1) self.assertEqual(self.we.df(u"理想"), 16) def test_idf(self): self.we.load_record("record.dat") self.assertEqual(self.we.df(u"转发"), 1) self.assertEqual(self.we.df(u"理想"), 16) self.we.idf("美丽") self.we.idf("转发") self.we.idf("阿拉") def test_prob_idf(self): self.we.load_record("record.dat") self.we.prob_idf("美丽") self.we.prob_idf("转发") self.we.prob_idf("阿拉") def text_word(self): term = u"发财" text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。" self.assertIn(term, self.we.text_word(text)) def test_tf(self): term = u"发财" text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。" self.we.tf(term, self.we.text_word(text)) def test_log_tf(self): term = u"发财" text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。" self.we.log_tf(term, self.we.text_word(text)) def test_a_tf(self): term = u"发财" text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。" self.we.a_tf(term, self.we.text_word(text)) def test_b_tf(self): term = u"发财" text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。" self.we.b_tf(term, self.we.text_word(text)) term = u"dd" self.we.b_tf(term, self.we.text_word(text)) def test_L_tf(self): term = u"发财" text = u"发财啊,发财啊,操蛋啊,发财啊,不知道什么情况啊。" self.we.L_tf(term, self.we.text_word(text)) term = u"dd" self.we.L_tf(term, self.we.text_word(text))
def setUp(self): # Using mmseg to test extract word. self.we = WeightEngine(segment_function) self.we.load_record("Hugerecord_save.dat") self.text = u"这里要说的是,转义虽然对前端展示的时候带来了好处,但是确带来了数据的不一致性。" self.term = u"这里"
def load_dict_from_save(filename, n=50): we = WeightEngine(mmseg_segfun) we.load_record(filename) return sort_dict(we._dict)[:n]