class WordExtractWithStopwordTestCase(unittest.TestCase): # Only use setUp() and tearDown() if necessary def setUp(self): # stopword list # Using mmseg to test extract word. self.we = WeightEngine(segment_function, stopwordList = STOPLIST) self.we.load_record("Hugerecord_save.dat") self.text = u""" 你说我在休假期间,大半夜哄完孩子来看代码也不容易。为什么要恶心我呢? 貌似GIT练习的强度不太够了,应该根据大家反馈搞一个新版本出来。不过今天要说的是注释的写法。 大家可以参看这个链接:http://www.kernel.org/pub/linux/kernel/v3.0/ChangeLog-3.3.3 要求带人的同学仔细看看,并且回复邮件说明我们和上面的之间的差距。这些同学有@克毅 @雄飞 @贺贺。 其他人仔细阅读,有心得的回复邮件参与讨论;没有心得的等我回去挨收拾吧。 再强调一遍,GIT是我们研发工作的基础,就好比我们说的话一样,都要说普通话,基础一样才能沟通交流。否则别的都是白扯。""" self.term = u"基础" def tearDown(self): pass def teststop_words(self): result1 = self.we.tf_idf_dict(self.text) show_dict(result1) result2 = self.we.tf_idf(self.term, self.text) print self.term, result2 return
class WordExtractTestCase2(unittest.TestCase): # Only use setUp() and tearDown() if necessary def setUp(self): # Using mmseg to test extract word. self.we = WeightEngine(segment_function) self.we.load_record("Hugerecord_save.dat") self.text = u"这里要说的是,转义虽然对前端展示的时候带来了好处,但是确带来了数据的不一致性。" self.term = u"这里" def tearDown(self): pass def test_extract_tags(self): """ TODO: HOW TO test this kind of function """ result1 = self.we.tf_idf_dict(self.text) show_dict(result1) result2 = self.we.tf_idf(self.term, self.text) print self.term, result2 def test_stop_words_filter(self): wordlst = [u"aa", u"bb"] self.assertEqual([stop_words_filter(wordlst, [u"aa"])], [u"bb"]) pass def test_integrated_tf_idf(self): """ tf_list idf_list """ tf_list = [self.we.tf, self.we.log_tf, self.we.a_tf, self.we.b_tf, self.we.L_tf] df_list = [self.we.n_df, self.we.idf, self.we.prob_idf] print_format = list() for tf in tf_list: for df in df_list: result = sort_dict(self.we.tf_idf_dict(self.text, tf, df)) head = "%s %s:" % (tf.__name__, df.__name__) showTable([word[0] for word in result], [word[1] for word in result], title_name = head) print_format.append([head] + [word[0] for word in result[:5]]) col_printtable(print_format)