class TestTfIdf(TestCase): def setUp(self): self.rec1 = RecordStub(Counter({"a": 1, "b": 2, "c": 3})) self.rec2 = RecordStub(Counter({"b": 5})) self.container = {"one": self.rec1, "two": self.rec2} self.itemgetter_func = lambda x: x.item_counter self.tf_idf = TfIdf(self.container, self.itemgetter_func) def compute_expected_idf_dict(self): expected_idf_dict = Counter(set(self.rec1.item_counter)) expected_idf_dict.update(set(self.rec2.item_counter)) return expected_idf_dict def test_refresh(self): self.tf_idf.refresh() expected_idf_dict = self.compute_expected_idf_dict() self.assertDictEqual(self.tf_idf.idf_dict, expected_idf_dict) def test__collect_idf(self): expected_idf_dict = self.compute_expected_idf_dict() counter_dict_list = [self.rec1.item_counter, self.rec2.item_counter] self.assertDictEqual(self.tf_idf.idf_dict, expected_idf_dict) def compute_tf_idf(self, counter_dict, idf_dict): import math num_items = 2 return Counter({key: (1+math.log10(tf_val)) * (math.log10(float(num_items)/float((1 + idf_dict[key])))) for key, tf_val in counter_dict.iteritems()}) def test__tf_idf_multiply(self): idf_dict = self.compute_expected_idf_dict() tf_idf_rec1_expected = self.compute_tf_idf(self.rec1.item_counter, idf_dict) tf_idf_rec2_expected = self.compute_tf_idf(self.rec2.item_counter, idf_dict) print tf_idf_rec2_expected self.assertDictEqual(tf_idf_rec1_expected, self.tf_idf["one"]) self.assertDictEqual(tf_idf_rec2_expected, self.tf_idf["two"])
def setUp(self): self.rec1 = RecordStub(Counter({"a": 1, "b": 2, "c": 3})) self.rec2 = RecordStub(Counter({"b": 5})) self.container = {"one": self.rec1, "two": self.rec2} self.itemgetter_func = lambda x: x.item_counter self.tf_idf = TfIdf(self.container, self.itemgetter_func)