def test_reduce_terms(self):
		tdm = make_a_test_term_doc_matrix()
		scores = tdm.get_term_freq_df().sum(axis=1) % 10
		new_tdm = AutoTermSelector.reduce_terms(
			tdm, scores, num_term_to_keep=10)
		self.assertLessEqual(len(new_tdm.get_term_freq_df().index), 10)
		self.assertEqual(len(tdm.get_term_freq_df().index), 58)
Exemple #2
0
 def test_absolute_frequency_ranker(self):
     tdm = make_a_test_term_doc_matrix()
     ranker = AbsoluteFrequencyRanker(tdm)
     rank_df = ranker.get_ranks()
     self.assertEqual(len(rank_df), 58)
     self.assertEqual(rank_df.loc['hello'].tolist(), [1, 0])
     self.assertEqual(rank_df.loc['blah'].tolist(), [0, 3])
     self.assertEqual(rank_df.loc['name'].tolist(), [1, 1])
	def test_absolute_frequency_ranker(self):
		tdm = make_a_test_term_doc_matrix()
		ranker = AbsoluteFrequencyRanker(tdm)
		rank_df = ranker.get_ranks()
		self.assertEqual(len(rank_df), 58)
		self.assertEqual(rank_df.ix['hello'].tolist(), [1, 0])
		self.assertEqual(rank_df.ix['blah'].tolist(), [0, 3])
		self.assertEqual(rank_df.ix['name'].tolist(), [1, 1])
Exemple #4
0
    def test_once_per_doc_frequency_ranker(self):
        tdm = make_a_test_term_doc_matrix()
        abs_ranker = DocLengthDividedFrequencyRanker(tdm)

        one_ranker = OncePerDocFrequencyRanker(tdm)
        abs_rank_df = abs_ranker.get_ranks()
        len_ranker_df = one_ranker.get_ranks()
        self.assertEqual(len(abs_rank_df), len(len_ranker_df))
        np.testing.assert_almost_equal(np.array(len_ranker_df.loc['blah']),
                                       [0, 1])
        np.testing.assert_almost_equal(np.array(len_ranker_df.loc['name']),
                                       [1, 1])
Exemple #5
0
 def test_doc_length_divided_frequency_ranker(self):
     tdm = make_a_test_term_doc_matrix()
     len_ranker = DocLengthDividedFrequencyRanker(tdm)
     abs_ranker = AbsoluteFrequencyRanker(tdm)
     abs_rank_df = abs_ranker.get_ranks()
     len_ranker_df = len_ranker.get_ranks()
     self.assertEqual(len(abs_rank_df), len(len_ranker_df))
     doc_lengths = [12, 35, 29]
     np.testing.assert_almost_equal(np.array(len_ranker_df.loc['blah']),
                                    [0, 3. / 12])
     np.testing.assert_almost_equal(np.array(len_ranker_df.loc['name']),
                                    [1. / 35, 1. / 29])
	def test_once_per_doc_frequency_ranker(self):
		tdm = make_a_test_term_doc_matrix()
		abs_ranker = DocLengthDividedFrequencyRanker(tdm)

		one_ranker = OncePerDocFrequencyRanker(tdm)
		abs_rank_df = abs_ranker.get_ranks()
		len_ranker_df = one_ranker.get_ranks()
		self.assertEqual(len(abs_rank_df), len(len_ranker_df))
		np.testing.assert_almost_equal(np.array(len_ranker_df.ix['blah']),
		                               [0, 1])
		np.testing.assert_almost_equal(np.array(len_ranker_df.ix['name']),
		                               [1, 1])
	def test_doc_length_divided_frequency_ranker(self):
		tdm = make_a_test_term_doc_matrix()
		len_ranker = DocLengthDividedFrequencyRanker(tdm)
		abs_ranker = AbsoluteFrequencyRanker(tdm)
		abs_rank_df = abs_ranker.get_ranks()
		len_ranker_df = len_ranker.get_ranks()
		self.assertEqual(len(abs_rank_df), len(len_ranker_df))
		doc_lengths = [12, 35, 29]
		np.testing.assert_almost_equal(np.array(len_ranker_df.ix['blah']),
		                               [0, 3. / 12])
		np.testing.assert_almost_equal(np.array(len_ranker_df.ix['name']),
		                               [1. / 35, 1. / 29])
	def test_doc_length_normalized_frequency_ranker(self):
		tdm = make_a_test_term_doc_matrix()
		len_ranker = DocLengthNormalizedFrequencyRanker(tdm)
		abs_ranker = AbsoluteFrequencyRanker(tdm)
		abs_rank_df = abs_ranker.get_ranks()
		len_ranker_df = len_ranker.get_ranks()
		self.assertEqual(len(abs_rank_df), len(len_ranker_df))
		doc_lengths = [12, 35, 29]
		avg_length = sum(doc_lengths) * 1. / len(doc_lengths)
		np.testing.assert_almost_equal(np.array(len_ranker_df.ix['blah']),
		                               [0, avg_length * 3. / 12])
		np.testing.assert_almost_equal(np.array(len_ranker_df.ix['name']),
		                               [avg_length * 1. / 35, avg_length * 1. / 29])
Exemple #9
0
 def test_doc_length_normalized_frequency_ranker(self):
     tdm = make_a_test_term_doc_matrix()
     len_ranker = DocLengthNormalizedFrequencyRanker(tdm)
     abs_ranker = AbsoluteFrequencyRanker(tdm)
     abs_rank_df = abs_ranker.get_ranks()
     len_ranker_df = len_ranker.get_ranks()
     self.assertEqual(len(abs_rank_df), len(len_ranker_df))
     doc_lengths = [12, 35, 29]
     avg_length = sum(doc_lengths) * 1. / len(doc_lengths)
     np.testing.assert_almost_equal(np.array(len_ranker_df.loc['blah']),
                                    [0, avg_length * 3. / 12])
     np.testing.assert_almost_equal(
         np.array(len_ranker_df.loc['name']),
         [avg_length * 1. / 35, avg_length * 1. / 29])