def _create_tfidf_matrix(self, document_set, dictionary): """ summarization should treat a sentence as a doc Creates matrix of shape |unique words|×|sentences| where cells contains number of occurences of words (rows) in senteces (cols). """ sentences_count = len(document_set.sentences) words_in_every_sent = [ self._normalize_words(sent.words) for sent in document_set.sentences ] tf_value_every_sent = compute_tf(words_in_every_sent) idf_value = compute_idf(words_in_every_sent) words_count = len(dictionary) # create matrix |unique_words|x|sentences| filled with zeroes matrix = numpy.zeros((words_count, sentences_count)) for idx, sentence in enumerate(document_set.sentences): for word in self._normalize_words(sentence.words): if word in dictionary: row = dictionary[word] matrix[ row, idx] = tf_value_every_sent[idx][word] * idf_value[word] return matrix
def test_compute_tf_idf(self): documents = ( ("this", "is", "a", "example"), ("just", "for", "test"), ("test", "tf", "and", "idf"), ) tf_metrics = compute_tf(documents) idf_metrics = compute_idf(documents) expected_tf = [ { "this": 1 / 4, "is": 1 / 4, "a": 1 / 4, "example": 1 / 4 }, { "just": 1 / 3, "for": 1 / 3, "test": 1 / 3 }, { "test": 1 / 4, "tf": 1 / 4, "and": 1 / 4, "idf": 1 / 4 }, ] expected_idf = { "this": math.log(3 / 2), "is": math.log(3 / 2), "a": math.log(3 / 2), "example": math.log(3 / 2), "just": math.log(3 / 2), "for": math.log(3 / 2), "test": math.log(3 / 3), "tf": math.log(3 / 2), "and": math.log(3 / 2), "idf": math.log(3 / 2), } self.assertEqual(tf_metrics, expected_tf) self.assertEqual(idf_metrics, expected_idf)