def _get_characteristic_terms(
        corpus: Corpus,
        n_keywords: int = 20,
        progress_callback: Callable = None) -> List[List[Tuple[str, float]]]:
    keywords = tfidf_keywords(corpus, progress_callback)
    return [
        sorted(k, key=lambda x: x[1], reverse=True)[:n_keywords]
        for k in keywords
    ]
Example #2
0
    def test_extractor(self):
        corpus = corpus_mock([["foo", "bar", "baz", "baz"], ["foobar"], [" "]])
        keywords = tfidf_keywords(corpus)
        self.assertEqual(len(keywords), 3)
        self.assertEqual(len(keywords[0]), 3)
        self.assertEqual(len(keywords[1]), 1)
        self.assertEqual(len(keywords[2]), 0)

        self.assertEqual(keywords[0][1][0], "baz")
        self.assertGreaterEqual(keywords[0][1][1], 0.8)
        self.assertLessEqual(keywords[0][1][1], 1)

        self.assertEqual(keywords[0][0][0], "bar")
        self.assertEqual(keywords[0][2][0], "foo")

        self.assertEqual(keywords[1][0][0], "foobar")
Example #3
0
    def test_extractor(self):
        tokens = [["foo", "bar", "baz", "baz"],
                  ["foobar"],
                  []]
        keywords = tfidf_keywords(tokens)
        self.assertEqual(len(keywords), 3)
        self.assertEqual(len(keywords[0]), 3)
        self.assertEqual(len(keywords[1]), 1)
        self.assertEqual(len(keywords[2]), 0)

        self.assertEqual(keywords[0][0][0], "baz")
        self.assertGreaterEqual(keywords[0][0][1], 0.8)
        self.assertLessEqual(keywords[0][0][1], 1)

        self.assertEqual(keywords[0][1][0], "bar")
        self.assertEqual(keywords[0][2][0], "foo")

        self.assertEqual(keywords[1][0][0], "foobar")
Example #4
0
 def dummy_embedding(tokens, language, progress_callback=None):
     return tfidf_keywords(tokens, progress_callback)
Example #5
0
 def test_single_letter_tokens(self):
     keywords = tfidf_keywords([["a", "b", "b", " "]])
     self.assertEqual(keywords[0][0][0], " ")
     self.assertEqual(keywords[0][1][0], "b")
     self.assertEqual(keywords[0][2][0], "a")
Example #6
0
 def test_single_letter_tokens(self):
     keywords = tfidf_keywords(corpus_mock([["a", "b", "b"]]))
     self.assertEqual(keywords[0][0][0], "a")
     self.assertEqual(keywords[0][1][0], "b")
Example #7
0
 def test_empty_tokens(self):
     keywords = tfidf_keywords(corpus_mock([[" "]]))
     self.assertEqual(1, len(keywords))
     self.assertEqual(0, len(keywords[0]))