def _get_characteristic_terms( corpus: Corpus, n_keywords: int = 20, progress_callback: Callable = None) -> List[List[Tuple[str, float]]]: keywords = tfidf_keywords(corpus, progress_callback) return [ sorted(k, key=lambda x: x[1], reverse=True)[:n_keywords] for k in keywords ]
def test_extractor(self): corpus = corpus_mock([["foo", "bar", "baz", "baz"], ["foobar"], [" "]]) keywords = tfidf_keywords(corpus) self.assertEqual(len(keywords), 3) self.assertEqual(len(keywords[0]), 3) self.assertEqual(len(keywords[1]), 1) self.assertEqual(len(keywords[2]), 0) self.assertEqual(keywords[0][1][0], "baz") self.assertGreaterEqual(keywords[0][1][1], 0.8) self.assertLessEqual(keywords[0][1][1], 1) self.assertEqual(keywords[0][0][0], "bar") self.assertEqual(keywords[0][2][0], "foo") self.assertEqual(keywords[1][0][0], "foobar")
def test_extractor(self): tokens = [["foo", "bar", "baz", "baz"], ["foobar"], []] keywords = tfidf_keywords(tokens) self.assertEqual(len(keywords), 3) self.assertEqual(len(keywords[0]), 3) self.assertEqual(len(keywords[1]), 1) self.assertEqual(len(keywords[2]), 0) self.assertEqual(keywords[0][0][0], "baz") self.assertGreaterEqual(keywords[0][0][1], 0.8) self.assertLessEqual(keywords[0][0][1], 1) self.assertEqual(keywords[0][1][0], "bar") self.assertEqual(keywords[0][2][0], "foo") self.assertEqual(keywords[1][0][0], "foobar")
def dummy_embedding(tokens, language, progress_callback=None): return tfidf_keywords(tokens, progress_callback)
def test_single_letter_tokens(self): keywords = tfidf_keywords([["a", "b", "b", " "]]) self.assertEqual(keywords[0][0][0], " ") self.assertEqual(keywords[0][1][0], "b") self.assertEqual(keywords[0][2][0], "a")
def test_single_letter_tokens(self): keywords = tfidf_keywords(corpus_mock([["a", "b", "b"]])) self.assertEqual(keywords[0][0][0], "a") self.assertEqual(keywords[0][1][0], "b")
def test_empty_tokens(self): keywords = tfidf_keywords(corpus_mock([[" "]])) self.assertEqual(1, len(keywords)) self.assertEqual(0, len(keywords[0]))