def test_aggregate_max(self): keywords = [[("foo", 0.1)], [("foo", 0.3), ("bar", 0.6)], [("foo", 0.5)]] scores = AggregationMethods.max(keywords) self.assertEqual(scores[0], ("foo", 0.5)) self.assertEqual(scores[1], ("bar", 0.6))
def test_aggregate_mean(self): keywords = [[("foo", 0.1)], [("foo", 0.3), ("bar", 0.6)], [("foo", 0.5)]] scores = AggregationMethods.mean(keywords) self.assertEqual(scores[0][0], "foo") self.assertEqual(scores[1][0], "bar") self.assertAlmostEqual(scores[0][1], 0.3) self.assertAlmostEqual(scores[1][1], 0.2)
def test_aggregate(self): keywords = [[("foo", 0.1)], [("foo", 0.3), ("bar", 0.6)], [("foo", 0.5)]] scores = AggregationMethods.aggregate(keywords, AggregationMethods.MEAN) self.assertEqual(scores[0][0], "foo") self.assertEqual(scores[1][0], "bar") self.assertAlmostEqual(scores[0][1], 0.3) self.assertAlmostEqual(scores[1][1], 0.2) scores = AggregationMethods.aggregate(keywords, AggregationMethods.MEDIAN) self.assertEqual(scores[0], ("foo", 0.3)) self.assertEqual(scores[1], ("bar", 0.6)) scores = AggregationMethods.aggregate(keywords, AggregationMethods.MIN) self.assertEqual(scores[0], ("foo", 0.1)) self.assertEqual(scores[1], ("bar", 0.6)) scores = AggregationMethods.aggregate(keywords, AggregationMethods.MAX) self.assertEqual(scores[0], ("foo", 0.5)) self.assertEqual(scores[1], ("bar", 0.6))
def run(corpus: Optional[Corpus], words: Optional[List], cached_keywords: Dict, scoring_methods: Set, scoring_methods_kwargs: Dict, agg_method: int, state: TaskState) -> Results: results = Results(scores=[], labels=[], all_keywords={}) if not corpus: return results # passed by reference (and not copied) - to save partial results results.all_keywords = cached_keywords if not scoring_methods: return results def callback(i: float, status=""): state.set_progress_value(i * 100) if status: state.set_status(status) if state.is_interruption_requested(): raise Exception callback(0, "Calculating...") scores = {} tokens = corpus.tokens documents = corpus.documents step = 1 / len(scoring_methods) for method_name, func in ScoringMethods.ITEMS: if method_name in scoring_methods: if method_name not in results.all_keywords: i = len(results.labels) cb = wrap_callback(callback, start=i * step, end=(i + 1) * step) needs_tokens = method_name in ScoringMethods.TOKEN_METHODS kw = {"progress_callback": cb} kw.update(scoring_methods_kwargs.get(method_name, {})) keywords = func(tokens if needs_tokens else documents, **kw) results.all_keywords[method_name] = keywords keywords = results.all_keywords[method_name] scores[method_name] = \ dict(AggregationMethods.aggregate(keywords, agg_method)) results.labels.append(method_name) scores = pd.DataFrame(scores) if words: # Normalize words for preprocessor in corpus.used_preprocessor.preprocessors: if isinstance(preprocessor, BaseNormalizer): words = [preprocessor.normalizer(w) for w in words] # Filter scores using words existing_words = [w for w in set(words) if w in scores.index] scores = scores.loc[existing_words] if existing_words \ else scores.iloc[:0] results.scores = scores.reset_index().sort_values( by=[results.labels[0], "index"], ascending=[False, True]).values.tolist() return results