def get_scores_for_category(self): cat_counts, not_cat_counts = self._get_counts() scores = ScaledFScore.get_scores_for_category(cat_counts, not_cat_counts) np.testing.assert_almost_equal(scores, [0.23991183969723384, 0.24969810634506373, 0.23991183969723384, 0.27646711056272855, 0.92885244834997516, 0.42010144843632563, 0.49166017105966719, 0.0, 0.0, 0.50262304057984664])
def test_get_scores(self): cat_counts, not_cat_counts = self._get_counts() scores = ScaledFScore.get_scores(cat_counts, not_cat_counts, beta=1.) np.testing.assert_almost_equal( scores, np.array([ 0.2689108, 0., 0.2689108, 0.1266617, 1., 0.5, 0.5590517, 0.5, 0.5, 0.5720015 ]))
def get_scores_for_category(self): cat_counts, not_cat_counts = self._get_counts() scores = ScaledFScore.get_scores_for_category(cat_counts, not_cat_counts) np.testing.assert_almost_equal(scores, [ 0.23991183969723384, 0.24969810634506373, 0.23991183969723384, 0.27646711056272855, 0.92885244834997516, 0.42010144843632563, 0.49166017105966719, 0.0, 0.0, 0.50262304057984664 ])
def _get_scaled_f_score_from_counts(self, cat_word_counts, not_cat_word_counts, scaler_algo, beta=DEFAULT_BETA): ''' scaler = self._get_scaler_function(scaler_algo) p_word_given_category = cat_word_counts.astype(np.float64) / cat_word_counts.sum() p_category_given_word = cat_word_counts.astype(np.float64) / (cat_word_counts + not_cat_word_counts) scores \ = self._computer_harmoic_mean_of_probabilities_over_non_zero_in_category_count_terms( cat_word_counts, p_category_given_word, p_word_given_category, scaler ) ''' return ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts, scaler_algo, beta=beta)
def get_scaled_f_scores_vs_background(self, scaler_algo='none', beta=1.): ''' Parameters ---------- scaler_algo : str see get_scaled_f_scores, default 'none' beta : float default 1. Returns ------- pd.DataFrame of scaled_f_score scores compared to background corpus ''' df = self.get_term_and_background_counts() df['Scaled f-score'] = ScaledFScore.get_scores_for_category( df['corpus'], df['background'], scaler_algo, beta) return df.sort_values(by='Scaled f-score', ascending=False)
def test_get_scores_zero_median(self): cat_counts = np.array([0, 0, 0, 0, 0, 0, 1, 2]) not_cat_counts = np.array([1, 1, 2, 1, 1, 1, 1, 3]) ScaledFScore.get_scores(cat_counts, not_cat_counts)
def test_get_scores_zero_all_same(self): cat_counts = np.array([0, 0, 0, 0, 0, 0, 1, 2]) not_cat_counts = np.array([1, 1, 2, 1, 1, 1, 1, 2]) scores = ScaledFScore.get_scores(cat_counts, not_cat_counts) np.testing.assert_almost_equal(scores, [0.5, 0.5, 0, 0.5, 0.5, 0.5, 0.5, 1.])
def test_get_scores(self): cat_counts, not_cat_counts = self._get_counts() scores = ScaledFScore.get_scores(cat_counts, not_cat_counts, beta=1.) np.testing.assert_almost_equal(scores, np.array([0.2689108, 0., 0.2689108, 0.1266617, 1., 0.5, 0.5590517, 0.5, 0.5, 0.5720015]))