def _get_default_scores(self, category, df): category_column_name = category + ' freq' cat_word_counts = df[category_column_name] not_cat_word_counts = self._get_not_category_term_frequency( category_column_name, df) scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts) return scores
def _get_default_scores(self, category, other_categories, df): category_column_name = category + ' freq' cat_word_counts = df[category_column_name] not_cat_word_counts = df[[c + ' freq' for c in other_categories]].sum(axis=1) scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts) return scores
def get_scaled_f_scores_vs_background( self, scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO, beta=DEFAULT_BACKGROUND_BETA): df = self.get_term_and_background_counts() df['Scaled f-score'] = ScaledFScore.get_scores_for_category( df['corpus'], df['background'], scaler_algo, beta) return df.sort_values(by='Scaled f-score', ascending=False)
def _get_default_scores(self, category, df): category_column_name = category + ' freq' cat_word_counts = df[category_column_name] not_cat_word_counts = df[[ c for c in df.columns if c != category_column_name ]].sum(axis=1) scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts) return scores
def get_scaled_f_scores_vs_background(self, scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO, beta=DEFAULT_BACKGROUND_BETA): df = self.get_term_and_background_counts() df['Scaled f-score'] = ScaledFScore.get_scores_for_category( df['corpus'], df['background'], scaler_algo, beta ) return df.sort_values(by='Scaled f-score', ascending=False)
def get_scaled_f_scores_vs_background(self, scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO, beta=DEFAULT_BACKGROUND_BETA): ''' Parameters ---------- scaler_algo : str see get_scaled_f_scores, default 'none' beta : float default 1. Returns ------- pd.DataFrame of scaled_f_score scores compared to background corpus ''' df = self.get_term_and_background_counts() df['Scaled f-score'] = ScaledFScore.get_scores_for_category( df['corpus'], df['background'], scaler_algo, beta ) return df.sort_values(by='Scaled f-score', ascending=False)
def get_p_vals(self, X): ''' Imputes p-values from the Z-scores of `ScaledFScore` scores. Assuming incorrectly that the scaled f-scores are normally distributed. Parameters ---------- X : np.array Array of word counts, shape (N, 2) where N is the vocab size. X[:,0] is the positive class, while X[:,1] is the negative class. Returns ------- np.array of p-values ''' f_scores = ScaledFScore.get_scores(X[:,0], X[:,1], self.scaler_algo, self.beta) z_scores = (f_scores - np.mean(f_scores))/(np.std(f_scores)/np.sqrt(len(f_scores))) return norm.cdf(z_scores)