def compute_mics(data_set_df, user_info_df, label='gender', min_not_nan=-1): df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) feature_mics = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance']) i = 0 for index, values in df_filtered.iterrows(): # if len(feature_mics) > 1000: # break m = minepy.MINE() try: if min_not_nan < 0: m.compute_score(values, y_v) feature_mics.loc[index] = m.mic() else: nan_removed = values.dropna() if len(nan_removed) < min_not_nan: feature_mics.loc[index] = np.nan else: m.compute_score(nan_removed, y_v[nan_removed.index.astype(int)]) feature_mics.loc[index] = m.mic() # if len(feature_mics) > 1000: # break # if float(i) % 10000 == 0 and i > 0: # print "\t\t\t%s features are done" % i i += 1 # print index, feature_mics.loc[index].values[0] except ValueError: # print "value error occurs during processing %r" % index continue feature_mics.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_mics
def compute_fscore(data_set_df, user_info_df, label='gender', min_not_nan=-1): df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance']) i = 0 for index, values in df_filtered.iterrows(): try: if min_not_nan < 0: f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v) feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan else: nan_removed = values.dropna() if len(nan_removed) < min_not_nan: feature_fs.loc[index] = np.nan else: f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)]) feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan if float(i) % 10000 == 0 and i > 0: print "\t\t\t%s features are done" % i i += 1 # print index, feature_fs.loc[index].values[0] except ValueError: # print "value error occurs during processing %r" % index continue feature_fs.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_fs
def compute_importances(data_set_df, user_info_df, label='gender', split_modal=False, n_est=10, max_depth=None): print "\t\t\tfilling nan values..." df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) feature_importances = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance']) modalities = data_set_df.index.levels[0] def compute(x): x_imp = pc.fill_nan_features(x) try: m = ExtraTreesClassifier(n_estimators=n_est) if max_depth is None \ else ExtraTreesClassifier(n_estimators=n_est, max_depth=3) print "\t\t\tfitting RF model..." m.fit(x_imp.T, y_v) # if len(feature_mics) > 1000: # break # print m.feature_importances_ for order, index in enumerate(x.index): feature_importances.loc[index] = m.feature_importances_[order] if float(order) % 10000 == 0 and order > 0: print "\t\t\t%s features are done" % order except ValueError as e: # print "value error occurs during processing %r" % index pass if split_modal is True: for modal in modalities: x = df_filtered.loc[modal].dropna(how='all') compute(x) else: x = df_filtered.dropna(how='all') compute(x) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances
def preprocess_xy(self, label): x, y = pc.get_filtered_x_y(self.feature_set_df, self.profile_df, label) x = x.dropna(how='all', axis=0) x = x.dropna(how='all', axis=1) if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any(): x_imp = pc.fill_nan_features(x) else: x_imp = x y_filtered = y[(map(int, x.columns.values))] return x_imp, y_filtered
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'): # print "\t\t\tfilling nan values..." df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered.dropna(how='all') x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values clf = RandomizedLogisticRegression() # print "\t\t\tfitting LR model..." clf.fit(x_imp.T, y_v) feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance']) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender', classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR', cv=10): instance_num = len(data_set_df.columns) df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered if features is None else df_filtered.loc[features] x = x.dropna(how='all', axis=0) x = x.dropna(how='all', axis=1) if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any(): x_imp = pc.fill_nan_features(x) # x_imp = dense_df.loc[x.index, x.columns] else: x_imp = x y_filtered = y_v[(map(int, x.columns.values))] clf = LogisticRegression(C=reg_param) if classifier is None else classifier cv_num = min(len(y_filtered), cv) score_mean = 0.0 miss_clf_rate = 1.0 if cv_num > 1 and len(y_filtered.unique()) > 1: kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True) # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True) fold = 0 result_str = "" matrix_str = "" for tr_index, te_index in kf: fold += 1 x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index] y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index] if selection: if sel_method == 'LR' or 'RF' in sel_method: feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat, method=sel_method, label=label) else: x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat, method=sel_method, label=label) x_train = x_train.loc[:, feat_index].values x_test = x_test.loc[:, feat_index].values try: clf.fit(x_train, y_train) score = clf.score(x_test, y_test) score_mean += score result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \ % (label, True if param.FILL_SUFFIX in feat_set_name else False, True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR', reg_param, cv, fold, x_train.shape[1], score) cf_mat = confusion_matrix(y_test, clf.predict(x_test), labels=range(len(info.LABEL_CATEGORY[label]))) matrix_str += np.array_str(cf_mat) + "\n" except ValueError: pass # traceback.print_exc() # print i, "why error? skip!" print result_str file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(result_str) file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(matrix_str) if fold > 0: score_mean = score_mean / fold miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num) return score_mean, miss_clf_rate