def write_whole_set(): # p_df = loader.load_profile_info() # print "profile information is loaded" # # user_feature_df = loader.read_csv_dataset(profile_ids=list(p_df.index)) user_feature_df = loader.read_csv_feature_set(param.FEATURE_SET) print "user feature data set is loaded" feat_num_org = len(user_feature_df) user_feature_df = user_feature_df.dropna(how='all', axis=0) user_feature_df = user_feature_df.dropna(how='all', axis=1) print "some %s features are dropped." % (feat_num_org - len(user_feature_df)) # user_feature_df.reset_index(level=2, inplace=True) # user_feature_df = user_feature_df[~user_feature_df['feature'].str.contains("DiscretVD")] # user_feature_df = user_feature_df[~user_feature_df['feature'].str.contains("_h")] # user_feature_df.set_index('feature', append=True, inplace=True) # user_feature_df = user_feature_df.reorder_levels(['modality', 'field', 'feature', 'type']) # user_feature_df.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + ".csv") # print "filling nan values.." # dense_df = DataFrame(pc.fill_nan_features(user_feature_df), # index=user_feature_df.index, columns=user_feature_df.columns) # # dense_df = user_feature_df # dense_df.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + param.FILL_SUFFIX + ".csv") # print "normalize.." # norm_df = dense_df.T # norm_df = (norm_df - norm_df.mean()) / norm_df.std() # norm_df = norm_df.T # norm_df = norm_df.replace(np.nan, 0) # norm_df.to_csv(param.DATA_PATH + "/" + param.FEATURE_SET + param.FILL_SUFFIX + param.NORMALIZE_SUFFIX + ".csv") print "zero-one scaling.." scaled = user_feature_df.T # scaled = dense_df.T s_max, s_min = scaled.max(), scaled.min() # id_to_set_v = (s_max != s_min).index scaled = (scaled - s_min) / (s_max - s_min) # scaled[id_to_set_v] -= s_min[id_to_set_v] # scaled[id_to_set_v] /= (s_max - s_min)[id_to_set_v] scaled = scaled.replace(np.nan, 0) scaled = scaled.replace(np.inf, 0) scaled = scaled.replace(-np.inf, 0) scaled = scaled.T # scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.SCALING_SUFFIX + ".csv") # scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + param.FILL_SUFFIX + param.SCALING_SUFFIX + ".csv") # scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + param.SCALING_SUFFIX + ".csv") scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.SCALING_SUFFIX + ".csv")
def test_feature_set_performance(target_label=info.LABEL_GEN, base_fset=param.FEATURE_SET_ORIGINAL, reduced=False, fillna=True, scaling=True, feat_sel=False, feat_num=None, with_pred=False, clf_name='LR', reg_param=1.0): p_df = loader.load_profile_info() print "profile information is loaded" feature_set_name = base_fset feature_set_name += param.REDUCED_SUFFIX if reduced else "" feature_set_name += param.FILL_SUFFIX if fillna else "" feature_set_name += param.SCALING_SUFFIX if scaling else "" print feature_set_name user_feature_df = loader.read_csv_feature_set(feature_set_name) user_feature_df.columns = map(lambda x: int(x), user_feature_df.columns) # user_feature_df = loader.read_csv_feature_set(param.FEATURE_SET_EXTENSION_APP, # fill_na=True, normalize=False) print "user feature data set is loaded" if with_pred: pred_df = pd.read_csv(param.DATA_PATH + "/predictions/item_preds.csv", index_col='profile_id') # pred_df = pd.read_csv(param.DATA_PATH + "/predictions/content_preds.csv", index_col='profile_id') for col in pred_df.columns: uls = list(pred_df[col].unique()) uls.sort() pred_df.loc[:, col] = pred_df[col].apply(lambda x: uls.index(x)) p_df = p_df.loc[pred_df.index] user_feature_df = user_feature_df[pred_df.index] pred_df.columns = [[info.APP] * len(pred_df.columns), ['itemBased_prediction'] * len(pred_df.columns), list(pred_df.columns), [feat.NOMINAL_VAL] * len(pred_df.columns)] pred_df.columns.names = ['modality', 'field', 'feature', 'type'] user_feature_df = pd.concat([user_feature_df.T, pred_df], axis=1).T # method_types = ["LR", "MI", "MI-min10", "FS", "FS-min10", "RF-100"] method_type = "MI" if feat_sel else None cv = 10 repeat_num = 20 nf = feat_num if feat_sel else None if not os.path.isdir(param.EXPERIMENT_PATH): os.makedirs(param.EXPERIMENT_PATH) print "\nlabel, fillna, scaling, feat_sel, clf_name, reg_param, k-CV, ith-fold, featNum, accuracy" for repeat in range(repeat_num): temp_score = clf.classify(user_feature_df, p_df, feature_set_name, features=None, label=target_label, reg_param=reg_param, selection=feat_sel, num_feat=nf, sel_method=method_type, cv=cv)
def load_feature_set(self): feature_set_name = self.get_feature_set_name() user_feature_df = loader.read_csv_feature_set(feature_set_name) user_feature_df.columns = map(lambda x: int(x), user_feature_df.columns) return user_feature_df