def write_feature_importance_files(n_est=10, max_depth=None): profile_ids = map(lambda x: int(x.split("_")[1]), os.listdir(param.FEATURE_AGG_PATH)) profile_ids.sort() user_feature_df = loader.read_csv_dataset(profile_ids=profile_ids) print "user feature data set is loaded" # mod_feature_dfs = {mod: loader.read_csv_dataset(profile_ids=profile_ids, modality=mod) # for mod in info.FREE_MODE_LIST} # print "each modality feature data sets are loaded" p_df = loader.load_profile_info() print "profile information is loaded" target_labels = info.LABEL_LIST # target_labels = ['gender', 'age', 'job', 'religion', 'marriage', 'numberOfChildren', 'income', 'education'] # target_label = 'religion' ranking_limits = [-1] # min_not_nans = [-1, 10] if not os.path.isdir(param.IMPORTANCE_PATH): os.makedirs(param.IMPORTANCE_PATH) for target_label in target_labels: features_powers_lr = fimp.compute_randomized_lr_score(user_feature_df, p_df, target_label) # features_powers_mi = fimp.compute_mics(user_feature_df, p_df, target_label) # features_powers_fs_10 = fimp.compute_fscore(user_feature_df, p_df, target_label, min_not_nan=10) for r_l in ranking_limits: features_powers_lr.iloc[:r_l].to_csv( "%s/%s_%s_%s-%s.csv" % (param.IMPORTANCE_PATH, target_label, "withAppFeat", "LR", r_l if r_l > -1 else "all"))
def test_feature_set_performance(target_label=info.LABEL_GEN, base_fset=param.FEATURE_SET_ORIGINAL, reduced=False, fillna=True, scaling=True, feat_sel=False, feat_num=None, with_pred=False, clf_name='LR', reg_param=1.0): p_df = loader.load_profile_info() print "profile information is loaded" feature_set_name = base_fset feature_set_name += param.REDUCED_SUFFIX if reduced else "" feature_set_name += param.FILL_SUFFIX if fillna else "" feature_set_name += param.SCALING_SUFFIX if scaling else "" print feature_set_name user_feature_df = loader.read_csv_feature_set(feature_set_name) user_feature_df.columns = map(lambda x: int(x), user_feature_df.columns) # user_feature_df = loader.read_csv_feature_set(param.FEATURE_SET_EXTENSION_APP, # fill_na=True, normalize=False) print "user feature data set is loaded" if with_pred: pred_df = pd.read_csv(param.DATA_PATH + "/predictions/item_preds.csv", index_col='profile_id') # pred_df = pd.read_csv(param.DATA_PATH + "/predictions/content_preds.csv", index_col='profile_id') for col in pred_df.columns: uls = list(pred_df[col].unique()) uls.sort() pred_df.loc[:, col] = pred_df[col].apply(lambda x: uls.index(x)) p_df = p_df.loc[pred_df.index] user_feature_df = user_feature_df[pred_df.index] pred_df.columns = [[info.APP] * len(pred_df.columns), ['itemBased_prediction'] * len(pred_df.columns), list(pred_df.columns), [feat.NOMINAL_VAL] * len(pred_df.columns)] pred_df.columns.names = ['modality', 'field', 'feature', 'type'] user_feature_df = pd.concat([user_feature_df.T, pred_df], axis=1).T # method_types = ["LR", "MI", "MI-min10", "FS", "FS-min10", "RF-100"] method_type = "MI" if feat_sel else None cv = 10 repeat_num = 20 nf = feat_num if feat_sel else None if not os.path.isdir(param.EXPERIMENT_PATH): os.makedirs(param.EXPERIMENT_PATH) print "\nlabel, fillna, scaling, feat_sel, clf_name, reg_param, k-CV, ith-fold, featNum, accuracy" for repeat in range(repeat_num): temp_score = clf.classify(user_feature_df, p_df, feature_set_name, features=None, label=target_label, reg_param=reg_param, selection=feat_sel, num_feat=nf, sel_method=method_type, cv=cv)
def __init__(self, exp_name="", base_set=param.FEATURE_SET_ORIGINAL, reduced_set=False, fill=False, scale=True, label_categorize=True): self.exp_name = exp_name self.feature_set_info = {"base": base_set, "reduced": reduced_set, "fillNa": fill, "scaling": scale} self.profile_df = loader.load_profile_info(categorize=label_categorize) self.feature_set_df = self.load_feature_set()
x = df_filtered.dropna(how='all') compute(x) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances if __name__ == '__main__': profile_ids = map(lambda x: int(x.split("_")[1]), os.listdir(param.FEATURE_AGG_PATH))[:20] user_feature_df = data_loader.read_csv_dataset(profile_ids=profile_ids) # user_feature_df = ida_data_loader.read_csv_dataset(file_name=csv_file) print "user feature data set is loaded" # mod_feature_dfs = {mod: ida_data_loader.read_csv_dataset(profile_ids=profile_ids, modality=mod) # for mod in info.FREE_MODE_LIST} # print "each modality feature data sets are loaded" profile_df = data_loader.load_profile_info() print "profile information is loaded" # target_labels = info.LABEL_LIST target_labels = [info.LABEL_GEN, info.LABEL_AGE] # target_labels = ['gender', 'age', 'job', 'religion', 'marriage', 'numberOfChildren', 'income', 'education'] compute_randomized_lr_score(user_feature_df, profile_df) # compute_fscore(user_feature_df, profile_df) # # ranking_limits = [1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 75, 100] # # ranking_limits = [1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 75, 100, 125, 150, 200, 250, 300, 400, 500, # # 750, 1000, 1500, 2000, 3000, 4000, 5000, 7500, 10000, 15000, 20000, 50000, -1] # # if not os.path.isdir(param.IMPORTANCE_PATH): # os.makedirs(param.IMPORTANCE_PATH)
def insert_new_features_to_csv(modality, file_name): p_df = loader.load_profile_info() ids = list(p_df.index) ids.sort()
def data_set_random_partition_algorithm(): user_table = loader.load_profile_info() train, test, error = train_test_random_partition(user_table) write_data_set_partition(train, test, error)
import numpy as np import os import pandas as pd from pandas import DataFrame from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.linear_model import LogisticRegression from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error uid_list = sorted(map(lambda x: int(x.split("_")[1]), os.listdir(param.CHUNK_PATH))) user_exp_dict = {uid: sorted(map(lambda x: int(x.split("_")[1].split(".")[0]), os.listdir("%s/user_%s" % (param.CHUNK_PATH, uid)))) for uid in uid_list} user_info_df = loader.load_profile_info(categorize=False) user_info_cat_df = loader.load_profile_info(categorize=True) def current_timestamp(): return int(round(time.time() * 1000)) def make_data_set(): x_df = [] y_df_dict = {lb: [] for lb in info.LABEL_LIST} for uid in uid_list: print uid, "done" sys.stdout.flush() u_info_series = user_info_df.query('profileId == %s' % uid).iloc[0]
if info.MOD_FIELD_TYPE[modality][field] in info.VAR_NUMERIC: bin_split_dic = feat.BIN_INFO['freq'] categories += range(len(bin_split_dic[(modality, field)]) + 1) elif info.MOD_FIELD_TYPE[modality][field] in info.VAR_CATEGORICAL: if info.MOD_FIELD_TYPE[modality][field] == info.VAR_BINARY: categories += range(2) else: pass return categories if __name__ == '__main__': reload(sys) sys.setdefaultencoding('utf-8') profile_df = loader.load_profile_info() target_tables = info.MOD_LIST if param.PERMISSION_FREE: target_tables = info.FREE_MODE_LIST data_set = None # file_names = os.listdir('data_set/features/') # profile_ids = [] # for file_name in file_names: # modiff_time = datetime.datetime.strptime(time.ctime(os.path.getmtime('data_set/features/%s' % file_name)), # "%a %b %d %H:%M:%S %Y") # if (datetime.datetime.now() - modiff_time).seconds > 3600: # profile_ids.append(file_name) # profile_ids = map(lambda x: int(x.split("_")[1]), profile_ids) # profile_ids.sort()