def write_feature_importance_files(n_est=10, max_depth=None): profile_ids = map(lambda x: int(x.split("_")[1]), os.listdir(param.FEATURE_AGG_PATH)) profile_ids.sort() user_feature_df = loader.read_csv_dataset(profile_ids=profile_ids) print "user feature data set is loaded" # mod_feature_dfs = {mod: loader.read_csv_dataset(profile_ids=profile_ids, modality=mod) # for mod in info.FREE_MODE_LIST} # print "each modality feature data sets are loaded" p_df = loader.load_profile_info() print "profile information is loaded" target_labels = info.LABEL_LIST # target_labels = ['gender', 'age', 'job', 'religion', 'marriage', 'numberOfChildren', 'income', 'education'] # target_label = 'religion' ranking_limits = [-1] # min_not_nans = [-1, 10] if not os.path.isdir(param.IMPORTANCE_PATH): os.makedirs(param.IMPORTANCE_PATH) for target_label in target_labels: features_powers_lr = fimp.compute_randomized_lr_score(user_feature_df, p_df, target_label) # features_powers_mi = fimp.compute_mics(user_feature_df, p_df, target_label) # features_powers_fs_10 = fimp.compute_fscore(user_feature_df, p_df, target_label, min_not_nan=10) for r_l in ranking_limits: features_powers_lr.iloc[:r_l].to_csv( "%s/%s_%s_%s-%s.csv" % (param.IMPORTANCE_PATH, target_label, "withAppFeat", "LR", r_l if r_l > -1 else "all"))
if split_modal is True: for modal in modalities: x = df_filtered.loc[modal].dropna(how='all') compute(x) else: x = df_filtered.dropna(how='all') compute(x) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances if __name__ == '__main__': profile_ids = map(lambda x: int(x.split("_")[1]), os.listdir(param.FEATURE_AGG_PATH))[:20] user_feature_df = data_loader.read_csv_dataset(profile_ids=profile_ids) # user_feature_df = ida_data_loader.read_csv_dataset(file_name=csv_file) print "user feature data set is loaded" # mod_feature_dfs = {mod: ida_data_loader.read_csv_dataset(profile_ids=profile_ids, modality=mod) # for mod in info.FREE_MODE_LIST} # print "each modality feature data sets are loaded" profile_df = data_loader.load_profile_info() print "profile information is loaded" # target_labels = info.LABEL_LIST target_labels = [info.LABEL_GEN, info.LABEL_AGE] # target_labels = ['gender', 'age', 'job', 'religion', 'marriage', 'numberOfChildren', 'income', 'education'] compute_randomized_lr_score(user_feature_df, profile_df) # compute_fscore(user_feature_df, profile_df) #