Esempio n. 1
0
def write_whole_set():
    # p_df = loader.load_profile_info()
    # print "profile information is loaded"
    #
    # user_feature_df = loader.read_csv_dataset(profile_ids=list(p_df.index))
    user_feature_df = loader.read_csv_feature_set(param.FEATURE_SET)
    print "user feature data set is loaded"

    feat_num_org = len(user_feature_df)
    user_feature_df = user_feature_df.dropna(how='all', axis=0)
    user_feature_df = user_feature_df.dropna(how='all', axis=1)
    print "some %s features are dropped." % (feat_num_org - len(user_feature_df))

    # user_feature_df.reset_index(level=2, inplace=True)
    # user_feature_df = user_feature_df[~user_feature_df['feature'].str.contains("DiscretVD")]
    # user_feature_df = user_feature_df[~user_feature_df['feature'].str.contains("_h")]
    # user_feature_df.set_index('feature', append=True, inplace=True)
    # user_feature_df = user_feature_df.reorder_levels(['modality', 'field', 'feature', 'type'])
    # user_feature_df.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + ".csv")

    # print "filling nan values.."
    # dense_df = DataFrame(pc.fill_nan_features(user_feature_df),
    #                      index=user_feature_df.index, columns=user_feature_df.columns)
    # # dense_df = user_feature_df
    # dense_df.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + param.FILL_SUFFIX + ".csv")

    # print "normalize.."
    # norm_df = dense_df.T
    # norm_df = (norm_df - norm_df.mean()) / norm_df.std()
    # norm_df = norm_df.T
    # norm_df = norm_df.replace(np.nan, 0)
    # norm_df.to_csv(param.DATA_PATH + "/" + param.FEATURE_SET + param.FILL_SUFFIX + param.NORMALIZE_SUFFIX + ".csv")

    print "zero-one scaling.."
    scaled = user_feature_df.T
    # scaled = dense_df.T
    s_max, s_min = scaled.max(), scaled.min()
    # id_to_set_v = (s_max != s_min).index

    scaled = (scaled - s_min) / (s_max - s_min)
    # scaled[id_to_set_v] -= s_min[id_to_set_v]
    # scaled[id_to_set_v] /= (s_max - s_min)[id_to_set_v]
    scaled = scaled.replace(np.nan, 0)
    scaled = scaled.replace(np.inf, 0)
    scaled = scaled.replace(-np.inf, 0)
    scaled = scaled.T
    # scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.SCALING_SUFFIX + ".csv")
    # scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + param.FILL_SUFFIX + param.SCALING_SUFFIX + ".csv")
    # scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.REDUCED_SUFFIX + param.SCALING_SUFFIX + ".csv")
    scaled.to_csv(param.FEATURE_SET_PATH + "/" + param.FEATURE_SET + param.SCALING_SUFFIX + ".csv")
Esempio n. 2
0
def test_feature_set_performance(target_label=info.LABEL_GEN, base_fset=param.FEATURE_SET_ORIGINAL,
                                 reduced=False, fillna=True, scaling=True,
                                 feat_sel=False, feat_num=None, with_pred=False, clf_name='LR', reg_param=1.0):
    p_df = loader.load_profile_info()
    print "profile information is loaded"

    feature_set_name = base_fset
    feature_set_name += param.REDUCED_SUFFIX if reduced else ""
    feature_set_name += param.FILL_SUFFIX if fillna else ""
    feature_set_name += param.SCALING_SUFFIX if scaling else ""
    print feature_set_name

    user_feature_df = loader.read_csv_feature_set(feature_set_name)
    user_feature_df.columns = map(lambda x: int(x), user_feature_df.columns)
    # user_feature_df = loader.read_csv_feature_set(param.FEATURE_SET_EXTENSION_APP,
    #                                               fill_na=True, normalize=False)
    print "user feature data set is loaded"

    if with_pred:
        pred_df = pd.read_csv(param.DATA_PATH + "/predictions/item_preds.csv", index_col='profile_id')
        # pred_df = pd.read_csv(param.DATA_PATH + "/predictions/content_preds.csv", index_col='profile_id')
        for col in pred_df.columns:
            uls = list(pred_df[col].unique())
            uls.sort()
            pred_df.loc[:, col] = pred_df[col].apply(lambda x: uls.index(x))

        p_df = p_df.loc[pred_df.index]
        user_feature_df = user_feature_df[pred_df.index]

        pred_df.columns = [[info.APP] * len(pred_df.columns), ['itemBased_prediction'] * len(pred_df.columns),
                           list(pred_df.columns), [feat.NOMINAL_VAL] * len(pred_df.columns)]
        pred_df.columns.names = ['modality', 'field', 'feature', 'type']
        user_feature_df = pd.concat([user_feature_df.T, pred_df], axis=1).T

    # method_types = ["LR", "MI", "MI-min10", "FS", "FS-min10", "RF-100"]
    method_type = "MI" if feat_sel else None
    cv = 10
    repeat_num = 20
    nf = feat_num if feat_sel else None

    if not os.path.isdir(param.EXPERIMENT_PATH):
        os.makedirs(param.EXPERIMENT_PATH)

    print "\nlabel, fillna, scaling, feat_sel, clf_name, reg_param, k-CV, ith-fold, featNum, accuracy"

    for repeat in range(repeat_num):
        temp_score = clf.classify(user_feature_df, p_df, feature_set_name, features=None, label=target_label,
                                  reg_param=reg_param, selection=feat_sel, num_feat=nf,
                                  sel_method=method_type, cv=cv)
Esempio n. 3
0
 def load_feature_set(self):
     feature_set_name = self.get_feature_set_name()
     user_feature_df = loader.read_csv_feature_set(feature_set_name)
     user_feature_df.columns = map(lambda x: int(x), user_feature_df.columns)
     return user_feature_df