Example #1
0
def compute_mics(data_set_df, user_info_df, label='gender', min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_mics = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    i = 0
    for index, values in df_filtered.iterrows():
        # if len(feature_mics) > 1000:
        #     break
        m = minepy.MINE()
        try:
            if min_not_nan < 0:
                m.compute_score(values, y_v)
                feature_mics.loc[index] = m.mic()
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_mics.loc[index] = np.nan
                else:
                    m.compute_score(nan_removed, y_v[nan_removed.index.astype(int)])
                    feature_mics.loc[index] = m.mic()
            # if len(feature_mics) > 1000:
            #     break
            # if float(i) % 10000 == 0 and i > 0:
            #     print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_mics.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_mics.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_mics
Example #2
0
def compute_fscore(data_set_df, user_info_df, label='gender', min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    i = 0
    for index, values in df_filtered.iterrows():
        try:
            if min_not_nan < 0:
                f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v)
                feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_fs.loc[index] = np.nan
                else:
                    f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)])
                    feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            if float(i) % 10000 == 0 and i > 0:
                print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_fs.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_fs.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_fs
Example #3
0
def compute_importances(data_set_df, user_info_df, label='gender', split_modal=False, n_est=10, max_depth=None):
    print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_importances = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    modalities = data_set_df.index.levels[0]

    def compute(x):
        x_imp = pc.fill_nan_features(x)
        try:
            m = ExtraTreesClassifier(n_estimators=n_est) if max_depth is None \
                else ExtraTreesClassifier(n_estimators=n_est, max_depth=3)
            print "\t\t\tfitting RF model..."
            m.fit(x_imp.T, y_v)

            # if len(feature_mics) > 1000:
            #     break
            # print m.feature_importances_
            for order, index in enumerate(x.index):
                feature_importances.loc[index] = m.feature_importances_[order]
                if float(order) % 10000 == 0 and order > 0:
                    print "\t\t\t%s features are done" % order
        except ValueError as e:
            # print "value error occurs during processing %r" % index
            pass

    if split_modal is True:
        for modal in modalities:
            x = df_filtered.loc[modal].dropna(how='all')
            compute(x)
    else:
        x = df_filtered.dropna(how='all')
        compute(x)

    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
Example #4
0
    def preprocess_xy(self, label):
        x, y = pc.get_filtered_x_y(self.feature_set_df, self.profile_df, label)

        x = x.dropna(how='all', axis=0)
        x = x.dropna(how='all', axis=1)
        if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any():
            x_imp = pc.fill_nan_features(x)
        else:
            x_imp = x
        y_filtered = y[(map(int, x.columns.values))]
        return x_imp, y_filtered
Example #5
0
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'):
    # print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered.dropna(how='all')
    x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values

    clf = RandomizedLogisticRegression()
    # print "\t\t\tfitting LR model..."
    clf.fit(x_imp.T, y_v)
    feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance'])
    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
Example #6
0
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender',
             classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR',
             cv=10):
    instance_num = len(data_set_df.columns)
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered if features is None else df_filtered.loc[features]

    x = x.dropna(how='all', axis=0)
    x = x.dropna(how='all', axis=1)
    if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any():
        x_imp = pc.fill_nan_features(x)
        # x_imp = dense_df.loc[x.index, x.columns]
    else:
        x_imp = x
    y_filtered = y_v[(map(int, x.columns.values))]

    clf = LogisticRegression(C=reg_param) if classifier is None else classifier
    cv_num = min(len(y_filtered), cv)
    score_mean = 0.0
    miss_clf_rate = 1.0
    if cv_num > 1 and len(y_filtered.unique()) > 1:
        kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True)
        # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True)
        fold = 0
        result_str = ""
        matrix_str = ""
        for tr_index, te_index in kf:
            fold += 1
            x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index]
            y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index]

            if selection:
                if sel_method == 'LR' or 'RF' in sel_method:
                    feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                else:
                    x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T
                    feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                x_train = x_train.loc[:, feat_index].values
                x_test = x_test.loc[:, feat_index].values

            try:
                clf.fit(x_train, y_train)
                score = clf.score(x_test, y_test)
                score_mean += score

                result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \
                              % (label, True if param.FILL_SUFFIX in feat_set_name else False,
                                 True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR',
                                 reg_param, cv, fold, x_train.shape[1], score)
                cf_mat = confusion_matrix(y_test, clf.predict(x_test),
                                          labels=range(len(info.LABEL_CATEGORY[label])))
                matrix_str += np.array_str(cf_mat) + "\n"
            except ValueError:
                pass
                # traceback.print_exc()
                # print i, "why error? skip!"

        print result_str
        file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(result_str)

        file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(matrix_str)

        if fold > 0:
            score_mean = score_mean / fold
            miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num)
    return score_mean, miss_clf_rate