Example #1
0
    def preprocess_xy(self, label):
        x, y = pc.get_filtered_x_y(self.feature_set_df, self.profile_df, label)

        x = x.dropna(how='all', axis=0)
        x = x.dropna(how='all', axis=1)
        if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any():
            x_imp = pc.fill_nan_features(x)
        else:
            x_imp = x
        y_filtered = y[(map(int, x.columns.values))]
        return x_imp, y_filtered
Example #2
0
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'):
    # print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered.dropna(how='all')
    x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values

    clf = RandomizedLogisticRegression()
    # print "\t\t\tfitting LR model..."
    clf.fit(x_imp.T, y_v)
    feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance'])
    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
Example #3
0
    def compute(x):
        x_imp = pc.fill_nan_features(x)
        try:
            m = ExtraTreesClassifier(n_estimators=n_est) if max_depth is None \
                else ExtraTreesClassifier(n_estimators=n_est, max_depth=3)
            print "\t\t\tfitting RF model..."
            m.fit(x_imp.T, y_v)

            # if len(feature_mics) > 1000:
            #     break
            # print m.feature_importances_
            for order, index in enumerate(x.index):
                feature_importances.loc[index] = m.feature_importances_[order]
                if float(order) % 10000 == 0 and order > 0:
                    print "\t\t\t%s features are done" % order
        except ValueError as e:
            # print "value error occurs during processing %r" % index
            pass
Example #4
0
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender',
             classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR',
             cv=10):
    instance_num = len(data_set_df.columns)
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered if features is None else df_filtered.loc[features]

    x = x.dropna(how='all', axis=0)
    x = x.dropna(how='all', axis=1)
    if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any():
        x_imp = pc.fill_nan_features(x)
        # x_imp = dense_df.loc[x.index, x.columns]
    else:
        x_imp = x
    y_filtered = y_v[(map(int, x.columns.values))]

    clf = LogisticRegression(C=reg_param) if classifier is None else classifier
    cv_num = min(len(y_filtered), cv)
    score_mean = 0.0
    miss_clf_rate = 1.0
    if cv_num > 1 and len(y_filtered.unique()) > 1:
        kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True)
        # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True)
        fold = 0
        result_str = ""
        matrix_str = ""
        for tr_index, te_index in kf:
            fold += 1
            x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index]
            y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index]

            if selection:
                if sel_method == 'LR' or 'RF' in sel_method:
                    feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                else:
                    x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T
                    feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                x_train = x_train.loc[:, feat_index].values
                x_test = x_test.loc[:, feat_index].values

            try:
                clf.fit(x_train, y_train)
                score = clf.score(x_test, y_test)
                score_mean += score

                result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \
                              % (label, True if param.FILL_SUFFIX in feat_set_name else False,
                                 True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR',
                                 reg_param, cv, fold, x_train.shape[1], score)
                cf_mat = confusion_matrix(y_test, clf.predict(x_test),
                                          labels=range(len(info.LABEL_CATEGORY[label])))
                matrix_str += np.array_str(cf_mat) + "\n"
            except ValueError:
                pass
                # traceback.print_exc()
                # print i, "why error? skip!"

        print result_str
        file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(result_str)

        file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(matrix_str)

        if fold > 0:
            score_mean = score_mean / fold
            miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num)
    return score_mean, miss_clf_rate