Example #1
0
def evaluate(X, y, args):
    if args.task == 'regression':
        if args.model == 'LR':
            model = Lasso()
        elif args.model == 'RF':
            model = RandomForestRegressor(n_estimators=10, random_state=0)
        if args.evaluate == 'mae':
            r_mae = cross_val_score(model,
                                    X,
                                    y,
                                    cv=5,
                                    scoring='neg_mean_absolute_error').mean()
            return r_mae
        elif args.evaluate == 'mse':
            r_mse = cross_val_score(model,
                                    X,
                                    y,
                                    cv=5,
                                    scoring='neg_mean_squared_error').mean()
            return r_mse
        elif args.evaluate == 'r2':
            r_r2 = cross_val_score(model, X, y, cv=5).mean()
            return r_r2

    elif args.task == 'classification':
        le = LabelEncoder()
        y = le.fit_transform(y)

        if args.model == 'RF':
            model = RandomForestClassifier(n_estimators=10, random_state=0)
        elif args.model == 'LR':
            model = LogisticRegression(multi_class='ovr')
        elif args.model == 'SVM':
            model = svm.SVC()
        if args.evaluate == 'f_score':
            s = cross_val_score(model, X, y, scoring='f1', cv=5).mean()
        elif args.evaluate == 'auc':
            model = RandomForestClassifier(max_depth=10, random_state=0)
            split_pos = X.shape[0] // 10
            X_train, X_test = X[:9 * split_pos], X[9 * split_pos:]
            y_train, y_test = y[:9 * split_pos], y[9 * split_pos:]
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_test)
            s = evaluate_(y_test, y_pred)
        return s
class SLearner(Model):
    def __init__(self, *args, **kwargs):
        self.reg = None
        self.l1_penalty = 0
        super(SLearner, self).__init__(*args, **kwargs)

    def fit(self,
            x,
            t,
            y,
            nfolds=5,
            lambdas=[1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2, 1e-3],
            seed=282):

        splits = super().get_splits(x, nfolds, seed)

        #### CLASSIFICATION ####
        if self.binary:
            avg_bces = []
            for l in lambdas:
                bce_lst = []
                for train_index, valid_index in splits:
                    x_train, x_valid = x[train_index], x[valid_index]
                    t_train, t_valid = t[train_index], t[valid_index]
                    y_train, y_valid = y[train_index], y[valid_index]
                    reg = LogisticRegression(C=1 / l).fit(
                        self.get_predictors(x_train, t_train), y_train)
                    log_phat_valid = reg.predict_log_proba(
                        self.get_predictors(x_valid, t_valid))
                    bce = np.mean(log_phat_valid[np.arange(len(y_valid)),
                                                 y_valid.astype('int')])
                    bce_lst.append(bce)
                avg_bces.append(np.mean(bce_lst))
            self.l1_penalty = lambdas[np.argmin(avg_bces)]
            self.reg = LogisticRegression(C=1 / self.l1_penalty).fit(
                self.get_predictors(x, t), y)

        #### REGRESSION ####
        else:
            avg_rmses = []
            for l in lambdas:
                rmse_lst = []
                for train_index, valid_index in splits:
                    x_train, x_valid = x[train_index], x[valid_index]
                    t_train, t_valid = t[train_index], t[valid_index]
                    y_train, y_valid = y[train_index], y[valid_index]
                    reg = Lasso(alpha=l).fit(
                        self.get_predictors(x_train, t_train), y_train)
                    yhat_valid = reg.predict(
                        self.get_predictors(x_valid, t_valid))
                    rmse = np.sqrt(np.mean((yhat_valid - y_valid)**2))
                    rmse_lst.append(rmse)
                avg_rmses.append(np.mean(rmse_lst))
            self.l1_penalty = lambdas[np.argmin(avg_rmses)]
            self.reg = Lasso(alpha=self.l1_penalty).fit(
                self.get_predictors(x, t), y)

    def predict(self, x, t):
        if self.reg is None:
            raise Exception('SLearner not Initialized')

        #### CLASSIFICATION ####
        if self.binary:
            return self.reg.predict_proba(self.get_predictors(x, t))[:, 1]

        #### REGRESSION ####
        else:
            return self.reg.predict(self.get_predictors(x, t))

    def get_predictors(self, x, t):
        return np.hstack([x, (t - 0.5).reshape(-1, 1) * x])
Example #3
0
def evaluate1(X, y, num_op_unary, num_op_binary, max_order, num_batch,
              optimizer, lr, epochs, evaluate, task, dataset, model, alpha,
              lr_value, RL_model, reg, controller, num_random_sample, lambd,
              multiprocessing, package):

    if task == 'regression':
        if model == 'LR':
            model = Lasso()
        elif model == 'RF':
            model = RandomForestRegressor(n_estimators=10, random_state=0)

        if evaluate == 'mae':
            r_mae = cross_val_score(model,
                                    X,
                                    y,
                                    cv=5,
                                    scoring='neg_mean_absolute_error').mean()
            return r_mae
        elif evaluate == 'mse':
            r_mse = cross_val_score(model,
                                    X,
                                    y,
                                    cv=5,
                                    scoring='neg_mean_squared_error').mean()
            return r_mse
        elif evaluate == 'r2':
            r_r2 = cross_val_score(model, X, y, cv=5).mean()
            return r_r2
        elif evaluate == 'rae':
            #print_file("rae")
            y_mean = statistics.mean(y)
            X1 = X.copy()
            y1 = y.copy()
            r_rae = 0
            Num = len(X.index)
            Seg = int(Num / 5)

            for i in range(5):
                if (i == 0):
                    X_test = X1[i * Seg:Seg]
                    X_train = X1[Seg:]

                    y_test = y1[i * Seg:Seg]
                    y_train = y1[Seg:]

                elif (i == 4):
                    X_test = X1[i * Seg:]
                    X_train = X1[:i * Seg]

                    y_test = y1[i * Seg:]
                    y_train = y1[:i * Seg]

                else:
                    X_test = X1[i * Seg:(i + 1) * Seg]
                    y_test = y1[i * Seg:(i + 1) * Seg]

                    X_train = X1[:i * Seg]
                    y_train = y1[:i * Seg]

                    X_train2 = X1[(i + 1) * Seg:]
                    y_train2 = y1[(i + 1) * Seg:]

                    X_train = X_train.append(X_train2)
                    y_train = y_train.append(y_train2)

                model.fit(X_train, y_train)

                y_pred = model.predict(X_test)
                diff1 = y_pred - y_test
                diff2 = y_mean - y_test

                diff1_sum = 0
                for i in diff1:
                    diff1_sum = diff1_sum + abs(i)

                diff2_sum = 0
                for i in diff2:
                    diff2_sum = diff2_sum + abs(i)

                r_rae = 1 - (diff1_sum / diff2_sum) + r_rae

            return (r_rae / 5)

    elif task == 'classification':
        le = LabelEncoder()
        y = le.fit_transform(y)

        if model == 'RF':
            model = RandomForestClassifier(n_estimators=10, random_state=0)
        elif model == 'LR':
            model = LogisticRegression(multi_class='ovr')
        elif model == 'SVM':
            model = svm.SVC()
        if evaluate == 'f_score':
            s = cross_val_score(model, X, y, scoring='f1', cv=5).mean()
        elif evaluate == 'auc':
            model = RandomForestClassifier(max_depth=10, random_state=0)
            split_pos = X.shape[0] // 10
            X_train, X_test = X[:9 * split_pos], X[9 * split_pos:]
            y_train, y_test = y[:9 * split_pos], y[9 * split_pos:]
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_test)
            s = evaluate_(y_test, y_pred)
        return s
y = get_locations_train()

X = df.copy()
X, y, cl = shuffle(X, y, cl)

dftest = pd.read_csv("test.bottleneck.{}.csv".format(architecture), index_col=0)
dftest.head()

p = []
pt = []
cv = StratifiedKFold(8)
for i, (train, test) in enumerate(cv.split(X, y, cl)):
    print(np.unique(y[train]))
    print(X.iloc[train].shape, y[train].shape)
    model = Lasso(C=C).fit(X.iloc[train], y[train])
    print(log_loss(y[test], model.predict_proba(X.iloc[test]), labels=[0, 1, 2, 3, 4, 5, 6, 7]))
    p.append((y[test], model.predict_proba(X.iloc[test])))
    pt.append(model.predict_proba(dftest))

model = Lasso(C=C).fit(X, y)
p = model.predict_proba(dftest.values)
sub = pd.read_csv("sample_submission_stg1.csv", index_col=0)
samp = sub.copy()
print(samp.mean(axis=0))

sub.loc[dftest.index, :] = p
print(sub.mean(axis=0))

sub.to_csv("sub.lr{:.5f}.bottleneck_{}.csv.gz".format(C, architecture), compression="gzip")

print(sub.head())
Example #5
0
def regularize_by_l1(X_train,
                     X_test,
                     y_train,
                     y_test,
                     all_features,
                     N_k,
                     task,
                     N_repeat,
                     seed_no=0):
    ## 0. Input arguments:
    # X_train: array that contains training feature data
    # X_test: array that contains testing feature data
    # y_train: array that contains traning response data
    # y_test: array that contains testing response data
    # all_features: names of all features (column names of X_train)
    # N_k: number of folds to split into
    # task: type of supervised learning task: 'regression' or 'classification'
    # N_repeat: number of independent cross-validation runs, each run will generate one performance score
    # seed_no: seed number to be used in the first run, 'seed_start + 1' will be used for the second run, ...

    ## 1. Perform regularized classification/regression based on the specified task
    # regression
    if task == 'regression':
        # split data into K folds
        kf = KFold(n_splits=N_k, random_state=seed_no, shuffle=True)
        # find the optimal alpha (regularization factor) using K-fold cross validation on training data
        cv_regressor = LassoCV(cv=kf, random_state=seed_no)
        cv_regressor.fit(X_train, y_train)
        best_alpha = cv_regressor.alpha_
        # fit lasso regression using the optimal alpha
        final_learner = Lasso(alpha=best_alpha)
        final_learner.fit(X_train, y_train)
        # obtain selected features by fitted lasso regression model (features with coefficients > 0)
        select_features = all_features[(final_learner.coef_ != 0).flatten()]
        N_select = len(select_features)
        # perform K-fold cross validation to obtain the training performance of fitted lasso regression model
        train_metric = []
        for i in range(0, N_repeat):
            cv_kf = KFold(n_splits=N_k, random_state=i + 1, shuffle=True)
            r2 = cross_val_score(final_learner,
                                 X_train,
                                 y_train,
                                 cv=cv_kf,
                                 scoring='r2')
            mse = cross_val_score(final_learner,
                                  X_train,
                                  y_train,
                                  cv=cv_kf,
                                  scoring='neg_mean_squared_error')
            train_metric.append({'r2': np.mean(r2), 'mse': np.mean(mse)})
        train_metric_df = pd.DataFrame(train_metric)
        # implement fitted lasso regression model on the testing set and obtain the testing performance
        y_pred = final_learner.predict(X_test)
        test_r2 = r2_score(y_test, y_pred)
        test_mse = mean_squared_error(y_test, y_pred)
        test_metric = {'r2': test_r2, 'test_mse': test_mse}

    # classification
    if task == 'classification':
        # straitified split for classification tasks
        kf = StratifiedKFold(n_splits=N_k, random_state=seed_no, shuffle=True)
        # find the optimal C (regularization factor) using K-fold cross validation on training data
        cv_classifier = LogisticRegressionCV(penalty='l1',
                                             solver='liblinear',
                                             cv=kf,
                                             random_state=seed_no)
        cv_classifier.fit(X_train, y_train)
        best_c = float(cv_classifier.C_)
        # fit logistic regression using the optimal C
        final_learner = LogisticRegression(penalty='l1',
                                           solver='liblinear',
                                           C=best_c,
                                           random_state=seed_no)
        final_learner.fit(X_train, y_train)
        # obtain selected features by fitted logistic regression model (features with coefficients > 0)
        select_features = all_features[(final_learner.coef_ != 0).flatten()]
        N_select = len(select_features)
        # perform K-fold cross validation to obtain the training performance of fitted logistic regression model
        train_metric = []
        for i in range(0, N_repeat):
            cv_kf = StratifiedKFold(n_splits=N_k,
                                    random_state=i + 1,
                                    shuffle=True)
            auc = cross_val_score(final_learner,
                                  X_train,
                                  y_train,
                                  cv=cv_kf,
                                  scoring='roc_auc')
            bac = cross_val_score(final_learner,
                                  X_train,
                                  y_train,
                                  cv=cv_kf,
                                  scoring='balanced_accuracy')
            f1 = cross_val_score(final_learner,
                                 X_train,
                                 y_train,
                                 cv=cv_kf,
                                 scoring='f1')
            train_metric.append({
                'auc': np.mean(auc),
                'bac': np.mean(bac),
                'f1': np.mean(f1)
            })
        train_metric_df = pd.DataFrame(train_metric)
        # compare with testing response data, compute metrics
        y_pred_prob = final_learner.predict_proba(X_test)[:, 1]
        y_pred = final_learner.predict(X_test)
        test_auc = roc_auc_score(y_test, y_pred_prob)
        test_bac = balanced_accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)
        test_metric = {'auc': test_auc, 'bac': test_bac, 'f1': test_f1}

    return final_learner, select_features, train_metric_df, test_metric
                        X_train_scale,
                        y_train,
                        cv=5,
                        scoring=['accuracy', 'roc_auc'],
                        return_train_score=True)

print('Cross Validation Results')
print('Train accuracy CV: %.4f' % scores['train_accuracy'].mean())
print('Test accuracy CV: %.4f' % scores['test_accuracy'].mean())
print('Test AUC: %.4f' % scores['test_roc_auc'].mean())

# In[408]:

clf = grid_search.best_estimator_
clf.fit(X_train_scale, y_train)
y_pred_prob = clf.predict_proba(X_test_scale)[:, 1]
y_pred = clf.predict(X_test_scale)

print('Training accuracy: %.2f%%' % (clf.score(X_train_scale, y_train) * 100))
print('Test accuracy: %.2f%%' % (clf.score(X_test_scale, y_test) * 100))
print('AUC: %.2f%%' % (metrics.roc_auc_score(y_test, y_pred_prob) * 100))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_prob)

# In[409]:

coefs = pd.DataFrame(clf_gridsearch.coef_[0],
                     index=X_train.columns,