def evaluate(X, y, args): if args.task == 'regression': if args.model == 'LR': model = Lasso() elif args.model == 'RF': model = RandomForestRegressor(n_estimators=10, random_state=0) if args.evaluate == 'mae': r_mae = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error').mean() return r_mae elif args.evaluate == 'mse': r_mse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean() return r_mse elif args.evaluate == 'r2': r_r2 = cross_val_score(model, X, y, cv=5).mean() return r_r2 elif args.task == 'classification': le = LabelEncoder() y = le.fit_transform(y) if args.model == 'RF': model = RandomForestClassifier(n_estimators=10, random_state=0) elif args.model == 'LR': model = LogisticRegression(multi_class='ovr') elif args.model == 'SVM': model = svm.SVC() if args.evaluate == 'f_score': s = cross_val_score(model, X, y, scoring='f1', cv=5).mean() elif args.evaluate == 'auc': model = RandomForestClassifier(max_depth=10, random_state=0) split_pos = X.shape[0] // 10 X_train, X_test = X[:9 * split_pos], X[9 * split_pos:] y_train, y_test = y[:9 * split_pos], y[9 * split_pos:] model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) s = evaluate_(y_test, y_pred) return s
class SLearner(Model): def __init__(self, *args, **kwargs): self.reg = None self.l1_penalty = 0 super(SLearner, self).__init__(*args, **kwargs) def fit(self, x, t, y, nfolds=5, lambdas=[1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2, 1e-3], seed=282): splits = super().get_splits(x, nfolds, seed) #### CLASSIFICATION #### if self.binary: avg_bces = [] for l in lambdas: bce_lst = [] for train_index, valid_index in splits: x_train, x_valid = x[train_index], x[valid_index] t_train, t_valid = t[train_index], t[valid_index] y_train, y_valid = y[train_index], y[valid_index] reg = LogisticRegression(C=1 / l).fit( self.get_predictors(x_train, t_train), y_train) log_phat_valid = reg.predict_log_proba( self.get_predictors(x_valid, t_valid)) bce = np.mean(log_phat_valid[np.arange(len(y_valid)), y_valid.astype('int')]) bce_lst.append(bce) avg_bces.append(np.mean(bce_lst)) self.l1_penalty = lambdas[np.argmin(avg_bces)] self.reg = LogisticRegression(C=1 / self.l1_penalty).fit( self.get_predictors(x, t), y) #### REGRESSION #### else: avg_rmses = [] for l in lambdas: rmse_lst = [] for train_index, valid_index in splits: x_train, x_valid = x[train_index], x[valid_index] t_train, t_valid = t[train_index], t[valid_index] y_train, y_valid = y[train_index], y[valid_index] reg = Lasso(alpha=l).fit( self.get_predictors(x_train, t_train), y_train) yhat_valid = reg.predict( self.get_predictors(x_valid, t_valid)) rmse = np.sqrt(np.mean((yhat_valid - y_valid)**2)) rmse_lst.append(rmse) avg_rmses.append(np.mean(rmse_lst)) self.l1_penalty = lambdas[np.argmin(avg_rmses)] self.reg = Lasso(alpha=self.l1_penalty).fit( self.get_predictors(x, t), y) def predict(self, x, t): if self.reg is None: raise Exception('SLearner not Initialized') #### CLASSIFICATION #### if self.binary: return self.reg.predict_proba(self.get_predictors(x, t))[:, 1] #### REGRESSION #### else: return self.reg.predict(self.get_predictors(x, t)) def get_predictors(self, x, t): return np.hstack([x, (t - 0.5).reshape(-1, 1) * x])
def evaluate1(X, y, num_op_unary, num_op_binary, max_order, num_batch, optimizer, lr, epochs, evaluate, task, dataset, model, alpha, lr_value, RL_model, reg, controller, num_random_sample, lambd, multiprocessing, package): if task == 'regression': if model == 'LR': model = Lasso() elif model == 'RF': model = RandomForestRegressor(n_estimators=10, random_state=0) if evaluate == 'mae': r_mae = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error').mean() return r_mae elif evaluate == 'mse': r_mse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean() return r_mse elif evaluate == 'r2': r_r2 = cross_val_score(model, X, y, cv=5).mean() return r_r2 elif evaluate == 'rae': #print_file("rae") y_mean = statistics.mean(y) X1 = X.copy() y1 = y.copy() r_rae = 0 Num = len(X.index) Seg = int(Num / 5) for i in range(5): if (i == 0): X_test = X1[i * Seg:Seg] X_train = X1[Seg:] y_test = y1[i * Seg:Seg] y_train = y1[Seg:] elif (i == 4): X_test = X1[i * Seg:] X_train = X1[:i * Seg] y_test = y1[i * Seg:] y_train = y1[:i * Seg] else: X_test = X1[i * Seg:(i + 1) * Seg] y_test = y1[i * Seg:(i + 1) * Seg] X_train = X1[:i * Seg] y_train = y1[:i * Seg] X_train2 = X1[(i + 1) * Seg:] y_train2 = y1[(i + 1) * Seg:] X_train = X_train.append(X_train2) y_train = y_train.append(y_train2) model.fit(X_train, y_train) y_pred = model.predict(X_test) diff1 = y_pred - y_test diff2 = y_mean - y_test diff1_sum = 0 for i in diff1: diff1_sum = diff1_sum + abs(i) diff2_sum = 0 for i in diff2: diff2_sum = diff2_sum + abs(i) r_rae = 1 - (diff1_sum / diff2_sum) + r_rae return (r_rae / 5) elif task == 'classification': le = LabelEncoder() y = le.fit_transform(y) if model == 'RF': model = RandomForestClassifier(n_estimators=10, random_state=0) elif model == 'LR': model = LogisticRegression(multi_class='ovr') elif model == 'SVM': model = svm.SVC() if evaluate == 'f_score': s = cross_val_score(model, X, y, scoring='f1', cv=5).mean() elif evaluate == 'auc': model = RandomForestClassifier(max_depth=10, random_state=0) split_pos = X.shape[0] // 10 X_train, X_test = X[:9 * split_pos], X[9 * split_pos:] y_train, y_test = y[:9 * split_pos], y[9 * split_pos:] model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) s = evaluate_(y_test, y_pred) return s
y = get_locations_train() X = df.copy() X, y, cl = shuffle(X, y, cl) dftest = pd.read_csv("test.bottleneck.{}.csv".format(architecture), index_col=0) dftest.head() p = [] pt = [] cv = StratifiedKFold(8) for i, (train, test) in enumerate(cv.split(X, y, cl)): print(np.unique(y[train])) print(X.iloc[train].shape, y[train].shape) model = Lasso(C=C).fit(X.iloc[train], y[train]) print(log_loss(y[test], model.predict_proba(X.iloc[test]), labels=[0, 1, 2, 3, 4, 5, 6, 7])) p.append((y[test], model.predict_proba(X.iloc[test]))) pt.append(model.predict_proba(dftest)) model = Lasso(C=C).fit(X, y) p = model.predict_proba(dftest.values) sub = pd.read_csv("sample_submission_stg1.csv", index_col=0) samp = sub.copy() print(samp.mean(axis=0)) sub.loc[dftest.index, :] = p print(sub.mean(axis=0)) sub.to_csv("sub.lr{:.5f}.bottleneck_{}.csv.gz".format(C, architecture), compression="gzip") print(sub.head())
def regularize_by_l1(X_train, X_test, y_train, y_test, all_features, N_k, task, N_repeat, seed_no=0): ## 0. Input arguments: # X_train: array that contains training feature data # X_test: array that contains testing feature data # y_train: array that contains traning response data # y_test: array that contains testing response data # all_features: names of all features (column names of X_train) # N_k: number of folds to split into # task: type of supervised learning task: 'regression' or 'classification' # N_repeat: number of independent cross-validation runs, each run will generate one performance score # seed_no: seed number to be used in the first run, 'seed_start + 1' will be used for the second run, ... ## 1. Perform regularized classification/regression based on the specified task # regression if task == 'regression': # split data into K folds kf = KFold(n_splits=N_k, random_state=seed_no, shuffle=True) # find the optimal alpha (regularization factor) using K-fold cross validation on training data cv_regressor = LassoCV(cv=kf, random_state=seed_no) cv_regressor.fit(X_train, y_train) best_alpha = cv_regressor.alpha_ # fit lasso regression using the optimal alpha final_learner = Lasso(alpha=best_alpha) final_learner.fit(X_train, y_train) # obtain selected features by fitted lasso regression model (features with coefficients > 0) select_features = all_features[(final_learner.coef_ != 0).flatten()] N_select = len(select_features) # perform K-fold cross validation to obtain the training performance of fitted lasso regression model train_metric = [] for i in range(0, N_repeat): cv_kf = KFold(n_splits=N_k, random_state=i + 1, shuffle=True) r2 = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='r2') mse = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='neg_mean_squared_error') train_metric.append({'r2': np.mean(r2), 'mse': np.mean(mse)}) train_metric_df = pd.DataFrame(train_metric) # implement fitted lasso regression model on the testing set and obtain the testing performance y_pred = final_learner.predict(X_test) test_r2 = r2_score(y_test, y_pred) test_mse = mean_squared_error(y_test, y_pred) test_metric = {'r2': test_r2, 'test_mse': test_mse} # classification if task == 'classification': # straitified split for classification tasks kf = StratifiedKFold(n_splits=N_k, random_state=seed_no, shuffle=True) # find the optimal C (regularization factor) using K-fold cross validation on training data cv_classifier = LogisticRegressionCV(penalty='l1', solver='liblinear', cv=kf, random_state=seed_no) cv_classifier.fit(X_train, y_train) best_c = float(cv_classifier.C_) # fit logistic regression using the optimal C final_learner = LogisticRegression(penalty='l1', solver='liblinear', C=best_c, random_state=seed_no) final_learner.fit(X_train, y_train) # obtain selected features by fitted logistic regression model (features with coefficients > 0) select_features = all_features[(final_learner.coef_ != 0).flatten()] N_select = len(select_features) # perform K-fold cross validation to obtain the training performance of fitted logistic regression model train_metric = [] for i in range(0, N_repeat): cv_kf = StratifiedKFold(n_splits=N_k, random_state=i + 1, shuffle=True) auc = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='roc_auc') bac = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='balanced_accuracy') f1 = cross_val_score(final_learner, X_train, y_train, cv=cv_kf, scoring='f1') train_metric.append({ 'auc': np.mean(auc), 'bac': np.mean(bac), 'f1': np.mean(f1) }) train_metric_df = pd.DataFrame(train_metric) # compare with testing response data, compute metrics y_pred_prob = final_learner.predict_proba(X_test)[:, 1] y_pred = final_learner.predict(X_test) test_auc = roc_auc_score(y_test, y_pred_prob) test_bac = balanced_accuracy_score(y_test, y_pred) test_f1 = f1_score(y_test, y_pred) test_metric = {'auc': test_auc, 'bac': test_bac, 'f1': test_f1} return final_learner, select_features, train_metric_df, test_metric
X_train_scale, y_train, cv=5, scoring=['accuracy', 'roc_auc'], return_train_score=True) print('Cross Validation Results') print('Train accuracy CV: %.4f' % scores['train_accuracy'].mean()) print('Test accuracy CV: %.4f' % scores['test_accuracy'].mean()) print('Test AUC: %.4f' % scores['test_roc_auc'].mean()) # In[408]: clf = grid_search.best_estimator_ clf.fit(X_train_scale, y_train) y_pred_prob = clf.predict_proba(X_test_scale)[:, 1] y_pred = clf.predict(X_test_scale) print('Training accuracy: %.2f%%' % (clf.score(X_train_scale, y_train) * 100)) print('Test accuracy: %.2f%%' % (clf.score(X_test_scale, y_test) * 100)) print('AUC: %.2f%%' % (metrics.roc_auc_score(y_test, y_pred_prob) * 100)) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_prob) # In[409]: coefs = pd.DataFrame(clf_gridsearch.coef_[0], index=X_train.columns,