def learn(learning_rate, X_train, y_train, X_test, y_test): model = GradientBoostingClassifier( n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate ) model.fit(X_train, y_train) # plot scores test_score = list(range(250)) train_score = list(range(250)) for i, predictions in enumerate(model.staged_decision_function(X_test)): predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format predictions = [1/(1 + math.exp(-x)) for x in predictions] test_score[i] = log_loss(y_test, predictions) for i, predictions in enumerate(model.staged_decision_function(X_train)): predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format predictions = [1/(1 + math.exp(-x)) for x in predictions] train_score[i] = log_loss(y_train, predictions) plt.figure() plt.plot(test_score, 'r', linewidth=2) plt.plot(train_score, 'g', linewidth=2) plt.legend(['test', 'train']) plt.show() return train_score, test_score
test_deviance = {} def sigmoid(y_pred): return 1 / (1 + math.e ** (-y_pred)) learning_rates = [1, 0.5, 0.3, 0.2, 0.1] for learning_rate in learning_rates: model = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate) model.fit(X_train, y_train) # compute test set deviance test_deviance[learning_rate] = np.zeros((250,), dtype=np.float64) for i, y_pred in enumerate(model.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[learning_rate][i] = log_loss(y_test, sigmoid(y_pred)) plt.plot((np.arange(test_deviance[learning_rate].shape[0]) + 1)[::5], test_deviance[learning_rate][::5], '-', label='label = {}'.format(learning_rate)) plt.legend(loc='upper left') plt.xlabel('Boosting Iterations') plt.ylabel('Test Set Deviance') plt.show() # 3. Как можно охарактеризовать график качества на тестовой выборке, # начиная с некоторой итерации: переобучение (overfitting) или недообучение (underfitting)? # В ответе укажите одно из слов overfitting либо underfitting. print('overfitting')
X = df.iloc[:, 1:].as_matrix() y = df.iloc[:, 0].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) for l in [1, 0.5, 0.3, 0.2, 0.1]: cf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=l) cf.fit(X_train, y_train) train_loss = [] test_loss = [] # log loss for train set for stage, array in enumerate(cf.staged_decision_function(X_train)): # apply sigmoid function transformed = [] for row in array: transformed.append(float(1) / (1+np.exp(-row[0]))) # calculate metric score = log_loss(y_train, transformed) train_loss.append(score) # log loss for test set for stage, array in enumerate(cf.staged_decision_function(X_test)): # apply sigmoid function transformed = [] for row in array: transformed.append(float(1) / (1+np.exp(-row[0]))) # calculate metric
from sklearn.ensemble import RandomForestClassifier clfR = RandomForestClassifier(n_estimators=250,verbose=True, random_state=241) clfR.fit(X_train,y_train) print log_loss(y_test, clfR.predict_proba(X_test)) ######################################## clf = GradientBoostingClassifier(n_estimators=250,verbose=True, random_state=241,learning_rate = 0.2) clf.fit(X_train,y_train) sdf = [] k = 0 for y_pred in enumerate(clf.staged_decision_function(X_test)): sdf.append([]) for i in y_pred[1]: sdf[k].append( 1 / (1 + math.exp(-1* i ))) #sdf[k].append( i + 1-1) k+= 1 k = 0 for i in sdf: print (str(k)+ " " + str( log_loss(y_true = y_test, y_pred = i))) k+=1 a= [] for i in sdf: a.append(log_loss(y_true = y_test, y_pred = i)) print min(a)
data = pandas.read_csv('gbm-data.csv').values y=data[:,0] X=data[:,1:] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=24) train_loss_learning_rate=[] test_loss_learning_rate=[] for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: GBC=GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=250, verbose=True, random_state=241) GBC.fit(X_train,y_train) train_loss=[] test_loss=[] for i, y in enumerate(GBC.staged_decision_function(X_train)): y_ = 1 / (1 + np.exp(-y)) train_loss.append(log_loss(y_train,y_)) train_loss_learning_rate.append(train_loss) for i, y in enumerate(GBC.staged_decision_function(X_test)): y_ = 1 / (1 + np.exp(-y)) test_loss.append(log_loss(y_test,y_)) test_loss_learning_rate.append(test_loss) a=np.array(train_loss_learning_rate) b=np.array(test_loss_learning_rate) farbe = ['orange','turquoise', 'blue','gray','magenta'] learning_rate = [1, 0.5, 0.3, 0.2, 0.1] plt.figure(num=1)
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.8, random_state=241) learning_rate = [1, 0.5, 0.3, 0.2, 0.1] loss_train = [] loss_test = [] min_loss = [] gbc = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=0.2) gbc.fit(X_train, y=y_train) score_train = gbc.staged_decision_function(X_train) score_test = gbc.staged_decision_function(X_test) for pred in score_train: loss_train.append( log_loss(y_train, [1 / (1 + math.exp((-1) * y_pred)) for y_pred in pred])) for pred in score_test: loss_test.append( log_loss(y_test, [1 / (1 + math.exp((-1) * y_pred)) for y_pred in pred])) min_loss.append(min(loss_test)) min_value = min(min_loss) min_index = min_loss.index(min_value) rfc = RandomForestClassifier(n_estimators=300, random_state=241)
y = data[:,0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) def sigmoid(arr): return 1./(1. + np.exp(-arr)) # for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: for learning_rate in [0.2, 0.1]: clf = GradientBoostingClassifier(n_estimators=250, learning_rate=learning_rate, # verbose=True, random_state=241) clf.fit(X_train, y_train) predict_train_by_iter = clf.staged_decision_function(X_train) predict_test_by_iter = clf.staged_decision_function(X_test) loss_train_by_iter = [] loss_test_by_iter = [] for predict in predict_train_by_iter: loss_value = log_loss(y_train, sigmoid(predict)) loss_train_by_iter.append(loss_value) for predict in predict_test_by_iter: loss_value = log_loss(y_test, sigmoid(predict)) loss_test_by_iter.append(loss_value) min_loss_index = np.argmin(loss_test_by_iter) print('learning_rate=%s, min_loss_value=%s, iteration(from 1)=%s' % (
def sigmoid(y): return 1. / (1 + np.exp(-y)) for i in [1, 0.5, 0.3, 0.2, 0.1]: gbt = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=i) gbt.fit(X_train, y_train) train_loss = [] test_loss = [] for j, y_pred in enumerate(gbt.staged_decision_function(X_train)): train_loss.append(log_loss(y_train, sigmoid(y_pred))) for j, y_pred in enumerate(gbt.staged_decision_function(X_test)): test_loss.append(log_loss(y_test, sigmoid(y_pred))) min_train_loss = np.min(train_loss) iter_train = np.argmin(train_loss) min_test_loss = np.min(test_loss) iter_test = np.argmin(test_loss) print("{}:\nmin train_loss {} on iteration {}".format( gbt, min_train_loss, iter_train)) print("{}:\nmin test_loss {} on iteration {}".format( gbt, min_test_loss, iter_test))
X = df.drop(['Activity'], axis=1).values # In[4]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) # In[17]: gbm_model = GradientBoostingClassifier(n_estimators=250, learning_rate=0.2, verbose=True, random_state=241) gbm_model.fit(X_train, y_train) # In[18]: arr = [] for i in gbm_model.staged_decision_function(X_test): arr.append(log_loss(y_test, [(1.0 / (1.0 + math.exp(-j))) for j in i])) min(arr) # In[27]: rf_model = RandomForestClassifier(n_estimators=36, random_state=241) rf_model.fit(X_train, y_train) y_pred = rf_model.predict_proba(X_test)[:, 1] log_loss(y_test, y_pred)
"predicted_den.npy") true_density_RF = np.load( "/Users/lls/Documents/CODE/stored_files/shear/classification/density_only/true_den.npy" ) pred_all = np.array([pred_i, pred_density_RF]) true_all = np.array([true_test, true_density_RF]) fpr, tpr, auc, fig = get_multiple_rocs(pred_all, true_all, labels=["GBT", "RF"]) plt.savefig(path + "roc_vs_RF.png") # score test vs train score_test = np.zeros(clf.n_estimators, ) for i, y_pred in enumerate(clf.staged_decision_function(testing_features)): score_test[i] = clf.loss_(true_test, y_pred) score_train = np.zeros(clf.n_estimators, ) for i, y_pred in enumerate(clf.staged_decision_function(training_features)): score_train[i] = clf.loss_(true_train, y_pred) score_train -= score_train[0] score_test -= score_test[0] plt.figure() plt.plot(np.arange(clf.n_estimators), score_train, label="score train") plt.plot(np.arange(clf.n_estimators), score_test, label="score test") plt.ylabel("Loss") plt.legend(loc="best") plt.xlabel("N estimators")
import pandas from sklearn.cross_validation import train_test_split from sklearn.ensemble import GradientBoostingClassifier import sklearn.metrics as met import matplotlib.pyplot as plt df = pandas.read_csv("gbm-data.csv") vals = df.values X_train, X_test, y_train, y_test = train_test_split(vals[:, 1:], vals[:, 0], test_size=0.8, random_state=241) # for lr in [1, 0.5, 0.3, 0.2, 0.1]: clf = GradientBoostingClassifier(learning_rate=1, n_estimators=250, verbose=False, random_state=241) clf.fit(X_train, y_train) sc_train = enumerate(clf.staged_decision_function(X_train)) sc_test = enumerate(clf.staged_decision_function(X_test)) train_loss = {} test_loss = {} for i, y_predicted in sc_train: train_loss[i] = met.log_loss(y_train,1/(1+np.exp(-y_predicted))) for i, y_predicted in sc_test: test_loss[i] = met.log_loss(y_test, 1/(1+np.exp(-y_predicted))) plt.figure() plt.plot(list(test_loss.values()), 'r', linewidth=2) plt.plot(list(train_loss.values()), 'g', linewidth=2) plt.legend(['test', 'train']) plt.show()
from sklearn.metrics import log_loss data = pandas.read_csv('gbm-data.csv') X = data.drop('Activity', axis=1) y = data['Activity'] data = np.array(pandas.read_csv('gbm-data.csv').values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) for learning_rate in [0.2]: cls = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate) cls.fit(X_train, y_train) print(cls.learning_rate) sigma_func = lambda x: 1/(1+math.e**(-x)) sdc_train = list(cls.staged_decision_function(X_train)) sdc_test = list(cls.staged_decision_function(X_test)) for i in range(250): pred_train = list(map(sigma_func, sdc_train[i])) pred_test = list(map(sigma_func, sdc_test[i])) loss_train = log_loss(y_train, pred_train) loss_test = log_loss(y_test, pred_test) print(i, loss_train, loss_test) clf = RandomForestClassifier(n_estimators=36, random_state=241) clf.fit(X_train, y_train) pred = clf.predict_proba(X_test) print(log_loss(y_test, pred))
y = np_data[:, 0] X = np_data[:, 1:] # X = data.drop('Activity', axis=1).values # y = data.Activity.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) # for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: for learning_rate in [0.2]: gbc = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=250, verbose=True, random_state=241) gbc.fit(X=X_train, y=y_train) staged_decision_train = gbc.staged_decision_function(X_test) # staged_decision_test = gbc.staged_decision_function(X_test) # yy1 = sigmoid(np.array(list(staged_decision_train))) # yy2 = sigmoid(staged_decision_test) test_loss = np.empty(250) for i, y_pred in enumerate(staged_decision_train): y_pred = 1.0 / (1.0 + np.exp(-y_pred)) test_loss[i] = log_loss(y_test, y_pred) print(test_loss.max()) if learning_rate == 0.2: print('learning_rate == 0.2') lr02_min = test_loss.min() lr02_idxmin = test_loss.argmin() print(lr02_min) print(lr02_idxmin) with open('/home/dima/lr_w5_z2_1_1.txt', 'w') as out:
y = np.array(df['Activity'].values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) def sigmoid(y_pred): return 1 / (1 + np.exp(-y_pred)) rates = [1, 0.5, 0.3, 0.2, 0.1] r = 0.2 clf = GradientBoostingClassifier(n_estimators=250, learning_rate=r, verbose=True, random_state=241) clf.fit(X_train, y_train) test_loss = [ (i, log_loss(y_test, sigmoid(y_pred))) for i, y_pred in enumerate(clf.staged_decision_function(X_test)) ] train_loss = [ (i, log_loss(y_train, sigmoid(y_pred))) for i, y_pred in enumerate(clf.staged_decision_function(X_train)) ] # # plt.figure() # plt.plot([loss for i, loss in test_loss], 'r', linewidth=2) # plt.plot([loss for i, loss in train_loss], 'g', linewidth=2) # plt.legend(['test', 'train']) # plt.show(); printAndWriteAnswer(1, 'overfitting')
def gb(data): X = data[data.columns.values[1:]].values y = data[data.columns.values[:1]].values.ravel() N = len(y) X_train, X_test, y_train, y_test = \ cv.train_test_split(X, y, test_size=0.8, random_state=241) # ------------------------------------------------------ # Deal with Gradient Boosting # ------------------------------------------------------ # Reserve an array to store iteration with min log_loss for each learning rate min_iterations_train = [] min_iterations_test = [] # Fit Gradient Boosting Classifiers with different learning rates learning_rates = [1, 0.5, 0.3, 0.2, 0.1] for lr in learning_rates: print("GB learning rate = ", lr) # Fit the classifier gbclf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=lr) gbclf.fit(X_train, y_train) # Get log_loss errors after every iteration of the Gradient Boosting y_train_pred = gbclf.staged_decision_function(X_train) log_loss_train = [] for y_t_p in y_train_pred: log_loss_train.append(log_loss(y_train, 1 / (1 + np.exp(-y_t_p)))) y_test_pred = gbclf.staged_decision_function(X_test) log_loss_test = [] for y_t_p in y_test_pred: log_loss_test.append(log_loss(y_test, 1 / (1 + np.exp(-y_t_p)))) # Min log-loss and the corresponding iteration log_loss_train_min_ind = np.argmin(log_loss_train) + 1 log_loss_test_min_ind = np.argmin(log_loss_test) + 1 log_loss_train_min = np.min(log_loss_train) log_loss_test_min = np.min(log_loss_test) min_iterations_train.append((log_loss_train_min, log_loss_train_min_ind)) min_iterations_test.append((log_loss_test_min, log_loss_test_min_ind)) # Plot the errors for both TRAIN and TEST sets (w/ the curr Learning Rate) plt.figure('GB learning rate: ' + str(lr)) plt.plot(log_loss_test, 'r', linewidth=2) plt.plot(log_loss_train, 'g', linewidth=2) plt.legend(['log_loss_test', 'log_loss_train']) plt.draw() # Optimal TEST iteration for the learning rate 0.2 print('Optimal iterations TEST vs. learning rate:') for t in zip(min_iterations_test, learning_rates): print('min: ', t[0][0], 'min_ind: ', t[0][1], 'learning rate: ', t[1]) t = [(x[0], x[1]) for x, y in zip(min_iterations_test, learning_rates) if y == 0.2] opt_log_loss = t[0][0] opt_log_loss_ind = t[0][1] writefile('%0.2f %d' % (opt_log_loss, opt_log_loss_ind), 'log-loss-0.2.out') # ------------------------------------------------------ # Deal with Random Forests # ------------------------------------------------------ clf = RandomForestClassifier(n_estimators=opt_log_loss_ind, random_state=241) clf.fit(X_train, y_train) y_test_pred_rf = clf.predict_proba(X_test) log_loss_test_rf = log_loss(y_test, y_test_pred_rf) # log-loss over the test set using Random Forests writefile('%0.2f' % (log_loss_test_rf), 'log-loss-rf.out') return 0
plt.style.use('ggplot') df = pd.read_csv('gbm-data.csv') val = df.values X = val[:,1:] y = val[:,0] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.8, random_state=241) #learning_rates = [1, 0.5, 0.3, 0.2, 0.1] learning_rates = [0.2] sigmoid = lambda x: 1 / (1 + np.exp(-x)) log_loss_test = [] for l in learning_rates: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241,learning_rate = l) print('fitting...') clf.fit(X_train, y_train) print('building staged_decision_function') staged_dec = clf.staged_decision_function(X_test) for pred in staged_dec: y_pred = sigmoid(pred) log_loss_test.append(log_loss(y_test,y_pred)) best_iter = [np.argmin(log_loss_test),log_loss_test[np.argmin(log_loss_test)]] #clf1 = RandomForestClassifier(n_estimators = 37, random_state=241) #clf1.fit(X_train, y_train) #prediction = clf1.predict_proba(X_test) #res = log_loss(y_test,prediction) #
df = pandas.read_csv('gbm-data.csv', index_col=None) #1 dfa = df.values X = dfa[:,1:] y = dfa[:,0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) def sigma(y_pred): return 1/(1 + np.exp(-y_pred)) # for rate in [1, 0.5, 0.3, 0.2, 0.1]: #2 for rate in [0.2]: #2 print rate clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate = rate) clf.fit(X_train, y_train) sigma_y_train = [sigma(y) for y in clf.staged_decision_function(X_train)] sigma_y_test = [sigma(y) for y in clf.staged_decision_function(X_test) ] log_loss_train = [log_loss(y_train, y) for y in sigma_y_train] log_loss_test = [log_loss(y_test , y) for y in sigma_y_test ] min_log_loss_test = min(log_loss_test) it_min_log_loss_test = log_loss_test.index(min_log_loss_test) print ">>>> it: ", it_min_log_loss_test, " val: ", min_log_loss_test #4 if rate == 0.2: #5 rf = RandomForestClassifier(random_state=241, n_estimators=it_min_log_loss_test) rf.fit(X_train, y_train) tree_log_loss_test = log_loss(y_test, rf.predict_proba(X_test)[:,1]) print ">>>>>>>> rf log_loss val: ", tree_log_loss_test plt.figure()
X = data_values[:, 1:] y = data_values[:, 0] # Разбейте выборку на обучающую и тестовую, используя функцию train_test_split # с параметрами test_size = 0.8 и random_state = 241. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) # 2 # Обучите GradientBoostingClassifier с параметрами n_estimators=250, verbose=True, random_state=241 # и для каждого значения learning_rate из списка [1, 0.5, 0.3, 0.2, 0.1] проделайте следующее: for lr in [1, 0.5, 0.3, 0.2, 0.1]: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=lr) clf.fit(X_train, y_train) # Используйте метод staged_decision_function для предсказания качества на обучающей и тестовой выборке на каждой итерации. score_prediction_train = clf.staged_decision_function(X_train) score_prediction_test = clf.staged_decision_function(X_test) # Преобразуйте полученное предсказание с помощью сигмоидной функции по формуле 1 / (1 + e^{−y_pred}), где y_pred — предсказаное значение. score_prediction_train_mod = 1 / (1 + math.exp(-score_prediction_train)) score_prediction_test_mod = 1 / (1 + math.exp(-score_prediction_test)) # Вычислите и постройте график значений log-loss # (которую можно посчитать с помощью функции sklearn.metrics.log_loss) на обучающей и тестовой выборках, # а также найдите минимальное значение метрики и номер итерации, на которой оно достигается. log_loss_graph_train = log_loss(y_train, clf.predict_proba(X_train)[:, 1]) log_loss_graph_test = log_loss(y_test, clf.predict_proba(X_test)[:, 1]) print("%s -> ll[train] = %s -> ll[test] = %s" % (lr, log_loss_graph_train, log_loss_graph_test))
X_train, X_test, y_train, y_test = model_selection.train_test_split( data_ar[:, 1:], data_ar[:, 0], random_state=241, test_size=0.8) learning_rate = [1, 0.5, 0.3, 0.2, 0.1] res = {} plt.figure(figsize=(6, 30)) for i, lr in enumerate(learning_rate): classifier = GradientBoostingClassifier(learning_rate=lr, n_estimators=250, random_state=241, verbose=True) classifier.fit(X_train, y_train) train_staged_decision = classifier.staged_decision_function(X_train) test_staged_decision = classifier.staged_decision_function(X_test) sigmoid_train = [ 1 / (1 + np.exp(-y_pred)) for y_pred in train_staged_decision ] sigmoid_test = [ 1 / (1 + np.exp(-y_pred)) for y_pred in test_staged_decision ] predictions_train = classifier.predict_proba(X_train) predictions_test = classifier.predict_proba(X_test) log_loss_train = [ metrics.log_loss(y_train, iteration_pred) for iteration_pred in sigmoid_train ] log_loss_test = [ metrics.log_loss(y_test, iteration_pred)
# iteration = train_loss.index(min_metric) # elif test_min < min_metric: # min_metric = test_min # iteration = test_loss.index(min_metric) # print('iter = {} val = {:.2}'.format(iteration, test_min)) # list_of_mins.append(test_min) # plt.figure() # plt.plot(test_loss, 'r', linewidth=2) # plt.plot(train_loss, 'g', linewidth=2) # plt.legend(['test', 'train']) # plt.show() # print('FINAL:\niter = {} val = {:.2}'.format(iteration, test_min)) # ------------------------------------------------------------------------------------------------------------------------- gbc = GradientBoostingClassifier(n_estimators=250, random_state=241, learning_rate=0.2) gbc.fit(train_matrix, train_vec) test_loss = [] iter_list = [] for i, pred in enumerate(gbc.staged_decision_function(test_matrix)): iter_list.append(i) test_loss.append(log_loss(test_vec, sigmoid(pred))) test_min = np.amin(test_loss) # prints iteration with lowest loss (loss = 0.53, iteration = 36) # print('{:.2} {}'.format(test_min, iter_list[test_loss.index(test_min)]), end='') # ------------------------------------------------------------------------------------------------------------------------- # find log loss of rfc's prediction using iterations found in prev task as amount of estimators rfc = RandomForestClassifier(n_estimators=iter_list[test_loss.index(test_min)], random_state=241) rfc.fit(train_matrix, train_vec) print('{:.2}'.format(log_loss(test_vec, rfc.predict_proba(test_matrix))), end='') # log loss is 0.54
for rate in [1, 0.5, 0.3, 0.2, 0.1]: # training the model clf = GradientBoostingClassifier(learning_rate=rate, n_estimators=250, verbose=True, random_state=241) clf.fit(X_train, y_train) # initializing lists for losses test_loss = [] train_loss = [] # filling them with values from stages of model's decision-making # using log_loss between true class and sigmoid of predicted for y_pred in clf.staged_decision_function(X_train): train_loss.append(log_loss(y_train, 1 / (1 + np.exp(-y_pred)))) for y_pred in clf.staged_decision_function(X_test): test_loss.append(log_loss(y_test, 1 / (1 + np.exp(-y_pred)))) # observing minimum losses for each rate print(f''' for learning_rate={rate} train loss: min, iteration are {min(train_loss)}, {np.argmin(train_loss)} test loss: min, iteration are {min(test_loss)}, {np.argmin(test_loss)} ''') # plotting losses without blocking the loop plt.figure() plt.title(f'Train and test losses for learning_rate={rate}') plt.xlabel('Iteration')
gbt_noRand05 = GradientBoostingClassifier(loss='deviance', learning_rate=0.05, n_estimators=500, subsample=1.0, min_samples_split=20, min_samples_leaf=10, max_depth=4) # Apprentissage du modele gbt_noRand05.fit(X_train, y_train) niter = 500 iter = np.arange(niter) + 1 test_deviance = np.zeros((niter, ), dtype=np.float64) # staged_decision_functio : décision fonction à chaque iteration for i, y_pred in enumerate(gbt_noRand05.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = gbt_noRand05.loss_(y_test, y_pred) plt.figure(figsize=(8, 6)) # Erreur sur le test (evolution deviance) plt.plot(iter, test_deviance, label='Test', color='darkorange') # min vers 100 # Erreur sur apprentissage (evolution deviance) plt.plot(iter, gbt_noRand05.train_score_, label='Apprentissage', color='navy') # Diminution de l'erreur rapport modele precedant (par rapport au oob) #plt.plot(iter,gbt_noRand05.oob_improvement_) plt.legend(loc="upper right", fontsize=12) # Prediction des probabilités de 1 , array2d probas_test = gbt_noRand05.predict_proba(X_test)[:, 1] probas_train = gbt_noRand05.predict_proba(X_train)[:, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) learn_rates = [1, 0.5, 0.3, 0.2, 0.1] for lr in learn_rates: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, learning_rate=lr, random_state=241) clf.fit(X_train, y_train) #compute quality on training set train_loss = [] for i, y_pred in enumerate(clf.staged_decision_function(X_train)): y_pred_sigmoid = 1.0 / (1 + np.exp(-y_pred)) loss = log_loss(y_train, y_pred_sigmoid) train_loss.append(loss) #compute quality and find minimum loss on test set min_loss = [0, 10] test_loss = [] for i, y_pred in enumerate(clf.staged_decision_function(X_test)): y_pred_sigmoid = 1.0 / (1 + np.exp(-y_pred)) loss = log_loss(y_test, y_pred_sigmoid) test_loss.append(loss) if loss < min_loss[1]: min_loss[0] = i min_loss[1] = loss
y = data[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, \ random_state=42) # choose lr (learning rate) out of [1, 0.5, 0.3, 0.2, 0.1] lr = 0.2 print('Learning rate =', lr) # fit gradient boosting clf = GradientBoostingClassifier(n_estimators=250, verbose=True, \ learning_rate = lr, random_state=241) clf.fit(X_train, y_train) # retrieve predictions on each iteration stage_train = list(clf.staged_decision_function(X_train)) stage_test = list(clf.staged_decision_function(X_test)) # convert predictions to the probability range for i in range(len(stage_train)): for j in range(len(stage_train[0])): stage_train[i][j] = 1 / (1 + math.exp(-stage_train[i][j])) for i in range(len(stage_test)): for j in range(len(stage_test[0])): stage_test[i][j] = 1 / (1 + math.exp(-stage_test[i][j])) # calculate logloss on each iteration logloss_train = [sklearn.metrics.log_loss(y_train, stage_train[i]) \ for i in range(len(stage_train))] logloss_test = [sklearn.metrics.log_loss(y_test, stage_test[i]) \ for i in range(len(stage_test))]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) iter_number = 0 learning_rates = [1, 0.5, 0.3, 0.2, 0.1] for rate in learning_rates: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=rate) clf.fit(X_train, y_train) sdf_train = clf.staged_decision_function(X_train) sdf_test = clf.staged_decision_function(X_test) score_train = [] for y_pred in sdf_train: score_train.append(log_loss(y_train, sigm(y_pred))) score_test = [] min_loss = 1 for i, y_pred in enumerate(sdf_test): loss = log_loss(y_test, sigm(y_pred)) score_test.append(loss) if rate == 0.2 and loss < min_loss: min_loss = loss iter_number = i
data = pandas.read_csv('Data/gbm-data.csv') datanp = data.values y = datanp[:, 0] x = datanp[:, [x for x in range(1, 1777)]] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.8, random_state=241) test_score = dict() for learning_rate in [0.2]: print("learning rate:", learning_rate) cls = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate) cls.fit(X_train, y_train) for i, pred in enumerate(cls.staged_decision_function(X_test)): predicted = sigmoid(pred) test_score[i] = log_loss(y_test, predicted) #train_score = dict() #for i, pred in enumerate(cls.staged_decision_function(X_train)): # train_score[i] = cls.loss_(y_train, pred) pp.pprint(test_score) res = min(test_score, key=test_score.get) print(res) cls2 = GradientBoostingClassifier(n_estimators=36, verbose=True, random_state=241) cls2.fit(X_train, y_train)
import matplotlib.pyplot as plt #%matplotlib inline data = pd.read_csv('gbm-data.csv') y = data['Activity'].values X = data.drop('Activity', axis = 1).values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 241) learning_rate = [1, 0.5, 0.3, 0.2, 0.1] for i in learning_rate: clf = GBC(n_estimators = 250, verbose = True, random_state = 241, learning_rate = i) clf.fit(X_train, y_train) train_loss = [log_loss(y_train, 1.0/(1 + exp(-y_pred))) for y_pred in clf.staged_decision_function(X_train)] test_loss = [log_loss(y_test, 1.0/(1 + exp(-y_pred))) for y_pred in clf.staged_decision_function(X_test)] plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) clf = GBC(n_estimators = 250, verbose = True, random_state = 241, learning_rate = 0.2) clf.fit(X_train, y_train) test_loss = [log_loss(y_test, 1.0/(1 + exp(-y_pred))) for y_pred in clf.staged_decision_function(X_test)] with open('log-loss.txt', 'w') as f: f.write(str(round(min(test_loss), 2)) + ' ' + str(test_loss.index(min(test_loss)))) clf = RFC(random_state = 241, n_estimators = test_loss.index(min(test_loss)))
import pandas import math from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor from sklearn.cross_validation import train_test_split from sklearn.metrics import log_loss data = pandas.read_csv('gbm-data.csv') train = data.drop('Activity', 1) target = data['Activity'] train = train.values target = target.values X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.8, random_state=241) list = [1, 0.5, 0.3, 0.2, 0.1] for i in list: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=i) for predict in clf.staged_decision_function(X=X_train): predict = 1 + math.exp(- predict) test_loss = log_loss(y_true=y_train, y_pred=predict)
import matplotlib.pyplot as plt data = pandas.read_csv('gbm-data.csv') X = data[list(range(1, len(data.columns)))] y = np.ravel(data[[0]]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate) clf.fit(X_train, y_train) log_train = [] log_test = [] for y_pred in clf.staged_decision_function(X_train): log_train.append(log_loss(y_train, 1 / (1 + np.exp(-y_pred)))) for y_pred in clf.staged_decision_function(X_test): log_test.append(log_loss(y_test, 1 / (1 + np.exp(-y_pred)))) if learning_rate == 0.2: mini = min(log_test) ind = (log_test).index(mini) plt.figure() plt.plot(log_test, 'r', linewidth=2) plt.plot(log_train, 'g', linewidth=2) plt.legend(['test', 'train']) plt.show()
t0 = DT.datetime.now() model_gbc.fit(X_train, y_train3) t1 = DT.datetime.now() print('GBC took ' + str(t1 - t0)) z_gbc = model_gbc.predict_proba(X_test)[:, 1] #ROC fpr_gbc, tpr_gbc, thresh_gbc = skm.roc_curve(y_test3, z_gbc) plt.figure(3) plt.plot(fpr_gbc, tpr_gbc, 'r-') # AUC skm.auc(fpr_gbc, tpr_gbc) # Deviance (see https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regularization.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regularization-py) # compute test set deviance test_deviance = np.zeros((params['n_estimators'], ), dtype=np.float64) for i, y_pred in enumerate(model_gbc.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} test_deviance[i] = model_gbc.loss_(y_test3, y_pred) plt.plot((np.arange(test_deviance.shape[0]) + 1)[::1], test_deviance[::1], '-', color='red', label=str(params)) #plt.close()
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.8, random_state=241) # for lr in [1, 0.5, 0.3, 0.2, 0.1]: for lr in [0.2]: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=lr) clf.fit(X_train, y_train) sigmoid_test_arr, sigmoid_train_arr = [], [] train_pred = clf.staged_decision_function(X_train) test_pred = clf.staged_decision_function(X_test) test_pred_arr, train_pred_arr = [], [] for i, val in enumerate(train_pred): sigmoid = 1 / (1 + np.exp(-val)) train_pred_arr.append(log_loss(y_train, sigmoid)) for i, val in enumerate(test_pred): sigmoid = 1 / (1 + np.exp(-val)) test_pred_arr.append(log_loss(y_test, sigmoid)) test_tuples, train_tuples = [], [] i = 0
Min_Loss = [] # Train GradientBoostingClassifier (n_estimators = 250, verbose = True, random_state = 241). for lr in [1, 0.5, 0.3, 0.2, 0.1]: gbc = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=lr) gbc.fit(X_train, y_train) test_loss, train_loss = [], [] # Use the staged_decision_function method to predict # the scores of the training and test samples at each iteration. # Transform the resulting prediction using the sigmoid function. for iter_ in gbc.staged_decision_function(X_train): train_loss.append( log_loss(y_train, [1.0 / (1 + np.exp(-x)) for x in iter_])) for iter_ in gbc.staged_decision_function(X_test): test_loss.append( log_loss(y_test, [1.0 / (1 + np.exp(-x)) for x in iter_])) Min_Loss.append( (test_loss[np.argmin(test_loss)], np.argmin(test_loss) + 1)) # Calculate and plot the log-loss values on the training and test samples. plt.figure() plt.ylabel('log_loss') plt.xlabel('iteration') plt.plot(test_loss, 'r', linewidth=2)
X= data[:, 1:] #split into train test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) #train clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=0.2) clf.fit(X_train, y_train) #verify log loss loss_on_test = [] for i, pred1 in enumerate(clf.staged_decision_function(X_test)): ## print(i) ## print(pred1) ## print(y_test) x = log_loss(y_test, 1.0/(1.0+np.exp(-pred1))) ## print(x) loss_on_test.append(x) grd2 = clf.staged_predict_proba(X_test) loss_on_test_proba = [] for i, pred2 in enumerate(grd2): loss_on_test_proba.append(log_loss(y_test, pred2))
learning_rate = [1, 0.5, 0.3, 0.2, 0.1] for rate in learning_rate: #обучаем классификатор clf = GradientBoostingClassifier(learning_rate = rate, n_estimators=250, verbose=True, random_state=241) clf.fit(X_train,Y_train) #готовим массывы под функцию потерь train_loss = np.zeros(250, dtype=np.float64) test_loss = np.zeros(250, dtype=np.float64) #считаем функцию потерь на обучающих данных for i, Y_train_pred in enumerate(clf.staged_decision_function(X_train)): Y_train_pred = 1 / (1 + np.exp(-Y_train_pred)) train_loss[i] = log_loss(Y_train, Y_train_pred) #считаем функцию потерь на тестовых данных for i, Y_test_pred in enumerate(clf.staged_decision_function(X_test)): Y_test_pred = 1 / (1 + np.exp(-Y_test_pred)) test_loss[i] = log_loss(Y_test, Y_test_pred) #строим графики plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) plt.title('learning_rate=%f' %rate)
f = open(filename, 'w') f.write(s) f.close() data = pd.read_csv('gbm-data.csv', header=0).values x_train, x_test, y_train, y_test = train_test_split(data[:, 1:], data[:, 0], test_size=0.8, random_state=241) # for lr in [1, 0.5, 0.3, 0.2, 0.1]: for lr in [0.2]: print '############## RATE %s ##########' % lr clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=lr) clf.fit(x_train, y_train) train_score = [] test_score = [] for i, y_predicted in enumerate(clf.staged_decision_function(x_train)): train_score.append(log_loss(y_train, 1 / (1 + np.exp(-y_predicted)))) for i, y_predicted in enumerate(clf.staged_decision_function(x_test)): test_score.append(log_loss(y_test, 1 / (1 + np.exp(-y_predicted)))) plt.figure() plt.plot(test_score, 'g', linewidth=2) plt.plot(train_score, 'r', linewidth=2) plt.legend(['test', 'train']) #plt.show() n_iter = np.argmin(np.array(test_score)) best = np.amin(np.array(test_score)) res = '%.2f %d' % (best, n_iter) print res out('5_3.txt', res) clf2 = RandomForestClassifier(n_estimators=n_iter, random_state=241)
X_data_test = arrays[1] Y_data_train = arrays[2] Y_data_test = arrays[3] answer2_argmin = None answer2_value = None for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: clf = GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate) clf.fit(X_data_train, Y_data_train) train_probs = clf.predict_proba(X_data_train) test_probs = clf.predict_proba(X_data_test) train_losts = [] for pred in clf.staged_decision_function(X_data_train): train_losts.append(log_loss(Y_data_train, [1 / (1 + exp(-x)) for x in pred])) train_losts = np.array(train_losts) test_losts = [] for pred in clf.staged_decision_function(X_data_test): test_losts.append(log_loss(Y_data_test, [1 / (1 + exp(-x)) for x in pred])) test_losts = np.array(test_losts) figure() plot(test_losts, 'g', linewidth=2) plot(train_losts, 'r', linewidth=2) legend(['test', 'train']) savefig('image-%s.png' % learning_rate) if learning_rate == 0.2: