def regression_sample(true_func=np.sin, x_scale=3.): """ regression problem for continuous targets :param float x_scale: データのスケール. [-x_scale, x_scale] の範囲のデータを生成する. :return: """ x, t = generate_continuous_data(true_function=true_func, x_scale=x_scale) trained_models = [] iteration_dist = [5, 10, 20, 40, 100] for n_iter in iteration_dist: # GradientBoostedDTの定義 # 連続変数に対しての回帰問題なので # 目的関数:二乗ロス(LeastSquare) # 活性化関数:恒等写像(f(x)=x) # 今の当てはまりがどの程度なのか評価するロス関数に二乗ロス関数を与える rmse_objective = gb.LeastSquare() loss_function = gb.functions.least_square clf = gb.GradientBoostedDT(objective=rmse_objective, loss=loss_function, max_depth=4, num_iter=n_iter, gamma=.01, lam=.1, eta=.1) clf.fit(x=x, t=t) trained_models.append(clf) x_test = np.linspace(-x_scale, x_scale, 100).reshape(100, 1) fig = plt.figure(figsize=(6, 6)) ax_i = fig.add_subplot(1, 1, 1) ax_i.plot(x_test, true_func(x_test), "--", label='True Function', color="C0") ax_i.scatter(x, t, s=50, label='Training Data', linewidth=1., edgecolors="C0", color="white") ax_i.set_xlabel("Input") ax_i.set_ylabel("Target") for i, (n_iter, model) in enumerate(zip(iteration_dist, trained_models)): y = model.predict(x_test) ax_i.plot(x_test, y, "-", label='n_iter: {}'.format(n_iter), color=cm.viridis(i / len(iteration_dist), 1)) ax_i.legend(loc=4) ax_i.set_title("Transition by Number of Iterations") fig.savefig('experiment_figures/regression.png')
n_train = 5000 x_train, t_train = x[perm[:n_train]], y[perm[:n_train]] x_test, t_test = x[perm[n_train:]], y[perm[n_train:]] logger.info('training datasize: {0}'.format(t_train.shape[0])) logger.info('test datasize: {0}'.format(t_test.shape[0])) # setup regression object for training and # loss function for evaluating the predict quarity regobj = fn.CrossEntropy() loss = fn.logistic_loss clf = gb.GradientBoostedDT(regobj, loss, num_iter=100, eta=.2, max_leaves=15, max_depth=5, gamma=.01) clf.fit(x_train, t_train, validation_data=(x_test, t_test), verbose=1) f_importance = clf.feature_importance() pd.Series(f_importance).reset_index().to_csv(os.path.join( OUTPUT_DIR, 'feature_importance.csv'), index=False) fig, ax = plt.subplots(figsize=(6, 6)) ax.set_title('seqence of training and test loss') ax.plot(clf.training_loss, 'o-', label='training loss') ax.plot(clf.validation_loss, 'o-', label='test loss') ax.set_yscale('log') ax.legend()
def binary_classification_sample(): """入力次元数二次元のサンプル問題 """ np.random.seed = 71 x = ( np.random.normal(loc=.7, scale=1., size=400).reshape(200, 2), np.random.normal(loc=-.7, scale=1., size=400).reshape(200, 2), ) t = np.zeros_like(x[0]), np.ones_like(x[1]) x = np.append(x[0], x[1], axis=0) t = np.append(t[0], t[1], axis=0)[:, 0] x_train, x_test, t_train, t_test = train_test_split(x, t, test_size=.3, random_state=71) # 二値分類問題なので目的関数を交差エントロピー、活性化関数をシグモイドに設定 regobj = gb.CrossEntropy() # ロス関数はロジスティクスロス loss = gb.logistic_loss clf = gb.GradientBoostedDT(regobj, loss, max_depth=5, gamma=.05, lam=3e-2, eta=.1, num_iter=50) clf.fit(x=x_train, t=t_train, validation_data=(x_test, t_test)) networks = clf.show_network() import json with open('./view/src/assets/node_edge.json', "w") as f: json.dump(list(networks), f) fig = plt.figure(figsize=(6, 6)) ax = fig.add_subplot(1, 1, 1) ax.set_title('Training Transitions') ax.plot(clf.training_loss, 'o-', label='Training') ax.plot(clf.validation_loss, 'o-', label='Validation') ax.set_xlabel("Iterations") ax.set_ylabel("Loss Transition") ax.legend() fig.savefig("experiment_figures/training_transitions.png", dpi=150) plt.figure(figsize=(6, 6)) xx = np.linspace(start=-4, stop=4, num=50) yy = np.linspace(start=-4, stop=4, num=50) X, Y = np.meshgrid(xx, yy) Z = [ clf.predict(np.array([a, b]).reshape(1, 2))[0] for a in xx for b in yy ] Z = np.array(Z).reshape(len(xx), len(yy)) levels = np.linspace(0, 1, 11) plt.contour(X, Y, Z, levels, colors=["gray"], alpha=.05) plt.contourf(X, Y, Z, levels, cmap=cm.GnBu) # plt.contour(X, Y, Z, levels, cmap=cm.PuBu_r) cbar = plt.colorbar() plt.scatter(x[:200, 0], x[:200, 1], s=80, label="t = 0", edgecolors="C2", alpha=.6, linewidth=2, facecolor="white") plt.scatter(x[200:, 0], x[200:, 1], s=80, label="t = 1", edgecolors="C0", alpha=.6, linewidth=2, facecolor="white") plt.legend(loc=2) plt.title("binary regression") plt.tight_layout() plt.savefig('experiment_figures/binary_classification.png', dpi=100) pred_prob = clf.predict(x_test) pred_t = np.where(pred_prob >= .5, 1, 0) acc = np.where(pred_t == t_test, 1, 0).sum() / len(t_test) acc_str = 'test accuracy:{0:.2f}'.format(acc) print(acc_str)
# split train and test dataset # I shoud have use sklearn.cross_validation.train_test_split... np.random.seed(71) perm = np.random.permutation(len(t)) x_train, t_train = x[perm[:2000]], t[perm[:2000]] x_test, t_test = x[perm[2000:]], t[perm[2000:]] logger.info('training datasize: {0}'.format(t_train.shape[0])) logger.info('test datasize: {0}'.format(t_test.shape[0])) # setup regression object for training and # loss function for evaluating the predict quarity regobj = fn.CrossEntropy() loss = fn.logistic_loss clf = gb.GradientBoostedDT(regobj, loss, num_iter=30, eta=.4) clf.fit(x_train, t_train, validation_data=(x_test, t_test)) plt.title('seqence of training and test loss') plt.plot(clf.training_loss, 'o-', label='training loss') plt.plot(clf.validation_loss, 'o-', label='test loss') plt.yscale('log') plt.legend() plt.show() pred_prob = clf.predict(x_test) pred_cls = np.where(pred_prob > .5, 1., .0) df_pred = pd.DataFrame({ 'probability': pred_prob, 'predict': pred_cls, 'true': t_test