コード例 #1
0
def regression_sample(true_func=np.sin, x_scale=3.):
    """
    regression problem for continuous targets
    :param float x_scale: データのスケール. [-x_scale, x_scale] の範囲のデータを生成する.
    :return:
    """
    x, t = generate_continuous_data(true_function=true_func, x_scale=x_scale)

    trained_models = []
    iteration_dist = [5, 10, 20, 40, 100]
    for n_iter in iteration_dist:
        # GradientBoostedDTの定義
        # 連続変数に対しての回帰問題なので
        # 目的関数:二乗ロス(LeastSquare)
        # 活性化関数:恒等写像(f(x)=x)
        # 今の当てはまりがどの程度なのか評価するロス関数に二乗ロス関数を与える
        rmse_objective = gb.LeastSquare()
        loss_function = gb.functions.least_square
        clf = gb.GradientBoostedDT(objective=rmse_objective,
                                   loss=loss_function,
                                   max_depth=4,
                                   num_iter=n_iter,
                                   gamma=.01,
                                   lam=.1,
                                   eta=.1)
        clf.fit(x=x, t=t)
        trained_models.append(clf)

    x_test = np.linspace(-x_scale, x_scale, 100).reshape(100, 1)
    fig = plt.figure(figsize=(6, 6))
    ax_i = fig.add_subplot(1, 1, 1)
    ax_i.plot(x_test,
              true_func(x_test),
              "--",
              label='True Function',
              color="C0")
    ax_i.scatter(x,
                 t,
                 s=50,
                 label='Training Data',
                 linewidth=1.,
                 edgecolors="C0",
                 color="white")
    ax_i.set_xlabel("Input")
    ax_i.set_ylabel("Target")

    for i, (n_iter, model) in enumerate(zip(iteration_dist, trained_models)):
        y = model.predict(x_test)
        ax_i.plot(x_test,
                  y,
                  "-",
                  label='n_iter: {}'.format(n_iter),
                  color=cm.viridis(i / len(iteration_dist), 1))
    ax_i.legend(loc=4)
    ax_i.set_title("Transition by Number of Iterations")
    fig.savefig('experiment_figures/regression.png')
コード例 #2
0
    n_train = 5000
    x_train, t_train = x[perm[:n_train]], y[perm[:n_train]]
    x_test, t_test = x[perm[n_train:]], y[perm[n_train:]]

    logger.info('training datasize: {0}'.format(t_train.shape[0]))
    logger.info('test datasize: {0}'.format(t_test.shape[0]))

    # setup regression object for training and
    # loss function for evaluating the predict quarity
    regobj = fn.CrossEntropy()
    loss = fn.logistic_loss

    clf = gb.GradientBoostedDT(regobj,
                               loss,
                               num_iter=100,
                               eta=.2,
                               max_leaves=15,
                               max_depth=5,
                               gamma=.01)
    clf.fit(x_train, t_train, validation_data=(x_test, t_test), verbose=1)
    f_importance = clf.feature_importance()
    pd.Series(f_importance).reset_index().to_csv(os.path.join(
        OUTPUT_DIR, 'feature_importance.csv'),
                                                 index=False)

    fig, ax = plt.subplots(figsize=(6, 6))
    ax.set_title('seqence of training and test loss')
    ax.plot(clf.training_loss, 'o-', label='training loss')
    ax.plot(clf.validation_loss, 'o-', label='test loss')
    ax.set_yscale('log')
    ax.legend()
コード例 #3
0
def binary_classification_sample():
    """入力次元数二次元のサンプル問題
    """
    np.random.seed = 71
    x = (
        np.random.normal(loc=.7, scale=1., size=400).reshape(200, 2),
        np.random.normal(loc=-.7, scale=1., size=400).reshape(200, 2),
    )
    t = np.zeros_like(x[0]), np.ones_like(x[1])
    x = np.append(x[0], x[1], axis=0)
    t = np.append(t[0], t[1], axis=0)[:, 0]

    x_train, x_test, t_train, t_test = train_test_split(x,
                                                        t,
                                                        test_size=.3,
                                                        random_state=71)

    # 二値分類問題なので目的関数を交差エントロピー、活性化関数をシグモイドに設定
    regobj = gb.CrossEntropy()

    # ロス関数はロジスティクスロス
    loss = gb.logistic_loss

    clf = gb.GradientBoostedDT(regobj,
                               loss,
                               max_depth=5,
                               gamma=.05,
                               lam=3e-2,
                               eta=.1,
                               num_iter=50)
    clf.fit(x=x_train, t=t_train, validation_data=(x_test, t_test))

    networks = clf.show_network()
    import json
    with open('./view/src/assets/node_edge.json', "w") as f:
        json.dump(list(networks), f)

    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title('Training Transitions')
    ax.plot(clf.training_loss, 'o-', label='Training')
    ax.plot(clf.validation_loss, 'o-', label='Validation')
    ax.set_xlabel("Iterations")
    ax.set_ylabel("Loss Transition")
    ax.legend()
    fig.savefig("experiment_figures/training_transitions.png", dpi=150)

    plt.figure(figsize=(6, 6))

    xx = np.linspace(start=-4, stop=4, num=50)
    yy = np.linspace(start=-4, stop=4, num=50)
    X, Y = np.meshgrid(xx, yy)
    Z = [
        clf.predict(np.array([a, b]).reshape(1, 2))[0] for a in xx for b in yy
    ]
    Z = np.array(Z).reshape(len(xx), len(yy))
    levels = np.linspace(0, 1, 11)
    plt.contour(X, Y, Z, levels, colors=["gray"], alpha=.05)
    plt.contourf(X, Y, Z, levels, cmap=cm.GnBu)
    # plt.contour(X, Y, Z, levels, cmap=cm.PuBu_r)
    cbar = plt.colorbar()
    plt.scatter(x[:200, 0],
                x[:200, 1],
                s=80,
                label="t = 0",
                edgecolors="C2",
                alpha=.6,
                linewidth=2,
                facecolor="white")
    plt.scatter(x[200:, 0],
                x[200:, 1],
                s=80,
                label="t = 1",
                edgecolors="C0",
                alpha=.6,
                linewidth=2,
                facecolor="white")
    plt.legend(loc=2)
    plt.title("binary regression")
    plt.tight_layout()
    plt.savefig('experiment_figures/binary_classification.png', dpi=100)

    pred_prob = clf.predict(x_test)
    pred_t = np.where(pred_prob >= .5, 1, 0)
    acc = np.where(pred_t == t_test, 1, 0).sum() / len(t_test)
    acc_str = 'test accuracy:{0:.2f}'.format(acc)
    print(acc_str)
コード例 #4
0
    # split train and test dataset
    # I shoud have use sklearn.cross_validation.train_test_split...
    np.random.seed(71)
    perm = np.random.permutation(len(t))
    x_train, t_train = x[perm[:2000]], t[perm[:2000]]
    x_test, t_test = x[perm[2000:]], t[perm[2000:]]

    logger.info('training datasize: {0}'.format(t_train.shape[0]))
    logger.info('test datasize: {0}'.format(t_test.shape[0]))

    # setup regression object for training and
    # loss function for evaluating the predict quarity
    regobj = fn.CrossEntropy()
    loss = fn.logistic_loss

    clf = gb.GradientBoostedDT(regobj, loss, num_iter=30, eta=.4)
    clf.fit(x_train, t_train, validation_data=(x_test, t_test))

    plt.title('seqence of training and test loss')
    plt.plot(clf.training_loss, 'o-', label='training loss')
    plt.plot(clf.validation_loss, 'o-', label='test loss')
    plt.yscale('log')
    plt.legend()
    plt.show()

    pred_prob = clf.predict(x_test)
    pred_cls = np.where(pred_prob > .5, 1., .0)
    df_pred = pd.DataFrame({
        'probability': pred_prob,
        'predict': pred_cls,
        'true': t_test