def compare_linear_model_decision_tree():
    # 仔细观察数据之间的相关性带来的影响。
    # 有噪声数据训练数据和测试数据只有完全相同的样本数才存在完全相关,否则数据不相关。
    # 以下结论只适合训练数据与测试数据存在相关性时有用。(即无噪声数据)
    # 结论1:训练数据的数量增加,会提高决策树模型预测的精度;测试数据的数量增加,可能会降低决策树模型预测的精度。(这个对模型没有帮助,只是帮助理解评价指标)
    # 结论2:训练数据的数量增加,会降低线性回归模型预测的精度;测试数据的数量增加,可能会提高线性回归模型预测的精度。(这个对模型没有帮助,只是帮助理解评价指标)

    for train_number in [30, 60, 90]:
        # 准备训练数据,有噪声的正弦波
        from mglearn.datasets import make_wave
        X_train, y_train = make_wave(n_samples=train_number)

        # # 准备训练数据,无噪声的正弦波
        # X_train = np.linspace(-3, 3, train_number).reshape(-1, 1)
        # y_train = np.sin(X_train)

        print('=' * 40)
        fig, axes = plt.subplots(2, 4, figsize=(20, 10))
        for test_number, ax in zip([15, 30, 45, 60, 75, 90, 105, 120],
                                   axes.ravel()):
            # 准备测试数据,有噪声的正弦波
            from mglearn.datasets import make_wave
            X_test, y_test = make_wave(n_samples=test_number)
            X_test, y_test = (np.array(t)
                              for t in zip(*sorted(zip(X_test, y_test))))

            # # 准备测试数据,无噪声的正弦波
            # X_test = np.linspace(-3, 3, test_number).reshape(-1, 1)
            # y_test = np.sin(X_test)

            show_title("{}个训练数据;{}个测试数据".format(train_number, test_number))
            # 决策树预测
            from sklearn.tree import DecisionTreeRegressor
            reg_dtr = DecisionTreeRegressor(min_samples_split=3)
            reg_dtr.fit(X_train, y_train)
            print("使用决策树预测的结果:「R^2」评价 = {}".format(
                reg_dtr.score(X_test, y_test)))

            # 线性回归预测
            from sklearn.linear_model import LinearRegression
            reg_lr = LinearRegression().fit(X_train, y_train)
            print("使用线性回归预测的结果:「R^2」评价 = {}".format(
                reg_lr.score(X_test, y_test)))

            # 画图
            ax.plot(X_test, reg_dtr.predict(X_test), label='决策树')
            ax.plot(X_test, reg_lr.predict(X_test), label='线性回归')
            ax.plot(X_train[:, 0], y_train, 'o', c='b')
            ax.plot(X_test[:, 0], y_test, '^', c='r')
            ax.legend(loc='best')
            ax.set_xlabel('输入的特征')
            ax.set_ylabel('输出的回归')
            ax.set_title("{}个训练数据;{}个测试数据".format(train_number, test_number))
            plt.suptitle("图4-1:比较线性回归和决策树的预测效果")
            print()
            pass
        pass
def linear_regression_add_new_feature_in_wave():
    train_number = 100
    test_number = 50
    bin_number = 12
    # 准备训练数据
    from mglearn.datasets import make_wave
    X_train, y_train = make_wave(n_samples=train_number)

    # 将X的值与箱子的值对应,即将连续值离散化
    bins = np.linspace(-3, 3, bin_number)
    x_train_binned = np.digitize(X_train, bins=bins)

    # 将离散特征使用 OneHotEncoder 进行编码
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(sparse=False, categories='auto')
    encoder.fit(x_train_binned)
    X_train_one_hot = encoder.transform(x_train_binned)

    # 准备测试数据
    # X_test = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
    X_test, y_test = make_wave(n_samples=test_number)
    X_test, y_test = (np.array(t) for t in zip(*sorted(zip(X_test, y_test))))

    # 将刻度使用 OneHotEncoder 进行编码
    X_test_one_hot = encoder.transform(np.digitize(X_test, bins=bins))

    import random
    rand_train_number_list = random.sample(range(0, train_number), 5)

    # 构造乘积特征,使One-Hot编码特征具有更加丰富的信息
    # 十个特征,斜率依然受到全部训练数据的影响,斜率方向是一致的
    number_title = "总共十个特征"
    show_title(number_title)
    X_train_product = np.hstack([X_train * X_train_one_hot
                                 ])  # X_train*X_train_one_hot 不是矩阵乘,只是数组对应乘
    X_test_product = np.hstack([X_test * X_test_one_hot])

    print("X_train_product.shape=", X_train_product.shape)
    print("X_train_product=\n", X_train_product[rand_train_number_list])

    show_linear_regression(X_train, X_train_product, y_train, X_test,
                           X_test_product, y_test, bins)
    plt.suptitle("图4-4:每个箱子具有不同斜率的线性回归--" + number_title)

    # 二十个特征,斜率不再受到全部训练数据的影响,斜率方向是不一致的
    number_title = "总共二十个特征"
    print('-' * 5, number_title, '-' * 5)
    X_train_product = np.hstack([X_train_one_hot, X_train * X_train_one_hot
                                 ])  # X_train*X_train_one_hot 不是矩阵乘,只是数组对应乘
    X_test_product = np.hstack([X_test_one_hot, X_test * X_test_one_hot])
    print("X_train_product.shape=", X_train_product.shape)
    print("X_train_product=\n", X_train_product[rand_train_number_list])

    show_linear_regression(X_train, X_train_product, y_train, X_test,
                           X_test_product, y_test, bins)
    plt.suptitle("图4-4:每个箱子具有不同斜率的线性回归--" + number_title)
Exemple #3
0
 def __init__(self):
     self.x, self.y = make_wave(n_samples=40)
     self.x_train, self.x_test, self.y_train, self.y_test = \
         train_test_split(self.x, self.y,random_state=0, test_size = 0.3)
     self.knn_reg = KNeighborsRegressor(n_neighbors=3, n_jobs=-1)
     # n_jobs=-1 : n_jobs 사용할 코어의 수 , -1 == all
     self.knn_reg.fit(self.x_train, self.y_train)
def svm_in_wave():
    # 对比核 SVM 模型,调整 gamma 参数可以学到与多项式回归的复杂度类似的预测结果,而且还不需要进行显式的特征变换
    # 准备训练数据
    from mglearn.datasets import make_wave
    X, y = make_wave(n_samples=100)

    # 准备测试数据
    X_test = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
    y_test = np.sin(X_test)

    from sklearn.svm import SVR

    # RBF核参数gamma,高斯核宽度的倒数,值越大,高斯分布越尖锐,方差越小
    for gamma in [0.1, 1, 3, 6, 10]:
        svr = SVR(gamma=gamma)
        svr.fit(X, y)
        print(f"SVR gamma={gamma} : ", svr.score(X_test, y_test))
        plt.plot(X_test,
                 svr.predict(X_test),
                 label='SVR gamma={}'.format(gamma))

    # 画图
    plt.plot(X[:, 0], y, 'o', c='k')
    plt.xlabel('Input feature')
    plt.ylabel('Regression output')
    plt.legend(loc='best')
    plt.suptitle("图4-6:对于RBF核的SVM,使用不同的gamma参数的对比\n"
                 "不需要显式地特征变换就可以学习得到与多项式回归一样复杂的模型")
def fg9():
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from mglearn.datasets import make_wave
    from sklearn.preprocessing import OneHotEncoder

    X, y = make_wave(n_samples=100)
    line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)

    bins = np.linspace(-3, 3, 11)
    which_bin = np.digitize(X, bins=bins)

    encoder = OneHotEncoder(sparse=False)
    encoder.fit(which_bin)
    X_binned = encoder.transform(which_bin)
    X_combined =np.hstack([X,X_binned])
    X_product = np.hstack([X_binned,X*X_binned])
    line_binned = encoder.transform(np.digitize(line,bins=bins))
    print(X_product.shape)

    reg = LinearRegression().fit(X_product,y)

    line_combined = np.hstack([line_binned,line*line_binned])
    plt.plot(line,reg.predict(line_combined),label='Linear regression combined')

    for bin in bins:
        plt.plot([bin,bin],[-3,3],':',c='k')

    plt.plot(X[:,0],y,'o',c='k')
    plt.legend(loc='best')
    plt.ylabel("Regression output")
    plt.xlabel("Input features")
    plt.plot(X[:,0],y,'o',c='k')

    plt.show()
Exemple #6
0
def plot_knn_regression(n_neighbors=1):
    X, y = make_wave(n_samples=40)
    X_test = np.array([[-1.5], [0.9], [1.5]])

    dist = euclidean_distances(X, X_test)
    closest = np.argsort(dist, axis=0)

    plt.figure(figsize=(10, 6))

    reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y)
    y_pred = reg.predict(X_test)

    for x, y_, neighbors in zip(X_test, y_pred, closest.T):
        for neighbor in neighbors[:n_neighbors]:
            plt.arrow(x[0],
                      y_,
                      X[neighbor, 0] - x[0],
                      y[neighbor] - y_,
                      head_width=0,
                      fc='k',
                      ec='k')

    plt.plot(X, y, 'o')
    plt.plot(X, -3 * np.ones(len(X)), 'o')
    plt.plot(X_test, -3 * np.ones(len(X_test)), 'x', c='g', markersize=20)
    plt.plot(X_test, y_pred, 'x', c='b', markersize=20)

    plt.ylim(-3.1, 3.1)
def plot_knn_regression(n_neighbors=1):
    X, y = make_wave(n_samples=40)
    X_test = np.array([[-1.5], [0.9], [1.5]])

    dist = euclidean_distances(X, X_test)
    closest = np.argsort(dist, axis=0)

    plt.figure(figsize=(10, 6))

    reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y)
    y_pred = reg.predict(X_test)

    for x, y_, neighbors in zip(X_test, y_pred, closest.T):
        for neighbor in neighbors[:n_neighbors]:
            plt.arrow(x[0], y_, X[neighbor, 0] - x[0], y[neighbor] - y_,
                      head_width=0, fc='k', ec='k')

    train, = plt.plot(X, y, 'o')
    test, = plt.plot(X_test, -3 * np.ones(len(X_test)), '*', c='g', markersize=20)
    pred, = plt.plot(X_test, y_pred, '*', c='b', markersize=20)
    plt.vlines(X_test, -3.1, 3.1, linestyle="--")
    plt.legend([train, test, pred],
               ["training data/target", "test data", "test prediction"], ncol=3, loc=(.1, 1.025))
    plt.ylim(-3.1, 3.1)
    plt.xlabel("Feature")
    plt.ylabel("Target")
Exemple #8
0
def linear_regression_housing():
    X, y = make_wave(n_samples=60)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    line = np.linspace(-3, 3, 100).reshape(-1, 1)

    lr = LinearRegression().fit(X_train, y_train)
    print("w[0]: %f  b: %f" % (lr.coef_[0], lr.intercept_))
    print("Training set score : {:.2f}".format(lr.score(X_train, y_train)))
    print("Test set score ] {:.2f}".format(lr.score(X_test, y_test)))
def create_wave():
    X, y = datasets.make_wave()
    mglearn.discrete_scatter(X, y)
    # plt.figure()
    # plt.plot(X, y, '^')
    plt.ylim(-3, 3)
    plt.xlabel('特征')
    plt.ylabel('目标')
    plt.suptitle("图2-3:wave数据集的图像,\n" "x轴表示特征,y轴表示回归目标")
    print('X.shape: {}'.format(X.shape))
    print('y.shape: {}'.format(y.shape))
def fg8():
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from mglearn.datasets import make_wave
    from sklearn.preprocessing import OneHotEncoder

    X,y = make_wave(n_samples=100)
    line = np.linspace(-3,3,1000,endpoint=False).reshape(-1,1)

    reg = LinearRegression().fit(X,y)
    plt.plot(line, reg.predict(line),label="linear regression")

    reg = DecisionTreeRegressor(min_samples_split=3).fit(X,y)
    plt.plot(line,reg.predict(line),label="decision tree")

    plt.plot(X[:,0],y,'o',c='r')
    plt.ylabel("regression output")
    plt.xlabel("input features")
    plt.legend(loc='best')

    bins = np.linspace(-3,3,11)
    print("bins: {}".format(bins))
    which_bin = np.digitize(X,bins=bins)
    print("\nData points: ",X[:5])
    print("\nBin membership for the data points:\n",which_bin[:5])

    encoder = OneHotEncoder(sparse=False)
    encoder.fit(which_bin)
    X_binned = encoder.transform(which_bin)
    print(X_binned[:5])

    line_binned = encoder.transform(np.digitize(line,bins=bins))
    reg = LinearRegression().fit(X_binned, y)

    plt.show()

    plt.plot(line,reg.predict(line_binned),'--',label="linear regression binned",c='r')

    reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned,y)

    plt.plot(line,reg.predict(line_binned),label = "Decision tree binned",alpha=.5)
    plt.plot(X[:,0],y,'o',c='k')
    plt.vlines(bins,-3,3,linewidth=1,alpha=.2)
    plt.legend(loc='best')
    plt.ylabel("Regression output")
    plt.xlabel("Input features")
    plt.show()
def scikit_data_binning():
    train_number = 100
    from mglearn.datasets import make_wave
    X_train, y_train = make_wave(n_samples=train_number)

    from sklearn.preprocessing import KBinsDiscretizer
    show_title("使用稀疏数组返回封箱后的数据")
    kb = KBinsDiscretizer(n_bins=10, strategy='uniform')
    kb.fit(X_train)
    print("bin edges: \n", kb.bin_edges_)
    X_binned = kb.transform(X_train)
    print("X_binned 数据类别(稀疏数组):", type(X_binned))
    print("封箱前的前十条数据:\n", X_train[:10])
    print("封箱后的前十条数据:\n", X_binned[:10])
    print("封箱后的前十条数据转化为数组:\n", X_binned[:10].toarray())

    show_title("使用OneHot编码返回封箱后的数据")
    kb = KBinsDiscretizer(n_bins=10, strategy='uniform', encode='onehot-dense')
    kb.fit(X_train)
    X_binned = kb.transform(X_train)
    print("封箱前的前十条数据:\n", X_train[:10])
    print("封箱后的前十条数据:\n", X_binned[:10])

    line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
    line_binned = kb.transform(line)

    from sklearn.linear_model import LinearRegression
    line_reg = LinearRegression().fit(X_binned, y_train)
    plt.plot(line,
             line_reg.predict(line_binned),
             label='linear regression binned')

    from sklearn.tree import DecisionTreeRegressor
    dt_reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned, y_train)
    plt.plot(line, dt_reg.predict(line_binned), label="decision tree binned")

    plt.plot(X_train[:, 0], y_train, 'o', c='k')
    plt.vlines(kb.bin_edges_[0], -3, 3, linewidth=1, alpha=.2)
    plt.legend(loc='best')
    plt.xlabel('Input feature')
    plt.ylabel("Regression output")
    pass
Exemple #12
0
def plot_linear_regression_wave():
    X, y = make_wave(n_samples=60)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    line = np.linspace(-3, 3, 100).reshape(-1, 1)

    lr = LinearRegression().fit(X_train, y_train)
    print("w[0]: %f  b: %f" % (lr.coef_[0], lr.intercept_))

    plt.figure(figsize=(8, 8))
    plt.plot(line, lr.predict(line))
    plt.plot(X, y, 'o', c=cm2(0))
    ax = plt.gca()
    ax.spines['left'].set_position('center')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_position('center')
    ax.spines['top'].set_color('none')
    ax.set_ylim(-3, 3)
    ax.legend(["model", "training data"], loc="best")
    ax.grid(True)
    ax.set_aspect('equal')
def fg10():
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import LinearRegression
    from sklearn.tree import DecisionTreeRegressor
    from mglearn.datasets import make_wave
    from sklearn.preprocessing import OneHotEncoder

    X, y = make_wave(n_samples=100)
    poly = PolynomialFeatures(degree=10,include_bias=False)
    X_poly = poly.fit_transform(X)
    print("Polynomial feature names: \n{}".format(poly.get_feature_names()))

    reg = LinearRegression().fit(X_poly,y)

    line = np.linspace(-3,3,1000,endpoint=False).reshape(-1,1)
    line_poly = poly.transform(line)
    plt.plot(line,reg.predict(line_poly),label="Polinomial linear regression")
    plt.plot(X[:,0],y,'o',c='k')
    plt.ylabel("Regression output")
    plt.xlabel("Input features")
    plt.legend(loc='best')
    plt.show()
def plot_knn_regression(n_neighbors=1):
    X, y = make_wave(n_samples=40)
    X_test = np.array([[-1.5], [0.9], [1.5]])

    dist = euclidean_distances(X, X_test)
    closest = np.argsort(dist, axis=0)

    plt.figure(figsize=(10, 6))

    reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y)
    y_pred = reg.predict(X_test)

    for x, y_, neighbors in zip(X_test, y_pred, closest.T):
        for neighbor in neighbors[:n_neighbors]:
                plt.arrow(x[0], y_, X[neighbor, 0] - x[0], y[neighbor] - y_,
                          head_width=0, fc='k', ec='k')

    plt.plot(X, y, 'o')
    plt.plot(X, -3 * np.ones(len(X)), 'o')
    plt.plot(X_test, -3 * np.ones(len(X_test)), 'x', c='g', markersize=20)
    plt.plot(X_test, y_pred, 'x', c='b', markersize=20)

    plt.ylim(-3.1, 3.1)
Exemple #15
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mglearn.datasets import make_wave

X, y = make_wave(n_samples = 100)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

lin_reg = LinearRegression().fit(X_train, y_train)

print(lin_reg.score(X_test, y_test))

from sklearn.metrics import mean_squared_error

y_pred = lin_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(mse)

rmse = np.sqrt(mse)
print(rmse)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from mglearn.datasets import make_wave

X, y = make_wave(n_samples=40)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
line = np.linspace(-3, 3, 1000).reshape(-1, 1)

for n_neighbors, ax in zip([1, 3, 9], axes):
    reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    reg.fit(X_train, y_train)
    ax.plot(line, reg.predict(line))
    ax.scatter(X_train,
               y_train,
               marker='^',
               c='pink',
               s=18,
               alpha=.4,
               edgecolor='red')
    ax.scatter(X_test,
               y_test,
               marker='o',
               c='lightblue',
               s=18,
               alpha=.4,
               edgecolor='blue')
def linear_regression_add_polynomial_feature_in_wave():
    train_number = 100
    test_number = 100
    bin_number = 12
    # 准备训练数据
    from mglearn.datasets import make_wave
    X_train, y_train = make_wave(n_samples=train_number)

    # 将X的值与箱子的值对应,即将连续值离散化
    bins = np.linspace(-3, 3, bin_number)
    X_train_binned = np.digitize(X_train, bins=bins)

    # 将离散特征使用 OneHotEncoder 进行编码
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder(sparse=False, categories='auto')
    encoder.fit(X_train_binned)
    X_train_one_hot = encoder.transform(X_train_binned)

    # 准备测试数据
    X_test = np.linspace(-3, 3, test_number, endpoint=False).reshape(-1, 1)
    y_test = np.sin(X_test)

    # 将刻度使用 OneHotEncoder 进行编码
    X_test_one_hot = encoder.transform(np.digitize(X_test, bins=bins))

    # 使用原始特征的多项式来扩展连续特征,本质就是将特征向高维空间映射
    from sklearn.preprocessing import PolynomialFeatures

    import random
    rand_train_number_list = random.sample(range(0, train_number), 5)

    # include_bias,不包括偏置就是bin_number-1个特征,包括偏置就是bin_number个特征,偏置就是x0^0=1
    # poly_feature = PolynomialFeatures(degree = bin_number - 1)
    poly_feature = PolynomialFeatures(degree=bin_number - 1,
                                      include_bias=False)
    poly_feature.fit(X_train)
    X_train_poly = poly_feature.transform(X_train)
    X_test_poly = poly_feature.transform(X_test)

    print('-' * 20)
    print('X_train_poly.shape= {}'.format(X_train_poly.shape))
    print('X_train_poly=')
    print(X_train_poly[rand_train_number_list])
    print('Polynomial feature names=\n{}'.format(
        poly_feature.get_feature_names()))

    show_linear_regression(X_train, X_train_poly, y_train, X_test, X_test_poly,
                           y_test, bins)
    plt.suptitle("图4-5:具有10次多项式特征的线性回归--没有One-Hot编码")

    # 加入One-Hot编码可以分段拟合多项式特征的线性回归
    # 这个对模型拟合并没有好处
    X_train_poly = np.hstack([X_train_one_hot, X_train_poly * X_train_one_hot])
    X_test_poly = np.hstack([X_test_one_hot, X_test_poly * X_test_one_hot])

    print('-' * 20)
    print('X_train_poly.shape= {}'.format(X_train_poly.shape))
    print('X_train_poly=')
    print(X_train_poly[rand_train_number_list])
    print('Polynomial feature names=\n{}'.format(
        poly_feature.get_feature_names()))

    show_linear_regression(X_train, X_train_poly, y_train, X_test, X_test_poly,
                           y_test, bins)
    plt.suptitle("图4-5:具有10次多项式特征的线性回归--加入One-Hot编码")
def numpy_data_binning():
    for train_number in [100, 150, 200]:
        # 准备训练数据,有噪声的正弦波
        from mglearn.datasets import make_wave
        X_train, y_train = make_wave(n_samples=train_number)

        # # 准备训练数据,无噪声的正弦波
        # X_train = np.linspace(-3, 3, train_number).reshape(-1, 1)
        # y_train = np.sin(X_train)

        print('=' * 40)
        fig, axes = plt.subplots(3, 3, figsize=(20, 10))
        for test_number, axs in zip([100, 150, 200], axes):
            # 准备测试数据,有噪声的正弦波
            from mglearn.datasets import make_wave
            X_test, y_test = make_wave(n_samples=test_number)
            X_test, y_test = (np.array(t)
                              for t in zip(*sorted(zip(X_test, y_test))))

            # # 准备测试数据,无噪声的正弦波
            # X_test = np.linspace(-3, 3, test_number).reshape(-1, 1)
            # y_test = np.sin(X_test)

            for bin_number, ax in zip([5, 10, 20], axs):
                number_title = "{}个训练数据;{}个测试数据;{}个箱子".format(
                    train_number, test_number, bin_number)
                show_title(number_title)
                # 将X_train的值与箱子的值对应,即将连续值离散化
                bins = np.linspace(-3, 3, bin_number)
                X_train_bin = np.digitize(X_train, bins=bins)

                print('-' * 5, "训练数据", '-' * 5)
                import random
                rand_train_number_list = random.sample(range(0, train_number),
                                                       10)
                print('十个原始的训练数据:', X_train[rand_train_number_list].T)
                print('十个分箱后的训练数据:', X_train_bin[rand_train_number_list].T)

                # 将离散化的训练数据使用 OneHotEncoder 进行编码
                from sklearn.preprocessing import OneHotEncoder
                encoder = OneHotEncoder(sparse=False, categories='auto')
                encoder.fit(X_train_bin)
                X_train_one_hot = encoder.transform(X_train_bin)
                print('-' * 5, "分箱后的训练数据使用OneHot编码", '-' * 5)
                print('使用OneHot编码的分箱后的训练数据的形状= {}'.format(
                    X_train_one_hot.shape))
                # print('十个使用OneHot编码的分箱后的训练数据: \n', X_train_one_hot[rand_train_number_list])

                # 将X_test的值与箱子的值对应,即将连续值离散化
                X_test_bin = np.digitize(X_test, bins=bins)
                print('-' * 5, "测试数据", '-' * 5)
                import random
                rand_train_number_list = random.sample(range(0, test_number),
                                                       10)
                print('十个原始的测试数据:', X_test[rand_train_number_list].T)
                print('十个分箱后的测试数据:', X_test_bin[rand_train_number_list].T)

                # 将离散化的测试数据使用 OneHotEncoder 进行编码
                # 必须使用训练数据集的编码器来编码,不能再训练一次
                X_test_one_hot = encoder.transform(X_test_bin)
                print('-' * 5, "分箱后的训练数据使用OneHot编码", '-' * 5)
                print('使用OneHot编码的分箱后的测试数据的形状= {}'.format(
                    X_test_one_hot.shape))
                # print('十个使用OneHot编码的分箱后的测试数据: \n', X_test_one_hot[rand_train_number_list])

                print('-' * 40)
                # 决策树预测
                from sklearn.tree import DecisionTreeRegressor
                reg_dtr = DecisionTreeRegressor(min_samples_split=3)
                reg_dtr.fit(X_train_one_hot, y_train)
                print("使用决策树预测的结果R^2评价 = {}".format(
                    reg_dtr.score(X_test_one_hot, y_test)))

                # 线性回归预测
                from sklearn.linear_model import LinearRegression
                reg_lr = LinearRegression().fit(X_train_one_hot, y_train)
                print("使用线性回归预测的结果R^2评价 = {}".format(
                    reg_lr.score(X_test_one_hot, y_test)))

                # 绘制数据点
                ax.plot(X_test, reg_dtr.predict(X_test_one_hot), label='决策树')
                ax.plot(X_test, reg_lr.predict(X_test_one_hot), label='线性回归')
                ax.plot(X_train[:, 0], y_train, 'o', c='b')
                ax.plot(X_test[:, 0], y_test, '^', c='r')
                if test_number == 200:
                    ax.set_xlabel('输入的特征')
                if bin_number == 5:
                    ax.set_ylabel('输出的回归')
                ax.legend(loc='best')
                ax.set_title(number_title)
                plt.suptitle("图4-2:在分箱特征上比较线性回归和决策树回归")

                print()
                pass
            pass
def linear_regression_binning_data():
    import random
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import OneHotEncoder
    # train_number = 100
    # test_number = 100
    # bin_number = 11

    for train_number in [500, 1000, 1500]:
        # 准备训练数据,有噪声的正弦波
        from mglearn.datasets import make_wave
        X_train, y_train = make_wave(n_samples=train_number)

        # 准备训练数据,无噪声的正弦波
        # X_train = np.linspace(-3, 3, train_number).reshape(-1, 1)
        # y_train = np.sin(X_train)

        fig, axes = plt.subplots(3, 3, figsize=(20, 10))
        plt.suptitle("图4-3:使用分箱特征和单一全局斜率的线性回归\n")
        for test_number, axs in zip([50, 100, 150], axes):
            # 准备测试数据,有噪声的正弦波
            from mglearn.datasets import make_wave
            X_test, y_test = make_wave(n_samples=test_number)
            # zip(*sorted(zip(X_test,y_test))) 中的 * 代表函数解包,就是将输出的参数打包成一个列表供 zip 使用
            X_test, y_test = (np.array(t)
                              for t in zip(*sorted(zip(X_test, y_test))))

            # 准备测试数据,无噪声的正弦波
            # X_test = np.linspace(-3, 3, test_number).reshape(-1, 1)
            # y_test = np.sin(X_test)

            # 将X的值与箱子的值对应,即将连续值离散化
            for bin_number, ax in zip([6, 12, 18], axs):
                number_title = "{}个训练数据;{}个测试数据;{}个箱子".format(
                    train_number, test_number, bin_number)
                show_title(number_title)

                bins = np.linspace(-3, 3, bin_number)
                # bins = np.linspace(X_train.min(), X_train.max(), bin_number)
                print("箱子的区间值:", bins)
                X_train_bin = np.digitize(X_train, bins=bins)

                # 将离散化的训练数据使用 OneHotEncoder 进行编码
                encoder = OneHotEncoder(sparse=False, categories='auto')
                encoder.fit(X_train_bin)
                X_train_one_hot = encoder.transform(X_train_bin)

                # 将X_test的值与箱子的值对应,即将连续值离散化
                X_test_bin = np.digitize(X_test, bins=bins)

                # 将离散化的测试数据使用 OneHotEncoder 进行编码
                # 必须使用训练数据集的编码器来编码,不能再训练一次
                X_test_one_hot = encoder.transform(X_test_bin)

                rand_train_number_list = random.sample(range(0, train_number),
                                                       5)
                X_train_combined = np.hstack([X_train, X_train_one_hot])

                print('-' * 50)
                print("将原始训练数据和 OneHotEncoder 进行编码的训练数据合并到一起")
                print("合并后的训练数据形状 =", X_train_combined.shape)
                print("五个合并后的数据=")
                print(X_train_combined[rand_train_number_list])

                # 合并时,注意测试数据与训练数据的对应关系。
                X_test_combined = np.hstack([X_test, X_test_one_hot])
                print('-' * 50)
                print("将原始测试数据和 OneHotEncoder 进行编码的测试数据合并到一起")
                print("合并后的测试数据形状 =", X_test_combined.shape)

                # 线性回归预测
                reg_lr = LinearRegression().fit(X_train_combined, y_train)
                print('-' * 50)
                print("使用线性回归预测的结果:「R^2」评价 = {}".format(
                    reg_lr.score(X_test_combined, y_test)))

                # 画图
                # 划出分隔线
                for bin in bins:
                    ax.plot([bin, bin], [-3, -3], ':', c='k')
                    pass

                # 绘制数据点
                ax.plot(X_train[:, 0], y_train, 'o', c='b')
                ax.plot(X_test[:, 0], y_test, '^', c='r')
                ax.plot(X_test,
                        reg_lr.predict(X_test_combined),
                        label='分箱的线性回归')
                ax.legend(loc='best')
                if test_number == 150:
                    ax.set_xlabel('输入的特征')
                if bin_number == 6:
                    ax.set_ylabel('输出的回归')
                ax.set_title(number_title)
Exemple #20
0
def main():     #TEST
    from mglearn.datasets import make_forge
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import train_test_split

    X, y = make_forge()
    X_train,X_test, y_train, y_test = train_test_split(X,y, random_state=0)

    n = 5
    md = KnnClassifier(n_neighbors=n)
    md.fit(X_train,y_train)
    y_pred_my = md.predict(X_test)

    md = KNeighborsClassifier(n_neighbors=n)
    md.fit(X_train,y_train)
    y_pred_sk = md.predict(X_test)

    ab = y_pred_my == y_pred_sk
    print(ab.all())

    ########################
    import numpy as np
    X = np.random.randint(0,100, size=(50,3))
    y = np.random.randint(0,3, size=len(X))

    X_train,X_test, y_train, y_test = train_test_split(X,y, random_state=0)

    n = 7
    md = KnnClassifier(n_neighbors=n)
    md.fit(X_train,y_train)
    y_pred_my = md.predict(X_test)

    md = KNeighborsClassifier(n_neighbors=n)
    md.fit(X_train,y_train)
    y_pred_sk = md.predict(X_test)

    ab = y_pred_my == y_pred_sk
    print(ab.all())

    ############################################

    from pylab import scatter, plot, show
    from sklearn.neighbors import KNeighborsRegressor
    from mglearn.datasets import make_wave
    X,y = make_wave()
    import numpy as np
    X_test = np.arange(-3,3,0.001).reshape(-1,1)

    n=4
    md = KNeighborsRegressor(n_neighbors=n).fit(X,y)
    scatter(X.ravel(), y, marker='.', color='blue')

    y_pred = md.predict(X_test)
    plot(X_test.ravel(), y_pred, linewidth=0.4)

    md = KnnRegressor(n_neighbors=n).fit(X,y)
    y_pred = md.predict(X_test)
    plot(X_test.ravel(), y_pred, linewidth=0.6, linestyle='--', alpha=0.5, color='red')

    show()

    #------------------------------------

    X,y = np.split(np.random.randint(-10,10, size=(10,5)), axis=1, indices_or_sections=[4])
    y = y.ravel()

    n = 5
    md = KnnRegressor(n_neighbors=n).fit(X,y)
    X_test = np.random.randint(-10,10,size=(25,4))
    y_pred_my = md.predict(X_test)

    md = KNeighborsRegressor(n_neighbors=n).fit(X,y)
    y_pred_sk = md.predict(X_test)

    b = np.allclose(y_pred_my, y_pred_sk)
    print(b)
Exemple #21
0
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split

from mglearn.datasets import make_wave

X, y = make_wave(n_samples=60)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
line = np.linspace(-3, 3, 100).reshape(-1, 1)
lr = LinearRegression().fit(X_train, y_train)
print(lr.coef_[0], lr.intercept_)

plt.figure(figsize=(8, 8))
plt.plot(line, lr.predict(line))
ax = plt.gca()
ax.spines['top'].set_color('none')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('center')
ax.spines['left'].set_position('center')
ax.set_ylim(-3, 3)
ax.set_title("Training score {:.2f}, test score {:.2f}. \nLeast squares perform OK on small data".format(lr.score(X_train, y_train),
                                                               lr.score(X_test, y_test)))
ax.legend(['model', 'training data'], loc='best')
ax.grid(True)
ax.set_aspect('equal')

plt.scatter(X_train, y_train)

plt.show()
Exemple #22
0
from sklearn.neighbors import KNeighborsRegressor
from mglearn.datasets import make_wave
from util.util import plot_training_vs_test_accuracy_by_neighbors

X, y = make_wave()

plot_training_vs_test_accuracy_by_neighbors(X,
                                            y,
                                            method_class=KNeighborsRegressor,
                                            neighbors_range=range(1, 21),
                                            stratify=None,
                                            random_state=0)
Exemple #23
0
# Chapter 4, page 222 Binning

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt
from mglearn import datasets
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

X, y = datasets.make_wave(n_samples=100)
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# create a 1000 input values (X predict) between -3 and 3
# and then predict the y value.
# plot the prediction for DecisionTreeRegressor and LinearRegressor
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)

reg_model = DecisionTreeRegressor(min_samples_split=3)
reg_model.fit(X, y)

plt.plot(line, reg_model.predict(line), label="decision tree regressor")

reg_model = LinearRegression()
reg_model.fit(X, y)

plt.plot(line, reg_model.predict(line), label="linear regression")

plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression Output")
Exemple #24
0
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import mglearn.datasets as data_sets

X, y = data_sets.make_wave(n_samples=40)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

reg = KNeighborsRegressor(n_neighbors=3)
reg.fit(X_train, y_train)
print(f'Test set prediction: \n{reg.predict(X_test)}')
print(f'Test regression score: {reg.score(X_test, y_test)}')
Exemple #25
0
class GreenStar(object):
    
    x, y = make_wave(n_samples=40)  # mglearn의 데이터셋 중 make_wave 에서 가져오는 샘플의 수이다.
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.3)

    def __init__(self):
        self._neighbors = 0  # 가장 가까운 이웃의 갯수이다. 예제에서는 3을 사용한다.
        self._jobs = 0  # 사용할 코어의 수이다. -1 이면 모든 코어 사용한다.

    @property
    def neighbors(self) -> 0:
        return self._neighbors

    @neighbors.setter
    def neighbors(self, neighbors):
        self._neighbors = neighbors

    @property
    def jobs(self) -> 0:
        return self._jobs

    @jobs.setter
    def jobs(self, jobs):
        self._jobs = jobs

    def get_knn_reg_score(self):
        knn_reg = KNeighborsRegressor(n_neighbors=self.neighbors, n_jobs=self.jobs)  # 3, -1
        knn_reg.fit(self.x_train, self.y_train)
        return knn_reg.score(self.x_test, self.y_test)

    def plot_knn_reg(self):
        _, axes = plt.subplots(1, 3)  # 언더바 쉼표 입니다
        xtrain = self.x_train
        xtest = self.x_test
        ytrain = self.y_train
        ytest = self.y_test
        line = np.linspace(-5, 5, num=1000)
        line = line.reshape(-1, 1)
        for i, ax in zip([1, 3, 9], axes.ravel()):
            knn_reg = KNeighborsRegressor(n_neighbors=i, n_jobs=-1)
            knn_reg.fit(xtrain, ytrain)
            prediction = knn_reg.predict(line)
            ax.plot(line, prediction, label='model predict', c='k')
            ax.scatter(xtrain, ytrain, marker='^', c='darkred', label='train target')
            ax.scatter(xtest, ytest, marker='v', c='darkblue', label='test target')
            train_score = knn_reg.score(xtrain, ytrain)
            test_score = knn_reg.score(xtest, ytest)
            ax.set_title('k={}\n test score={:.3f}\n train score={:.3f}'.format(i, train_score, test_score))
            ax.set_xlabel('feature')
            ax.set_ylabel('target')
        axes[0].legend(loc=2)
        plt.show()

    @staticmethod
    def main():
        knn = GreenStar()
        while 1:
            menu = input('0.Exit\n 1.Plot\n 2.Score\n')
            if menu == '0':
                break
            elif menu == '1':
                knn.neighbors = int(input('Please Enter a Neighbors Value.'))
                mglearn.plots.plot_knn_regression(n_neighbors=knn.neighbors)
                plt.show()
            elif menu == '2':
                knn.neighbors = int(input('Please Enter a Neighbors Value.'))
                knn.jobs = int(input('Please Enter a Jobs Value.'))
                score = knn.get_knn_reg_score()
                print("{:.3f}".format(score))  # 0.697
            elif menu == '3':
                knn.plot_knn_reg()
            else:
                print('Wrong Number. Enter Another Number')