Beispiel #1
0
def regress_by_random_forest():
    # prepare training data and target variable
    features = None
    D = HousingData(features)
    X, y = D.X, D.y

    # split data into training dataset and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=1)
    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    # fit regressors
    regressor = RandomForestRegressor(n_estimators=1000,
                                      random_state=1,
                                      n_jobs=-1).fit(X_train, y_train)

    # compute scores
    scorers = [mean_squared_error, r2_score]
    scorer_names = ['MSE', 'R2']
    for score, scorer_name in zip(scorers, scorer_names):
        print('[{scorer_name}]'.format(scorer_name=scorer_name))
        print('training data:{score:.3f}'.format(
            score=score(y_train, regressor.predict(X_train))))
        print('test data:{score:.3f}'.format(
            score=score(y_test, regressor.predict(X_test))))

    # plot residuals
    plot_residuals(X_combined,
                   y_combined,
                   regressor,
                   test_idx=range(len(y_train), len(y)))
def main():
    # prepare training data and target variable
    features = ['RM']
    D = HousingData(features)
    X, y = D.X, D.y

    # prepare and fit RANSAC regressor
    ransac = RANSACRegressor(LinearRegression(),
                             max_trials=100,
                             min_samples=50,
                             loss='absolute_loss',
                             residual_threshold=5.0,
                             random_state=0)
    ransac.fit(X, y)

    # show prediction
    plot_predictions(X.flatten(), y, ransac, xlabel='RM', ylabel='MEDV')

    # plot inliers and outliers
    inlier_mask = ransac.inlier_mask_
    outlier_mask = np.logical_not(inlier_mask)
    line_X = np.arange(3, 10, 1)
    line_y = ransac.predict(line_X[:, np.newaxis])
    plt.scatter(X[inlier_mask],
                y[inlier_mask],
                c='steelblue',
                edgecolor='white',
                marker='o',
                label='inliers')
    plt.scatter(X[outlier_mask],
                y[outlier_mask],
                c='limegreen',
                edgecolor='white',
                marker='s',
                label='outliers')
    plt.plot(line_X, line_y, color='black')
    plt.xlabel('RM')
    plt.ylabel('MEDV')
    plt.legend()
    plt.show()

    # show a sample of non-standardized prediction and weights
    rm = [[5.0]]
    medv = ransac.predict(rm)[0]
    print('RM = 5.0 -> MEDV = {:.3e}'.format(medv))

    # show weights
    print('intercept = {i:.3e}, slope = {s:.3e}'.format(
        i=ransac.estimator_.intercept_, s=ransac.estimator_.coef_[0]))
Beispiel #3
0
def regress_by_decision_tree():
    # prepare training data and target variable
    features = ['LSTAT']
    D = HousingData(features)
    X, y = D.X, D.y

    # fit regressors
    regressor = DecisionTreeRegressor(max_depth=3).fit(X, y)

    # plot prediction
    sort_idx = X.flatten().argsort()
    plot_predictions(X[sort_idx].flatten(),
                     y[sort_idx],
                     regressor,
                     xlabel='LSTAT',
                     ylabel='MEDV')
Beispiel #4
0
def show_residual():
    # prepare training data and target variable
    features = None
    D = HousingData(features)
    X, y = D.X, D.y

    # split data into training dataset and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    # fit regressor
    regressor = LinearRegression().fit(X_train, y_train)

    # show residual
    plot_residuals(
        X_combined, y_combined,
        regressor, test_idx=range(len(y_train), len(y)),
        xlabel='RM', ylabel='MEDV')
Beispiel #5
0
def show_prediction():
    # prepare training data and target variable
    features = ['RM']
    D = HousingData(features)
    X, y = D.X, D.y

    # fit regressor
    regressor = LinearRegression().fit(X, y)

    # show prediction
    plot_predictions(
        X.flatten(), y, regressor,
        xlabel='RM', ylabel='MEDV')

    # show a sample of non-standardized prediction and weights
    rm = [[5.0]]
    medv = regressor.predict(rm)[0]
    print('RM = 5.0 -> MEDV = {:.3e}'.format(medv))

    # show weights
    print('intercept = {i:.3e}, slope = {s:.3e}'.format(i=regressor.intercept_, s=regressor.coef_[0]))
Beispiel #6
0
def main():
    # prepare training data and target variable
    features = None
    D = HousingData(features)
    X, y = D.X, D.y

    # split data into training dataset and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    # fit regressors
    regressors = [
        Ridge(alpha=1.0).fit(X_train, y_train),
        Lasso(alpha=1.0).fit(X_train, y_train),
        ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X_train, y_train)
    ]
    names = ['ridge', 'LASSO', 'elastic net']

    # show residual
    for regressor, name in zip(regressors, names):
        plot_residuals(
            X_combined, y_combined,
            regressor, test_idx=range(len(y_train), len(y)),
            xlabel='RM', ylabel='MEDV', title=name)

        # show scores of regressor
        print('<{}>'.format(name))
        y_pred_train = regressor.predict(X_train)
        y_pred_test = regressor.predict(X_test)

        # compute mean squared error
        mse_train = mean_squared_error(y_train, y_pred_train)
        mse_test = mean_squared_error(y_test, y_pred_test)
        print('[MSE] train:{0:.3f} / test:{1:.3f}'.format(mse_train, mse_test))

        # compute R^2 score
        r2_train = r2_score(y_train, y_pred_train)
        r2_test = r2_score(y_test, y_pred_test)
        print('[R^2 score] train:{0:.3f} / test:{1:.3f}'.format(r2_train, r2_test))    
Beispiel #7
0
def show_metrics():
    # prepare training data and target variable
    features = None
    D = HousingData(features)
    X, y = D.X, D.y

    # split data into training dataset and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    # fit regressor and predict data
    regressor = LinearRegression().fit(X_train, y_train)
    y_pred_train = regressor.predict(X_train)
    y_pred_test = regressor.predict(X_test)

    # compute mean squared error
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    print('[MSE] train:{0:.3f} / test:{1:.3f}'.format(mse_train, mse_test))

    # compute R^2 score
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print('[R^2 score] train:{0:.3f} / test:{1:.3f}'.format(r2_train, r2_test))    
Beispiel #8
0
def main():
    # prepare training data and target variable
    features = ['RM']
    D = HousingData(features)
    X, y = D.X, D.y

    # standardize training data
    sc_x = StandardScaler().fit(X)
    sc_y = StandardScaler().fit(y[:, np.newaxis])
    X_std = sc_x.transform(X)
    y_std = sc_y.transform(y[:, np.newaxis]).flatten()

    # fit regressors
    regressors = [
        LinearRegressionGD(eta=0.01, n_iter=20).fit(X_std, y_std),
        LinearRegressionGD(eta=0.001, n_iter=20).fit(X_std, y_std),
        LinearRegressionSGD(eta=0.01, n_iter=100).fit(X_std, y_std),
    ]

    for regressor in regressors:
        # show history of costs
        plot_update_history(regressor)

        # show prediction
        plot_predictions(
            X_std.flatten(), y_std, regressor,
            xlabel='RM (standardized)', ylabel='MEDV (standardized)', title=r'$\eta$ = {}'.format(regressor.eta))

        # show a sample of non-standardized prediction and weights
        rm_std = sc_x.transform([[5.0]])
        medv_std = regressor.predict(rm_std)
        medv = sc_y.inverse_transform(medv_std)[0]
        print('RM = 5.0 -> MEDV = {:.3e}'.format(medv))

        # show weights
        print('intercept = {i:.3e}, slope = {s:.3e}'.format(i=regressor.w_[0], s=regressor.w_[1]))
def regress_housing_data():
    # prepare training data and target variable
    features = ['LSTAT']
    D = HousingData(features)
    X, y = D.X, D.y

    # fit regressors
    transformers = [
        PolynomialFeatures(degree=1).fit_transform,
        PolynomialFeatures(degree=2).fit_transform,
        PolynomialFeatures(degree=3).fit_transform
    ]
    regressors = [
        LinearRegression().fit(transform(X), y) for transform in transformers
    ]
    regressor_names = ['linear', 'polynomial (d=2)', 'polynomial (d=3)']

    # compute scores
    print('[R2 score]')
    for transform, regressor, regressor_name in zip(transformers, regressors,
                                                    regressor_names):
        score = r2_score(y, regressor.predict(transform(X)))
        print('{name}:{score:.3f}'.format(name=regressor_name, score=score))

    # plot training data
    plt.scatter(X.flatten(), y, label='training data', edgecolor='white')
    # plot prediction of regressors
    X_pred = np.arange(X.min(), X.max(), 1)[:, np.newaxis]
    for transform, regressor, regressor_name in zip(transformers, regressors,
                                                    regressor_names):
        y_pred = regressor.predict(transform(X_pred))
        plt.plot(X_pred.flatten(), y_pred, label=regressor_name)
    # set plot area
    plt.xlabel('LSTAT')
    plt.ylabel('MEDV')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # fit regressor after exponential transformation
    X_log = np.log(X)
    y_sqrt = np.sqrt(y)
    regressor = LinearRegression().fit(X_log, y_sqrt)

    # compute scores
    print('[R2 score]')
    score_original = r2_score(y, (regressor.predict(X_log))**2)
    score_transform = r2_score(y_sqrt, regressor.predict(X_log))
    print('original space:{score:.3f}'.format(score=score_original))
    print('transformed space:{score:.3f}'.format(score=score_transform))

    # plot prediction of regressor
    X_pred = np.log(np.arange(X.min(), X.max(), 1))[:, np.newaxis]
    y_pred = regressor.predict(X_pred)
    plt.scatter(X.flatten(), y, label='training data', edgecolor='white')
    plt.plot(np.exp(X_pred), y_pred**2, label='prediction')
    # set plot area
    plt.xlabel('LSTAT')
    plt.ylabel('MEDV')
    plt.legend()
    plt.tight_layout()
    plt.show()