def regress_by_random_forest(): # prepare training data and target variable features = None D = HousingData(features) X, y = D.X, D.y # split data into training dataset and test dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) # fit regressors regressor = RandomForestRegressor(n_estimators=1000, random_state=1, n_jobs=-1).fit(X_train, y_train) # compute scores scorers = [mean_squared_error, r2_score] scorer_names = ['MSE', 'R2'] for score, scorer_name in zip(scorers, scorer_names): print('[{scorer_name}]'.format(scorer_name=scorer_name)) print('training data:{score:.3f}'.format( score=score(y_train, regressor.predict(X_train)))) print('test data:{score:.3f}'.format( score=score(y_test, regressor.predict(X_test)))) # plot residuals plot_residuals(X_combined, y_combined, regressor, test_idx=range(len(y_train), len(y)))
def main(): # prepare training data and target variable features = ['RM'] D = HousingData(features) X, y = D.X, D.y # prepare and fit RANSAC regressor ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, loss='absolute_loss', residual_threshold=5.0, random_state=0) ransac.fit(X, y) # show prediction plot_predictions(X.flatten(), y, ransac, xlabel='RM', ylabel='MEDV') # plot inliers and outliers inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='steelblue', edgecolor='white', marker='o', label='inliers') plt.scatter(X[outlier_mask], y[outlier_mask], c='limegreen', edgecolor='white', marker='s', label='outliers') plt.plot(line_X, line_y, color='black') plt.xlabel('RM') plt.ylabel('MEDV') plt.legend() plt.show() # show a sample of non-standardized prediction and weights rm = [[5.0]] medv = ransac.predict(rm)[0] print('RM = 5.0 -> MEDV = {:.3e}'.format(medv)) # show weights print('intercept = {i:.3e}, slope = {s:.3e}'.format( i=ransac.estimator_.intercept_, s=ransac.estimator_.coef_[0]))
def regress_by_decision_tree(): # prepare training data and target variable features = ['LSTAT'] D = HousingData(features) X, y = D.X, D.y # fit regressors regressor = DecisionTreeRegressor(max_depth=3).fit(X, y) # plot prediction sort_idx = X.flatten().argsort() plot_predictions(X[sort_idx].flatten(), y[sort_idx], regressor, xlabel='LSTAT', ylabel='MEDV')
def show_residual(): # prepare training data and target variable features = None D = HousingData(features) X, y = D.X, D.y # split data into training dataset and test dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) # fit regressor regressor = LinearRegression().fit(X_train, y_train) # show residual plot_residuals( X_combined, y_combined, regressor, test_idx=range(len(y_train), len(y)), xlabel='RM', ylabel='MEDV')
def show_prediction(): # prepare training data and target variable features = ['RM'] D = HousingData(features) X, y = D.X, D.y # fit regressor regressor = LinearRegression().fit(X, y) # show prediction plot_predictions( X.flatten(), y, regressor, xlabel='RM', ylabel='MEDV') # show a sample of non-standardized prediction and weights rm = [[5.0]] medv = regressor.predict(rm)[0] print('RM = 5.0 -> MEDV = {:.3e}'.format(medv)) # show weights print('intercept = {i:.3e}, slope = {s:.3e}'.format(i=regressor.intercept_, s=regressor.coef_[0]))
def main(): # prepare training data and target variable features = None D = HousingData(features) X, y = D.X, D.y # split data into training dataset and test dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) # fit regressors regressors = [ Ridge(alpha=1.0).fit(X_train, y_train), Lasso(alpha=1.0).fit(X_train, y_train), ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X_train, y_train) ] names = ['ridge', 'LASSO', 'elastic net'] # show residual for regressor, name in zip(regressors, names): plot_residuals( X_combined, y_combined, regressor, test_idx=range(len(y_train), len(y)), xlabel='RM', ylabel='MEDV', title=name) # show scores of regressor print('<{}>'.format(name)) y_pred_train = regressor.predict(X_train) y_pred_test = regressor.predict(X_test) # compute mean squared error mse_train = mean_squared_error(y_train, y_pred_train) mse_test = mean_squared_error(y_test, y_pred_test) print('[MSE] train:{0:.3f} / test:{1:.3f}'.format(mse_train, mse_test)) # compute R^2 score r2_train = r2_score(y_train, y_pred_train) r2_test = r2_score(y_test, y_pred_test) print('[R^2 score] train:{0:.3f} / test:{1:.3f}'.format(r2_train, r2_test))
def show_metrics(): # prepare training data and target variable features = None D = HousingData(features) X, y = D.X, D.y # split data into training dataset and test dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # fit regressor and predict data regressor = LinearRegression().fit(X_train, y_train) y_pred_train = regressor.predict(X_train) y_pred_test = regressor.predict(X_test) # compute mean squared error mse_train = mean_squared_error(y_train, y_pred_train) mse_test = mean_squared_error(y_test, y_pred_test) print('[MSE] train:{0:.3f} / test:{1:.3f}'.format(mse_train, mse_test)) # compute R^2 score r2_train = r2_score(y_train, y_pred_train) r2_test = r2_score(y_test, y_pred_test) print('[R^2 score] train:{0:.3f} / test:{1:.3f}'.format(r2_train, r2_test))
def main(): # prepare training data and target variable features = ['RM'] D = HousingData(features) X, y = D.X, D.y # standardize training data sc_x = StandardScaler().fit(X) sc_y = StandardScaler().fit(y[:, np.newaxis]) X_std = sc_x.transform(X) y_std = sc_y.transform(y[:, np.newaxis]).flatten() # fit regressors regressors = [ LinearRegressionGD(eta=0.01, n_iter=20).fit(X_std, y_std), LinearRegressionGD(eta=0.001, n_iter=20).fit(X_std, y_std), LinearRegressionSGD(eta=0.01, n_iter=100).fit(X_std, y_std), ] for regressor in regressors: # show history of costs plot_update_history(regressor) # show prediction plot_predictions( X_std.flatten(), y_std, regressor, xlabel='RM (standardized)', ylabel='MEDV (standardized)', title=r'$\eta$ = {}'.format(regressor.eta)) # show a sample of non-standardized prediction and weights rm_std = sc_x.transform([[5.0]]) medv_std = regressor.predict(rm_std) medv = sc_y.inverse_transform(medv_std)[0] print('RM = 5.0 -> MEDV = {:.3e}'.format(medv)) # show weights print('intercept = {i:.3e}, slope = {s:.3e}'.format(i=regressor.w_[0], s=regressor.w_[1]))
def regress_housing_data(): # prepare training data and target variable features = ['LSTAT'] D = HousingData(features) X, y = D.X, D.y # fit regressors transformers = [ PolynomialFeatures(degree=1).fit_transform, PolynomialFeatures(degree=2).fit_transform, PolynomialFeatures(degree=3).fit_transform ] regressors = [ LinearRegression().fit(transform(X), y) for transform in transformers ] regressor_names = ['linear', 'polynomial (d=2)', 'polynomial (d=3)'] # compute scores print('[R2 score]') for transform, regressor, regressor_name in zip(transformers, regressors, regressor_names): score = r2_score(y, regressor.predict(transform(X))) print('{name}:{score:.3f}'.format(name=regressor_name, score=score)) # plot training data plt.scatter(X.flatten(), y, label='training data', edgecolor='white') # plot prediction of regressors X_pred = np.arange(X.min(), X.max(), 1)[:, np.newaxis] for transform, regressor, regressor_name in zip(transformers, regressors, regressor_names): y_pred = regressor.predict(transform(X_pred)) plt.plot(X_pred.flatten(), y_pred, label=regressor_name) # set plot area plt.xlabel('LSTAT') plt.ylabel('MEDV') plt.legend() plt.tight_layout() plt.show() # fit regressor after exponential transformation X_log = np.log(X) y_sqrt = np.sqrt(y) regressor = LinearRegression().fit(X_log, y_sqrt) # compute scores print('[R2 score]') score_original = r2_score(y, (regressor.predict(X_log))**2) score_transform = r2_score(y_sqrt, regressor.predict(X_log)) print('original space:{score:.3f}'.format(score=score_original)) print('transformed space:{score:.3f}'.format(score=score_transform)) # plot prediction of regressor X_pred = np.log(np.arange(X.min(), X.max(), 1))[:, np.newaxis] y_pred = regressor.predict(X_pred) plt.scatter(X.flatten(), y, label='training data', edgecolor='white') plt.plot(np.exp(X_pred), y_pred**2, label='prediction') # set plot area plt.xlabel('LSTAT') plt.ylabel('MEDV') plt.legend() plt.tight_layout() plt.show()