def test_boston_housing_regression(): from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.model_selection import KFold boston = load_boston() y = boston['target'] X = boston['data'] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for train_index, test_index in kf.split(X, y): xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds2, labels) < 350 assert mean_squared_error(preds3, labels) < 25 assert mean_squared_error(preds4, labels) < 350
def compute_ALS(R, n_iter, lambda_, k): '''임의의 사용자 요인 행렬 X와 임의의 영화 요인 행렬 Y를 생성한 뒤 교대 최소제곱법을 이용하여 유틸리티 행렬 R을 근사합니다. R(ndarray) : 유틸리티 행렬 lambda_(float) : 정규화 파라미터입니다. n_iter(fint) : X와 Y의 갱신 횟수입니다. ''' m, n =R.shape X = np.random.rand(m, k) Y = np.random.rand(k, n) # 각 갱신 때마다 계산한 에러를 저장합니다. errors =[] for i in range(0, n_iter): # [식 6-4]를 구현했습니다. # 넘파이의 eye 함수는 파라미터 a를 받아 a x a 크기의 단위행렬을 만듭니다. X = np.linalg.solve(np.dot(Y, Y.T) + lambda_ * np.eye(k), np.dot(Y, R.T)).T Y = np.linalg.solve(np.dot(X.T, X) + lambda_ * np.eye(k), np.dot(X.T, R)) errors.append(mean_squared_error(R, np.dot(X, Y))) if i % 10 == 0: print('iteration %d is completed'%(i)) #print(mean_squared_error(R, np.dot(X, Y))) R_hat = np.dot(X, Y) print('Error of rated movies: %.5f'%(mean_squared_error(R, np.dot(X, Y)))) return(R_hat, errors)
def exercise_2b(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0) # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None) accuracy_lst = np.zeros([49, 2], dtype=float) accuracy_current = np.zeros(10, dtype=float) for k in range(1,50): iterator = 0 for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X_train, y_train) accuracy_current[iterator] = (1. - clf.score(X_test,y_test)) iterator+=1 print mean_squared_error(y_test, clf.predict(X_test)) accuracy_lst[k-1, 0] = accuracy_current.mean() accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95% x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K') plt.ylabel('Variance') plt.show()
def test_regression(): from numpy.random import rand x = rand(40,1) # explanatory variable y = x*x*x+rand(40,1)/5 # depentend variable from sklearn.linear_model import LinearRegression linreg = LinearRegression() linreg.fit(x,y) from numpy import linspace, matrix xx = linspace(0,1,40) plot(x,y,'o',xx,linreg.predict(matrix(xx).T),'--r') show() from sklearn.metrics import mean_squared_error print mean_squared_error(linreg.predict(x),y) from numpy import corrcoef corr = corrcoef(data.T) # .T gives the transpose print corr from pylab import pcolor, colorbar, xticks, yticks from numpy import arrange pcolor(corr) colorbar() # add # arranging the names of the variables on the axis xticks(arange(0.5,4.5),['sepal length', 'sepal width', 'petal length', 'petal width'],rotation=-20) yticks(arange(0.5,4.5),['sepal length', 'sepal width', 'petal length', 'petal width'],rotation=-20) show()
def simple_cv(valence_regressors, arousal_regressors, valence_movie_matrices, arousal_movie_matrices, valence_labels_movies, arousal_labels_movies, threshold, valence_movie_t, arousal_movie_t): n_train_matrices = 21 n_valid_matrices = 6 n_test_matrices = 3 valence_labels = join_vectors(valence_labels_movies) arousal_labels = join_vectors(arousal_labels_movies) print len(valence_labels), len(arousal_labels) processes = [] n_valence_features, n_arousal_features = threshold_n_features(threshold, valence_movie_t, arousal_movie_t) valence_predictions, arousal_predictions = np.array([], dtype = 'float'), np.array([], dtype = 'float') for i in range(0, 10): valence_test_predictions, arousal_test_predictions = fold_training(valence_predictions, arousal_predictions, i, valence_regressors, arousal_regressors, valence_movie_matrices, arousal_movie_matrices, valence_labels_movies, arousal_labels_movies, n_test_matrices, n_train_matrices, n_valid_matrices, n_valence_features, n_arousal_features) valence_predictions = np.append(valence_predictions, valence_test_predictions) arousal_predictions = np.append(arousal_predictions, arousal_test_predictions) print math.sqrt(mean_squared_error(valence_labels, valence_predictions)), np.corrcoef(valence_labels, valence_predictions)[0][1] print math.sqrt(mean_squared_error(arousal_labels, arousal_predictions)), np.corrcoef(arousal_labels, arousal_predictions)[0][1]
def test_als_warm_start(): X, y, coef = make_user_item_regression(label_stdev=0) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) X_train = sp.csc_matrix(X_train) X_test = sp.csc_matrix(X_test) fm = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2) fm.fit(X_train, y_train) y_pred = fm.predict(X_test) error_10_iter = mean_squared_error(y_pred, y_test) fm = als.FMRegression(n_iter=5, l2_reg_w=0, l2_reg_V=0, rank=2) fm.fit(X_train, y_train) print fm.iter_count y_pred = fm.predict(X_test) error_5_iter = mean_squared_error(y_pred, y_test) fm.fit(sp.csc_matrix(X_train), y_train, n_more_iter=5) print fm.iter_count y_pred = fm.predict(X_test) error_5_iter_plus_5 = mean_squared_error(y_pred, y_test) print error_5_iter, error_5_iter_plus_5, error_10_iter assert error_10_iter == error_5_iter_plus_5
def test_regression_synthetic(): """Test on synthetic regression datasets used in Leo Breiman, `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """ random_state = check_random_state(1) regression_params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.1, 'loss': 'ls'} # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor() clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
def model_metrics(model, X, y, data_split): print '-----------------------------------------' print 'Metrics:' print '-----------------------------------------' y_test = data_split['y_test'] y_pred = data_split['y_pred'] X_train = data_split['X_train'] X_test = data_split['X_test'] y_train = data_split['y_train'] print 'MSE\t', metrics.mean_squared_error(y_test, y_pred) print 'RMSE\t', np.sqrt(metrics.mean_squared_error(y_test, y_pred)) score_train = model.score(X_train, y_train) score_test = model.score(X_test, y_test) score_general = model.score(X[cols].fillna(0), y) print '\n' print '-----------------------------------------' print 'Scores:' print '-----------------------------------------' print 'Train\t', score_train print 'Test\t', score_test print 'General\t', score_general print '-----------------------------------------\n' return score_test, score_general
def gradient_boosting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold): #kernel: linear, poly, rbf, sigmoid, precomputed rows = 0 while rows_temp > 0: rows = rows + 1 rows_temp = rows_temp - 1 columns = 0 while columns_temp > 0: columns = columns + 1 columns_temp = columns_temp - 1 features_values = [x for x in features_values_temp] prediction_values = [y for y in prediction_values_temp] rotated = convert_list_to_matrix(features_values, rows, columns) scores = np.array(prediction_values) threshold = float(threshold) estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.) X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train) mean_squared_error(y_test, est.predict(X_test))
def train_test(features,labels,features_test,labels_test): verbose ("Features size",features.shape) verbose ("Labels size",labels.shape) verbose ("Features size",features_test.shape) verbose ("Labels size",labels_test.shape) #T_train_xgb = xgb.DMatrix(features, labels) verbose ("Training...") #params = {"objective":"reg:linear", # "booster" : "gbtree", # "eta":0.1, # "max_depth":10, # "subsample":0.85, # "colsample_bytree":0.7} #gbm = xgb.train(dtrain=T_train_xgb,params=params) regressor = skflow.TensorFlowLinearRegressor()#TODO convert uint32 to TensorFlow DType regressor.fit(features, labels) verbose ("Predict...") #preds=gbm.predict(xgb.DMatrix(features_test)) preds=regressor.predict(features_test) preds[preds<0]=0 verbose(preds) verbose(len(preds)) verbose("MSE: ") score = metrics.mean_squared_error(preds,labels_test) verbose("Original",score) score = metrics.mean_squared_error(np.round(preds), labels_test) verbose("round",score) verbose ("RMSLE:") score = rmsle(preds, labels_test) verbose ("Original",score) score = rmsle(np.round(preds), labels_test) verbose ("round",score)
def get_grid_search_values(model, grid_params, x_train, y_train, x_test, y_test, scoring_criteria = 'mean_squared_error'): # Run a grid search on a model, and return the train / test score and MSE on the best result # Input # model: scikit-learn model # grid_params: dict of parameter space # x_train: independent variables training set # y_train: dependent variable training set # x_test: independent variables test set # y_test: dependent variable test set # scoring_criteria: model scoring criteria # Output # best_model: model that produced the best results # para_search.best_params_: best grid parameters # train_score: training score # test_score: test score # train_mse: training mse # test_mse: test mse para_search = grid_search.GridSearchCV(model, grid_params, scoring = scoring_criteria, cv = 5).fit(x_train, y_train) best_model = para_search.best_estimator_ train_score = best_model.score(x_train, y_train) test_score = best_model.score(x_test, y_test) train_mse = metrics.mean_squared_error(best_model.predict(x_train), y_train) test_mse = metrics.mean_squared_error(best_model.predict(x_test), y_test) return best_model, para_search.best_params_, train_score, test_score, train_mse, test_mse
def classify(): mela=np.loadtxt('TrainingData_Melanoma') notmela=np.loadtxt('TrainingData_NotMelanoma') mela_labels=[1]*32 notmela_labels=[-1]*26 data=np.append(mela, notmela, axis=0) means=np.mean(data, axis=0) varis=np.var(data, axis=0) for i in range(len(data[0])-1): data[:,i]=(((data[:,i]-means[i])/3*varis[i])+1)/2 labels_d=np.append(mela_labels, notmela_labels) X_train, X_test, y_train, y_test = train_test_split(data, labels_d, test_size=0.25) clf=svm.LinearSVC() clf=clf.fit(X_train, y_train) p_test=clf.predict(X_test) p_train=clf.predict(X_train) print clf.score(X_train, y_train) print clf.score(X_test, y_test) rmse_train=mean_squared_error(y_train, p_train)**0.5 rmse_test=mean_squared_error(y_test, p_test)**0.5 print rmse_train, rmse_test
def execute(model, data, savepath, *args, **kwargs): fluence_divisions = [3.3E18, 3.3E19, 3.3E20] flux_divisions = [5e11,2e11,1e11] fig, ax = plt.subplots(1,3, figsize = (30,10)) for x in range(len(fluence_divisions)): model = model data.remove_all_filters() data.add_inclusive_filter("fluence n/cm2", '<', fluence_divisions[x]) l_train = len(data.get_y_data()) model.fit(data.get_x_data(), np.array(data.get_y_data()).ravel()) data.remove_all_filters() data.add_inclusive_filter("fluence n/cm2", '>=', fluence_divisions[x]) l_test = len(data.get_y_data()) Ypredict = model.predict(data.get_x_data()) RMSE = np.sqrt(mean_squared_error(Ypredict, np.array(data.get_y_data()).ravel())) matplotlib.rcParams.update({'font.size': 26}) ax[x].scatter(data.get_y_data(), Ypredict, color='black', s=10) ax[x].plot(ax[x].get_ylim(), ax[x].get_ylim(), ls="--", c=".3") ax[x].set_xlabel('Measured ∆sigma (Mpa)') ax[x].set_ylabel('Predicted ∆sigma (Mpa)') ax[x].set_title('Testing Fluence > {}'.format(fluence_divisions[x])) ax[x].text(.1, .88, 'RMSE: {:.3f}'.format(RMSE),fontsize = 30, transform=ax[x].transAxes) ax[x].text(.1, .83, 'Train: {}, Test: {}'.format(l_train, l_test), transform=ax[x].transAxes) fig.tight_layout() plt.subplots_adjust(bottom = .2) fig.savefig(savepath.format("fluence_extrapolation"), dpi=150, bbox_inches='tight') plt.close() fig, ax = plt.subplots(1, 3, figsize=(30, 10)) for x in range(len(flux_divisions)): model = model data.remove_all_filters() data.add_inclusive_filter("flux n/cm2/s", '>', flux_divisions[x]) l_train = len(data.get_y_data()) model.fit(data.get_x_data(), np.array(data.get_y_data()).ravel()) data.remove_all_filters() data.add_inclusive_filter("flux n/cm2/s", '<=', flux_divisions[x]) l_test = len(data.get_y_data()) Ypredict = model.predict(data.get_x_data()) RMSE = np.sqrt(mean_squared_error(Ypredict, np.array(data.get_y_data()).ravel())) matplotlib.rcParams.update({'font.size': 26}) ax[x].scatter(data.get_y_data(), Ypredict, color='black', s=10) ax[x].plot(ax[x].get_ylim(), ax[x].get_ylim(), ls="--", c=".3") ax[x].set_xlabel('Measured ∆sigma (Mpa)') ax[x].set_ylabel('Predicted ∆sigma (Mpa)') ax[x].set_title('Testing Flux < {:.0e}'.format(flux_divisions[x])) ax[x].text(.1, .88, 'RMSE: {:.3f}'.format(RMSE), fontsize=30, transform=ax[x].transAxes) ax[x].text(.1, .83, 'Train: {}, Test: {}'.format(l_train, l_test), transform=ax[x].transAxes) fig.tight_layout() plt.subplots_adjust(bottom=.2) fig.savefig(savepath.format("flux_extrapolation"), dpi=150, bbox_inches='tight') plt.close()
def stats_by_latlev(x_ppi, y_ppi, x_pp, y_pp, r_mlp, lat, lev, datafile): # Initialize Tmean = np.zeros((len(lat), len(lev))) qmean = np.zeros((len(lat), len(lev))) Tbias = np.zeros((len(lat), len(lev))) qbias = np.zeros((len(lat), len(lev))) rmseT = np.zeros((len(lat), len(lev))) rmseq = np.zeros((len(lat), len(lev))) rT = np.zeros((len(lat), len(lev))) rq = np.zeros((len(lat), len(lev))) for i in range(len(lat)): print('Loading data for latitude {:d} of {:d}'.format(i, len(lat))) T_true, q_true, T_pred, q_pred = \ load_one_lat(x_ppi, y_ppi, x_pp, y_pp, r_mlp, i, datafile, minlev=np.min(lev)) # Get means of true output Tmean[i, :] = np.mean(T_true, axis=0) qmean[i, :] = np.mean(q_true, axis=0) # Get bias from means Tbias[i, :] = np.mean(T_pred, axis=0) - Tmean[i, :] qbias[i, :] = np.mean(q_pred, axis=0) - qmean[i, :] # Get rmse rmseT[i, :] = np.sqrt( metrics.mean_squared_error(T_true, T_pred, multioutput='raw_values')) rmseq[i, :] = np.sqrt( metrics.mean_squared_error(q_true, q_pred, multioutput='raw_values')) # Get correlation coefficients for j in range(len(lev)): rT[i, j], _ = scipy.stats.pearsonr(T_true[:, j], T_pred[:, j]) rq[i, j], _ = scipy.stats.pearsonr(q_true[:, j], q_pred[:, j]) return Tmean.T, qmean.T, Tbias.T, qbias.T, rmseT.T, rmseq.T, rT.T, rq.T
def plot_stages(reg,X_train,y_train,X_test,y_test,ax,title=""): test_score = np.zeros(reg.n_estimators, dtype=np.float64) train_score = np.zeros(reg.n_estimators, dtype=np.float64) for i, y_pred in enumerate(reg.staged_predict(X_test)): test_score[i] = np.sqrt(mean_squared_error(y_test, y_pred)) for i, y_pred_train in enumerate(reg.staged_predict(X_train)): train_score[i] = np.sqrt(mean_squared_error(y_train, y_pred_train)) min_test_score = min(test_score) min_test_score_stage = np.argmin(test_score) learning_rate=reg.learning_rate max_depth = reg.max_depth ax.hold("on") ax.set_title('RMSE per stage for :'+str(title),fontsize=9) ax.plot(np.arange(reg.n_estimators), train_score, 'b-', label='Training Set RMSE') ax.plot(np.arange(reg.n_estimators), test_score, 'r-', label='Test Set RMSE') ax.set_xlim((0,reg.n_estimators)) ymin , ymax = ax.get_ylim() xmin , xmax = ax.get_xlim() ax.annotate('Learning rate : '+str(learning_rate), xy=(0.8*xmax, 0.85*ymax), xytext=(0.8*xmax, 0.85*ymax)) ax.annotate('Max depth : '+str(max_depth), xy=(0.8*xmax, 0.8*ymax), xytext=(0.8*xmax, 0.8*ymax)) ax.annotate('Min RMSE : '+str(round(min_test_score,3)), xy=(min_test_score_stage+10,min_test_score+0.1), xytext=(min_test_score_stage+10,min_test_score+0.1),color = "red") ax.legend(loc='upper right') ax.grid(True) ax.hlines(y=min_test_score,xmin=0,xmax=reg.n_estimators,linestyles="dashed",color="grey") ax.vlines(x=min_test_score_stage,ymin=0,ymax=1,linestyles="dashed",color="grey") ax.set_xlabel('Boosting Iterations') ax.set_ylabel('RMSE') ax.hold("off")
def find_init_stdev(fm, X_train, y_train, X_vali=None, y_vali=None, stdev_range=None, ): if not stdev_range: stdev_range = [0.1, 0.1, 0.2, 0.5, 1.0] if not isinstance(fm, FMRegression): raise Exception("only implemented for FMRegression") # just using a dummy here if X_vali is None: X_test = X_train[:2, :] else: X_test = X_vali best_init_stdev = 0 best_mse = np.finfo(np.float64).max for init_stdev in stdev_range: fm.init_stdev = init_stdev y_pred_vali = fm.fit_predict(X_train, y_train, X_test) if X_vali is None: y_pred = fm.predict(X_train) mse = mean_squared_error(y_pred, y_train) else: mse = mean_squared_error(y_pred_vali, y_vali) if mse < best_mse: best_mse = mse best_init_stdev = init_stdev return best_init_stdev, best_mse
def train_learning_model_decision_tree_ada_boost(df): #code taken from sklearn X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) tree_regressor = DecisionTreeRegressor(max_depth = 6) ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1) tree_regressor.fit(X_train, y_train) ada_regressor.fit(X_train, y_train) y_pred_tree = tree_regressor.predict(X_test) y_pred_ada = ada_regressor.predict(X_test) mse_tree = mean_squared_error(y_test, y_pred_tree) mse_ada = mean_squared_error(y_test, y_pred_ada) mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train)) mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train)) print ("MSE tree: %.4f " %mse_tree) print ("MSE ada: %.4f " %mse_ada) print ("MSE tree train: %.4f " %mse_tree_train) print ("MSE ada train: %.4f " %mse_ada_train)
def execute(model, data, savepath, *args, **kwargs): # Train the model using the training sets model.fit(data.get_x_data(), np.asarray(data.get_y_data()).ravel()) overall_rms = np.sqrt(mean_squared_error(model.predict(data.get_x_data()), np.asarray(data.get_y_data()).ravel())) datasets = ['IVAR', 'ATR-1', 'ATR-2'] colors = ['#BCBDBD', '#009AFF', '#FF0A09'] fig, ax = plt.subplots() #calculate rms for each dataset for dataset in range(max(np.asarray(data.get_data("Data Set code")).ravel()) + 1): data.remove_all_filters() data.add_inclusive_filter("Data Set code", '=', dataset) Ypredict = model.predict(data.get_x_data()) Ydata = np.asarray(data.get_y_data()).ravel() # calculate rms rms = np.sqrt(mean_squared_error(Ypredict, Ydata)) # graph outputs ax.scatter(Ydata, Ypredict, s=7, color=colors[dataset], label= datasets[dataset], lw = 0) ax.text(.05, .83 - .05*dataset, '{} RMS: {:.3f}'.format(datasets[dataset],rms), fontsize=14, transform=ax.transAxes) ax.legend() ax.plot(ax.get_ylim(), ax.get_ylim(), ls="--", c=".3") ax.set_xlabel('Measured (MPa)') ax.set_ylabel('Predicted (MPa)') ax.set_title('Full Fit') ax.text(.05, .88, 'Overall RMS: %.4f' % (overall_rms), fontsize=14, transform=ax.transAxes) fig.savefig(savepath.format(ax.get_title()), dpi=300, bbox_inches='tight') plt.clf() plt.close()
def test_rrf_vs_sklearn_reg(self): """Test R vs. sklearn on boston housing dataset. """ from sklearn.datasets import load_boston from sklearn.cross_validation import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=13) n_samples, n_features = X_train.shape mtry = int(np.floor(0.3 * n_features)) # do 100 trees r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0, 'mtry': mtry, 'corr.bias': False, 'sampsize': n_samples, 'random_state': 1234}) r_rf.fit(X_train, y_train) y_pred = r_rf.predict(X_test) r_mse = mean_squared_error(y_test, y_pred) p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False, max_features=mtry, random_state=1) p_rf.fit(X_train, y_train) y_pred = p_rf.predict(X_test) p_mse = mean_squared_error(y_test, y_pred) print('%.4f vs %.4f' % (r_mse, p_mse)) # should be roughly the same (7.6 vs. 7.2) np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
def test_boston_housing_regression_with_sample_weights(): tm._skip_if_no_sklearn() from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.cross_validation import KFold boston = load_boston() y = boston['target'] X = boston['data'] sample_weight = np.ones_like(y, 'float') kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) for train_index, test_index in kf: xgb_model = xgb.XGBRegressor().fit( X[train_index], y[train_index], sample_weight=sample_weight[train_index] ) preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 assert mean_squared_error(preds2, labels) < 370 assert mean_squared_error(preds3, labels) < 25 assert mean_squared_error(preds4, labels) < 370
def main(): """ Test SVM from scikit learn on mnist data set.""" (X_train, Y_train), (X_test, Y_test) = data.preprocess_mnist() model = SVC(kernel='poly', degree=2) params = { "C" : np.logspace(0, 3, 4), "gamma" : np.logspace(-7, 2, 4), "coef0" : np.logspace(-4,4,4)} grid = GridSearchCV(model, param_grid = params, cv=5, n_jobs = 5, pre_dispatch = "n_jobs") grid.fit(X_train, Y_train) print(grid.best_params_) train_yy = grid.predict(X_train) test_yy = grid.predict(X_test) train_err = 100*mean_squared_error(train_yy, Y_train) test_err = 100*mean_squared_error(test_yy, Y_test) print("Train. err:", train_err) print("Test err:", test_err) train_acc = accuracy_score(Y_train, train_yy) test_acc = accuracy_score(Y_test, test_yy) print("Train. acc:", train_acc) print("Test acc:", test_acc)
def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))
def rf_test(X,y): RF_model = RandomForestRegressor(100,n_jobs=-1) X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1) RF_model.fit(X_train,y_train) y_pred = RF_model.predict(X_test) print mean_squared_error(y_test, y_pred), r2_score(y_test,y_pred)
def test(): (X_train, Y_train), (X_test, Y_test) = mnist.load_data() # preprocess data X_train = X_train.reshape(60000, 784) X_test = X_test.reshape(10000, 784) X_train = X_train.astype('float32') X_test = X_test.astype('float32') X_train /= 255 X_test /= 255 model = pickle.load(open("svm_rbf.pickle","rb")) train_yy = model.predict(X_train) test_yy = model.predict(X_test) train_err = 100*mean_squared_error(train_yy, Y_train) test_err = 100*mean_squared_error(test_yy, Y_test) print("Train. err:", train_err) print("Test err:", test_err) train_acc = accuracy_score(Y_train, train_yy) test_acc = accuracy_score(Y_test, test_yy) print("Train acc:", train_acc) print("Test acc:", test_acc)
def main(): """ Test SVM from scikit learn on mnist data set.""" (X_train, Y_train), (X_test, Y_test) = mnist.load_data() # preprocess data X_train = X_train.reshape(60000, 784) X_test = X_test.reshape(10000, 784) X_train = X_train.astype('float32') X_test = X_test.astype('float32') X_train /= 255 X_test /= 255 print(X_train.shape[0], 'train samples') print(X_test.shape[0], 'test samples') model = SVC(kernel='rbf', gamma=0.02, C=10) model.fit(X_train, Y_train) train_yy = model.predict(X_train) test_yy = model.predict(X_test) train_err = 100*mean_squared_error(train_yy, Y_train) test_err = 100*mean_squared_error(test_yy, Y_test) print("Train. err:", train_err) print("Test err:", test_err) train_acc = accuracy_score(Y_train, train_yy) test_acc = accuracy_score(Y_test, test_yy) pickle.dump(model, open("svm_rbf", "wb"))
def polyRegressionKFold(inputFiles, deg=2): print "***************************" print "Degree: %s" % deg start_time = time.time() errors = [] for File in inputFiles: print "___________________________" print "Data Set: %s" % File data = tools.readData(File) data = data[np.argsort(data[:,0])] X = data[:, :-1] Y = data[:, len(data[1,:]) - 1] kf = KFold(len(data), n_folds = 10, shuffle = True) TrainError = 0 TestError = 0 for train, test in kf: pol = PolynomialFeatures(deg) Z = pol.fit_transform(X[train]) Z_test = pol.fit_transform(X[test]) theta = regress(Z, Y[train]) Y_hat = np.dot(Z, theta) Y_hat_test = np.dot(Z_test, theta) TrainError += mean_squared_error(Y[train], Y_hat) TestError += mean_squared_error(Y[test], Y_hat_test) TestError /= len(kf) TrainError /= len(kf) errors.append([TestError, deg]) print "---------------------------" print "Test Error: %s" % TestError print "Train Error: %s" % TrainError time_taken = start_time - time.time() print "Time Taken for primal: %s" % str(time_taken) return np.asarray(errors)
def FindPolyregDegree(): loadDB() points = getAllPoints2(30) print(len(points)) X = [] Y = [] for point in points: Y.append(point['vehicleSpeed']/point['enginespeed']) X.append([point['fuelrate']]) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.8) train_error = np.empty(10) test_error = np.empty(10) for degree in range(10): est = make_pipeline(PolynomialFeatures(degree), Ridge()) est.fit(X_train, y_train) train_error[degree] = mean_squared_error(y_train, est.predict(X_train)) test_error[degree] = mean_squared_error(y_test, est.predict(X_test)) plt.plot(np.arange(10), train_error, color='green', label='train') plt.plot(np.arange(10), test_error, color='red', label='test') plt.title("Degree vs Error - Finding optimal model for regression") plt.ylabel('log(mean squared error)') plt.xlabel('degree') plt.legend(loc='lower left') plt.show()
def multi_regression(): ''' 多元回归 :return: ''' from sklearn.cross_validation import train_test_split X = df.iloc[:, :-1].values y = df['MEDV'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) slr = LinearRegression() slr.fit(X_train, y_train) y_train_pred = slr.predict(X_train) y_test_pred = slr.predict(X_test) # 计算Mean Squared Error (MSE) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) # MSE train: 19.958, test: 27.196 => over fitting # 计算R*R # If R*R =1, the model ts the data perfectly with a corresponding MSE = 0 . print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) # plot plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red') plt.xlim([-10, 50]) plt.show()
def testingGBM(X_train, Y_train, X_test, Y_test): params = {'verbose':2, 'n_estimators':100, 'max_depth':50, 'min_samples_leaf':20, 'learning_rate':0.1, 'loss':'ls', 'max_features':None} test_init = Ridge(alpha = 0.1, normalize = True, fit_intercept=True) gbm2 = GradientBoostingRegressor(**params) gbm2.fit(X_train, Y_train["Ca"]) yhat_gbm = gbm2.predict(X_test) mean_squared_error(Y_test["Ca"], yhat_gbm) math.sqrt(mean_squared_error(Y_test["Ca"], yhat_gbm)) test_score = np.zeros((params['n_estimators'],), dtype=np.float64) for i, y_pred in enumerate(gbm2.staged_decision_function(X_test)): test_score[i]=mean_squared_error(Y_test["Ca"], y_pred) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title('Deviance') plt.plot(np.arange(params['n_estimators']) + 1, gbm2.train_score_, 'b-', label='Training Set Deviance') plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') plt.legend(loc='upper right') plt.xlabel('Boosting Iterations') plt.ylabel('Deviance') plt.show()
def demo(X = None, y = None, test_size = 0.1): if X == None: boston = load_boston() X = pd.DataFrame(boston.data) y = pd.DataFrame(boston.target) base_estimator = DecisionTreeRegressor(max_depth = 5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape # If you want to compare with BaggingRegressor. # bench = BaggingRegressor(base_estimator = base_estimator, n_estimators = 10, max_samples = 1, oob_score = True).fit(X_train, y_train) # print bench.score(X_test, y_test) # print mean_squared_error(bench.predict(X_test), y_test) clf = BasicSegmenterEG_FEMPO(ngen=30,init_sample_percentage = 1, n_votes=10, n = 10, base_estimator = base_estimator, unseen_x = X_test, unseen_y = y_test) clf.fit(X_train, y_train) print clf.score(X_test,y_test) y = clf.predict(X_test) print mean_squared_error(y, y_test) print y.shape return clf, X_test, y_test
regression1 = regression() regression2 = regression(normalize=True) neighbor1 = neighbor() neighbor2 = neighbor(normalize=True) X_pred1 = regression1.predict(X_train) X_pred2 = neighbor1.predict(X_train) X_pred3 = regression2.predict(X_train) X_pred4 = neighbor2.predict(X_train) stack_train = np.array([ X_pred1[X_test > 0], X_pred2[X_test > 0], X_pred3[X_test > 0], X_pred4[X_test > 0] ]).T clf = LinearRegression() clf.fit(stack_train, X_test[X_test > 0]) stack_test = np.array( [X_pred1.ravel(), X_pred2.ravel(), X_pred3.ravel(), X_pred4.ravel()]).T predicted = clf.predict(stack_test).reshape(X.shape) r2 = r2_score(y[y > 0], predicted[y > 0]) print r2 #0.345168402139 rmse = np.sqrt(mean_squared_error(y[y > 0], predicted[y > 0])) print rmse #0.905436328833
#%% rf = RandomForestRegressor() #%% rf.fit(X_train,y_train) #%% pred2 = rf.predict(X_test) #%% from sklearn import metrics print('MAE:', metrics.mean_absolute_error(y_test, pred2)) print('MSE:', metrics.mean_squared_error(y_test, pred2)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred2))) #%% from sklearn.metrics import r2_score r2_score(y_test, pred2) #%% sns.distplot((y_test-pred2),bins=50) #%%
sep="\t", index_col="frame", usecols=["frame", "/actuator/inflate"]) #read time series from the exchange.csv file series = GetData('intensity_2P_breathing signal.txt') #view top 10 records print(series.head(10)) print(series.dtypes) X = series.values size = int(len(X) * 0.75) train, test = X[0:size], X[size:len(X)] # walk-forward validation history = [x for x in train] predictions = list() for i in range(len(test)): # make prediction predictions.append(history[-1]) # observation history.append(test[i]) # report performance rmse = sqrt(mean_squared_error(test, predictions)) print('RMSE: %.3f' % rmse) # line plot of observed vs predicted pyplot.plot(test) pyplot.plot(predictions) pyplot.show()
test_err8 = [0] * len(max_n_estimators) max_depths = [4, 6, 8] for i, o in enumerate(max_n_estimators): print 'AdaBoostClassifier: learning a decision tree with n_estimators=' + str(o) dt4 = DecisionTreeClassifier(max_depth=4) dt6 = DecisionTreeClassifier(max_depth=6) dt8 = DecisionTreeClassifier(max_depth=8) bdt4 = AdaBoostClassifier(base_estimator=dt4, n_estimators=o) bdt6 = AdaBoostClassifier(base_estimator=dt6, n_estimators=o) bdt8 = AdaBoostClassifier(base_estimator=dt8, n_estimators=o) bdt4.fit(X_train, y_train) bdt6.fit(X_train, y_train) bdt8.fit(X_train, y_train) train_err4[i] = mean_squared_error(y_train, bdt4.predict(X_train)) test_err4[i] = mean_squared_error(y_test, bdt4.predict(X_test)) train_err6[i] = mean_squared_error(y_train, bdt6.predict(X_train)) test_err6[i] = mean_squared_error(y_test, bdt6.predict(X_test)) train_err8[i] = mean_squared_error(y_train, bdt8.predict(X_train)) test_err8[i] = mean_squared_error(y_test, bdt8.predict(X_test)) print '---' # Plot results print 'plotting results' plt.figure()
selector = preprocessing.Selector(datax, datay) selector.load(opt.partition) trainx, trainy = selector.training_set() ymax = np.max(np.abs(trainy)) trainy = trainy.flatten() / ymax scaler = StandardScaler() scaler.fit(trainx) trainx = scaler.transform(trainx) n_feature = len(trainx[0]) svr = SVR(kernel='linear') svr.fit(trainx, trainy) mse = mean_squared_error(trainy, svr.predict(trainx)) print(mse) pool = [] output = open(opt.output, 'w') output.write('STEP\tRMFEA\tMSE\tFEATURES\n') output.write('0\tfull\t%.4e\t[FULL]\n' % mse) step = 1 while n_feature > opt.end: svr = SVR(kernel='linear') rfe = RFE(svr, n_feature - 1, step=1) rfe.fit(trainx, trainy)
#(kernel(x_train[0,:],x_test[0,:])) ##a = np.matrix([[0],[0.2],[1],[3]]) ##print(a.shape) ##b = np.matrix([[0],[0.2],[1]]) ##print(kernel_matrix(a,b)) #print(y_test) g = gaussian_process(x_train,y_train,x_test,train_samples,test_samples,sigma) #print(g) ##This is for fitting the linear model in python regr = linear_model.LinearRegression() regr.fit(x_train,y_train) y_pred = regr.predict(test[:,1]) print("Mean squared error for the linear model is: %.10f" % mean_squared_error(y_test,y_pred)) ## This is for printing the accuracy of GP regression! error = g-y_test #print(error) sq_error = np.square(error) #print(sq_error) mean_sq_error = sum(sq_error)/sq_error.shape[0] print("Mean squared errorfor GPR is: %.10f" % mean_sq_error)
import pandas as pd import pickle from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import train_test_split import numpy as np from sklearn.metrics import mean_squared_error from math import sqrt df = pd.read_csv('./data/train.csv') df = df.iloc[:,1:] df.drop(columns=['cut','clarity'], inplace=True) df = pd.get_dummies(df) X = df.drop(columns='price') y = df['price'] X_train, X_test, y_train, y_test = train_test_split(X, y) model = GradientBoostingRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) rmse = sqrt(mean_squared_error(y_test, y_pred)) print(rmse) pickle.dump(model, open('./models/gb_model.sav', 'wb'))
Pred_YList.append(round(ansY[index])) print(Pred_YList) accuracy = accuracy_score(Test_YList, Pred_YList) print("Y accuarcy: %.2f%%" % (accuracy * 100.0)) # draw real and predict points plt.scatter(Pred_XList, Pred_YList, linewidths=0) plt.scatter(Test_XList, Test_YList, linewidths=0) plt.ylabel('real and predict') plt.show() # create predict XY list Pred_XYList = [] for index in range(0, test_num): tmp_list = [] tmp_list.append(Pred_XList[index]) tmp_list.append(Pred_YList[index]) Pred_XYList.append(tmp_list) # create real XY list Real_XYList = [] for index in range(0, test_num): tmp_list = [] tmp_list.append(Test_XList[index]) tmp_list.append(Test_YList[index]) Real_XYList.append(tmp_list) # calculate MSE MSE_XY = mean_squared_error(Real_XYList, Pred_XYList) print("XY MSE: %.2f" % MSE_XY)
epochs=1, batch_size=batch_size, verbose=2, shuffle=False) model.reset_states() # make predictions trainPredict = model.predict(trainX, batch_size=batch_size) model.reset_states() testPredict = model.predict(testX, batch_size=batch_size) # invert predictions trainPredict = scaler.inverse_transform(trainPredict) trainY = scaler.inverse_transform([trainY]) testPredict = scaler.inverse_transform(testPredict) testY = scaler.inverse_transform([testY]) # calculate root mean squared error trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:, 0])) print('Train Score: %.2f RMSE' % (trainScore)) testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:, 0])) print('Test Score: %.2f RMSE' % (testScore)) # shift train predictions for plotting trainPredictPlot = numpy.empty_like(dataset) trainPredictPlot[:, :] = numpy.nan trainPredictPlot[look_back:len(trainPredict) + look_back, :] = trainPredict # shift test predictions for plotting testPredictPlot = numpy.empty_like(dataset) testPredictPlot[:, :] = numpy.nan testPredictPlot[len(trainPredict) + (look_back * 2) + 1:len(dataset) - 1, :] = testPredict # plot baseline and predictions plt.plot(scaler.inverse_transform(dataset)) plt.plot(trainPredictPlot)
def RMSE(ytest, y_predict): return np.sqrt(mean_squared_error(y_test, y_predict))
''' # Create an input function for predictions. # Note: Since we're making just one prediction for each example, we don't # need to repeat or shuffle the data here. prediction_input_fn = lambda: my_input_fn( my_feature, targets, num_epochs=1, shuffle=False) # Call predict() on the linear_regressor to make predictions. predictions = linear_regressor.predict(input_fn=prediction_input_fn) # Format predictions as a NumPy array, so we can calculate error metrics. predictions = np.array([item['predictions'][0] for item in predictions]) # Print Mean Squared Error and Root Mean Squared Error. mean_squared_error = metrics.mean_squared_error(predictions, targets) root_mean_squared_error = math.sqrt(mean_squared_error) print("Mean Squared Error (on training data): %0.3f" % mean_squared_error) print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error) #Mean Squared Error (on training data): 56367.025 #Root Mean Squared Error (on training data): 237.417 ''' 这是出色的模型吗?您如何判断误差有多大? 由于均方误差 (MSE) 很难解读,因此我们经常查看的是均方根误差 (RMSE)。RMSE 的一个很好的特性是,它可以在与原目标相同的规模下解读。 我们来比较一下 RMSE 与目标最大值和最小值的差值: ''' min_house_value = california_housing_dataframe["median_house_value"].min() max_house_value = california_housing_dataframe["median_house_value"].max() min_max_difference = max_house_value - min_house_value
y_test = ss_y.transform(y_test) # 使用线性核函数配置 linear_svr = SVR(kernel='linear') linear_svr.fit(x_train, y_train) linear_svr_y_predict = linear_svr.predict(x_test) # 使用多项式核函数配置 ploy_svr = SVR(kernel='poly') ploy_svr.fit(x_train, y_train) ploy_svr_y_predict = ploy_svr.predict(x_test) # 使用径向基核函数配置 rbf_svr = SVR(kernel='rbf') rbf_svr.fit(x_train, y_train) rbf_svr_y_predict = rbf_svr.predict(x_test) print('The R2 ', r2_score(y_test, linear_svr_y_predict)) print('The MSE ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))) print('The MAE ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))) print('The R2 ', r2_score(y_test, ploy_svr_y_predict)) print('The MSE ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(ploy_svr_y_predict))) print('The MAE ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(ploy_svr_y_predict))) print('The R2 ', r2_score(y_test, rbf_svr_y_predict)) print('The MSE ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict))) print('The MAE ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)))
if "L" in direction: label = row.split(",")[3] else: label = row.split(",")[6] break if "V" in file: label = "3" if "8" not in label and "9" not in label and "X" not in label and '.' not in label: #if "." in label: #label='4' labelList.append(int(label)) nameList.append(naming) return np.array(labelList), np.array(nameList) Y_true, N_true = load_valY() Y_pre, N_pre = load_va() true = [] pre = [] for i in range(len(Y_true) - 1): for j in range(len(Y_pre) - 1): if N_true[i] == N_pre[j]: true.append(Y_true[i]) pre.append(Y_pre[j]) break print(sklm.accuracy_score(true, pre)) print(sklm.classification_report(true, pre)) print(sklm.confusion_matrix(true, pre)) print(sklm.mean_squared_error(true, pre))
def train_model(learning_rate, steps, batch_size, input_feature="total_rooms"): """Trains a linear regression model of one feature. Args: learning_rate: A `float`, the learning rate. steps: A non-zero `int`, the total number of training steps. A training step consists of a forward and backward pass using a single batch. batch_size: A non-zero `int`, the batch size. input_feature: A `string` specifying a column from `california_housing_dataframe` to use as input feature. """ periods = 10 steps_per_period = steps / periods my_feature = input_feature my_feature_data = california_housing_dataframe[[my_feature]] my_label = "median_house_value" targets = california_housing_dataframe[my_label] # Create feature columns feature_columns = [tf.feature_column.numeric_column(my_feature)] # Create input functions training_input_fn = lambda: my_input_fn( my_feature_data, targets, batch_size=batch_size) prediction_input_fn = lambda: my_input_fn( my_feature_data, targets, num_epochs=1, shuffle=False) # Create a linear regressor object. my_optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) my_optimizer = tf.contrib.estimator.clip_gradients_by_norm( my_optimizer, 5.0) linear_regressor = tf.estimator.LinearRegressor( feature_columns=feature_columns, optimizer=my_optimizer) # Set up to plot the state of our model's line each period. plt.figure(figsize=(15, 6)) plt.subplot(1, 2, 1) plt.title("Learned Line by Period") plt.ylabel(my_label) plt.xlabel(my_feature) sample = california_housing_dataframe.sample(n=300) plt.scatter(sample[my_feature], sample[my_label]) colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)] # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. print("Training model...") print("RMSE (on training data):") root_mean_squared_errors = [] for period in range(0, periods): # Train the model, starting from the prior state. linear_regressor.train(input_fn=training_input_fn, steps=steps_per_period) # Take a break and compute predictions. predictions = linear_regressor.predict(input_fn=prediction_input_fn) predictions = np.array( [item['predictions'][0] for item in predictions]) # Compute loss. root_mean_squared_error = math.sqrt( metrics.mean_squared_error(predictions, targets)) # Occasionally print the current loss. print(" period %02d : %0.2f" % (period, root_mean_squared_error)) # Add the loss metrics from this period to our list. root_mean_squared_errors.append(root_mean_squared_error) # Finally, track the weights and biases over time. # Apply some math to ensure that the data and line are plotted neatly. y_extents = np.array([0, sample[my_label].max()]) weight = linear_regressor.get_variable_value( 'linear/linear_model/%s/weights' % input_feature)[0] bias = linear_regressor.get_variable_value( 'linear/linear_model/bias_weights') x_extents = (y_extents - bias) / weight x_extents = np.maximum(np.minimum(x_extents, sample[my_feature].max()), sample[my_feature].min()) y_extents = weight * x_extents + bias plt.plot(x_extents, y_extents, color=colors[period]) print("Model training finished.") # Output a graph of loss metrics over periods. plt.subplot(1, 2, 2) plt.ylabel('RMSE') plt.xlabel('Periods') plt.title("Root Mean Squared Error vs. Periods") plt.tight_layout() plt.plot(root_mean_squared_errors) # Output a table with calibration data. calibration_data = pd.DataFrame() calibration_data["predictions"] = pd.Series(predictions) calibration_data["targets"] = pd.Series(targets) display.display(calibration_data.describe()) print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
from sklearn import metrics # 创建数据 rdm = np.random.RandomState(2) xtrain = 10 * rdm.rand(30) ytrain = 8 + 4 * xtrain + rdm.rand(30) * 3 # 多元回归拟合 model = LinearRegression() model.fit(xtrain[:, np.newaxis], ytrain) # 求出预测数据 ytest = model.predict(xtrain[:, np.newaxis]) # 求出均方差 mse = metrics.mean_squared_error(ytrain, ytest) # 求出均方根 rmse = np.sqrt(mse) # 求出预测数据与原始数据均值之差的平方和 ssr = ((ytest - ytrain.mean()) ** 2).sum() # 求出原始数据和均值之差的平方和 sst = ((ytrain - ytrain.mean()) ** 2).sum() # 求出确定系数 r2 = ssr / sst # 求出确定系数 r2 = model.score(xtrain[:, np.newaxis], ytrain)
label=y_val[:, i], weight=items["perishable"] * 0.25 + 1) watchlist = [(dtrain, 'train'), (dval, 'val')] model = xgb.train(plst, dtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=50) val_pred.append(model.predict(dval)) test_pred.append(model.predict(dtest)) print("Validation mse:", mean_squared_error(y_val, np.array(val_pred).transpose())**0.5) p_val = np.array(val_pred).transpose() df_val = pd.DataFrame( p_val, index=df_2017.index, columns=pd.date_range("2017-07-26", periods=16)).stack().to_frame("unit_sales") df_val.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True) df_val.to_csv('out/xgb_0115_pred.csv', float_format='%.4f', index=None) df_true.to_csv('out/xgb_0115_true.csv', float_format='%.4f', index=None) print("Making submission...") y_test = np.array(test_pred).transpose() df_preds = pd.DataFrame(
# To see the predicted values and the actual values for comparison # In[17]: df1.plot(kind='bar',figsize=(10,8)) plt.grid(which='major', linestyle='-', linewidth='0.5', color='green') plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black') plt.show() # In[18]: print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # # Model 2: Random Forest Regressor # In[19]: # Import the model we are using from sklearn.ensemble import RandomForestRegressor # Instantiate model with 1000 decision trees rf = RandomForestRegressor(n_estimators = 1000, random_state = 42) # Train the model on training data rf.fit(X_train, y_train)
# # ## mean_squared_error(Y_true, Y_predict) # print("Simple Linear Regression MSE: " + str(mean_squared_error(df['price'], Yhat))) # # ###5 Model 2: Multiple Linear Regression # # Price = -15678.742628061467 + 52.65851272 x horsepower + 4.69878948 x curb-weight + 81.95906216 x engine-size + 33.58258185 x highway-mpg # # calculate the R^2 # # fit the model Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']] lm.fit(Z, df['price']) # Find the R^2 print("Multiple Linear Regression R^2: " + str(lm.score(Z, df['price']))) #that ~ 80.896 % # calculate the MSE Y_predict_multifit = lm.predict(Z) print("Multiple Linear Regression MSE: " + str(mean_squared_error(df['price'], Y_predict_multifit))) # # ###6 Model 3: Polynomial Fit # poly = PolynomialFeatures(degree = 3) # X_poly = poly.fit_transform(X) # # poly.fit(X_poly, Y) # lin2 = LinearRegression() # lin2.fit(X_poly, Y) # Ypred = lin2.predict(X_poly) # r2 = r2_score(Y,Ypred) #0.651793603702672 # print("Polynomial Fit R^2: " + str(r2)) # print("Polynomial Fit MSE: " + str(mean_squared_error(df['price'], Ypred))) ##5 Multiple Linear Regression # lm = LinearRegression()
t0 = time.time() regr.fit(x_train, y_train.ravel()) regr_fit = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit) t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) # open a file to append outF = open("output.txt", "a") print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit, file=outF) print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict),file=outF) print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_regr), file=outF) print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_regr), file=outF) print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_regr)), file=outF) outF.close() print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_regr)) print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_regr)) print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_regr))) x_test_dim = sc_x.inverse_transform(x_test) y_test_dim = sc_y.inverse_transform(y_test) y_regr_dim = sc_y.inverse_transform(y_regr) plt.scatter(x_test_dim, y_test_dim, s=5, c='k', marker='o', label='Matlab') plt.scatter(x_test_dim, y_regr_dim, s=5, c='r', marker='+', label='Multi-layer Perceptron') #plt.title('Relaxation term $R_{ci}$ regression') plt.ylabel('$R_{ci}$ $[J/m^3/s]$')
X = iris.data[:, :2] # 使用前两个特征 Y = iris.target # 分训练集测试集 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=3) x = X_train y = y_train # 把输入变成二维数组,一行一样本,一列一特征 # x = x.reshape(-1, 1) # 变成n行1列 model = lm.Ridge(150, fit_intercept=True, max_iter=1000) model.fit(x, y) pred_y = model.predict(x) # 把样本x带入模型求出预测y # 输出模型的评估指标 print('平均绝对值误差:', sm.mean_absolute_error(y, pred_y)) print('平均平方误差:', sm.mean_squared_error(y, pred_y)) print('中位绝对值误差:', sm.median_absolute_error(y, pred_y)) print('R2得分:', sm.r2_score(y, pred_y)) # 绘制图像 mp.figure("Linear Regression", facecolor='lightgray') mp.title('Linear Regression', fontsize=16) mp.tick_params(labelsize=10) mp.grid(linestyle=':') mp.xlabel('x') mp.ylabel('y') mp.scatter(x, y, s=60, marker='o', c='dodgerblue', label='Points') mp.plot(x, pred_y, c='orangered', label='LR Line') mp.tight_layout() mp.legend()
def rmse(prediction, ground_truth): prediction = np.mat(prediction) ground_truth = np.mat(ground_truth) prediction = prediction[ground_truth.nonzero()].flatten() ground_truth = ground_truth[ground_truth.nonzero()].flatten() return sqrt(mean_squared_error(prediction, ground_truth))
X = dataset.iloc[:, :-1] y = dataset.iloc[:, 8] regressor = LinearRegression(normalize = True) current_features = [] # Spliting training and testing data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15) best_features = [] size = len(X_train) i = 0; max = 0 prev_rmse= 0.0 m_mse = 0.0 while len(features) > 0 and i < len(features): #Fetching the feature current_features.append(features[i]) X_in_test = X_train[current_features] y_in_test = y_train.values regressor.fit(X_in_test, y_in_test) scr = regressor.score(X_in_test, y_in_test) mse = met.mean_squared_error(y_in_test, regressor.predict(X_in_test)) print('\n ADDED FEATURES ' + str(features[i]) + ' RMSE ', math.sqrt(mse * size)) print('\n R2 SCORE IS ',scr) if scr > max or m_mse < prev_rmse : max = scr best_features.append(features[i]) m_mse = mse features.remove(features[i]) prev_rmse = mse print(best_features) print(' MAX ',max)
def DT_main_seq(start, stop, testGroup, segmentName): print('\n----------Start-----------\n') # (n_estimators, # max_depth, # min_samples_split, # learning_rate, # loss, # start, # stop, # testGroup, # segmentName) = parsingInit() n_estimators = 1000 max_depth = 2 min_samples_split = 2 learning_rate = 0.01 loss = 'ls' flowRates_Train = np.array([i for i in range(start, stop + 10, 10)]) flowRates_Test = np.array( [i for i in range(testGroup, testGroup + 10, 10)]) flowRates_reTrain = np.append(flowRates_Train, flowRates_Test) #The 160 flow rate data is corrupted!! #TODO: recollect the data flowRates_Train = np.delete(flowRates_Train, np.where(flowRates_Train == 160)) flowRates_Test = np.delete(flowRates_Test, np.where(flowRates_Test == 160)) flowRates_reTrain = np.delete(flowRates_reTrain, np.where(flowRates_reTrain == 160)) print('Train: ', flowRates_Train) print('Test: ', flowRates_Test) print('reTrain: ', flowRates_reTrain) print('1. Extracting Data... ') #Train Data X_Train, y_thic_Train, y_flow_Train = getXData(KPI_fileName, objectName, segment_Numbers, flowRates_Train, segmentName, features) featureNames = X_Train.columns #Test Data X_Test, y_thic_Test, y_flow_Test = getXData(KPI_fileName, objectName, segment_Numbers, flowRates_Test, segmentName, features) #ReTrain Data X_reTrain, y_thic_reTrain, y_flow_reTrain = getXData( KPI_fileName, objectName, segment_Numbers, flowRates_reTrain, segmentName, features) #%% Preprocessing Data converting to float32 and removing NaN print('2. Preprocessing Data...') imp1 = Imputer(missing_values='NaN', strategy='mean', axis=0) # imp2 = Imputer(missing_values=0, strategy='mean', axis=0) X_Train, y_thic_Train = preProcess(X_Train, y_thic_Train) X_Train = imp1.fit_transform(X_Train) X_Test, y_thic_Test = preProcess(X_Test, y_thic_Test) X_Test = imp1.fit_transform(X_Test) X_reTrain, y_thic_reTrain = preProcess(X_reTrain, y_thic_reTrain) X_reTrain = imp1.fit_transform(X_reTrain) #%% if not os.path.exists(destinationFolder): os.makedirs(destinationFolder) paramsGBR = { 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'learning_rate': learning_rate, 'loss': loss } model = ensemble.GradientBoostingRegressor(**paramsGBR) clf_Tr = clone(model) #%% print('3. Building Model with all the Samples...') X_Train, y_thic_Train = shuffle(X_Train, y_thic_Train) print('\t Shape Train: ', X_Train.shape) print('\t DataType Train: ', X_Train.dtype) print('\t Shape Train: ', y_thic_Train.shape) print('\t DataType Train: ', y_thic_Train.dtype) min_max_scaler_Train_X = preprocessing.MinMaxScaler().fit(X_Train) scaler_Train_X = preprocessing.StandardScaler().fit(X_Train) X_Tr = scaler_Train_X.transform(X_Train) X_Tr = min_max_scaler_Train_X.transform(X_Tr) clf_Tr = model.fit(X_Tr, y_thic_Train) #%% print('4. Results for Training:') y_pred1 = clf_Tr.predict(X_Tr) featureImportance(clf_Tr, featureNames, str(testGroup) + '_initialRankings_' + segmentName) mse_Test = mean_squared_error(y_thic_Train, y_pred1) mae_Test = mean_absolute_error(y_thic_Train, y_pred1) medae_Test = median_absolute_error(y_thic_Train, y_pred1) r2_Test = r2_score(y_thic_Train, y_pred1) exvs_Test = explained_variance_score(y_thic_Train, y_pred1) print('\t Mean Squared Error :', mse_Test) print('\t Mean Absolute Error :', mae_Test) print('\t Median Absolute Error :', medae_Test) print('\t R2 Score :', r2_Test) print('\t Explained Variance Score:', exvs_Test) #%% print('\n5. Processing emissions Signals for Group ', flowRates_Test, ' ...') X_Test, y_thic_Test = shuffle(X_Test, y_thic_Test) print('\t Shape Test: ', X_Test.shape) print('\t DataType Train: ', X_Test.dtype) print('\t Shape y Test: ', y_thic_Test.shape) print('\t DataType y Test: ', y_thic_Test.dtype) print('6. Transforming emissions Signals for Group ', flowRates_Test, ' ...') X_Te = scaler_Train_X.transform(X_Test) X_Te = min_max_scaler_Train_X.transform(X_Te) print('\t Shape X_Te: ', X_Te.shape) print('\t DataType X_te: ', X_Te.dtype) print('7. Predicting KPI for Signals for Group ', flowRates_Test, ' ...') y_pred_Te = clf_Tr.predict(X_Te) print('8. Results for Predicting KPI for Signals for Group ', flowRates_Test, ' ...') mse_Test = mean_squared_error(y_thic_Test, y_pred_Te) mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te) medae_Test = median_absolute_error(y_thic_Test, y_pred_Te) r2_Test = r2_score(y_thic_Test, y_pred_Te) exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te) print('\t Mean Squared Error :', mse_Test) print('\t Mean Absolute Error :', mae_Test) print('\t Median Absolute Error :', medae_Test) print('\t R2 Score :', r2_Test) print('\t Explained Variance Score:', exvs_Test) fileNamecsv = destinationFolder + '/FeatureRanking_' + str( testGroup) + '_' + segmentName + '.csv' print('9. Saving Results', fileNamecsv, ' ...') np.savetxt( fileNamecsv, [[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]], delimiter=',', header= 'Mean Squared Error, Mean Absolute Error, Median Absolute Error,R2 Score, Explained Variance Score', comments='') print('10. Retraining the Model with new emission Signal...') X_reTrain, y_thic_reTrain = shuffle(X_reTrain, y_thic_reTrain) print('\t Shape reTrain: ', y_thic_reTrain.shape) print('\t DataType reTrain: ', y_thic_reTrain.dtype) print('\t Shape y reTrain: ', y_thic_Test.shape) print('\t DataType y reTrain: ', y_thic_Test.dtype) min_max_scaler_Train_X2 = preprocessing.MinMaxScaler().fit(X_reTrain) scaler_Train_X2 = preprocessing.StandardScaler().fit(X_reTrain) X_reTr = scaler_Train_X2.transform(X_reTrain) X_reTr = min_max_scaler_Train_X2.transform(X_reTr) print('\t Shape X_reTr: ', X_reTr.shape) print('\t DataType X_reTr: ', X_reTr.dtype) X_Te = scaler_Train_X.transform(X_Test) X_Te = min_max_scaler_Train_X.transform(X_Te) print('\t Shape X_Te: ', X_Te.shape) print('\t DataType X_Te: ', X_Te.dtype) clf_reTr = model.fit(X_reTr, y_thic_reTrain) print('11. New Results with emission signals Incorporated:') y_pred_Te = clf_reTr.predict(X_Te) mse_Test = mean_squared_error(y_thic_Test, y_pred_Te) mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te) medae_Test = median_absolute_error(y_thic_Test, y_pred_Te) r2_Test = r2_score(y_thic_Test, y_pred_Te) exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te) print('\t Mean Squared Error :', mse_Test) print('\t Mean Absolute Error :', mae_Test) print('\t Median Absolute Error :', medae_Test) print('\t R2 Score :', r2_Test) print('\t Explained Variance Score:', exvs_Test) print('12. Saving the new Results', fileNamecsv, ' ...') f = open(fileNamecsv, 'a') df = pd.DataFrame([[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]]) df.to_csv(f, index=False, header=False) f.close() featureImportance(clf_reTr, featureNames, str(testGroup) + '_reTrainedRankings_' + segmentName) print('-----------:Finished!:--------------- \n')
def randomforest_predict(): warnings.filterwarnings('ignore') df_data = pd.read_csv("data/housing.data", delim_whitespace=True) X = df_data.drop(["MEDV"], axis=1) y = df_data["MEDV"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=128) param_grid = { 'n_estimators': [5, 10, 20, 50, 100, 200], # tree number 'max_depth': [3, 5, 7], # max depth 'max_features': [0.6, 0.7, 0.8, 1] # max features } rf = RandomForestRegressor() grid = GridSearchCV(rf, param_grid=param_grid, cv=3) grid.fit(X_train, y_train) print("best_params", grid.best_params_) rf_reg = grid.best_estimator_ print(rf_reg) estimator = rf_reg.estimators_[3] dot_data = tree.export_graphviz(estimator, out_file=None, filled=True, rounded=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_png("result/rf_reg.png") feature_names = X.columns feature_importances = rf_reg.feature_importances_ indices = np.argsort(feature_importances)[::-1] for index in indices: print("feature %s (%f)" % (feature_names[index], feature_importances[index])) plt.figure(figsize=(16, 8)) plt.title("feature importance of random forest") plt.bar(range(len(feature_importances)), feature_importances[indices], color='b') plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b') plt.show() rst = {"label": y_test, "prediction": rf_reg.predict(X_test)} rst = pd.DataFrame(rst) print(rst.head()) rst['label'].plot(style='k.', figsize=(15, 5)) rst['prediction'].plot(style='r.') plt.legend(fontsize=15, markerscale=3) plt.tick_params(labelsize=25) plt.grid() plt.show() MSE = metrics.mean_squared_error(y, rf_reg.predict(X)) print(np.sqrt(MSE)) submission = {"prediction": rf_reg.predict(X_test)} submission = pd.DataFrame(submission) submission.to_csv("result/price_predict_randomforest.csv") y_predict = rf_reg.predict(X_test) x_data = pd.Series(range(len(y_test)))[:, np.newaxis] y_test_data = y_test[:, np.newaxis] y_predict_data = y_predict[:, np.newaxis] plt.plot(x_data, y_test_data, label='Price') plt.plot(x_data, y_predict_data, label='Predict price') plt.xlabel('Entity') plt.ylabel('Price') plt.title('Price prediction (random forest)') plt.legend() plt.savefig('result/price_predict_random_forest.png') plt.show()
from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, y_train) print(model) score_train = model.score(X_train, y_train) score_test = model.score(X_test, y_test) parameters = {} model = GridSearchCV(LinearRegression(), parameters, cv=5) model.fit(X_train, y_train) output = model.predict(X_test) score_r2_pred = r2_score(y_test, output) rmse = np.sqrt(mean_squared_error(y_test, output)) Obsv_tbl = [['Linear Regressor', score_train, score_test, score_r2_pred, rmse]] #XGBOOST REGRESSION MODEL import xgboost as xgb from sklearn.metrics import mean_squared_error as ms from math import sqrt model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, n_estimators=2500, reg_alpha=0.4640, reg_lambda=0.8571, random_state=7)
kfcv = KFold(n_splits=10) #RidgeCV with 10-fold cross-validation(similar to ISLR)# #rcv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', normalize=True) rcv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', normalize=True, cv=kfcv) rcv.fit(X_train, Y_train) print('\nBest RidgeCV alpha value:') print(rcv.alpha_) #Ridge regression using best alpha# rbest = Ridge(alpha=rcv.alpha_, normalize=True) rbest.fit(X_train, Y_train) print('\nBest Ridge MSE:') print(mean_squared_error(Y_test, rbest.predict(X_test))) print('\nRidge Coeficients:') print(pd.Series(rbest.coef_, index=xcols)) #Full Lasso regression# lasso = Lasso(max_iter=10000, normalize=True) coefs2 = [] for a in alphas: lasso.set_params(alpha=a) lasso.fit(scale(X), Y) coefs2.append(lasso.coef_) ax2 = plt.gca() ax2.plot(alphas*2, coefs)
print("setp = {}, loss = {:.5f}".format(step+1, loss_val)) # model 최적화 a_up, b_up = sess.run([a, b]) print("수정된 기울기 : {}, 절편 : {}".format(a_up, b_up)) # 테스트용 공급 data feed_data_test = {X : x_test, Y : y_test} # Y(정답) vs model(예측치) y_true = sess.run(Y, feed_dict = feed_data_test) y_pred = sess.run(model, feed_dict = feed_data_test) # model 평가 mse = mean_squared_error(y_true, y_pred) print("MSE = ", mse) ''' 1차 : 학습율 = 0.5, 반복학습 100회 MSE = 0.72902936 2차 : 학습율 = 0.4, 반복학습 100회 MSE = 0.5829428 3차 : 학습율 = 0.4, 반복학습 200회 MSE = 0.7733004 '''
def test(self): self.results = self.model.predict(self.testX) self.finalError = mean_squared_error(self.results, self.testY)
plt.plot(X_plot[:,0], y_plot, color='r') plt.axis([-3, 3, 0, 6]) plt.show() print("==============岭回归解决多项式回归问题=======================") def RidgeRegression(degree=2, alpha=1): return Pipeline([ ("poly", PolynomialFeatures(degree=degree)), #多项式的增加特征 ("std_scaler", StandardScaler()), #归一化 ("ridge_reg", Ridge(alpha=alpha)) #岭回归替代了线性回归 ]) ridge1_reg=RidgeRegression(20,0.0001) ridge1_reg.fit(X_train,y_train) y1_predict=ridge1_reg.predict(X_test) print("MSE: ",mean_squared_error(y_test,y1_predict)) plot_model(ridge1_reg) ridge1_reg=RidgeRegression(20,3) ridge1_reg.fit(X_train,y_train) y1_predict=ridge1_reg.predict(X_test) print("MSE: ",mean_squared_error(y_test,y1_predict)) plot_model(ridge1_reg) ridge1_reg=RidgeRegression(20,10000) ridge1_reg.fit(X_train,y_train) y1_predict=ridge1_reg.predict(X_test) print("MSE: ",mean_squared_error(y_test,y1_predict)) plot_model(ridge1_reg)
def fmean_squared_error(ground_truth, predictions): fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5 return fmean_squared_error_