def train_dummy_regressors(features, target): for strat in ['mean', 'median']: dr = DummyRegressor(strategy=strat) dr.fit(features, y=target.flatten()) dummy_score = (100 * dr.score(features, target)) print('{:.1f} % score for a dummy regressor using the {} stragety'.format( dummy_score, dr.get_params()['strategy']))
feat_imps.plot(x='feature', y='importance', kind='barh') # LINEAR REGRESSION X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.2, random_state=44) # Create a dummy regressor dummy_mean = DummyRegressor(strategy='mean') # "Train" dummy regressor dummy_mean.fit(X_train, y_train) # Get R-squared score dummy_mean.score(X_test, y_test) # -0.11 R2 using mean # vanilla linear regression - R2 is -4! model = regr.fit(X_train, y_train) model.score(X_test, y_test) # LASSO # Standarize features scaler = StandardScaler() X_std = scaler.fit_transform(X_train) # Create lasso regression with alpha value lasso = Lasso(alpha=0.1)
y_train_10 = train_10['GAME_TOTAL'].to_numpy() X_test_10 = test_10.drop('GAME_TOTAL', axis=1).to_numpy() y_test_10 = test_10['GAME_TOTAL'].to_numpy() Test_Vegas = test_10['TOTAL_CLOSE'].to_numpy() Train_Vegas = train_10['TOTAL_CLOSE'].to_numpy() #Vegas BASELINE = 17.565434708173875 mean_squared_error(y_test_10, Test_Vegas, squared=False) #DUMMY REGRESSOR: dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train_10, y_train_10) #returns -0.0011412 #second run with new data = -0.00201585 dummy_regr.score(X_test_10, y_test_10) #returns 21.1452 #second run = 21.1599 mean_squared_error(y_test_10, dummy_regr.predict(X_test_10), squared=False) #OLS regressor = sm.OLS(y_train_10, X_train_10) regressor = regressor.fit() #evidently this returned a 0.991 R**2 #second run gave us 0.993 regressor.summary() preds = regressor.predict(X_test_10) #this returns a RMSE of 19.29939303517463 #second run gives 17.708329120934696, which is close to vegas without any tuning... mean_squared_error(y_test_10, preds, squared=False)
X_train_s = train_season.drop('GAME_TOTAL', axis = 1).to_numpy() y_train_s = train_season['GAME_TOTAL'].to_numpy() X_test_s = test_season.drop('GAME_TOTAL', axis = 1).to_numpy() y_test_s = test_season['GAME_TOTAL'].to_numpy() Test_Vegas = test_season['TOTAL_CLOSE'].to_numpy() Train_Vegas = train_season['TOTAL_CLOSE'].to_numpy() #Vegas BASELINE = 17.650007402704748 mean_squared_error(np.append(y_train_s,y_test_s), np.append(Train_Vegas,Test_Vegas), squared = False) #DUMMY REGRESSOR: dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train_s, y_train_s) #-0.7833193001644205 dummy_regr.score(X_test_s, y_test_s) #27.845427872989156 mean_squared_error(y_test_s, dummy_regr.predict(X_test_s), squared = False) #OLS regressor = sm.OLS(y_train_s, X_train_s) regressor = regressor.fit() #evidently this returned a 0.991 R**2 #second run gave us 0.993 regressor.summary() preds = regressor.predict(X_test_s) #18.5802074596655 mean_squared_error(y_test_s, preds, squared = False) #RANDOM FOREST rf = RandomForestRegressor(oob_score=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # Baseline Model # In[36]: from sklearn.dummy import DummyRegressor dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train, y_train) dummy_regr.predict(X_train) baseline = dummy_regr.score(X_train, y_train) print("Baseline R^2: %f" % baseline) # # Multiple Linear Regression # In[37]: ols = linear_model.LinearRegression() ols.fit(X_train, y_train) print("Coefficients: %s" % ols.coef_) print("Intercept: %f" % ols.intercept_) y_test_prediction = ols.predict(X_test) ols.score(X_train, y_train) # In[40]:
def test_regressor_score_with_None(y, y_test): reg = DummyRegressor() reg.fit(None, y) assert_equal(reg.score(None, y_test), 1.0)
def train_rf(self, features, labels): print('Training random forest ...') self.model = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=np.ceil(len(features[0])/5), min_samples_leaf=3, n_jobs=-1) self.model2 = RandomForestClassifier( n_estimators=100, max_features='sqrt', max_depth=np.ceil(len(features[0])/5), min_samples_leaf=3, n_jobs=-1 ) self.lr0 = linear_model.TheilSenRegressor() self.lr1 = linear_model.TheilSenRegressor() reg_dummy = DummyRegressor() clf_dummy = DummyClassifier() kfold = KFold(n_splits=self.kfold, shuffle=True) kfold2 = KFold(n_splits=self.kfold, shuffle=True) features, labels = shuffle(features, labels) import matplotlib.pyplot as plt import seaborn as sns sns.set(style='whitegrid', context='paper') for ifold, (train, test) in enumerate(kfold.split(labels)): self.model.fit(features[train], labels[train]) score_train = self.model.score(features[train], labels[train]) score_test = self.model.score(features[test], labels[test]) reg_dummy.fit(features[train], labels[train]) score_dummy = reg_dummy.score(features[test], labels[test]) print('Fold %d: %.4f / %.4f (%.4f)' % (ifold, score_test, score_train, score_dummy)) labels_t = labels.transpose() y_pred = self.model.predict(features) y_pred_t = y_pred.transpose() # self.lr0.fit(labels_t[0][train].reshape(-1, 1), y_pred_t[0][train]) self.lr1.fit(labels_t[1][train].reshape(-1, 1), y_pred_t[1][train]) y_lr = self.lr1.predict(labels_t[1][test].reshape(-1,1)) dy = np.abs(y_pred_t[1][test] - y_lr) < 0.2 print('\t%d / %d' % (np.sum(dy), np.sum(1 - dy))) for jfold, (train2, test2) in enumerate(kfold2.split(dy)): self.model2.fit(features[test[train2]], dy[train2]) y_pred2 = self.model2.predict(features[test[test2]]) score_train2 = precision_score(dy[train2], self.model2.predict(features[test[train2]]), average='binary') score_test2 = precision_score(dy[test2], y_pred2, average='binary') clf_dummy.fit(features[test[train2]], dy[train2]) score_dummy = precision_score(dy[test2], clf_dummy.predict(features[test[test2]]), average='binary') print('\tFold %d: %.4f / %.4f (%.4f)' % (jfold, score_test2, score_train2, score_dummy)) score_final_train = self.model.score(features[test[train2]], labels[test[train2]]) score_final_test = self.model.score(features[test[test2[y_pred2]]], labels[test[test2[y_pred2]]]) print('\tFinal: %.4f / %.4f' % (score_final_test, score_final_train)) fig, axs = plt.subplots(2,2) train_truth = labels[train].transpose() train_pred = self.model.predict(features[train]).transpose() test_truth = labels[test].transpose() test_pred = y_pred[test].transpose() sns.scatterplot(x=train_truth[0], y=train_pred[0], ax=axs[0,0]) sns.scatterplot(x=train_truth[1], y=train_pred[1], ax=axs[0,1]) sns.scatterplot(x=test_truth[0][test2[y_pred2]], y=test_pred[0][test2[y_pred2]], ax=axs[1, 0]) sns.scatterplot(x=test_truth[1][test2[y_pred2]], y=test_pred[1][test2[y_pred2]], ax=axs[1,1]) plt.draw() plt.show() return
from sklearn.linear_model import LinearRegression # 加载数据 boston = load_boston() features, target = boston.data, boston.target # 将数据分为测试集和训练集 features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=0) # 创建DummyRegressor对象 dummy = DummyRegressor(strategy='mean') # 训练回归模型 dummy.fit(features_train, target_train) # 计算R方得分 print(dummy.score(features_test, target_test)) ols = LinearRegression() ols.fit(features_train, target_train) # 计算R方得分 print(ols.score(features_test, target_test)) # 创建一个讲所有样本预测为20的DummyRegressor clf = DummyRegressor(strategy='constant', constant=20) clf.fit(features_train, target_train) # 计算模型的得分 print(clf.score(features_test, target_test))
svm_linear_c1 = SVC(kernel='linear', C=1.0) svm_linear_c1.fit(X_train, y_train) svm_linear_c1.score(X_test, y_test) #Aula 6.2 - Avaliação de Modelos de Classificação e Regressão #Modelos Dummy from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.datasets import load_iris, load_boston X, y = load_iris(return_X_y=True) dc = DummyClassifier(strategy='stratified') dc.fit(X, y) dc.score(X, y) X, y = load_boston(return_X_y=True) dr = DummyRegressor(strategy='mean') dr.fit(X, y) dr.score(X, y) #Matriz de confusão, recall e precisão from sklearn.datasets import load_breast_cancer from sklearn.metrics import confusion_matrix, classification_report X, y = load_breast_cancer(return_X_y=True) dc = DummyClassifier(strategy='stratified') dc.fit(X, y) confusion_matrix(y, dc.predict(X)) print(classification_report(y, dc.predict(X))) #Validação cruzada from sklearn.model_selection import cross_val_score X, y = load_iris(return_X_y=True) dc = DummyClassifier(strategy='stratified') cross_val_score(dc, X, y, cv=5)
#Create Dummy Regression Always Predicts The Mean Value Of Target # Create a dummy regressor dummy_mean = DummyRegressor(strategy='mean') # "Train" dummy regressor dummy_mean.fit(X, y) # In[ ]: dummy_mean.predict(X) # In[ ]: # Get R-squared score dummy_mean.score(X, y) # ### Making a model # In[ ]: X_train.head() # In[ ]: X_train.rename({"unnamed: 0": "a"}, axis="columns", inplace=True) X_train.drop(["a"], axis=1, inplace=True) # In[ ]: X_test.rename({"unnamed: 0": "a"}, axis="columns", inplace=True)
print('Mean Absolute Error: ', mean_absolute_error(test_data, predictions)) print('Mean Squared Error: ', mean_squared_error(test_data, predictions)) print('Root MSE: ', rmse(test_data, predictions)) print('Mean Absolute Percentage Error', np.mean(np.abs((test_data - predictions) / test_data)) * 100) """Now I'll run the data through a dummy regressor to see how well an 'empty' model performs.""" X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=35) dummy_mean = DummyRegressor(strategy='mean') dummy_mean.fit(X_train, y_train) predictions = dummy_mean.predict(X_test) dummy_score = dummy_mean.score(X_test, y_test) print('Dummy Score: ', dummy_score) metrics(y_test, predictions) dummy_median = DummyRegressor(strategy='median') dummy_median.fit(X_train, y_train) predictions = dummy_median.predict(X_test) dummy_score = dummy_median.score(X_test, y_test) print('Dummy Score: ', dummy_score) metrics(y_test, predictions) """So the R^2 values to beat are -0.005 and -0.162
model.add(Dense(1)) model.compile( optimizer=Adam(), loss="mse", #metrics=['accuracy'] ) return model model = KerasRegressor(build_fn, epochs=10) N = int(len(X) * 0.7) #X_train, X_test, y_train, y_test = train_test_split(X, Y) X_train, X_test, y_train, y_test = X[:N], X[N:], Y[:N], Y[N:] estimator = make_pipeline( StandardScalerNDim(), make_pipeline(FlattenNDim(), LinearSVR()), #model ) estimator.fit(X_train, y_train) print("Model score:", estimator.score(X_test, y_test)) dummy = DummyRegressor() dummy.fit(y_train[:, np.newaxis], y_train) print("Dummy score:", dummy.score(y_test[:, np.newaxis], y_test)) y_pred = estimator.predict(X_test) print("R2 score: %s" % r2_score(y_test, y_pred)) plot_data(X_test, y_test, y_pred)
return df X, Y = rgf.drop('revenue', axis=1), rgf['revenue'] X = regression_engineering(X) train_X, test_X, train_Y, test_Y = train_test_split( X, Y, train_size=0.75, test_size=0.25) #randomly separating training and test set reg = GradientBoostingRegressor() reg.fit(train_X, train_Y) #Train regressor model print('Regressor Score: ', reg.score(test_X, test_Y)) #Compare with dummy regressor!! dummy = DummyRegressor() dummy.fit(train_X, train_Y) print('Dummy Regressor Score: ', dummy.score(test_X, test_Y)) sns.set_style('whitegrid') plt.figure(figsize=(12, 14)) sns.barplot(x=reg.feature_importances_, y=X.columns) plt.savefig('regressor.png') #Classification: Predicting Movie Sucess cls = movies_df[movies_df['return'].notnull()] cls = cls.drop(['revenue'], axis=1) cls['return'] = cls['return'].apply( lambda x: 1 if x >= 1 else 0) #create binary output for classification cls['return'].value_counts() #balanced classes cls['belongs_to_collection'] = cls['belongs_to_collection'].fillna('').apply(
def main(): """ :return: """ try: if sys.argv[1] == "-S" or "--single" or "-s": training_data = load( f"../data/train/{SINGLE_TRAIN_FILE_NAME}") trial_data = load(f"../data/{SINGLE_TRIAL_FILE_NAME}") elif sys.argv[1] == "-M" or "--multi" or "-m": training_data = load( f"../data/train/{MULTI_TRAIN_FILE_NAME}") trial_data = load(f"../data/{MULTI_TRIAL_FILE_NAME}") except IndexError: exit( "Please specify which type of trial information you would" " like to use (-S for single trial, -M for multi trial" " information)!") training_data = training_data.dropna() trial_data = trial_data.dropna() print("Extracting training features...") X_train, y_train = extract_features(training_data, use_sentence=True, use_word_embeddings=False, use_token=True, use_readability_measures=False), \ training_data[['complexity']] print("Extracting trial features...") X_trial, y_trial = extract_features(trial_data, use_sentence=True, use_word_embeddings=False, use_token=True, use_readability_measures=False), \ trial_data[['complexity']] tokens = X_trial[['token', "sentence"]] X_train.drop(["complexity", "id", "token", "sentence"], axis=1, inplace=True) X_trial.drop(["complexity", "id", "token", "sentence"], axis=1, inplace=True) print("Finished feature processing!\n") regressor = DummyRegressor(strategy="median") regressor.fit(X_train, y_train) y_guess = regressor.predict(X_trial) regressor.score(X_train, y_train) print(f"Mean squared error: {mean_squared_error(y_trial, y_guess)}") print(f"R^2 score: {r2_score(y_trial, y_guess)}") print(f"Explained variance score:" f" {explained_variance_score(y_trial, y_guess)}") print(f"Max error: {max_error(y_trial, y_guess)}") print(f"Mean absolute error:" f" {mean_absolute_error(y_trial, y_guess)}") results = y_trial.merge(pd.DataFrame(y_guess), left_index=True, right_index=True) results = results.merge(tokens, left_index=True, right_index=True) results.columns = ["Actual", "Predicted", "Token", "Sentence",] print(results[['Actual', "Predicted", "Token"]]) fig = results.plot(kind='bar', rot=0, title="Actual and predicted complexity scores" " by dummy (single token)", xlabel="Sample ID", ylabel="Complexity score", grid=False, figsize=(20, 9) ).get_figure() fig.savefig("dummy_results.png")
def benchmark(X_train, X_test, y_train, y_test): print "********************DummyRegressor Model******************" model = DummyRegressor() model.fit(X_train, y_train) print '{}'.format(model.score(X_test, y_test)) return model
# Load libraries from sklearn.datasets import load_boston from sklearn.dummy import DummyRegressor from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler # Load data boston = load_boston() # Create features X, y = boston.data, boston.target # Make test and training split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Create a dummy regressor dummy_mean = DummyRegressor(strategy='mean') # "Train" dummy regressor dummy_mean.fit(X_train, y_train) # Create a dummy regressor that always predit a contant value dummy_constant = DummyRegressor(strategy='constant', constant=20) # "Train" dummy regressor dummy_constant.fit(X_train, y_train) # Get R-squared score dummy_constant.score(X_test, y_test)
Dummy = DummyRegressor() # DummyRegressor 알고리즘 선언 Dummy.fit(X_train,y_train) # DummyRegressor 알고리즘에 나의 데이터를 적용시켜본다. Dummy_y_pred = Dummy.predict(X_test) # Dummy 알고리즘을 사용해서 Y값을 예측한다. print('DummyRegressor Mean Absolute Error:', metrics.mean_absolute_error(y_test,Dummy_y_pred)) print('DummyRegressor Mean Squared Error:', metrics.mean_squared_error(y_test,Dummy_y_pred)) print('DummyRegressor Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test,Dummy_y_pred))) print('DummyRegressor Accuracy:', metrics.r2_score(y_test,Dummy_y_pred)) Dummy_df = pd.DataFrame({'Actual':y_test, 'Predicted':Dummy_y_pred}) print(Dummy_df) print("R-squared for Train: %.2f" %Dummy.score(X_train,y_train)) print("R-squared for Test: %.2f" %Dummy.score(X_test,y_test)) # ------------------------------------------- Output # random_state=0일 때 # DummyRegressor Mean Absolute Error: 603.3633884521669 # DummyRegressor Mean Squared Error: 880156.477329049 # DummyRegressor Root Mean Squared Error: 938.1665509540665 # DummyRegressor Accuracy: -0.00020030937269321925 # R-squared for Train: 0.00 # R-squared for Test: -0.00 # random_state=43일 때 # DummyRegressor Mean Absolute Error: 606.3160356253877 # DummyRegressor Mean Squared Error: 1342621.3396696597
fig = plt.figure() ax = fig.add_subplot(111) red = ax.scatter(Xtrain, ytrain, color='red', marker='+') knn_plot = ax.plot(Xtest, knn.predict(Xtest), color='green') kridge_plot = ax.plot(Xtest, kridge.predict(Xtest), color='blue') base = ax.plot(Xtest, dummy.predict(Xtest), color='orange', linestyle='--') ax.set_ylabel("output Y", fontsize=20) ax.set_xlabel("input X", fontsize=20) fig.legend(["kNN", "KernelRidge", "baseline", "train"], scatterpoints=1, loc='right', ncol=2, fontsize=15) ax.set_title( "kNN & KernelRidge Predictions", fontsize=20) # Compute percentage of accuracy for each predictions knn_accuracy = knn.score(Xtrain, ytrain) kridge_accuracy = kridge.score(Xtrain, ytrain) baseline_accuracy = dummy.score(Xtrain, ytrain) # Print outputs print("base model accuracy score: ", baseline_accuracy, " - knn model accuracy score: ", knn_accuracy, " - kridge accuracy: ", kridge_accuracy) plt.show()
test_size=0.2, random_state=0) print(X_train.shape, y_train.shape) print(X_test.shape, y_test.shape) # average SalePrice in train and test print('mean SalePrice in train : {0:.3f}'.format(np.mean(y_train))) print('mean SalePrice in test : {0:.3f}'.format(np.mean(y_test))) from sklearn.dummy import DummyRegressor # baseline model model_dummy = DummyRegressor(strategy='mean') model_dummy.fit(X_train, y_train) print('score for baseline model: {0:.2f}'.format( model_dummy.score(X_test, y_test))) def find_best_model_using_gridsearchcv(X, y): algos = { 'linear_regression': { 'model': LinearRegression(), 'params': { 'normalize': [True, False] } }, 'lasso': { 'model': Lasso(), 'params': { 'alpha': [1, 2, 3, 4, 5], 'selection': ['random', 'cyclic']
ml_desafio02['Machile-Learning-RD'] = ridge_predicoes ml_desafio02['Diferença-ML-RD'] = ((y_teste - ml_desafio02['Machile-Learning-RD'])**2)**.5 plt.figure(figsize=(14,8)) plt.title('Comparação Da diferença da Machine Learning SVM x Ridge Gerando uma nota') sns.histplot(data = ml_desafio02, x = 'Diferença-ML-SVM', kde = True, stat='density', color='blue',alpha=1,fill=True,multiple='stack') aula04_d2 = sns.histplot(data = ml_desafio02, x = 'Diferença-ML-RD', kde = True, stat='density', color='green',alpha=.5,fill=True,multiple='stack') aula04_d2.set_xlabel('Valor da Diferença') aula04_d2.set_ylabel('Quantidade Diferença') aula04_d2.legend(labels=('SVM','Ridge'),title=('Machine Learning')) """Desafio 03""" modelo_dummyv2 = DummyRegressor(quantile=1,constant=9) modelo_dummyv2.fit(x_treino,y_treino) dummy_predicoesv2 = modelo_dummyv2.predict(x_teste) print(modelo_dummyv2.score(X=x_treino,y=y_treino)) print(mean_squared_error(y_teste,ridge_predicoes)**.5) print(mean_squared_error(y_teste,predicoes_matematica)**.5) print(mean_squared_error(y_teste,dummy_predicoes)**.5) print(mean_squared_error(y_teste,dummy_predicoesv2)**.5) """Desafio 04""" from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score validadores = [explained_variance_score,max_error,mean_absolute_error,mean_squared_error,median_absolute_error,r2_score] validadores_nome = ['explained_variance_score','max_error','mean_absolute_error','mean_squared_error','median_absolute_error','r2_score'] predicoes = [ridge_predicoes,predicoes_matematica,dummy_predicoes] predicoes_nome = [' ridge_predicoes ',' predicoes_matematica ',' dummy_predicoes '] for posy, y in enumerate(predicoes): print(f'{predicoes_nome[posy-1]:-^50}') for posx, x in enumerate(validadores):
bottom_99_dataset.iloc[0, :57] # **DummyRegressor with mean strategy as a baseline** # In[24]: from sklearn.dummy import DummyRegressor from sklearn.model_selection import train_test_split X = bottom_99_dataset[['year']] y = bottom_99_dataset['total_net_value'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) model.score(X_test, y_test) # In[25]: from sklearn.preprocessing import LabelEncoder le_state = LabelEncoder() le_city = LabelEncoder() factor_columns = ['state_company', 'city'] model_dataset = bottom_99_dataset.dropna(subset=factor_columns) model_dataset['state_company'] = le_state.fit_transform( model_dataset['state_company']) model_dataset['city'] = le_city.fit_transform(model_dataset['city']) model_columns = [ 'cnpj', 'issue_date_day', 'issue_date_month', 'issue_date_year'
def test_regressor_score_with_None(y, y_test): reg = DummyRegressor() reg.fit(None, y) assert reg.score(None, y_test) == 1.0
R_ridge_train_score = ridge.score(X_train, y_train) R_ridge_test_score = ridge.score(X_test, y_test) y_pred_ridge = ridge.predict(X_test) # regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=param_grid, n_jobs=n_jobs) # regressor.fit(X_train, y_train) for k, (train, test) in enumerate(cv.split(X, y)): pdb.set_trace() estimator.fit(X[train], y[train]) y_pred = estimator.predict(X_test) # # best_est= regressor.best_estimator_ # # print "Best Estimator Parameters" # # print"---------------------------" # # print "n_estimators: %d" %best_est.n_estimators # # print "max_depth: %d" %best_est.max_depth # # print "Learning Rate: %.1f" %best_est.learning_rate # # print "min_samples_leaf: %d" %best_est.min_samples_leaf # # print "max_features: %.1f" %best_est.max_features # # print "Train R-squared: %.2f" %best_est.score(X_train,y_train) print "Feature Importances" print estimator.feature_importances_ print "R-squared for Train: %.2f" % estimator.score(X[train], y[train]) print "R-squared for Test: %.2f" % estimator.score(X[test], y[test]) dummy = DummyRegressor() dummy.fit(X_train, y_train) R_dummy = dummy.score(X_train, y_train) y_pred_dummy = dummy.predict(X_test) pdb.set_trace()
import pandas as pd from sklearn.dummy import DummyRegressor # Loading in the data canucks = pd.read_csv('data/canucks_subbed.csv') # Define X and y X = canucks.loc[:, ['No.', 'Age', 'Height', 'Weight', 'Experience']] y = canucks['Salary'] # Create a model model = DummyRegressor(strategy="mean") # Fit your data model.fit(X, y) # Predict the labels of X model.predict(X) # The model accuracy accuracy = round(model.score(X, y), 2) accuracy
# cv里 k折交叉验证时迄今最常见的方法,也有一些其他方法如 (leave-one-out-cross-validation) 该方法的折数等于样本数 # scoring 参数 指定了衡量模型性能的标准。 本章其他节会讨论 # n_jobs=-1 是使用所有可用的CPU核进行计算。 # 11.2 创建一个基准回归模型 from sklearn.datasets import load_boston from sklearn.dummy import DummyRegressor from sklearn.model_selection import train_test_split boston = load_boston() features, target = boston.data, boston.target features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=0) dummy = DummyRegressor(strategy='mean') # 创建DummyRegressor对象 简单的基准回归模型 dummy.fit(features_train, target_train) # 训练回归模型 dummy.score(features_test, target_test) # 计算R方得分 返回的是R-squared from sklearn.linear_model import LinearRegression # 训练自己的模型并与基准模型做比较 ols = LinearRegression() ols.fit(features_train, target_train) ols.score(features_test, target_test) # DummyRegressor 允许我们创建一个简单的模型,以此作为基准和实际的模型进行对比。通常使用这种办法来模拟某个产品或系统中已有的原始预测系统。 # 可选的方法包括训练集的均值或者中位数。此外如果将strategy设置成constant 并使用constant参数。则模型的预测结果都为这个常数。 clf = DummyRegressor(strategy="constant",constant=20) clf.fit(features_train, target_train) clf.score(features_test, target_test) # R-squared 越接近1,代表特征对目标向量的解释越好(即相关性越高) # 11.3 创建一个基准分类模型 from sklearn.datasets import load_iris
# Wczytanie bibliotek. from sklearn.datasets import load_boston from sklearn.dummy import DummyRegressor from sklearn.model_selection import train_test_split # Wczytanie danych. boston = load_boston() # Utworzenie cech. features, target = boston.data, boston.target # Podział na zbiory uczący i testowy. features_train, features_test, target_train, target_test = train_test_split( features, target, random_state=0) # Utworzenie sztucznego regresora. dummy = DummyRegressor(strategy='mean') # "Wytrenowanie" sztucznego regresora. dummy.fit(features_train, target_train) # Pobranie kwadratu wartości. dummy.score(features_test, target_test)
# In[5]: # Calculate and print RMSE training set error of the dummy model from sklearn.metrics import mean_squared_error dummy_r_training_rsme = np.sqrt(mean_squared_error(y_train_r, dummy_r.predict(X_train_r))) print('dummy RMSE: {:.3f}'.format(dummy_r_training_rsme)) # In[6]: # Calculate and print the R2 training set score of the dummy model # hint: can use models 'score' function dummy_r_training_r2 = dummy_r.score(X_train_r, y_train_r) print('dummy R2: {:.3f}'.format(dummy_r_training_r2)) # In[7]: # Calculate and print the mean 5-fold cross valication R2 score of the dummy model from sklearn.model_selection import cross_val_score dummy_r_cv = cross_val_score(dummy_r, X_train_r, y_train_r, cv=5) print('dummy mean cv R2: {:.3f}'.format(np.mean(dummy_r_cv))) # ### Measure performance of Linear Regression # In[8]: