def model_cv_df(model_cv, X_train, X_test, y_train, y_test, cv_mapping, model_name): from sklearn.metrics import mean_absolute_error import numpy as np from statsmodels.tools.eval_measures import mse, rmse from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV # We are making predictions here y_preds_test = model_cv.predict(X_test) print("\t-----TRAIN SET-----") print("\tBest alpha: {}".format(model_cv.alpha_)) print("\tR-squared: {}\n".format(model_cv.score(X_train, y_train))) print("\t-----TEST SET-----") print("\tR-squared: {}".format(model_cv.score(X_test, y_test))) print("\tMean absolute error: {}".format(mean_absolute_error(y_test, y_preds_test))) print("\tMean squared error: {}".format(mse(y_test, y_preds_test))) print("\tRoot mean squared error: {}".format(rmse(y_test, y_preds_test))) print( "\tMean absolute percentage error: {}".format( np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100 ) ) model_vals = [ model_cv.alpha_, model_cv.score(X_train, y_train), model_cv.score(X_test, y_test), mean_absolute_error(y_test, y_preds_test), mse(y_test, y_preds_test), rmse(y_test, y_preds_test), np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100, ] cv_mapping[model_name] = model_vals
def overall_single_regressions(x_train, x_test, y_train, y_test): """ Run a regression over the whole timeline for each variable. x and y should be Pandas dataframes. """ for column in x_train.columns: x = x_train[column] plt.scatter(x, y_train) plt.ylabel("Days to first infection") plt.xlabel(column.replace("-", " ").title()) plt.savefig(f"results/single-regressions/{column}-scatter.png") plt.clf() plt.hist(x_train[column]) plt.ylabel(column.replace("-", " ").title()) plt.savefig(f"results/single-regressions/{column}-histogram.png") plt.clf() # Use StatsModels to create the Linear Model and Output R-squared model = sm.OLS(y_train, x_train[column]) results = model.fit() # print(f"{column} regression summary:") with open(f"results/single-regressions/{column}-result-summary.txt", "w+") as rs: rs.write(results.summary().as_text()) training_mse = eval_measures.mse( y_train, pd.DataFrame(results.predict(x_train[column]))) testing_mse = eval_measures.mse( y_test, pd.DataFrame(results.predict(x_test[column]))) rs.write("\n training MSE: " + str(training_mse[0])) rs.write("\n testing MSE: " + str(testing_mse[0])) # print(results.summary(), "\n\n") return
def MLR(train, test, train_label, test_label): train_ = sm.add_constant(train) res = sm.OLS(train_label, train_).fit() pred_train = res.predict(train_) test = sm.add_constant(test) pred_test = res.predict(test) mlr_test_mse = eval_measures.mse(test_label, pred_test) mlr_train_mse = eval_measures.mse(train_label, pred_train) return mlr_train_mse, mlr_test_mse
def test_eval_measures(): #mainly regression tests x = np.arange(20).reshape(4,5) y = np.ones((4,5)) assert_equal(iqr(x, y), 5*np.ones(5)) assert_equal(iqr(x, y, axis=1), 2*np.ones(4)) assert_equal(iqr(x, y, axis=None), 9) assert_equal(mse(x, y), np.array([ 73.5, 87.5, 103.5, 121.5, 141.5])) assert_equal(mse(x, y, axis=1), np.array([ 3., 38., 123., 258.])) assert_almost_equal(rmse(x, y), np.array([ 8.5732141 , 9.35414347, 10.17349497, 11.02270384, 11.89537725])) assert_almost_equal(rmse(x, y, axis=1), np.array([ 1.73205081, 6.164414, 11.09053651, 16.0623784 ])) assert_equal(maxabs(x, y), np.array([ 14., 15., 16., 17., 18.])) assert_equal(maxabs(x, y, axis=1), np.array([ 3., 8., 13., 18.])) assert_equal(meanabs(x, y), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(meanabs(x, y, axis=1), np.array([ 1.4, 6. , 11. , 16. ])) assert_equal(meanabs(x, y, axis=0), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(bias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(bias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(medianbias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianbias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(vare(x, y), np.array([ 31.25, 31.25, 31.25, 31.25, 31.25])) assert_equal(vare(x, y, axis=1), np.array([ 2., 2., 2., 2.]))
def test_eval_measures(): # mainly regression tests x = np.arange(20).reshape(4, 5) y = np.ones((4, 5)) assert_equal(iqr(x, y), 5 * np.ones(5)) assert_equal(iqr(x, y, axis=1), 2 * np.ones(4)) assert_equal(iqr(x, y, axis=None), 9) assert_equal(mse(x, y), np.array([73.5, 87.5, 103.5, 121.5, 141.5])) assert_equal(mse(x, y, axis=1), np.array([3.0, 38.0, 123.0, 258.0])) assert_almost_equal( rmse(x, y), np.array( [8.5732141, 9.35414347, 10.17349497, 11.02270384, 11.89537725] ), ) assert_almost_equal( rmse(x, y, axis=1), np.array([1.73205081, 6.164414, 11.09053651, 16.0623784]), ) err = x - y loc = np.where(x != 0) err[loc] /= x[loc] err[np.where(x == 0)] = np.nan expected = np.sqrt(np.nanmean(err ** 2, 0) * 100) assert_almost_equal(rmspe(x, y), expected) err[np.where(np.isnan(err))] = 0.0 expected = np.sqrt(np.nanmean(err ** 2, 0) * 100) assert_almost_equal(rmspe(x, y, zeros=0), expected) assert_equal(maxabs(x, y), np.array([14.0, 15.0, 16.0, 17.0, 18.0])) assert_equal(maxabs(x, y, axis=1), np.array([3.0, 8.0, 13.0, 18.0])) assert_equal(meanabs(x, y), np.array([7.0, 7.5, 8.5, 9.5, 10.5])) assert_equal(meanabs(x, y, axis=1), np.array([1.4, 6.0, 11.0, 16.0])) assert_equal(meanabs(x, y, axis=0), np.array([7.0, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0])) assert_equal(bias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(bias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0])) assert_equal(medianbias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianbias(x, y, axis=1), np.array([1.0, 6.0, 11.0, 16.0])) assert_equal(vare(x, y), np.array([31.25, 31.25, 31.25, 31.25, 31.25])) assert_equal(vare(x, y, axis=1), np.array([2.0, 2.0, 2.0, 2.0]))
def perfstats(col, Y): ##rsq,mae,mse,rmse,mape pf = pd.DataFrame(columns=[ 'model', 'rsq', 'rsq_adj', 'f_value', 'aic', 'bic', 'mae', 'mse', 'rmse', 'mape' ]) pd.options.display.float_format = '{:.3f}'.format for num, X in enumerate(col, 1): x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) standardscaler = StandardScaler() x_train = standardscaler.fit_transform(x_train) x_test = standardscaler.transform(x_test) x_train = sm.add_constant(x_train) results = sm.OLS(y_train, x_train).fit() x_test = sm.add_constant(x_test) y_pred = results.predict(x_test) pf.loc[num] = ('model_' + str(num), results.rsquared, results.rsquared_adj, results.fvalue, results.aic, results.bic, mean_absolute_error(y_test, y_pred), mse(y_test, y_pred), rmse(y_test, y_pred), (np.mean(np.abs((y_test - y_pred) / y_test)) * 100)) return pf
def regstats(col, Y): #linear ##rsq,mae,mse,rmse,mape pf = pd.DataFrame(columns=[ 'model', 'rsq_train', 'rsq_test', 'subt_rsq', 'mae_test', 'mse_test', 'rmse_test', 'mape_test' ]) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) standardscaler = StandardScaler() x_train = standardscaler.fit_transform(x_train) x_test = standardscaler.transform(x_test) results = LinearRegression().fit(x_train, y_train) y_pred = results.predict(x_test) pf.loc[num] = ('model_' + str(num), results.score(x_train, y_train), results.score(x_test, y_test), results.score(x_train, y_train) - results.score(x_test, y_test), mean_absolute_error(y_test, y_pred), mse(y_test, y_pred), rmse(y_test, y_pred), (np.mean(np.abs( (y_test - y_pred) / y_test)) * 100)) return pf
def fit(self, folds=3, thetas=(-2, -1, 0, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2)): """Function to theta models based on Kevin Sheppard's code. Selects the best theta for the series based on KFold cross-validation Parameters ---------- @Parameters thetas - tuple of float theta values to evaluate Returns ---------- None """ # Initialise the KFold object kf = TimeSeriesSplit(n_splits=folds) for i, series in enumerate(self.data.columns): x = self.data.loc[:self.train_ix[series] - 1, series] mspes = {t: np.empty((folds, 1)) for t in thetas} p = pd.DataFrame(None, index=["a0", "b0"], dtype=np.double) params = {i: p for i in range(folds)} fold_ix = 0 for tr_ix, te_ix in kf.split(x): # Set up data x_tr, x_te = x.iloc[tr_ix], x.iloc[te_ix] t = x_tr.shape[0] k = x_te.shape[0] for theta in thetas: # Estimate the different theta models params[fold_ix][theta] = self.estimate(x_tr, theta) # Forecast for different theta models: b0 = params[fold_ix][theta]["b0"] # New RHS for forecasting rhs_oos = np.ones((k, 2)) rhs_oos[:, 1] = np.arange(k) + t + 1 # Exp. Smoothing term fit_args = {"disp": False, "iprint": -1, "low_memory": True} ses = ExponentialSmoothing(x_tr).fit(**fit_args) alpha = ses.params.smoothing_level # Actual forecasting ses_forecast = ses.forecast(k) trend = (np.arange(k) + 1 / alpha - ((1 -alpha) ** t) / alpha) trend *= 0.5 * b0 forecast = np.array(ses_forecast + trend) mspes[theta][fold_ix] = mse(x_te, forecast) fold_ix += 1 # Evaluate the KFold for k, v in mspes.items(): mspes[k] = np.mean(v) self.best_theta[series] = min(mspes, key=mspes.get) self.fitted[series] = self.estimate(x, self.best_theta[series]) self.fit_success = True
def print_evaluation_metrics(true, predicted): print("Mean absolute error of the prediction is: {}".format( mean_absolute_error(true, predicted))) print("Mean squared error of the prediction is: {}".format( mse(true, predicted))) print("Root mean squared error of the prediction is: {}".format( rmse(true, predicted))) print("Mean absolute percentage error of the prediction is: {}".format( np.mean(np.abs((true - predicted) / true)) * 100))
def overall_multiregression(x_train, x_test, y_train, y_test): """ Run a multigression over the whole timeline with all provided variables. x and y should be Pandas dataframes. """ # Use StatsModels to create the Linear Model and Output R-squared x_train = sm.add_constant(x_train) x_test = sm.add_constant(x_test) model = sm.OLS(y_train, x_train) results = model.fit() # print(f"Multiregression summary:") with open(f"results/multiregression-result-summary.txt", "w+") as rs: rs.write(results.summary().as_text()) # TODO: Something is messed up here training_mse = eval_measures.mse(y_train, pd.DataFrame(results.predict(x_train))) testing_mse = eval_measures.mse(y_test, pd.DataFrame(results.predict(x_test))) rs.write("\n training MSE: " + str(training_mse[0])) rs.write("\n testing MSE: " + str(testing_mse[0])) # print(results.summary()) return (results.rsquared, training_mse[0], testing_mse[0])
def printErrors(test, pred, model): ''' Objective: to print errors of the models Inputs: test: test dataframe pred: predictions model: model that is used Outputs: Mean absolute error, mean squared error, root mean squared error ''' print('MAE of ' + model + ': {:.4}'.format(meanabs(test, pred, axis=0))) print('MSE of ' + model + ': {:.4}'.format(mse(test, pred, axis=0))) print('RMSE of ' + model + ': {:.4}'.format(rmse(test, pred, axis=0)))
def _evaluate_arima_model(X: Union[pd.Series, pd.DataFrame], arima_order: Tuple[int, int, int], train_size: Union[float, int, None], freq: str) -> Tuple[float, dict]: train_size = int(len(X) * 0.75) if train_size is None else int(len(X) * train_size) \ if isinstance(train_size, float) else train_size train, test = X[:train_size].astype(float), X[train_size:].astype(float) model = ARIMA(train, order=arima_order, freq=freq) model_fit = model.fit(disp=False, method='css', trend='nc') # calculate test error yhat = model_fit.forecast(len(test))[0] error = mse(test, yhat) return error, model_fit
def model_evaluation(y_test, X_test, fitted_model, training_time, start_time): """ Evaluates model performance and returns a dict of evaluation metrics :param 1D array-like y_test: the target variable of the test set :param 2D array-like X_test: the predictor variables of the test set :param fitted_model: a trained model implementing a predict() method :param training_time: the duration of training :param start_time: the timestamp at which training started :return dict: a dict of evaluation metrics """ return { 'mse_test': mse(y_test.values, fitted_model.predict(X_test.values)), 'training_time': training_time, 'prediction_time': time.time() - start_time - training_time, }
def main(): #Calculating via cross_val_score function kf = KFold(len(loansData['IR_TF']), 10) X1 = loansData[indVars] y1 = loansData['IR_TF'] lr = LogisticRegression() msescores = cross_val_score(lr, X1, y1,scoring='mean_squared_error', cv=kf, n_jobs=1) r2score = cross_val_score(lr, X1, y1,scoring='r2', cv=kf, n_jobs=1) maescore = cross_val_score(lr, X1, y1,scoring='mean_absolute_error', cv=kf, n_jobs=1) #Below is an alternative means to calculating the cross validation stats. Seemed to give a more #intuitive answer, however not fully certain it is correct mselist = [] maelist = [] r2list = [] r2listtrain = [] r2listtest = [] for train, test in kf: X = loansData[indVars] y = loansData['IR_TF'] #setting up train and test models logit = sm.Logit(loansData['IR_TF'].ix[train], loansData[indVars].ix[train]).fit() logittest = sm.Logit(loansData['IR_TF'].ix[test], loansData[indVars].ix[test]).fit() testR = logittest.prsquared trainR = logit.prsquared #calculate MSE mselist.append(ste.mse(logit.predict(X.ix[test]), y.ix[test], axis=0)) #calculate MAE maelist.append(np.sum(logit.predict(X.ix[test])) - np.sum(y.ix[test])) #calculate % difference in R2 of each train-test pair r2list.append(abs((trainR - testR)/trainR)) r2listtrain.append(testR) r2listtest.append(trainR) #printing results for both methods print('First Method Using cross_val_score function:') print('MSE: '+str(np.mean(msescores))+' ,'+'RSquared: '+str(np.mean(r2score))+' ,'+ 'MAE: '+str(np.mean(maescore))) print ('Second Method Using for loop (answers seemed more intuitive):') print ('Mean of MSE is '+str(np.mean(mselist)) ) print ('MAE is '+str(np.mean(maelist)) ) print ('The mean of R^2 percentage difference in each training and test samples is '+"{:.0%}".format(np.mean(r2list))) print ('The overall percentage difference of the total means of R^2 for training and test samples is '+"{:.0%}".format( abs((np.mean(r2listtrain) - np.mean(r2listtest))/np.mean(r2listtrain))))
def regression_stats(model): y_preds_test = model.predict(X_test) # create df for model results model_vals = [ model.score(X_train, y_train), model.score(X_test, y_test), mean_absolute_error(y_test, y_preds_test), mse(y_test, y_preds_test), rmse(y_test, y_preds_test), np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100, ] mapping = { "stat": ["train R^2", "test R^2", "MAE", "MSE", "RMSE", "MAPE"], "model": model_vals, } stats_df = pd.DataFrame.from_dict(mapping) return stats_df
def daily_analysis(file_path): """Run a day-by-day analysis of the performance of the viral pressure metric""" X = get_viral_pressure_data(get_connectedness_data(file_path), file_path) y = process_y(get_case_data(file_path)) for (country, date, val) in y: X[date][country] = (X[date][country], val) daily_results = {} for (date, data) in X.items(): # _, data = daily_data [x_pressure, y_days] = zip(*data.values()) model = sm.OLS(y_days, x_pressure) results = model.fit() training_mse = eval_measures.mse(y_days, results.predict(y_days)) daily_results[date] = (training_mse, results.rsquared) [mse, rsquared] = zip(*daily_results.values()) # dates = daily_results.keys() dates = [ pd.to_datetime(d, format="%Y-%m-%d") for d in daily_results.keys() ] ax = plt.gca() ax.xaxis.set_major_formatter(mdates.DateFormatter("%m-%d")) # plt.scatter(dates, mse) plt.plot_date(dates, mse) plt.ylabel("MSE") plt.xlabel("Date") plt.savefig("results/MSE_daily_scatter.png") plt.clf() ax = plt.gca() ax.xaxis.set_major_formatter(mdates.DateFormatter("%m-%d")) # plt.scatter(dates, rsquared) plt.plot_date(dates, rsquared) plt.ylabel("R-Squared") plt.xlabel("Date") plt.savefig("results/R_Squared_daily_scatter.png") plt.clf()
def __evaluate_arima_model(self, ts, order, seasonal_order, test_ratio): """ Evaluates the ARIMA model with given order and seasonal order :param ts: time series :param order: (p,d,q) :param seasonal_order: (P,D,Q,m) :param test_ratio: test ratio to use :return: error, best_model, best_model_fit """ # prepare training dataset train_size = int(len(ts) * (1 - test_ratio)) train, test = ts[0:train_size], ts[train_size:] model, model_fit = self.__train_model(train, order, seasonal_order) if not self.__seasonal: yhat = model_fit.forecast(len(test))[0] else: yhat = model_fit.forecast(len(test)) # calculate out of sample error error = mse(test, yhat) return error, model, model_fit
def model_evaluation(pred_data, test_date_index, test_airport_index): DelayRatio = pd.read_csv("DelayRatio.csv", index_col=0) mae_score = np.mean( mae(DelayRatio.fillna(0).iloc[test_date_index, test_airport_index], pred_data, axis=0)) print('mae metric: ', mae_score) rmse_score = np.mean( mse(DelayRatio.fillna(0).iloc[test_date_index, test_airport_index], pred_data, axis=0))**0.5 print('rmse metric: ', rmse_score) wae_score = wae_eval(pred_data, test_date_index, test_airport_index) print('wae metric: ', wae_score) rwse_score = rwse(pred_data, test_date_index, test_airport_index) print('rwse metric: ', rwse_score) DelayFlights = pd.read_csv("ArrDelayFlights.csv", index_col=0) + pd.read_csv( "DepDelayFlights.csv", index_col=0) TotalFlights = pd.read_csv("ArrTotalFlights.csv", index_col=0) + pd.read_csv( "DepTotalFlights.csv", index_col=0) w_pre_data = TotalFlights.iloc[test_date_index, test_airport_index] * pred_data #display(w_pre_data) # w_mae_score = np.mean(mae(DelayFlights.iloc[test_date_index, test_airport_index],w_pre_data,axis=0)) # print ('w_mae metric: ',w_mae_score) # w_rmse_score = np.mean(mse(DelayFlights.iloc[test_date_index, test_airport_index],w_pre_data,axis=0))**0.5 # print ('w_rmse metric: ',w_rmse_score) return
def regframe(X, Y, mod, idx): ##rsq,mae,mse,rmse,mape x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False) model = mod.fit(x_train, y_train) y_pred = model.predict(x_test) k_fold = KFold(n_splits=10, shuffle=False) df = pd.Series( { 'rsq_train': model.score(x_train, y_train), 'rsq_test': model.score(x_test, y_test), 'subt_rsq': model.score(x_train, y_train) - model.score(x_test, y_test), 'mae_test': mean_absolute_error(y_test, y_pred), 'mse_test': mse(y_test, y_pred), 'rmse_test': rmse(y_test, y_pred), 'mape_test': (np.mean(np.abs((y_test - y_pred) / y_test)) * 100), 'cross-score': cross_val_score(estimator=mod, X=X, y=Y, cv=k_fold).mean(), 'cross-train': cross_val_score(estimator=mod, X=x_train, y=y_train, cv=k_fold).mean() }, name=idx) return df
], axis=1) y = data_df['is_winner'] return X, y X, y = load_file("tennis.csv") print(X) X = sm.add_constant(X) x_train, x_test, y_train, y_test = train_test_split(X.values, y, p) model = sm.OLS(y_train, x_train) results = model.fit() print(results.summary()) train_y_cap = results.predict(x_train) y_cap = results.predict(x_test) training_MSE = eval_measures.mse(y_train, train_y_cap) testing_MSE = eval_measures.mse(y_test, y_cap) print('training r-squared: ' + str(results.rsquared)) print('training MSE: ' + str(training_MSE)) print('testing MSE: ' + str(testing_MSE)) ################################################################################## # TODO: use train test split to split data into x_train, x_test, y_train, y_test # ################################################################################# ################################################################################## # TODO: Use StatsModels to create the Linear Model and Output R-squared ################################################################################# # Prints out the Report # TODO: print R-squared, test MSE & train MSE
ax.set_xlabel('YEAR') ax.set_ylabel('DEC') plt.show() from sklearn import linear_model, feature_selection, preprocessing from sklearn.model_selection import train_test_split import statsmodels.formula.api as sm from statsmodels.tools.eval_measures import mse from statsmodels.tools import add_constant from sklearn.metrics import mean_squared_error X = df.values.copy() X_train, X_valid, y_train, y_valid = train_test_split(X[:, :-1], X[:, -1], train_size=0.80) result = sm.OLS(y_train, add_constant(X_train)).fit() result.summary() result = sm.OLS(y_train, add_constant(X_train)).fit() result.summary() ypred = result.predict(add_constant(X_valid)) print(mse(ypred, y_valid)) fig, ax = plt.subplots(1, 1) ax.scatter(y_valid, ypred) ax.set_xlabel('Actual') ax.set_ylabel('Prediction') plt.show() # In[ ]: # In[ ]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1) ############################### # run OLS from statsmodels.tools.eval_measures import mse, rmse X_train_const = sm.add_constant(X_train) #X_train_const['carat_2'] = np.pow(X_train_const['carat'],2) res = sm.OLS(y_train, X_train_const ).fit() print(res.summary()) #np.sqrt(mse(res.predict(X_test), y_test)) mse(res.predict(X_test), y_test) plot_pred_vs_actual(res.predict(X_test), y_test) res.params.plot(kind='bar') pd.Series(lasso.coef_, index=X.columns).plot(kind='bar') ################################ # run LASSO lassocv = LassoCV(alphas=None, cv=10, max_iter=100000, normalize=True) lassocv.fit(X_train, y_train) print("Alpha=", lassocv.alpha_) lasso = Lasso() lasso.set_params(alpha=lassocv.alpha_)
def get_mse(self): return mse(self.y_test, self.y_predictor)
def cross_validation(all_variables, labels, ind_variables, title): features = all_variables[ind_variables].to_numpy() labels = labels.to_numpy() k_fold = KFold(n_splits=5, random_state=0, shuffle=True) nn_test_mse_sum = nn_train_mse_sum = mlr_test_mse_sum = mlr_train_mse_sum = dtr_train_mse_sum = dtr_test_mse_sum = base_test_mse_sum = base_train_mse_sum = 0 res = [] for train_indices, test_indices in k_fold.split(features): train = features[train_indices] train_label = labels[train_indices] test = features[test_indices] test_label = labels[test_indices] # neural network epoch = 200 model = build_model(features.shape[1]) history = model.fit( train, train_label, epochs=epoch, verbose=0) # plot_history(history) loss, mae, nn_test_mse = model.evaluate(test, test_label, verbose=2) nn_train_mse = history.history['mse'][epoch-1] print('nn test mse: ' + str(nn_test_mse)) print('nn train mse: ' + str(nn_train_mse)) nn_test_mse_sum += nn_test_mse nn_train_mse_sum += nn_train_mse # multiple linear regression mlr_train_mse, mlr_test_mse = MLR(train, test, train_label, test_label) print('mlr test MSE: ' + str(mlr_test_mse)) print('mlr train mse: ' + str(mlr_train_mse)) mlr_test_mse_sum += mlr_test_mse mlr_train_mse_sum += mlr_train_mse # decision tree regression regr = DecisionTreeRegressor(max_depth=5) regr.fit(train, train_label) target_train = regr.predict(train) target_test = regr.predict(test) regression_train_mse = eval_measures.mse(train_label, target_train) print("Decision Tree Regression train MSE: ", regression_train_mse) regression_test_mse = eval_measures.mse(test_label, target_test) print("Decision Tree Regression test MSE: ", regression_test_mse) dtr_train_mse_sum += regression_train_mse dtr_test_mse_sum += regression_test_mse # baseline mean = np.mean(list(train_label) + list(test_label)) base_test_mse = sum([(label - mean)**2 for label in test_label])/len(test_label) print('baseline (mean) test mse: ' + str(base_test_mse)) base_train_mse = sum([(label - mean)**2 for label in train_label])/len(train_label) print('baseline (mean) train mse: ' + str(base_train_mse)) base_test_mse_sum += base_test_mse base_train_mse_sum += base_train_mse res.extend([mlr_train_mse, mlr_test_mse, regression_train_mse, regression_test_mse, nn_train_mse, nn_test_mse, base_train_mse, base_test_mse]) df = pd.DataFrame([]) df['Model'] = ['MLR', 'MLR', 'DT', 'DT', 'NN', 'NN', 'Baseline', 'Baseline']*5 df['data'] = ['Train', 'Test', 'Train', 'Test', 'Train', 'Test', 'Train', 'Test']*5 df['MSE'] = res #[mlr_train_mse_sum/5, mlr_test_mse_sum/5, dtr_train_mse_sum/5, dtr_test_mse_sum/5, nn_train_mse_sum/5, nn_test_mse_sum/5, base_train_mse_sum/5, base_test_mse_sum/5] sns.barplot(x="Model", y="MSE", hue="data", data=df, palette="Paired") plt.title(title) plt.legend(loc=[1, 1]) plt.tight_layout() #plt.box(False) plt.show() plt.savefig(fname="D:\Brown_MS\spring2020\cs1951a\cs1951a_final_project\pictures\models\\"+title,format="svg") mlr, nn, dt, base = mlr_test_mse_sum/5, nn_test_mse_sum/5, dtr_test_mse_sum/5, base_test_mse_sum/5 return mlr, nn, dt, base, res
130, {'1%': -3.4816817173418295, '5%': -2.8840418343195267, '10%': -2.578770059171598}, 996.6929308390189) ''' help(adfuller) dftest = adfuller(df1['Thousands of Passengers']) from statsmodels.tools.eval_measures import mse, rmse, meanabs mse(df['test'],df['predictions']) #: 17.02 df4 = pd.read_csv('airline_passengers.csv', index_col = 'Month',parse_dates= True ) df4.index.freq = 'MS' from statsmodels.graphics.tsaplots import month_plot, quarter_plot month_plot(df4['Thousands of Passengers']); dfq = df4['Thousands of Passengers'].resample(rule = 'Q').mean() quarter_plot(dfq)
def MSE(y_pred, y_true): return mse(y_pred, y_true)
np.logical_and(y >= LOW_THRESHOLD, y <= HIGH_THRESHOLD)).tolist() data_groups["HIGH"] = np.argwhere(y > HIGH_THRESHOLD).tolist() data_groups["ALL"] = list(range(len(y))) # Loops through each testing group to perform the regression: for label, indices_list in data_groups.items(): # Indices list is plain list of numbers (had to do this because np.argwhere returns list of lists) indices = indices_list if label != "ALL": indices = [item for sublist in indices_list for item in sublist] print('Current Testing Group: ' + label) print('Variables used: ' + ", ".join(variables)) print('Number of countries: ' + str(len(indices))) cur_X = X[indices] cur_y = y[indices] cur_X = sm.add_constant(cur_X) model = sm.OLS(cur_y, cur_X) results = model.fit() mse = eval_measures.mse(cur_y, results.predict(cur_X)) print(results.summary()) print('R-squared = ' + str(results.rsquared)) print('MSE = ' + str(mse)) print("-----------------------------------------------")
if 0 <= datetime.weekday() <= 4: one_hot_date[i][0] = 1 X = np.append(X, one_hot_weather, axis=1) X = np.append(X, one_hot_borough, axis=1) X = np.append(X, one_hot_time, axis=1) X = np.append(X, one_hot_date, axis=1) return (X, y) X, y = load_file("fullDeliverable.db") x_train, x_test, y_train, y_test = train_test_split(X, y, p) X = sm.add_constant(x_train) model = sm.OLS(y_train, X) results = model.fit() print(results.summary()) average = np.mean(y_train) train_predict = results.predict(X) train_MSE = eval_measures.mse(y_train, train_predict) regular_train_MSE = eval_measures.mse(y_train, average) regular_test_MSE = eval_measures.mse(y_test, average) X1 = sm.add_constant(x_test) test_predict = results.predict(X1) test_MSE = eval_measures.mse(y_test, test_predict) print("Baseline Train MSE: " + str(regular_train_MSE)) print("Baseline Test MSE: " + str(regular_test_MSE)) print("Regression Train MSE: " + str(train_MSE)) print("Regression Test MSE: " + str(test_MSE))
def ModelLinearRegression1(self): print '+++++++++++++++++++++++++ MULTI-DIM LINEAR REGRESSION 1 +++++++++++++++++++++++++' # Read csv file.. First get handle comLRHandle = ReadCSV.Read_CSV() data = comLRHandle.Read(self.path) # Lets print data and some maths print data print data.describe() # As we can see data is multi-Dimensional (more than 2 dim). We need to find something # which help us to use Linear Regression on this Multi-Dim data. # Lets print correlation Matrix to get some info.. print '---------------------------- CORRELATION MATRIX ---------------------------- ' print data.corr() print '-----------------------------------------------------------------------------' # Lets plot data to have visual info #fig, ax = plt.subplots(1,1) #ax.scatter(data.height, data.avg_points_scored) #ax.set_xlabel('height') #ax.set_ylabel('Average points scored per game') #plt.show() #fig, ax = plt.subplots(1,1) #ax.scatter(data.weight, data.avg_points_scored) #ax.set_xlabel('weight') #ax.set_ylabel('Average points scored per game') #plt.show() #fig, ax = plt.subplots(1,1) #ax.scatter(data.success_field_goals, data.avg_points_scored) #ax.set_xlabel('success_field_goals') #ax.set_ylabel('Average points scored per game') #plt.show() #fig, ax = plt.subplots(1,1) #ax.scatter(data.success_free_throws, data.avg_points_scored) #ax.set_xlabel('success_free_throws') #ax.set_ylabel('Average points scored per game') #plt.show() # This is highly deviated data against avg_points_scored. It will be difficult for the model # to analyze and predict. Let see what happens.. # Lets break data in 2 pieces (training data (80%) and test data (20%)) dataX = data.values.copy() TdataX, VdataX, TdataY, VdataY = cross_validation.train_test_split(dataX[:,:-1], dataX[:, -1], train_size= 0.80) # height|weight|success_field_goals|success_free_throws|avg_points_scored # |<--------------------------------------------------->|<--------------->| # |<-----------------TdataX/VdataX--------------------->|<-TdataY/VdataY->| # |<------------Independent Variables ----------------->|<-Dep Variable-->| # Lets use Ordinary Least Square (OLS) regression model ols = sm.OLS(TdataY, add_constant(TdataX)).fit() print ols.summary() # Lets use Ordinary Least Square (OLS) regression model over success_field_goals olsSFG = sm.OLS(TdataY, add_constant(TdataX[:,2])).fit() print olsSFG.summary() # Lets see MSE of above two models TPredictedOLSY = ols.predict(add_constant(VdataX)) print mse(TPredictedOLSY, VdataY) TPredictedOLSSFGY = olsSFG.predict(add_constant(VdataX[:,2])) print mse(TPredictedOLSSFGY, VdataY) # Lets plot them.. Get visuals #fig, ax = plt.subplots(1, 1) #ax.scatter(VdataY, TPredictedOLSY) #ax.set_xlabel('Actual') #ax.set_ylabel('Predicted') #plt.show() #fig, ax = plt.subplots(1, 1) #ax.scatter(VdataY, TPredictedOLSSFGY) #ax.set_xlabel('Actual') #ax.set_ylabel('Predicted') #plt.show() # LETS USE sklearn PACKAGE NOW.. # create Linear Regression model handle lm = linear_model.LinearRegression() # Let train the model lm.fit(TdataX, TdataY) # Intercept and Weights print 'Intercept is %f' % lm.intercept_ print pd.DataFrame(zip(data.columns,lm.coef_), columns = ['features','estimatedCoefficients']) # Cross validate CV = cross_validation.cross_val_score(lm, TdataX, TdataY, scoring='r2') print CV # Lets see how system predicts and what is MSE TPredictedLMY = lm.predict(VdataX) print mean_squared_error(TPredictedLMY, VdataY)
from statsmodels.tsa.stattools import grangercausalitytests grangercausalitytests(df3[['a', 'd']], maxlag=5) grangercausalitytests(df3[['b', 'd']], maxlag=5) np.random.seed(42) df = pd.DataFrame(np.random.randint(20, 30, (50, 2)), columns=['test', 'predictions']) df.head() df.plot(figsize=(12, 8)) from statsmodels.tools.eval_measures import mse, rmse, meanabs mse(df['test'], df['predictions']) rmse(df['test'], df['predictions']) meanabs(df['test'], df['predictions']) df1.head() df1.index from statsmodels.graphics.tsaplots import month_plot, quarter_plot month_plot(df1['Pass_K']) df1q = df1['Pass_K'].resample(rule='Q').sum() quarter_plot(df1q)
print "Printing OLS fit summary: " print result.summary() print "\n" # use different combination of variables print "\n" print "Printing alternative OLS fit summary: " result_alt = sm.OLS(y_train, add_constant(X_train[:, 2])).fit() print result_alt.summary() print "\n" # apply model on test data set ypred = result.predict(add_constant(X_valid)) print "Printing mean squared error: " # mean-square error print mse(ypred, y_valid) print "\n" # predict test set ypred_alt = result_alt.predict(add_constant(X_valid[:, 2])) print "Printing mean squared error of alternate model: " print mse(ypred_alt, y_valid) print "\n" fig7, ax = plt.subplots(1, 1) ax.scatter(y_valid, ypred) ax.set_xlabel("Actual") ax.set_ylabel("Predicted") # alternate model fig8, ax = plt.subplots(1, 1)
import numpy as np import pandas as pd import statsmodels.formula.api as smf import statsmodels.tools.eval_measures as ste # Set seed for reproducible results np.random.seed(414) # Gen toy data X = np.linspace(0, 15, 1000) y = 3 * np.sin(X) + np.random.normal(1 + X, 0.2, 1000) train_X, train_y = X[:700], y[:700] test_X, test_y = X[300:], y[300:] train_df = pd.DataFrame({"X": train_X, "y": train_y}) test_df = pd.DataFrame({"X": test_X, "y": test_y}) # Linear Fit poly_1 = smf.ols(formula="y ~ 1 + X", data=train_df).fit() print(ste.mse(poly_1.predict(test_df), test_y, axis=0)) # Quadratic Fit poly_2 = smf.ols(formula="y ~ 1 + X + I(X**2)", data=train_df).fit() print(ste.mse(poly_2.predict(test_df), test_y, axis=0))
# Since the 3rd variable is significant and the others aren't based on the pvalue. We'll recreate the model only using that variable. # <codecell> result_alternate = sm.OLS( y_train, add_constant(X_train[:,2]) ).fit() result_alternate.summary() # <markdowncell> # Lets predict on the test data and see how much is the error # <codecell> ypred = result.predict(add_constant(X_valid)) print mse(ypred,y_valid) ypred_alternate = result_alternate.predict(add_constant(X_valid[:, 2])) print mse(ypred_alternate,y_valid) # <markdowncell> # Lets see the actual vs predicted for the 1st model # <codecell> fig, ax = plt.subplots(1, 1) ax.scatter(y_valid, ypred) ax.set_xlabel('Actual') ax.set_ylabel('Predicted') plt.show()
################################################################################## # TODO: Use StatsModels to create the Linear Model and Output R-squared ################################################################################# x_train = sm.add_constant(x_train) model = sm.OLS(y_train, x_train) results = model.fit() # Prints out the Report # TODO: print R-squared, test MSE & train MSE print('r-squared') print(results.rsquared) x_test = sm.add_constant(x_test) test_pred = results.predict(x_test) print('MSE test') print(eval_measures.mse(y_test, test_pred)) train_pred = results.predict(x_train) print('MSE train') print(eval_measures.mse(y_train, train_pred)) print('Summary') print(results.summary()) # r-squared # 0.4610741855183632 # MSE test # 0.12307815170253002 # MSE train # 0.12598380510417417