def test_eval_measures(): #mainly regression tests x = np.arange(20).reshape(4,5) y = np.ones((4,5)) assert_equal(iqr(x, y), 5*np.ones(5)) assert_equal(iqr(x, y, axis=1), 2*np.ones(4)) assert_equal(iqr(x, y, axis=None), 9) assert_equal(mse(x, y), np.array([ 73.5, 87.5, 103.5, 121.5, 141.5])) assert_equal(mse(x, y, axis=1), np.array([ 3., 38., 123., 258.])) assert_almost_equal(rmse(x, y), np.array([ 8.5732141 , 9.35414347, 10.17349497, 11.02270384, 11.89537725])) assert_almost_equal(rmse(x, y, axis=1), np.array([ 1.73205081, 6.164414, 11.09053651, 16.0623784 ])) assert_equal(maxabs(x, y), np.array([ 14., 15., 16., 17., 18.])) assert_equal(maxabs(x, y, axis=1), np.array([ 3., 8., 13., 18.])) assert_equal(meanabs(x, y), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(meanabs(x, y, axis=1), np.array([ 1.4, 6. , 11. , 16. ])) assert_equal(meanabs(x, y, axis=0), np.array([ 7. , 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(bias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(bias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(medianbias(x, y), np.array([ 6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianbias(x, y, axis=1), np.array([ 1., 6., 11., 16.])) assert_equal(vare(x, y), np.array([ 31.25, 31.25, 31.25, 31.25, 31.25])) assert_equal(vare(x, y, axis=1), np.array([ 2., 2., 2., 2.]))
def moving_average(dataframe, window, ahead, file_name): # Datetime format date_format = '%b-%y' # Create ahead-day entries in future date_range = pd.date_range(start=dataframe.index[0], periods=dataframe.count() + ahead, format=date_format, freq='MS') # Create new dataframe for forecasting forecast_full_frame = pd.Series(data=[np.nan] * (len(date_range)), index=date_range) # Begin forecasting for idx in range(window, len(forecast_full_frame.index)): # Estimation of actual data if idx < dataframe.count(): forecast_full_frame.iloc[idx] = round( np.mean([dataframe.iloc[idx - i] for i in range(1, window + 1)])) # Calculation of future data else: forecast_full_frame.iloc[idx] = round( np.mean([forecast_full_frame.iloc[idx - i] for i in range(1, window + 1)])) # Drop all NaN values forecast_full_frame.dropna(inplace=True) # Future timeframe only forecast_partial_frame = forecast_full_frame[~forecast_full_frame.index.isin(dataframe.index)] # Root mean squared error rmse = eval_measures.rmse(dataframe[window:dataframe.count()], forecast_full_frame[:dataframe.count() - window]) # Return result return forecast_full_frame, forecast_partial_frame, rmse
def get_rmse(): """ Compute the RMSE based on the relevant parameterization. """ fname = '../truth/start/data.respy.info' probs_true = get_choice_probabilities(fname, is_flatten=True) fname = 'start/data.respy.info' probs_start = get_choice_probabilities(fname, is_flatten=True) fname = 'stop/data.respy.info' probs_stop = get_choice_probabilities(fname, is_flatten=True) rmse_stop = rmse(probs_stop, probs_true) rmse_start = rmse(probs_start, probs_true) return rmse_start, rmse_stop
def RMSE(params, *args): dataframe = args[0] type = args[1] # multiplicative rmse = 0 alpha, beta, gamma = params period_len = args[2] smooth = [0] * period_len trend = [0] * period_len smooth[-1] = sum(dataframe.iloc[0:period_len]) / float(period_len) trend[-1] = (sum(dataframe.iloc[period_len:2 * period_len]) - sum(dataframe.iloc[0:period_len])) / period_len ** 2 forecast = [] if type == 'multiplicative': season = [dataframe.iloc[i] / smooth[-1] for i in range(period_len)] for i in range(period_len, dataframe.count()): smooth.append(alpha * (dataframe.iloc[i] / season[-period_len]) + (1 - alpha) * (smooth[-1] + trend[-1])) trend.append(beta * (smooth[i] - smooth[-1]) + (1 - beta) * trend[-1]) season.append(gamma * (dataframe.iloc[i] / (smooth[i])) + (1 - gamma) * season[-period_len]) forecast.append((smooth[-1] + trend[-1]) * season[-period_len]) else: exit('Type must be multiplicative') rmse = eval_measures.rmse(dataframe[period_len:], forecast) return rmse
def sarimax_fc(train, test, order, seas_order, exog_train=None, exog_test=None): model = SARIMAX(train, order=order, exog=exog_train, seasonal_order=seas_order) results = model.fit() start, end = len(train), len(test) + len(train) - 1 pred = results.predict(start, end, exog=exog_test, typ='levels').rename('sarima_predictions') rmse_pred, rmse_pred_pct = rmse(test, pred), rmse(test, pred) / test.mean() results = { 'prediction': pred, 'rmse': rmse_pred, 'rmse_pct': rmse_pred_pct } return results
def insample_performance(test, forecast_dict, dict=False): forecasts = forecast_frame(test, forecast_dict) dict_perf = {} for col, _ in forecasts.iteritems(): dict_perf[col] = {} dict_perf[col]["rmse"] = rmse(forecasts["Target"], forecasts[col]) dict_perf[col]["mse"] = dict_perf[col]["rmse"]**2 dict_perf[col]["mean"] = forecasts[col].mean() if dict: return dict_perf else: return pd.DataFrame.from_dict(dict_perf)
def get_rms(model, df, y): """ Get the RMSE for a stats.models model and new data :param model: a stats.models model :param df: pandas dataframe containing all the data :param y: (array-like) the true responses of the response variable :return: a numeric RMSE """ result = model.fit() predictions = result.predict(df) return rmse(predictions, y)
def printErrors(test, pred, model): ''' Objective: to print errors of the models Inputs: test: test dataframe pred: predictions model: model that is used Outputs: Mean absolute error, mean squared error, root mean squared error ''' print('MAE of ' + model + ': {:.4}'.format(meanabs(test, pred, axis=0))) print('MSE of ' + model + ': {:.4}'.format(mse(test, pred, axis=0))) print('RMSE of ' + model + ': {:.4}'.format(rmse(test, pred, axis=0)))
def log_rrmse(y_tst, y_hat): try: y_tst = y_tst.values except: pass try: y_hat = y_hat.values except: pass y_tst = np.exp(y_tst) y_hat = np.exp(y_hat) rrmse = em.rmse(y_tst, y_hat, axis=0) / y_tst.mean() * 100 return rrmse
def insample_performance(forecast_frame, as_dict=False): dict_perf = {} for col, _ in forecast_frame.iteritems(): dict_perf[col] = {} dict_perf[col]["rmse"] = rmse(forecast_frame["Target"], forecast_frame[col]) dict_perf[col]["mse"] = dict_perf[col]["rmse"]**2 dict_perf[col]["mean"] = forecast_frame[col].mean() if as_dict: return dict_perf return pd.DataFrame.from_dict(dict_perf)
def rrmse(y_tst, y_hat): try: y_tst = y_tst.values except: pass try: y_hat = y_hat.values except: pass y_tst = y_tst y_hat = y_hat rrmse = em.rmse(y_tst, y_hat, axis=0) / y_tst.mean() * 100 return rrmse
def test_eval_measures(): #mainly regression tests x = np.arange(20).reshape(4, 5) y = np.ones((4, 5)) assert_equal(iqr(x, y), 5 * np.ones(5)) assert_equal(iqr(x, y, axis=1), 2 * np.ones(4)) assert_equal(iqr(x, y, axis=None), 9) assert_equal(mse(x, y), np.array([73.5, 87.5, 103.5, 121.5, 141.5])) assert_equal(mse(x, y, axis=1), np.array([3., 38., 123., 258.])) assert_almost_equal( rmse(x, y), np.array( [8.5732141, 9.35414347, 10.17349497, 11.02270384, 11.89537725])) assert_almost_equal( rmse(x, y, axis=1), np.array([1.73205081, 6.164414, 11.09053651, 16.0623784])) assert_equal(maxabs(x, y), np.array([14., 15., 16., 17., 18.])) assert_equal(maxabs(x, y, axis=1), np.array([3., 8., 13., 18.])) assert_equal(meanabs(x, y), np.array([7., 7.5, 8.5, 9.5, 10.5])) assert_equal(meanabs(x, y, axis=1), np.array([1.4, 6., 11., 16.])) assert_equal(meanabs(x, y, axis=0), np.array([7., 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianabs(x, y, axis=1), np.array([1., 6., 11., 16.])) assert_equal(bias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(bias(x, y, axis=1), np.array([1., 6., 11., 16.])) assert_equal(medianbias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5])) assert_equal(medianbias(x, y, axis=1), np.array([1., 6., 11., 16.])) assert_equal(vare(x, y), np.array([31.25, 31.25, 31.25, 31.25, 31.25])) assert_equal(vare(x, y, axis=1), np.array([2., 2., 2., 2.]))
def setErrorData(self, trainingFit, testPredictions): auxList = [] # Root Mean Squared Error - RMSE trainingErrorRMSE = round(ms.rmse(self.trainData, trainingFit), ModelSelector._decimalPlaces) testErrorRMSE = round(ms.rmse(self.testData, testPredictions), ModelSelector._decimalPlaces) auxList.append(obj.ForecastErro('RMSE', testErrorRMSE, 'TEST')) auxList.append(obj.ForecastErro('RMSE', trainingErrorRMSE, 'TRAIN')) #MAPE only all values > 0 if 0 not in self.data.values: trainingErrorMAPE = round( ut.Utils.mape(self.trainData, trainingFit), ModelSelector._decimalPlaces) testErrorMape = round( ut.Utils.mape(self.testData, testPredictions), ModelSelector._decimalPlaces) auxList.append(obj.ForecastErro('MAPE', trainingErrorMAPE, 'TRAIN')) auxList.append(obj.ForecastErro('MAPE', testErrorMape, 'TEST')) # Mean Absolute Scaled Error trainingErrorMASE = round( ut.Utils.mase(self.trainData.to_numpy(), self.trainData.to_numpy(), trainingFit.to_numpy()), ModelSelector._decimalPlaces) testErrorMASE = round( ut.Utils.mase(self.trainData.to_numpy(), self.testData.to_numpy(), testPredictions.to_numpy()), ModelSelector._decimalPlaces) auxList.append(obj.ForecastErro('MASE', trainingErrorMASE, 'TRAIN')) auxList.append(obj.ForecastErro('MASE', testErrorMASE, 'TEST')) return auxList
def app(window, train, test, pred, interval, windo): window = window.append( { 'Current test': test.values, 'Current prediction': pred, 'MSE': np.square(np.subtract(test.values, pred)).mean(), 'Glycemia prediction RMSE (mg/dl)': rmse(test.values, pred), 'PSW': int(round(windo)), 'Prediction Horizon (minutes)': interval }, ignore_index=True) return window
def accuracy(y1, y2): accuracy_df = pandas.DataFrame() rms_error = numpy.round(rmse(y1, y2), 4) map_error = numpy.round(MAPE(y1, y2), 4) accuracy_df = accuracy_df.append({ "RMSE": rms_error, "%MAPE": map_error }, ignore_index=True) return accuracy_df
def croston_method(dataframe, next_periods, alpha=None): # Datetime format date_format = '%Y-%m' # Get size of original dataframe size = dataframe.count() # Create ahead-day entries in future date_range = pd.date_range(start=dataframe.index[0], periods=size + next_periods, format=date_format, freq='MS') # Create new dataframe for forecasting forecast_full_frame = pd.Series(data=[0] * (len(date_range)), index=date_range) # prepare non-zero demand non_zero_demand, q, map = prepare(dataframe) # n-th non-zero demand n = len(q) forecast_non_zero_demand = [0] * n inter_arrival = [0] * n if alpha is None: initial_values = np.array([0.0]) boundaries = [(0, 1)] parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(non_zero_demand, next_periods), bounds=boundaries, approx_grad=True) alpha = parameters[0] forecast_non_zero_demand[1] = non_zero_demand[0] inter_arrival[1] = q[0] for i in range(2, n): forecast_non_zero_demand[i] = alpha * non_zero_demand[i - 1] + (1 - alpha) * forecast_non_zero_demand[i - 1] inter_arrival[i] = alpha * q[i - 1] + (1 - alpha) * inter_arrival[i - 1] # predict values for i in range(n): forecast_full_frame.iloc[map[i]] = forecast_non_zero_demand[i] # forecast new values for i in range(size, size + next_periods): forecast_full_frame.iloc[i] = forecast_non_zero_demand[n - 1] / inter_arrival[n - 1] rmse = eval_measures.rmse(non_zero_demand[1:], forecast_non_zero_demand[1:]) return forecast_full_frame, forecast_full_frame[-next_periods:], rmse, alpha
def prophet_analysis(df,split,freq,changepoints=3): train = df.iloc[:split] test = df.iloc[split:] # m_eval = Prophet(growth='linear') m_eval = Prophet( growth='linear', n_changepoints=changepoints, changepoint_range=0.8, yearly_seasonality=False, weekly_seasonality=False, daily_seasonality=False, seasonality_mode='additive', seasonality_prior_scale=20, changepoint_prior_scale=.5, mcmc_samples=0, interval_width=0.8, uncertainty_samples=500, ).add_seasonality( name='monthly', period=30.5, fourier_order=5 ).add_seasonality( name='yearly', period=365.25, fourier_order=20 ).add_seasonality( name='quarterly', period=365.25/4, fourier_order=5, prior_scale=15) m_eval.fit(train) eval_future=m_eval.make_future_dataframe(periods=test.shape[0],freq=freq) eval_forecast=m_eval.predict(eval_future) fig,axs=plt.subplots(1,1,figsize=(15,4)) ax1 = sns.lineplot(x='ds',y='yhat',data=eval_forecast,label='Predictions',legend='full') ax1 = sns.lineplot(x='ds',y='y',data=train,label='Train True',legend='full',linestyle='-.') ax1 = sns.lineplot(x='ds',y='y',data=test,label='Test True',legend='full') ax =m_eval.plot(eval_forecast) ax = add_changepoints_to_plot(fig.gca(),m_eval,eval_forecast) predictions = eval_forecast.iloc[-test.shape[0]:]['yhat'] #grab predictions to compare with test set print('MAPE = ' + str((abs(np.array(test.y)-predictions)/(np.array(test.y))).mean())) print('RMSE = ' + str(rmse(predictions,test['y']))) print('MEAN = ' + str(df.y.mean())) return
def predict(X, Y): pre = model.predict(X) actual_y_test = np.exp(Y) actual_predicted = np.exp(pre) diff = abs(Y - actual_predicted) compare_actual = pd.DataFrame({ 'Test Data': actual_y_test, 'Predicted Price': actual_predicted, 'Difference': diff }) compare_actual = compare_actual.astype(int) print("Root Mean Squared Error: ", rmse(actual_predicted, actual_y_test)) print("Variance Score: ", explained_variance_score(actual_y_test, actual_predicted)) compare_actual.to_csv('results.csv')
def RMSE(params, *args): data_frame = args[0] alpha = params rmse = 0 forecast = [0] * len(data_frame) forecast[1] = data_frame[0] for index in range(2, len(data_frame)): forecast[index] = alpha * data_frame[index - 1] + (1 - alpha) * forecast[index - 1] rmse = eval_measures.rmse(forecast[1:], data_frame[1:]) return rmse
def accuracy(y1, y2): accuracy_df = pd.DataFrame() rms_error = np.round(rmse(y1, y2), 1) map_error = np.round( np.mean(np.abs((np.array(y1) - np.array(y2)) / np.array(y1))) * 100, 1) accuracy_df = accuracy_df.append({ "RMSE": rms_error, "%MAPE": map_error }, ignore_index=True) return accuracy_df
def train_and_fit_arima(x, test_split = 0.2): # run auto-arima grid search stepwise_model= auto_arima(x, exogenous=None, start_p=0, d=1, start_q=0, max_p=3, max_d=1, max_q=3, start_P=0, D=1, start_Q=0, max_P=3, max_D=3, max_Q=3, max_order=10, m=12, seasonal=True, trace=True,error_action='ignore', suppress_warnings=True,stepwise=False, approximation=False) print(stepwise_model.aic()) print(stepwise_model.summary()) split=len(x) - int(test_split * len(x)) train = x[0:split] test = x[split:] stepwise_model.fit(train) future_forecast = stepwise_model.predict(n_periods=len(test)) future_forecast = pd.DataFrame(future_forecast, index=test.index, columns=['Prediction']) lineObjects=plt.plot(pd.concat([test, future_forecast], axis=1)) plt.xlabel("Years") plt.ylabel("CO2 Levels (ppm)") plt.legend(iter(lineObjects), ('CO2 Levels', 'Predictions')) plt.savefig("Forecast.png") plt.show() plt.close() line1bjects=plt.plot(pd.concat([x, future_forecast], axis=1)) plt.xlabel("Years") plt.ylabel("CO2 Levels (ppm)") plt.legend(iter(line1bjects), ('CO2 Levels', 'Predictions')) plt.savefig("Forecast_conc.png") plt.show() plt.close() pred_error = rmse(test, future_forecast) print("rmse:", pred_error) stepwise_model.plot_diagnostics(figsize=(15, 12)) plt.savefig("Diagnostic.png") plt.show() plt.close()
def regression_stats(model): y_preds_test = model.predict(X_test) # create df for model results model_vals = [ model.score(X_train, y_train), model.score(X_test, y_test), mean_absolute_error(y_test, y_preds_test), mse(y_test, y_preds_test), rmse(y_test, y_preds_test), np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100, ] mapping = { "stat": ["train R^2", "test R^2", "MAE", "MSE", "RMSE", "MAPE"], "model": model_vals, } stats_df = pd.DataFrame.from_dict(mapping) return stats_df
def gen_results(d): #exp = d.query('Trial > 10 and Time > 5') exp = d.query('Time > 5') error = exp.groupby(('Day', 'Subject', 'Trial')).ae.mean().reset_index() error.columns = ['Day', 'Subject', 'Trial', 'AbsoluteError'] rms = exp.groupby( ('Day', 'Subject', 'Trial')).apply(lambda x: rmse(x.y, x.yg)).reset_index() rms.columns = ['Day', 'Subject', 'Trial', 'RMSE'] var = exp.groupby( ('Day', 'Subject', 'Trial')).apply(lambda x: vare(x.y, x.yg)).reset_index() var.columns = ['Day', 'Subject', 'Trial', 'VARE'] crossings = exp.groupby( ('Day', 'Subject', 'Trial')).apply(lambda x: len(cross(x.e))).reset_index() crossings.columns = ['Day', 'Subject', 'Trial', 'Crossings'] rt = find_response_times(exp, trials) response_time = rt.groupby( ('Day', 'Subject', 'Trial')).mean().ResponseTime.reset_index() td = exp.groupby(('Day', 'Subject', 'Trial')).apply( lambda x: recover_shift(x['Time'], x['y'], x['yg'])).reset_index() time_delay = td.groupby( ('Day', 'Subject', 'Trial')).mean()[0].reset_index().abs() time_delay.columns = ['Day', 'Subject', 'Trial', 'LagTime'] #entropy = generate_entropy_results(exp) #res = error.merge(rms).merge(var).merge(crossings).merge(response_time, how='outer').merge(time_delay).merge(entropy) res = error.merge(rms).merge(var).merge(crossings).merge( response_time, how='outer').merge(time_delay) res['Feedback'] = res.Subject % 2 == 1 res = res.merge(trials[['Trial']]) res = res.sort(['Day', 'Subject', 'Trial']) #res['SecondaryTask'] = res['Secondary_Task'] res = res[[ 'Day', 'Subject', 'Trial', 'AbsoluteError', 'RMSE', 'VARE', 'ResponseTime', 'LagTime', 'Crossings', 'Feedback' ]] res = res.reset_index(drop=True) res['ID'] = (res.Day - 1) * res.Trial.max() + res.Trial return res
def multiplicative(input_dataframe, period_len, next_periods, alpha=None, beta=None, gamma=None): dataframe = input_dataframe.copy() # Datetime format date_format = '%Y-%m' # Get size of original dataframe t = dataframe.count() # Create ahead-day entries in future date_range = pd.date_range(start=dataframe.index[period_len], periods=t, format=date_format, freq='MS') forecast = pd.Series(data=[np.nan] * len(date_range), index=date_range) if alpha is None or beta is None or gamma is None: initial_values = np.array([0.0, 1.0, 0.0]) boundaries = [(0, 1), (0, 1), (0, 1)] type = 'multiplicative' parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(dataframe, type, period_len), bounds=boundaries, approx_grad=True) alpha, beta, gamma = parameters[0] smooth = [0] * period_len trend = [0] * period_len smooth[-1] = sum(dataframe.iloc[0:period_len]) / float(period_len) trend[-1] = (sum(dataframe.iloc[period_len:2 * period_len]) - sum(dataframe.iloc[0:period_len])) / period_len ** 2 season = [dataframe.iloc[i] / smooth[-1] for i in range(period_len)] rmse = 0 for i in range(period_len, t + next_periods): if i >= t: T = i - t forecast.iloc[i - period_len] = (smooth[t - 1] + T * trend[t - 1]) * season[i - period_len] else: smooth.append(alpha * (dataframe[i] / season[-period_len]) + (1 - alpha) * (smooth[-1] + trend[-1])) trend.append(beta * (smooth[i] - smooth[-1]) + (1 - beta) * trend[-1]) season.append(gamma * (dataframe[i] / (smooth[i])) + (1 - gamma) * season[-period_len]) forecast.iloc[i - period_len] = (smooth[-1] + trend[-1]) * season[-period_len] rmse = eval_measures.rmse(dataframe[period_len:], forecast[:-period_len]) return forecast, alpha, beta, gamma, rmse
def Function(params): a, b = params res = [] for ibasin in xrange(0, 1): #10): for istation in good_stations[ibasin]: # print ibasin, istation data = scipy.io.loadmat('%s/%s_AP.mat' %(datadir, ibasin+1)) index = np.where(geoinfo[:, 0]==data['station_name'][0, istation])[0] # pan_obs = data['pan'][0, istation][0:tstep].flatten() pan_obs_gapfill = Gapfill(data['pan'][0, istation][0:tstep].flatten()) ## Prepare for the input data for Epan calculation INPUT = {vars_penpan[i]: Gapfill(data[v][0, istation][0:tstep].flatten()) for i, v in enumerate(vars_penpan[:-2])} INPUT['doy'] = doys.flatten() INPUT['lat'] = geoinfo[index, 1] INPUT['elev'] = geoinfo[index, 3] pan_mod = Data(INPUT, 'cloud').Penpan_u2(a, b) res.append(evaluate.rmse(daily2monthly(pan_mod), daily2monthly(pan_obs_gapfill))) return vstack(res).mean()
def simple_exponential_smoothing(dataframe, next_periods, alpha=None): # Datetime format date_format = '%b-%y' # Get size of original dataframe size = dataframe.count() # Create ahead-day entries in future date_range = pd.date_range(start=dataframe.index[0], periods=size + next_periods, format=date_format, freq='MS') # Create new dataframe for forecasting forecast_full_frame = pd.Series(data=[np.nan] * (len(date_range)), index=date_range) if alpha is None: initial_values = np.array([0.0]) boundaries = [(0, 1)] parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(dataframe, next_periods), bounds=boundaries, approx_grad=True) alpha = parameters[0] # Begin forecasting for idx in range(len(forecast_full_frame.index) - 1): if idx == 0: forecast_full_frame.iloc[idx + 1] = dataframe.iloc[idx] elif idx < size: forecast_full_frame.iloc[idx + 1] = alpha * dataframe.iloc[idx] + (1 - alpha) * forecast_full_frame.iloc[ idx] else: forecast_full_frame.iloc[idx + 1] = forecast_full_frame.iloc[idx] # Drop all NaN values forecast_full_frame.dropna(inplace=True) # Future timeframe only forecast_partial_frame = forecast_full_frame[~forecast_full_frame.index.isin(dataframe.index)] # Root mean squared error rmse = eval_measures.rmse(dataframe[1:size], forecast_full_frame[0:size - 1]) # Return result return forecast_full_frame, forecast_partial_frame, rmse, alpha
def get_best_model(train, test): # Step 1: specify the form of the model model_formula = "total_cases ~ 1 + " \ "reanalysis_specific_humidity_g_per_kg + " \ "reanalysis_dew_point_temp_k + " \ "reanalysis_min_air_temp_k + " \ "station_min_temp_c + " \ "station_max_temp_c + " \ "station_avg_temp_c" full_dataset = pd.concat([train, test]) model = smf.glm(formula=model_formula, data=full_dataset, family=sm.families.Gaussian()) fitted_model = model.fit() acc = eval_measures.rmse(sj_best_model.predict(full_dataset).astype(int), full_dataset.total_cases) return fitted_model, acc
def rolling_tscv(series, trend, seasonal, seasonal_periods, damped, boxcox, initial_train_window, test_window): i = 0 x = initial_train_window t = test_window errors_roll = [] while (i + x + t) < len(series): train_ts = series[(i):(i + x)].values test_ts = series[(i + x):(i + x + t)].values model_roll = ExponentialSmoothing( train_ts, trend=trend, seasonal=seasonal, seasonal_periods=seasonal_periods, damped=damped).fit(use_boxcox=boxcox) fcast = model_roll.forecast(t) error_roll = rmse(test_ts, fcast) errors_roll.append(error_roll) i = i + 1 return np.mean(errors_roll).round(1)
def eval_metrics(forecast, observed): '''Return forecast evaluation metrics. Parameters ---------- forecast : pd.Series Forecasted values. observed : pd.Series Observed values. Return ------ mae : float Mean Absolute Error metric. rmserr : float Root Mean Squared Error metric. Named rmserr to avoid conflicting with statsmodels rmse function. ''' return meanabs(forecast, observed), rmse( forecast, observed), (((forecast - observed).abs() / observed).mean()) * 100
def RMSE(params, *args): data_frame = args[0] rmse = 0 alpha, beta = params # Init smooth, trend = data_frame.iloc[0], data_frame.iloc[1] - data_frame.iloc[0] forecast = pd.Series(data=[np.nan] * data_frame.count(), index=data_frame.index) forecast.iloc[0] = data_frame.iloc[0] size = data_frame.count() for n in range(1, size): last_smooth, smooth = smooth, alpha * data_frame[n] + (1 - alpha) * (smooth + trend) trend = beta * (smooth - last_smooth) + (1 - beta) * trend forecast.iloc[n] = smooth + trend rmse = eval_measures.rmse(data_frame, forecast) return rmse
def plotPrediction(self, fit, ax): """Plot predicted vs. test""" sub = self.sub if len(sub) == 0: sub = X.index Xout = self.X.ix[-self.X.index.isin(sub)] yout = self.y.ix[-self.y.index.isin(sub)] ypred = fit.predict(Xout) ax.scatter(yout, ypred, alpha=0.6, edgecolor='black', color='blue', lw=0.5, label='fit') ax.plot(ax.get_xlim(), ax.get_xlim(), ls="--", lw=2, c=".2") ax.set_xlabel('test') ax.set_ylabel('predicted') ax.set_title('predicted vs test data') import statsmodels.tools.eval_measures as em yt = yout.squeeze().values rmse = em.rmse(yt, ypred) ax.text(0.9,0.1,'rmse: '+ str(round(rmse,3)),ha='right', va='top', transform=ax.transAxes) return
def calculate_total_error(actual, predictions, df): """ Calculate root mean square error (RMSE), mean and error as a percentage of mean Inputs: actual: values of actual data series predictions: values of prediction data series df: dataframe of all values Outputs: root mean squared error of the two series mean of the actual series percent: percentage of rmse of the actual mean Means and errors are formatted as integers Percent is formatted as one decimal point """ end_date = df.index[-1] actual = actual[:end_date] predictions = predictions[:end_date] error = rmse(actual, predictions) print(f'{error:.0f}', 'RMSE') CancMean = actual.mean() print(f'{CancMean:.0f}', 'Mean') percent = error/CancMean*100 print(f'{percent:.1f}', '% Error')
def mlr_array(Y, X, MASK=None, MASKnodata=None, Ynodata=None, Xnodata=None): if MASK is not None: X = [np.where(MASK == MASKnodata, np.NaN, x) for x in X] Y = np.where(MASK == MASKnodata, np.NaN, Y) # also mask array's nodata (X's must have same nodata value!): if Ynodata is not None: Y = np.where(Y == Ynodata, np.NaN, Y) if Xnodata is not None: X = [np.where(X == Xnodata, np.NaN, x) for x in X] # reshape arrays: Y = np.reshape(Y, (Y.shape[0] * Y.shape[1])) X = [np.reshape(x, (x.shape[0] * x.shape[1])) for x in X] # mask NaNs: mask = 0 for x in X: mask = np.where(np.isnan(x), 1, mask) mask = np.where(np.isnan(Y), 1, mask) X = [np.where(mask == 1, np.NaN, x) for x in X] Y = np.where(mask == 1, np.NaN, Y) # retrieve valid values: X = [x[~np.isnan(x)] for x in X] Y = Y[~np.isnan(Y)] # prepare predictors X = np.array(X).T X = sm.add_constant(X) model = sm.OLS(Y, X).fit() m_predict = model.predict(X) print('Model', 'RMSE', 'Predict', 'Y', 'X') m_list = [model, rmse(Y, m_predict), m_predict, Y, X] return m_list
def forecast_arima(df: pd.DataFrame, cols: list, with_graph: bool = True): lag = 0 order = 1 moving_avg_model = 0 steps = 50 for col in cols: model = ARIMA(df[col].iloc[:-steps], order=(lag, order, moving_avg_model)) model_fit = model.fit() model_for = model_fit.get_forecast(steps=steps, alpha=0.05) print('\t==== Summary of forecast ARIMA(%d, %d, %d) ====\n' % (lag, order, moving_avg_model)) print(model_for.summary_frame(), model_for.conf_int(), sep='\n') print('RMSE: %f\nMAE: %f' % (rmse(df[col][-50:], model_for.predicted_mean), meanabs(df[col][-50:], model_for.predicted_mean))) print() if with_graph is True: plt.figure(figsize=(12, 5)) plt.xlabel(col) plt.title('Forecast for %s using ARIMA(%d, %d, %d)' % (col, lag, order, moving_avg_model)) ax1 = model_for.predicted_mean.plot(color='blue', grid=True, label='Actual') ax2 = df[col][-50:].plot(color='red', grid=True, secondary_y=True, label='Estimated') h1, l1 = ax1.get_legend_handles_labels() h2, l2 = ax2.get_legend_handles_labels() plt.legend(h1 + h2, l1 + l2, loc=2) plt.show()
def execute(self,filename): df1= pd.read_csv('data/'+filename,index_col='Date',parse_dates=True) df1.index.freq='MS' df = pd.read_csv('./data/'+filename) df.columns = ['ds','y'] df['ds'] = pd.to_datetime(df['ds']) m = Prophet() m.fit(df) future = m.make_future_dataframe(periods=24,freq = 'MS') forecast = m.predict(future) filename=filename[:-4] m.plot(forecast).savefig(os.path.join('static/', secure_filename(filename+'_prophetPredict.jpg'))) # 80% for training train = df.iloc[:int(len(df)*0.8)] test = df.iloc[len(train):] m = Prophet() m.fit(train) future = m.make_future_dataframe(periods=len(test),freq='MS') forecast = m.predict(future) #print(forecast.tail()) ax = forecast.plot(x='ds',y='yhat',label='Predictions',legend=True,figsize=(12,8)) g=test.plot(x='ds',y='y',label='True Miles',legend=True,ax=ax,xlim=('2018-01-01','2019-01-01')) g.figure.savefig(os.path.join('static/', secure_filename(filename+'_prophetCompare.jpg'))) predictions = forecast.iloc[len(train):]['yhat'] error=rmse(predictions,test['y']) mean=test.mean() print('percentage') self.accuracy=100-(error/mean*100) data=dict() data['stationary']=self.adf_test(df1) data['accuracy']=str(self.accuracy) return data #a=ProphetModel() #print(a.execute('BeerWineLiquor.csv'))
def double_exponential_smoothing(series, next_periods, alpha=None, beta=None): # Datetime format date_format = '%Y-%m' # Get size of original dataframe size = series.count() # Create ahead-day entries in future date_range = pd.date_range(start=series.index[0], periods=size + next_periods, format=date_format, freq='MS') forecast = pd.Series(data=[np.nan] * len(date_range), index=date_range) forecast.iloc[0] = series.iloc[0] # Init smooth, trend = series.iloc[0], series.iloc[1] - series.iloc[0] # Calculate alpha, beta if it's None if alpha is None or beta is None: initial_values = np.array([0.0, 1.0]) boundaries = [(0, 1), (0, 1)] parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(series, next_periods), bounds=boundaries, approx_grad=True) alpha, beta = parameters[0] for n in range(1, size): last_smooth, smooth = smooth, alpha * series[n] + (1 - alpha) * (smooth + trend) trend = beta * (smooth - last_smooth) + (1 - beta) * trend forecast.iloc[n] = smooth + trend for n in range(size, size + next_periods): m = n - size + 1 forecast.iloc[n] = smooth + m * trend rmse = eval_measures.rmse(series, forecast[:-next_periods]) return forecast, rmse, alpha, beta
def regframe(X, Y, mod, idx): ##rsq,mae,mse,rmse,mape x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False) model = mod.fit(x_train, y_train) y_pred = model.predict(x_test) k_fold = KFold(n_splits=10, shuffle=False) df = pd.Series( { 'rsq_train': model.score(x_train, y_train), 'rsq_test': model.score(x_test, y_test), 'subt_rsq': model.score(x_train, y_train) - model.score(x_test, y_test), 'mae_test': mean_absolute_error(y_test, y_pred), 'mse_test': mse(y_test, y_pred), 'rmse_test': rmse(y_test, y_pred), 'mape_test': (np.mean(np.abs((y_test - y_pred) / y_test)) * 100), 'cross-score': cross_val_score(estimator=mod, X=X, y=Y, cv=k_fold).mean(), 'cross-train': cross_val_score(estimator=mod, X=x_train, y=y_train, cv=k_fold).mean() }, name=idx) return df
def sm_fit(X, Y, alpha=None, L1_wt=0.0): actual_v_predicted_plot = bokeh.plotting.figure(tools=['save'], x_axis_type='log', y_axis_type='log') resid_v_actual_plot = bokeh.plotting.figure(tools=['save']) cv_rmse = [] ts = TimeSeriesSplit(7) for train_index, test_index in ts.split(X): X_train, Y_train = X[train_index], Y[train_index] X_test, Y_test = X[test_index], Y[test_index] model = sm.OLS(Y_train, X_train) if alpha is None: reg_results = model.fit() else: reg_results = model.fit_regularized(alpha=alpha, L1_wt=L1_wt) sm_plot_actual_v_predicted(actual_v_predicted_plot, reg_results, X_test, Y_test[:, 0]) sm_plot_resid_v_actual(resid_v_actual_plot, reg_results, X_test, Y_test[:, 0]) cv_rmse.append(rmse(reg_results.predict(X_test), Y_test[:, 0])) cv_rmse = pd.Series(cv_rmse, name='rmse').reset_index() return reg_results, resid_v_actual_plot, actual_v_predicted_plot, cv_rmse
def calc_nrmse(df_target, df_new): """ Calculates the normalized root mean square error of the target input dataframe and the simulated output dataframe to determine if the rmse decreased with a new mutation or not. Input(s): df_target is the user-inputted tsv file containing transcript abundances for each gene. df_new is the simulator-generated tsv file containng transcript abundances for each gene. Output(s): RMSE is a floating point number that refers to the root mean square error calculated. """ #Confirms that the dataframes are the same shape assert df_target.shape == df_new.shape assert all(df_target.columns == df_new.columns) assert all(df_target.index == df_new.index) norm_errs = [] #Performs a normalized RMSE to help determine the fitness of the new genome for column in df_target.columns: nrmse = rmse(df_target[column], df_new[column])/\ np.mean(df_target[column]) norm_errs.append(nrmse) return np.mean(norm_errs)
def sm_forest_fit(X, Y, tuning_parameters=None): if tuning_parameters is not None: max_depth, min_samples_leaf, n_estimators, max_features = tuning_parameters max_depth, min_samples_leaf, n_estimators, max_features = int( round(max_depth)), int(round(min_samples_leaf)), int( round(n_estimators)), max_features else: max_depth = 3 min_samples_leaf = 1 n_estimators = 10 max_features = 'auto' actual_v_predicted_plot = bokeh.plotting.figure(tools=['save'], x_axis_type='log', y_axis_type='log') resid_v_actual_plot = bokeh.plotting.figure(tools=['save']) cv_rmse = [] ts = TimeSeriesSplit(7) for train_index, test_index in ts.split(X): X_train, Y_train = X[train_index], Y[train_index] X_test, Y_test = X[test_index], Y[test_index] model = sm.OLS(Y_train, X_train) reg_results = RandomForestRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, max_features=max_features, n_jobs=-1) reg_results.fit(X, Y) sm_plot_actual_v_predicted(actual_v_predicted_plot, reg_results, X_test, Y_test[:, 0]) sm_plot_resid_v_actual(resid_v_actual_plot, reg_results, X_test, Y_test[:, 0]) cv_rmse.append(rmse(reg_results.predict(X_test), Y_test[:, 0])) cv_rmse = pd.Series(cv_rmse, name='rmse').reset_index() return reg_results, resid_v_actual_plot, actual_v_predicted_plot, cv_rmse
lstm_predictions_scaled.append(lstm_pred) current_batch = np.append(current_batch[:, 1:, :], [[lstm_pred]], axis=1) lstm_predictions_scaled lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled) lstm_predictions test_data_sa['LSTM_Predictions'] = lstm_predictions test_data_sa test_data_sa['New Cases'].plot(figsize=(16, 5), legend=True) test_data_sa['LSTM_Predictions'].plot(legend=True) lstm_rmse_error_sa = rmse(test_data_sa['New Cases'], test_data_sa["LSTM_Predictions"]) lstm_mse_error_sa = lstm_rmse_error_sa**2 mean_value = sa['value'].mean() #%% scaler = MinMaxScaler() scaler.fit(train_data_korea) scaled_train_data = scaler.transform(train_data_korea) scaled_test_data = scaler.transform(test_data_korea) n_input = 7 n_features = 1 generator = TimeseriesGenerator(scaled_train_data, scaled_train_data, length=n_input,
def regression(dependent_str, model_list, dataset, independents_filter): # first we create the model for this dependent variable with the entire dataset independents_str = " + ".join(model_list) print(independents_str) # https://stackoverflow.com/questions/48522609/how-to-retrieve-model-estimates-from-statsmodels X = dataset[sorted(independents_filter)] y = dataset[dependent_str] model = smf.ols(formula=dependent_str + " ~ " + independents_str, data=dataset).fit() # then we calculate the average fitness (rsme normalized) using k fold cross validation kf = KFold(configuration.kfold, True, 1) #print("########################################################################") fitness_norm = 0 fitness = 0 for train, test in kf.split(dataset): model_t = smf.ols(formula=dependent_str + " ~ " + independents_str, data=dataset.iloc[train]).fit() # filter columns X = dataset[sorted(independents_filter)] y = dataset[dependent_str] # filter rows X = X.iloc[test] y = y.iloc[test] # generate predictions and metric ypred = model_t.predict(X) r = rmse(y, ypred) rmse_norm = round(r / (max(y) - min(y)), 3) #print("rmse_norm = ", rmse_norm) fitness_norm = fitness_norm + rmse_norm fitness = fitness + r # to be able to check manually #print(y) #print(ypred) fitness_norm = round(fitness_norm / configuration.kfold, 3) fitness = round(fitness / configuration.kfold, 3) #print("########################################################################") #print(fitness_norm, fitness) #print("########################################################################") rsquared = round(model.rsquared, 3) rsquared_adj = round(model.rsquared_adj, 3) X = dataset[sorted(independents_filter)] model_y = dataset[dependent_str] #model_y_pred = model_t.predict(X) # compare with random values #df_random = pd.DataFrame(np.random.randint(1,100,size=(len(model_y), 1))) randomlist = random.sample(range(1, 100), len(model_y)) rmse_random = rmse(model_y, randomlist) #print("########################################################################") #print(model_y) #print(model_y_pred) #print("########################################################################") #print(model_y) #print(randomlist) #print("########################################################################") #print("rmse_random", rmse_random) #return (dep + " ~ " + independents, rsquared, rsquared_adj, fitness_norm, fitness, model.summary(), model_y, model_y_pred) return (dependent_str + " ~ " + independents_str, rsquared, rsquared_adj, fitness_norm, fitness, model.summary(), 0, 0, rmse_random)
def hw(x, horizon, params, quantile=None, verbose=True, boxcox=False): """ Holt-Winters point prediction :param x: input time series (assume equally spaced) :param horizon: number of points to predict :param params: dict with keys and values for specified parameters. Keys must be in the set level, trend, season, damped All keys must be specified. Examples: The first component in the tuple prevails - params['level'] = None: compute the optimal level alpha - params['level'] = <positive_number>: set alpha = <positive_number> (between 0 and 1) - params['trend'] = [None, None]: set beta = 0 - params['trend'] = ['A', None]: use additive trend and compute optimal beta - params['trend'] = ['M', <positive_number>]: use multiplicative trend and set beta = <positive_number> (between 0 and 1) - params['season'] = [None, <seasonality>, None]: set s_len = 1, gamma = 0 (no seasons). The value of <seasonality> is ignored - params['season'] = ['A', None, None]: try season detection to set the 2nd component (season length) - params['season'] = ['A', <seasonality>, None]: use additive seasonality, compute optimal gamma assuming season length equals <seasonality> - params['season'] = ['M', <seasonality>, <positive_number>]: use multiplicative seasonality, set gamma = <positive_number> and assume season length equals <seasonality> - params['damped'] = [None, <anything>]: error: damped can only be True or False - params['damped'] = [False, <anything>]: set <anything> = 1. No damping - params['damped'] = [True, None]: damping present. Compute optimal value for phi - params['damped'] = [True, <positive_number>]: damping present. Set phi = <positive_number> (between 0 and 1) :param quantile: error band to return, when not None :param verbose: whether to print some information while executing :return: df_out, rmse, params, yint, df_errs df_out is a DataFrame with columns: - y input data - yhat (point forecasts and forecasts) - ylwr (lower forecast bound on forecasts) - yupr (upper forecast bound on forecasts) - lj_pval (Ljung-Box independence test p-values on transformed errors. the min of p_values is shown -worst case-) - sw_pval (Shapiro-Wilks test p_value on transformed errors) - lbda (BoxCox transform parameter used for the forecast bounds) rms_err is the rmse on the data params is the model parameter dictionary includes the values for alpha, beta, gamma, phi and s_len that minimize the one step prediction rmse or that were supplied. mdl is the type of model used (trend, season, damped) """ if set(params.keys()) != {'level', 'trend', 'season', 'damped'}: print('invalid params keys: ' + str(params.keys())) return None opt_pars = list() # list of params to optimze # damping if len(params['damped']) != 2: print('invalid damping parameters: ' + str(params['damped'])) return None if params['damped'][0] not in [True, False]: print('invalid damped parameter: ' + str(params['damped'])) return None if params['damped'][0] is True: if params['damped'][1] is None: opt_pars.append('damped') else: if not(0.0 < params['damped'][1] <= 0.98): # 0.98 See Hyndman print('invalid damped parameter: ' + str(params['damped'])) return None else: params['damped'][1] = 1.0 # seasons if len(params['season']) != 3: print('invalid seasonality parameters: ' + str(params['season'])) return None if params['season'][0] not in [None, 'A', 'M']: print('invalid seasonality parameters: ' + str(params['season'])) return None if params['season'][0] is None: params['season'] = [None, 1, 0.0] else: if params['season'][1] <= 1: params['season'][1] = hwu.get_season(x) if params['season'][1] <= 1.0: print('invalid seasonality parameters: ' + str(params['season'])) return None else: if verbose: print('using automated season detection. Seasonality: ' + str(params['season'][1])) if params['season'][2] is None: opt_pars.append('season') else: if not(0.0 <= params['season'][2] <= 1.0): print('invalid season parameter: ' + str(params['season'])) return None if params['season'][1] > 1 and len(x) < 4 * params['season'][1]: print('not enough data for seasonality') return None # trend if len(params['trend']) != 2: print('invalid trend parameters: ' + str(params['trend'])) return None if params['trend'][0] not in [None, 'A', 'M']: print('invalid trend parameters: ' + str(params['trend'])) return None if params['trend'][0] is None: params['trend'][1] = 0.0 else: if params['trend'][1] is None: opt_pars.append('trend') else: if not(0.0 <= params['trend'][1] <= 1.0): print('invalid trend parameter: ' + str(params['trend'])) return None # level if params['level'] is None: opt_pars.append('level') else: if not(0.0 <= params['level'] <= 1.0): print('invalid trend parameter: ' + str(params['level'])) return None # update params to get optimal parameters if needed Y = list(x[:]) if len(opt_pars) > 0: get_pars(Y, params, opt_pars) alpha = params['level'] trend, beta = params['trend'] season, s_len, gamma = params['season'] phi = params['damped'][1] if beta <= EPS: trend, beta = None, 0.0 params['trend'] = [None, 0.0] if gamma <= EPS: season, s_len, gamma = None, 1, 0.0 params['season'] = [None, 1, 0.0] if phi <= EPS: phi = 0.0 params['season'] = [True, 0.0] if verbose: print('model parameters: ' + str(params)) # initialize: set a[0], b[0], s[0] a, b, s = hwu.initialize(trend, season, Y, s_len, verbose) # HW main iteration yhat = list() # one step point forecast: yhat_{t|t-1}, 1 <= t <= N. yint = list() # list of multi step predictions: yhat_{t+h|t-1}, 0 <= h <= N-t, 1 <= t <= N for i in range(len(Y)): # note that index = i fills position i+1 # update the HW parameters a.append(hwu.level(alpha, phi, Y[i], a[i], b[i], s[i], trend, season)) b.append(hwu.trend(beta, phi, hwu.get_aval(a[i + 1], a[i], trend), b[i], trend, season)) s.append(hwu.season(gamma, phi, Y[i], a[i], b[i], s[i], trend, season)) # one step prediction. s is initialized with s_len values. s[i] is s_len values behind, as it should # yhat[i] = y(i+1|Y_0, ..., Y_i), 0 <= i < N yhat.append(hwu.one_step_pred(a[i], b[i], s[i], phi, trend, season)) # multi-step forecasts (used for error bounds): generate all the predictions at each step # yint[i][h] = y(i + 1 + h|y_0, .., y_i), 0 <= h < N - i, 0 <= i < N if quantile is not None: yint.append(hwu.point_fcast(len(Y), phi, s_len, a[i], b[i], s[-s_len:], trend, season)) # states df_states = pd.DataFrame({'level': a, 'trend': b, 'season': s}) # point predictions outside the data range yhat += hwu.point_fcast(horizon, phi, s_len, a[-1], b[-1], s[-s_len:], trend, season) rms_err = sm.rmse(Y, yhat[:len(Y)]) yhat = np.array(yhat) df_hat = pd.DataFrame({'yhat': yhat[:len(Y)], 'y': Y}) # interval predictions (errors) if quantile is not None and horizon > 0: df_int, df_errs, df_detail = hwu.interval_fcast(np.array(x), np.array(yint), horizon, quantile) df_int['yhat'] = yhat[-horizon:] df_int['yupr'], df_int['ylwr'] = df_int['yhat'] + df_int['upr'], df_int['yhat'] + df_int['lwr'] df_hat['yupr'], df_hat['ylwr'] = df_hat['yhat'], df_hat['yhat'] df_int.drop(['upr', 'lwr'], axis=1, inplace=True) df_out = pd.concat([df_hat, df_int], axis=0) if horizon > 0 else df_hat else: # no errors computed df_int = pd.DataFrame({'yhat': yhat[-horizon:], 'yupr': yhat[-horizon:], 'ylwr': yhat[-horizon:]}) df_out = pd.concat([df_hat, df_int], axis=0) if horizon > 0 else df_hat df_out.reset_index(inplace=True, drop=True) return {'df_out': df_out, 'rmse': rms_err, 'params': params, 'states': df_states}
vars.remove('TRAIN') # this is also useless for prediction vars.remove('ID') # as is ID min_rmse = 100 # initialise with something large f_val = "" # variable to hold our formula resample_count = 3 # number of CV folds for iter in xrange(1000): # generate a random expression f = 'DEATHS ~ {0}'.format(rand_expr(vars)) # for 3 re-samples of the training set fit the lm on the fitting set # and calculate the rmse on the validation set rmse_val = 0 for _ in xrange(resample_count): sampleIndices = random.sample( train_data.index, int(0.75 * recs) ) fitting_data = train_data.ix[sampleIndices] validation_data = train_data.drop(sampleIndices) death_glm = smf.ols(formula = f, data = fitting_data).fit() rmse_val += rmse(death_glm.predict(validation_data), validation_data.DEATHS) rmse_val /= resample_count if rmse_val < min_rmse: print 'new minimum: {0}'.format(rmse_val) min_rmse = rmse_val f_val = f print 'BEST EXPRESSION : {0} \n\n {1}'.format(min_rmse, f_val)
def rmse(par_vals, *args): Y, par_names, param_dict = args[0], args[1], args[2] set_dict(par_names, par_vals, param_dict) results = hw(Y, 0, param_dict, verbose=False, quantile=None) Yhat = results['df_out']['yhat'].values return sm.rmse(Y, Yhat)
def make_plot(image_dir, run_dirs, run_names=None, cmu0=0.5544): """ Make plot of runs against analytical solution. params: ------- image_dir - str - Directory to save the file run_dirs - list - Names of the simulation directories run_names - str, optional - Names of simulation for legend cmu0 - float, optional - Parameter in GLS to calculate TKE (default 0.5544 Kantha-Clayson) """ names = [] m_sed = [] m_vel = [] m_dif = [] for r in run_dirs: print r names.append(r) fileBase = r+'/Warner/data/profile' dataFile = os.path.join(fileBase, 'Warner-1K_trcr_1_0_2012-01-01_2012-01-02.nc') m_sed.append(dc.loadFromNetCDF(dataFile)) dataFile = os.path.join(fileBase, 'Warner-1K_hvel_0_2012-01-01_2012-01-02.nc') m_vel.append(dc.loadFromNetCDF(dataFile)) dataFile = os.path.join(fileBase, 'Warner-1K_tdff_0_2012-01-01_2012-01-02.nc') m_dif.append(dc.loadFromNetCDF(dataFile)) # Calculate analytical values using actual water column depth H # z - Height above the bed. # H - Water column height z = abs(m_sed[0].z[0, -1] - m_sed[0].z[:, -1]) z[0] = Z0 depths = m_sed[0].z[:, -1] H = z[-1] u_star = calcFrictionVelocity(U, H, Z0) a_vel = calcVelocity(z, u_star, Z0) a_vis = calcEddyViscosity(z, u_star, H) a_dif = calcEddyDiffusivity(a_vis) a_sed = calcSediment(z) # "Analytical" values from imposing parabolic eddy viscosity print ' - test_warner_channel_analytical' fileBase = 'test_warner_channel_analytical/Warner/data/profile' dataFile = os.path.join(fileBase, 'Warner-1K_trcr_1_0_2012-01-01_2012-01-03.nc') m_sed.append(dc.loadFromNetCDF(dataFile)) dataFile = os.path.join(fileBase, 'Warner-1K_hvel_0_2012-01-01_2012-01-03.nc') m_vel.append(dc.loadFromNetCDF(dataFile)) dataFile = os.path.join(fileBase, 'Warner-1K_tdff_0_2012-01-01_2012-01-03.nc') m_dif.append(dc.loadFromNetCDF(dataFile)) # Calculate RMSE vel_rmse = [] dif_rmse = [] sed_rmse = [] for i, r in enumerate(run_dirs): vel_rmse.append(stats.rmse(m_vel[i].data[:, 0, -1], a_vel)) dif_rmse.append(stats.rmse(m_dif[i].data[:, 0, -1], a_dif)) sed_rmse.append(stats.rmse(m_sed[i].data[:, 0, -1], a_sed)) # Plots ticks_font = matplotlib.font_manager.FontProperties(size=6) f, ax = plt.subplots(1, 3, sharey=True, figsize=(18, 7)) f.subplots_adjust(wspace=0.4, top=0.9) #plt.rc('axes', color_cycle=['r', 'g', 'b', 'y']) for vel in m_vel: ax[0].plot(np.squeeze(vel.data[:, 0, -1]), depths, marker='.') p2 = ax[0].plot(a_vel, depths, color='k') ax[0].set_xlim([0, 1.5]) ax[0].set_ylim([-10, 0.5]) ax[0].xaxis.set_ticks([0, 0.5, 1, 1.5]) ax[0].grid(True) ax[0].set_ylabel('Z-coordinate $m$') ax[0].set_xlabel('Velocity $m/s$') #ax[0].set_title('RMSE: %4.3f' % vel_rmse, fontsize=12) if run_names is None: legend_str = names else: legend_str = run_names legend_str.append('Analytical') ax[0].legend(legend_str, loc='upper left', fontsize=8) ax[0].fill_between([0, 1.5], -10, m_sed[0].z[0, -1], facecolor='brown') ax[0].fill_between([0, 1.5], -10, m_sed[0].z[-1, -1], facecolor='blue', alpha=0.1) for dif in m_dif: ax[1].plot(np.squeeze(dif.data[:, :, -1]), depths, marker='.') ax[1].plot(a_dif, depths, color='k') ax[1].set_xlim([0, 0.1]) #ax[1].xaxis.set_ticks([0, 002, 0.04, 0.06, 0.08]) ax[1].grid(True) ax[1].set_xlabel('Edddy diffusivity $m^2/s$') #ax[1].set_title('RMSE: %4.3f' % dif_rmse, fontsize=12) ax[1].fill_between([0, 0.1], -10, m_sed[0].z[0, -1], facecolor='brown') ax[1].fill_between([0, 0.1], -10, m_sed[0].z[-1, -1], facecolor='blue', alpha=0.1) for sed in m_sed: ax[2].plot(np.squeeze(sed.data[:, :, -1]), depths, marker='.') ax[2].plot(a_sed, depths, color='k') # ax[4].xaxis.set_ticks([0.150, 0.2, 0.25, 0.3, 0.35, 0.4]) ax[2].set_xlim([0.05, 0.35]) ax[2].grid(True) ax[2].set_xlabel('Sediment $kg/m^3$') ax[2].fill_between([0.05, 0.35], -10, m_sed[0].z[0, -1], facecolor='brown') ax[2].fill_between([0.05, 0.35], -10, m_sed[0].z[-1, -1], facecolor='blue', alpha=0.1) f.suptitle('Warner et al. 2008 open channel test', fontsize=14) # Save fig print 'saving image : warner_comparison.png' runName = 'warner_comparison' f.savefig(runName, dpi=200) plt.close('all')