Esempio n. 1
0
def test_eval_measures():
    #mainly regression tests

    x = np.arange(20).reshape(4,5)
    y = np.ones((4,5))
    assert_equal(iqr(x, y), 5*np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2*np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y),
                 np.array([  73.5,   87.5,  103.5,  121.5,  141.5]))
    assert_equal(mse(x, y, axis=1),
                 np.array([   3.,   38.,  123.,  258.]))

    assert_almost_equal(rmse(x, y),
                        np.array([  8.5732141 ,   9.35414347,  10.17349497,
                                   11.02270384,  11.89537725]))
    assert_almost_equal(rmse(x, y, axis=1),
                        np.array([  1.73205081,   6.164414,
                                   11.09053651,  16.0623784 ]))

    assert_equal(maxabs(x, y),
                 np.array([ 14.,  15.,  16.,  17.,  18.]))
    assert_equal(maxabs(x, y, axis=1),
                 np.array([  3.,   8.,  13.,  18.]))

    assert_equal(meanabs(x, y),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(meanabs(x, y, axis=1),
                 np.array([  1.4,   6. ,  11. ,  16. ]))
    assert_equal(meanabs(x, y, axis=0),
                 np.array([  7. ,   7.5,   8.5,   9.5,  10.5]))

    assert_equal(medianabs(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianabs(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(bias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(bias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(medianbias(x, y),
                 np.array([  6.5,   7.5,   8.5,   9.5,  10.5]))
    assert_equal(medianbias(x, y, axis=1),
                 np.array([  1.,   6.,  11.,  16.]))

    assert_equal(vare(x, y),
                 np.array([ 31.25,  31.25,  31.25,  31.25,  31.25]))
    assert_equal(vare(x, y, axis=1),
                 np.array([ 2.,  2.,  2.,  2.]))
def moving_average(dataframe, window, ahead, file_name):
    # Datetime format
    date_format = '%b-%y'

    # Create ahead-day entries in future
    date_range = pd.date_range(start=dataframe.index[0], periods=dataframe.count() + ahead, format=date_format,
                               freq='MS')

    # Create new dataframe for forecasting
    forecast_full_frame = pd.Series(data=[np.nan] * (len(date_range)), index=date_range)

    # Begin forecasting
    for idx in range(window, len(forecast_full_frame.index)):
        # Estimation of actual data
        if idx < dataframe.count():
            forecast_full_frame.iloc[idx] = round(
                np.mean([dataframe.iloc[idx - i] for i in range(1, window + 1)]))
        # Calculation of future data
        else:
            forecast_full_frame.iloc[idx] = round(
                np.mean([forecast_full_frame.iloc[idx - i] for i in range(1, window + 1)]))

    # Drop all NaN values
    forecast_full_frame.dropna(inplace=True)

    # Future timeframe only
    forecast_partial_frame = forecast_full_frame[~forecast_full_frame.index.isin(dataframe.index)]

    # Root mean squared error
    rmse = eval_measures.rmse(dataframe[window:dataframe.count()], forecast_full_frame[:dataframe.count() - window])

    # Return result
    return forecast_full_frame, forecast_partial_frame, rmse
def get_rmse():
    """ Compute the RMSE based on the relevant parameterization.
    """
    fname = '../truth/start/data.respy.info'
    probs_true = get_choice_probabilities(fname, is_flatten=True)

    fname = 'start/data.respy.info'
    probs_start = get_choice_probabilities(fname, is_flatten=True)

    fname = 'stop/data.respy.info'
    probs_stop = get_choice_probabilities(fname, is_flatten=True)

    rmse_stop = rmse(probs_stop, probs_true)
    rmse_start = rmse(probs_start, probs_true)

    return rmse_start, rmse_stop
def RMSE(params, *args):
    dataframe = args[0]
    type = args[1]  # multiplicative
    rmse = 0

    alpha, beta, gamma = params
    period_len = args[2]
    smooth = [0] * period_len
    trend = [0] * period_len
    smooth[-1] = sum(dataframe.iloc[0:period_len]) / float(period_len)
    trend[-1] = (sum(dataframe.iloc[period_len:2 * period_len]) - sum(dataframe.iloc[0:period_len])) / period_len ** 2
    forecast = []

    if type == 'multiplicative':
        season = [dataframe.iloc[i] / smooth[-1] for i in range(period_len)]

        for i in range(period_len, dataframe.count()):
            smooth.append(alpha * (dataframe.iloc[i] / season[-period_len]) + (1 - alpha) * (smooth[-1] + trend[-1]))
            trend.append(beta * (smooth[i] - smooth[-1]) + (1 - beta) * trend[-1])
            season.append(gamma * (dataframe.iloc[i] / (smooth[i])) + (1 - gamma) * season[-period_len])
            forecast.append((smooth[-1] + trend[-1]) * season[-period_len])

    else:
        exit('Type must be multiplicative')

    rmse = eval_measures.rmse(dataframe[period_len:], forecast)

    return rmse
Esempio n. 5
0
def sarimax_fc(train,
               test,
               order,
               seas_order,
               exog_train=None,
               exog_test=None):
    model = SARIMAX(train,
                    order=order,
                    exog=exog_train,
                    seasonal_order=seas_order)
    results = model.fit()
    start, end = len(train), len(test) + len(train) - 1
    pred = results.predict(start, end, exog=exog_test,
                           typ='levels').rename('sarima_predictions')
    rmse_pred, rmse_pred_pct = rmse(test, pred), rmse(test, pred) / test.mean()
    results = {
        'prediction': pred,
        'rmse': rmse_pred,
        'rmse_pct': rmse_pred_pct
    }
    return results
Esempio n. 6
0
def insample_performance(test, forecast_dict, dict=False):
    forecasts = forecast_frame(test, forecast_dict)
    dict_perf = {}
    for col, _ in forecasts.iteritems():
        dict_perf[col] = {}
        dict_perf[col]["rmse"] = rmse(forecasts["Target"], forecasts[col])
        dict_perf[col]["mse"] = dict_perf[col]["rmse"]**2
        dict_perf[col]["mean"] = forecasts[col].mean()
    if dict:
        return dict_perf
    else:
        return pd.DataFrame.from_dict(dict_perf)
Esempio n. 7
0
def get_rms(model, df, y):
    """
    Get the RMSE for a stats.models model and new data

    :param model:  a stats.models model
    :param df:     pandas dataframe containing all the data
    :param y:      (array-like) the true responses of the response variable
    
    :return:       a numeric RMSE
    """
    result = model.fit()
    predictions = result.predict(df)
    return rmse(predictions, y)
Esempio n. 8
0
def printErrors(test, pred, model):
    '''
    Objective: to print errors of the models
    Inputs:
    test: test dataframe
    pred: predictions
    model: model that is used
    Outputs:
    Mean absolute error, mean squared error, root mean squared error
    '''
    print('MAE of ' + model + ': {:.4}'.format(meanabs(test, pred, axis=0)))
    print('MSE of ' + model + ': {:.4}'.format(mse(test, pred, axis=0)))
    print('RMSE of ' + model + ': {:.4}'.format(rmse(test, pred, axis=0)))
Esempio n. 9
0
def log_rrmse(y_tst, y_hat):
    try:
        y_tst = y_tst.values
    except:
        pass
    try:
        y_hat = y_hat.values
    except:
        pass
    y_tst = np.exp(y_tst)
    y_hat = np.exp(y_hat)
    rrmse = em.rmse(y_tst, y_hat, axis=0) / y_tst.mean() * 100
    return rrmse
Esempio n. 10
0
    def insample_performance(forecast_frame, as_dict=False):

        dict_perf = {}
        for col, _ in forecast_frame.iteritems():
            dict_perf[col] = {}
            dict_perf[col]["rmse"] = rmse(forecast_frame["Target"],
                                          forecast_frame[col])
            dict_perf[col]["mse"] = dict_perf[col]["rmse"]**2
            dict_perf[col]["mean"] = forecast_frame[col].mean()
        if as_dict:
            return dict_perf

        return pd.DataFrame.from_dict(dict_perf)
Esempio n. 11
0
def rrmse(y_tst, y_hat):
    try:
        y_tst = y_tst.values
    except:
        pass
    try:
        y_hat = y_hat.values
    except:
        pass
    y_tst = y_tst
    y_hat = y_hat
    rrmse = em.rmse(y_tst, y_hat, axis=0) / y_tst.mean() * 100
    return rrmse
Esempio n. 12
0
def test_eval_measures():
    #mainly regression tests

    x = np.arange(20).reshape(4, 5)
    y = np.ones((4, 5))
    assert_equal(iqr(x, y), 5 * np.ones(5))
    assert_equal(iqr(x, y, axis=1), 2 * np.ones(4))
    assert_equal(iqr(x, y, axis=None), 9)

    assert_equal(mse(x, y), np.array([73.5, 87.5, 103.5, 121.5, 141.5]))
    assert_equal(mse(x, y, axis=1), np.array([3., 38., 123., 258.]))

    assert_almost_equal(
        rmse(x, y),
        np.array(
            [8.5732141, 9.35414347, 10.17349497, 11.02270384, 11.89537725]))
    assert_almost_equal(
        rmse(x, y, axis=1),
        np.array([1.73205081, 6.164414, 11.09053651, 16.0623784]))

    assert_equal(maxabs(x, y), np.array([14., 15., 16., 17., 18.]))
    assert_equal(maxabs(x, y, axis=1), np.array([3., 8., 13., 18.]))

    assert_equal(meanabs(x, y), np.array([7., 7.5, 8.5, 9.5, 10.5]))
    assert_equal(meanabs(x, y, axis=1), np.array([1.4, 6., 11., 16.]))
    assert_equal(meanabs(x, y, axis=0), np.array([7., 7.5, 8.5, 9.5, 10.5]))

    assert_equal(medianabs(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(medianabs(x, y, axis=1), np.array([1., 6., 11., 16.]))

    assert_equal(bias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(bias(x, y, axis=1), np.array([1., 6., 11., 16.]))

    assert_equal(medianbias(x, y), np.array([6.5, 7.5, 8.5, 9.5, 10.5]))
    assert_equal(medianbias(x, y, axis=1), np.array([1., 6., 11., 16.]))

    assert_equal(vare(x, y), np.array([31.25, 31.25, 31.25, 31.25, 31.25]))
    assert_equal(vare(x, y, axis=1), np.array([2., 2., 2., 2.]))
Esempio n. 13
0
    def setErrorData(self, trainingFit, testPredictions):

        auxList = []

        # Root Mean Squared Error - RMSE
        trainingErrorRMSE = round(ms.rmse(self.trainData, trainingFit),
                                  ModelSelector._decimalPlaces)
        testErrorRMSE = round(ms.rmse(self.testData, testPredictions),
                              ModelSelector._decimalPlaces)
        auxList.append(obj.ForecastErro('RMSE', testErrorRMSE, 'TEST'))
        auxList.append(obj.ForecastErro('RMSE', trainingErrorRMSE, 'TRAIN'))

        #MAPE only all values > 0
        if 0 not in self.data.values:

            trainingErrorMAPE = round(
                ut.Utils.mape(self.trainData, trainingFit),
                ModelSelector._decimalPlaces)
            testErrorMape = round(
                ut.Utils.mape(self.testData, testPredictions),
                ModelSelector._decimalPlaces)
            auxList.append(obj.ForecastErro('MAPE', trainingErrorMAPE,
                                            'TRAIN'))
            auxList.append(obj.ForecastErro('MAPE', testErrorMape, 'TEST'))

        # Mean Absolute Scaled Error
        trainingErrorMASE = round(
            ut.Utils.mase(self.trainData.to_numpy(), self.trainData.to_numpy(),
                          trainingFit.to_numpy()),
            ModelSelector._decimalPlaces)
        testErrorMASE = round(
            ut.Utils.mase(self.trainData.to_numpy(), self.testData.to_numpy(),
                          testPredictions.to_numpy()),
            ModelSelector._decimalPlaces)
        auxList.append(obj.ForecastErro('MASE', trainingErrorMASE, 'TRAIN'))
        auxList.append(obj.ForecastErro('MASE', testErrorMASE, 'TEST'))

        return auxList
Esempio n. 14
0
def app(window, train, test, pred, interval, windo):

    window = window.append(
        {
            'Current test': test.values,
            'Current prediction': pred,
            'MSE': np.square(np.subtract(test.values, pred)).mean(),
            'Glycemia prediction RMSE (mg/dl)': rmse(test.values, pred),
            'PSW': int(round(windo)),
            'Prediction Horizon (minutes)': interval
        },
        ignore_index=True)

    return window
def accuracy(y1, y2):

    accuracy_df = pandas.DataFrame()

    rms_error = numpy.round(rmse(y1, y2), 4)

    map_error = numpy.round(MAPE(y1, y2), 4)

    accuracy_df = accuracy_df.append({
        "RMSE": rms_error,
        "%MAPE": map_error
    },
                                     ignore_index=True)

    return accuracy_df
Esempio n. 16
0
def croston_method(dataframe, next_periods, alpha=None):
    # Datetime format
    date_format = '%Y-%m'

    # Get size of original dataframe
    size = dataframe.count()

    # Create ahead-day entries in future
    date_range = pd.date_range(start=dataframe.index[0], periods=size + next_periods, format=date_format, freq='MS')

    # Create new dataframe for forecasting
    forecast_full_frame = pd.Series(data=[0] * (len(date_range)), index=date_range)

    # prepare non-zero demand
    non_zero_demand, q, map = prepare(dataframe)

    # n-th non-zero demand
    n = len(q)

    forecast_non_zero_demand = [0] * n
    inter_arrival = [0] * n

    if alpha is None:
        initial_values = np.array([0.0])
        boundaries = [(0, 1)]

        parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(non_zero_demand, next_periods), bounds=boundaries,
                                   approx_grad=True)
        alpha = parameters[0]

    forecast_non_zero_demand[1] = non_zero_demand[0]
    inter_arrival[1] = q[0]

    for i in range(2, n):
        forecast_non_zero_demand[i] = alpha * non_zero_demand[i - 1] + (1 - alpha) * forecast_non_zero_demand[i - 1]
        inter_arrival[i] = alpha * q[i - 1] + (1 - alpha) * inter_arrival[i - 1]

    # predict values
    for i in range(n):
        forecast_full_frame.iloc[map[i]] = forecast_non_zero_demand[i]

    # forecast new values
    for i in range(size, size + next_periods):
        forecast_full_frame.iloc[i] = forecast_non_zero_demand[n - 1] / inter_arrival[n - 1]

    rmse = eval_measures.rmse(non_zero_demand[1:], forecast_non_zero_demand[1:])

    return forecast_full_frame, forecast_full_frame[-next_periods:], rmse, alpha
def prophet_analysis(df,split,freq,changepoints=3):
    train = df.iloc[:split]
    test = df.iloc[split:]

    # m_eval = Prophet(growth='linear')
    m_eval = Prophet(
        growth='linear',
        n_changepoints=changepoints,
        changepoint_range=0.8,
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False,
        seasonality_mode='additive',
        seasonality_prior_scale=20,
        changepoint_prior_scale=.5,
        mcmc_samples=0,
        interval_width=0.8,
        uncertainty_samples=500,
        ).add_seasonality(
            name='monthly',
            period=30.5,
            fourier_order=5
        ).add_seasonality(
            name='yearly',
            period=365.25,
            fourier_order=20
        ).add_seasonality(
            name='quarterly',
            period=365.25/4,
            fourier_order=5,
            prior_scale=15)
    m_eval.fit(train)
    eval_future=m_eval.make_future_dataframe(periods=test.shape[0],freq=freq)
    eval_forecast=m_eval.predict(eval_future)

    fig,axs=plt.subplots(1,1,figsize=(15,4))
    ax1 = sns.lineplot(x='ds',y='yhat',data=eval_forecast,label='Predictions',legend='full')
    ax1 = sns.lineplot(x='ds',y='y',data=train,label='Train True',legend='full',linestyle='-.')
    ax1 = sns.lineplot(x='ds',y='y',data=test,label='Test True',legend='full')

    ax =m_eval.plot(eval_forecast)
    ax = add_changepoints_to_plot(fig.gca(),m_eval,eval_forecast)

    predictions = eval_forecast.iloc[-test.shape[0]:]['yhat'] #grab predictions to compare with test set
    print('MAPE = ' + str((abs(np.array(test.y)-predictions)/(np.array(test.y))).mean()))
    print('RMSE = ' + str(rmse(predictions,test['y'])))
    print('MEAN = ' + str(df.y.mean()))
    return
def predict(X, Y):
    pre = model.predict(X)
    actual_y_test = np.exp(Y)
    actual_predicted = np.exp(pre)
    diff = abs(Y - actual_predicted)
    compare_actual = pd.DataFrame({
        'Test Data': actual_y_test,
        'Predicted Price': actual_predicted,
        'Difference': diff
    })
    compare_actual = compare_actual.astype(int)
    print("Root Mean Squared Error: ", rmse(actual_predicted, actual_y_test))

    print("Variance Score: ",
          explained_variance_score(actual_y_test, actual_predicted))
    compare_actual.to_csv('results.csv')
Esempio n. 19
0
def RMSE(params, *args):
    data_frame = args[0]
    alpha = params

    rmse = 0

    forecast = [0] * len(data_frame)

    forecast[1] = data_frame[0]

    for index in range(2, len(data_frame)):
        forecast[index] = alpha * data_frame[index - 1] + (1 - alpha) * forecast[index - 1]

    rmse = eval_measures.rmse(forecast[1:], data_frame[1:])

    return rmse
Esempio n. 20
0
def accuracy(y1, y2):

    accuracy_df = pd.DataFrame()

    rms_error = np.round(rmse(y1, y2), 1)

    map_error = np.round(
        np.mean(np.abs((np.array(y1) - np.array(y2)) / np.array(y1))) * 100, 1)

    accuracy_df = accuracy_df.append({
        "RMSE": rms_error,
        "%MAPE": map_error
    },
                                     ignore_index=True)

    return accuracy_df
Esempio n. 21
0
def train_and_fit_arima(x, test_split = 0.2):

    # run auto-arima grid search 
    stepwise_model= auto_arima(x, exogenous=None, start_p=0, d=1, start_q=0,
                               max_p=3, max_d=1, max_q=3,
                               start_P=0, D=1, start_Q=0, max_P=3, max_D=3, 
                               max_Q=3, max_order=10, m=12, seasonal=True,
                               trace=True,error_action='ignore',
                               suppress_warnings=True,stepwise=False,
                               approximation=False)

    print(stepwise_model.aic())
    print(stepwise_model.summary())

    split=len(x) - int(test_split * len(x))
    train = x[0:split]
    test = x[split:]

    stepwise_model.fit(train)

    future_forecast = stepwise_model.predict(n_periods=len(test))
    future_forecast = pd.DataFrame(future_forecast, index=test.index, columns=['Prediction'])
    lineObjects=plt.plot(pd.concat([test, future_forecast], axis=1))
    plt.xlabel("Years")
    plt.ylabel("CO2 Levels (ppm)")
    plt.legend(iter(lineObjects), ('CO2 Levels', 'Predictions'))
    plt.savefig("Forecast.png")
    plt.show()
    plt.close()

    line1bjects=plt.plot(pd.concat([x, future_forecast], axis=1))
    plt.xlabel("Years")
    plt.ylabel("CO2 Levels (ppm)")
    plt.legend(iter(line1bjects), ('CO2 Levels', 'Predictions'))

    plt.savefig("Forecast_conc.png")
    plt.show()
    plt.close()

    pred_error = rmse(test, future_forecast)
    print("rmse:", pred_error)

    stepwise_model.plot_diagnostics(figsize=(15, 12))
    plt.savefig("Diagnostic.png")
    plt.show()
    plt.close()
def regression_stats(model):
    y_preds_test = model.predict(X_test)
    # create df for model results
    model_vals = [
        model.score(X_train, y_train),
        model.score(X_test, y_test),
        mean_absolute_error(y_test, y_preds_test),
        mse(y_test, y_preds_test),
        rmse(y_test, y_preds_test),
        np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100,
    ]
    mapping = {
        "stat": ["train R^2", "test R^2", "MAE", "MSE", "RMSE", "MAPE"],
        "model": model_vals,
    }
    stats_df = pd.DataFrame.from_dict(mapping)
    return stats_df
Esempio n. 23
0
def gen_results(d):
    #exp = d.query('Trial > 10 and Time > 5')
    exp = d.query('Time > 5')

    error = exp.groupby(('Day', 'Subject', 'Trial')).ae.mean().reset_index()
    error.columns = ['Day', 'Subject', 'Trial', 'AbsoluteError']
    rms = exp.groupby(
        ('Day', 'Subject',
         'Trial')).apply(lambda x: rmse(x.y, x.yg)).reset_index()
    rms.columns = ['Day', 'Subject', 'Trial', 'RMSE']
    var = exp.groupby(
        ('Day', 'Subject',
         'Trial')).apply(lambda x: vare(x.y, x.yg)).reset_index()
    var.columns = ['Day', 'Subject', 'Trial', 'VARE']
    crossings = exp.groupby(
        ('Day', 'Subject',
         'Trial')).apply(lambda x: len(cross(x.e))).reset_index()
    crossings.columns = ['Day', 'Subject', 'Trial', 'Crossings']

    rt = find_response_times(exp, trials)
    response_time = rt.groupby(
        ('Day', 'Subject', 'Trial')).mean().ResponseTime.reset_index()
    td = exp.groupby(('Day', 'Subject', 'Trial')).apply(
        lambda x: recover_shift(x['Time'], x['y'], x['yg'])).reset_index()
    time_delay = td.groupby(
        ('Day', 'Subject', 'Trial')).mean()[0].reset_index().abs()
    time_delay.columns = ['Day', 'Subject', 'Trial', 'LagTime']
    #entropy = generate_entropy_results(exp)

    #res = error.merge(rms).merge(var).merge(crossings).merge(response_time, how='outer').merge(time_delay).merge(entropy)
    res = error.merge(rms).merge(var).merge(crossings).merge(
        response_time, how='outer').merge(time_delay)
    res['Feedback'] = res.Subject % 2 == 1
    res = res.merge(trials[['Trial']])
    res = res.sort(['Day', 'Subject', 'Trial'])
    #res['SecondaryTask'] = res['Secondary_Task']
    res = res[[
        'Day', 'Subject', 'Trial', 'AbsoluteError', 'RMSE', 'VARE',
        'ResponseTime', 'LagTime', 'Crossings', 'Feedback'
    ]]
    res = res.reset_index(drop=True)
    res['ID'] = (res.Day - 1) * res.Trial.max() + res.Trial

    return res
Esempio n. 24
0
def multiplicative(input_dataframe, period_len, next_periods, alpha=None, beta=None, gamma=None):
    dataframe = input_dataframe.copy()

    # Datetime format
    date_format = '%Y-%m'

    # Get size of original dataframe
    t = dataframe.count()

    # Create ahead-day entries in future
    date_range = pd.date_range(start=dataframe.index[period_len], periods=t, format=date_format, freq='MS')

    forecast = pd.Series(data=[np.nan] * len(date_range), index=date_range)

    if alpha is None or beta is None or gamma is None:
        initial_values = np.array([0.0, 1.0, 0.0])
        boundaries = [(0, 1), (0, 1), (0, 1)]
        type = 'multiplicative'

        parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(dataframe, type, period_len),
                                   bounds=boundaries, approx_grad=True)
        alpha, beta, gamma = parameters[0]

    smooth = [0] * period_len
    trend = [0] * period_len
    smooth[-1] = sum(dataframe.iloc[0:period_len]) / float(period_len)
    trend[-1] = (sum(dataframe.iloc[period_len:2 * period_len]) - sum(dataframe.iloc[0:period_len])) / period_len ** 2
    season = [dataframe.iloc[i] / smooth[-1] for i in range(period_len)]

    rmse = 0

    for i in range(period_len, t + next_periods):
        if i >= t:
            T = i - t
            forecast.iloc[i - period_len] = (smooth[t - 1] + T * trend[t - 1]) * season[i - period_len]
        else:
            smooth.append(alpha * (dataframe[i] / season[-period_len]) + (1 - alpha) * (smooth[-1] + trend[-1]))
            trend.append(beta * (smooth[i] - smooth[-1]) + (1 - beta) * trend[-1])
            season.append(gamma * (dataframe[i] / (smooth[i])) + (1 - gamma) * season[-period_len])
            forecast.iloc[i - period_len] = (smooth[-1] + trend[-1]) * season[-period_len]

    rmse = eval_measures.rmse(dataframe[period_len:], forecast[:-period_len])

    return forecast, alpha, beta, gamma, rmse
Esempio n. 25
0
	def Function(params):
		a, b = params
		res = []
		for ibasin in xrange(0, 1): #10):
			for istation in good_stations[ibasin]:
				# print ibasin, istation
				data = scipy.io.loadmat('%s/%s_AP.mat' %(datadir, ibasin+1))
				index = np.where(geoinfo[:, 0]==data['station_name'][0, istation])[0]
				# pan_obs = data['pan'][0, istation][0:tstep].flatten()
				pan_obs_gapfill = Gapfill(data['pan'][0, istation][0:tstep].flatten())
				## Prepare for the input data for Epan calculation
				INPUT = {vars_penpan[i]: Gapfill(data[v][0, istation][0:tstep].flatten()) for i, v in enumerate(vars_penpan[:-2])}
				INPUT['doy'] = doys.flatten()
				INPUT['lat'] = geoinfo[index, 1]
				INPUT['elev'] = geoinfo[index, 3]
				pan_mod = Data(INPUT, 'cloud').Penpan_u2(a, b)
				res.append(evaluate.rmse(daily2monthly(pan_mod), daily2monthly(pan_obs_gapfill)))

		return vstack(res).mean()
Esempio n. 26
0
def simple_exponential_smoothing(dataframe, next_periods, alpha=None):
    # Datetime format
    date_format = '%b-%y'

    # Get size of original dataframe
    size = dataframe.count()

    # Create ahead-day entries in future
    date_range = pd.date_range(start=dataframe.index[0], periods=size + next_periods, format=date_format, freq='MS')

    # Create new dataframe for forecasting
    forecast_full_frame = pd.Series(data=[np.nan] * (len(date_range)), index=date_range)

    if alpha is None:
        initial_values = np.array([0.0])
        boundaries = [(0, 1)]

        parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(dataframe, next_periods), bounds=boundaries,
                                   approx_grad=True)
        alpha = parameters[0]

    # Begin forecasting
    for idx in range(len(forecast_full_frame.index) - 1):
        if idx == 0:
            forecast_full_frame.iloc[idx + 1] = dataframe.iloc[idx]
        elif idx < size:
            forecast_full_frame.iloc[idx + 1] = alpha * dataframe.iloc[idx] + (1 - alpha) * forecast_full_frame.iloc[
                idx]
        else:
            forecast_full_frame.iloc[idx + 1] = forecast_full_frame.iloc[idx]

    # Drop all NaN values
    forecast_full_frame.dropna(inplace=True)

    # Future timeframe only
    forecast_partial_frame = forecast_full_frame[~forecast_full_frame.index.isin(dataframe.index)]

    # Root mean squared error
    rmse = eval_measures.rmse(dataframe[1:size], forecast_full_frame[0:size - 1])

    # Return result
    return forecast_full_frame, forecast_partial_frame, rmse, alpha
Esempio n. 27
0
def get_best_model(train, test):
    # Step 1: specify the form of the model
    model_formula = "total_cases ~ 1 + " \
                    "reanalysis_specific_humidity_g_per_kg + " \
                    "reanalysis_dew_point_temp_k + " \
                    "reanalysis_min_air_temp_k + " \
                    "station_min_temp_c + " \
                    "station_max_temp_c + " \
                    "station_avg_temp_c"

    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.Gaussian())


    fitted_model = model.fit()
    acc = eval_measures.rmse(sj_best_model.predict(full_dataset).astype(int), full_dataset.total_cases)

    return fitted_model, acc
Esempio n. 28
0
 def rolling_tscv(series, trend, seasonal, seasonal_periods, damped, boxcox,
                  initial_train_window, test_window):
     i = 0
     x = initial_train_window
     t = test_window
     errors_roll = []
     while (i + x + t) < len(series):
         train_ts = series[(i):(i + x)].values
         test_ts = series[(i + x):(i + x + t)].values
         model_roll = ExponentialSmoothing(
             train_ts,
             trend=trend,
             seasonal=seasonal,
             seasonal_periods=seasonal_periods,
             damped=damped).fit(use_boxcox=boxcox)
         fcast = model_roll.forecast(t)
         error_roll = rmse(test_ts, fcast)
         errors_roll.append(error_roll)
         i = i + 1
     return np.mean(errors_roll).round(1)
Esempio n. 29
0
def eval_metrics(forecast, observed):
    '''Return forecast evaluation metrics.

    Parameters
    ----------
    forecast : pd.Series
        Forecasted values.
    observed : pd.Series
        Observed values.

    Return
    ------
    mae : float
        Mean Absolute Error metric.
    rmserr : float
        Root Mean Squared Error metric. Named rmserr to avoid
        conflicting with statsmodels rmse function.
    '''
    return meanabs(forecast, observed), rmse(
        forecast,
        observed), (((forecast - observed).abs() / observed).mean()) * 100
Esempio n. 30
0
def RMSE(params, *args):
    data_frame = args[0]
    rmse = 0

    alpha, beta = params

    # Init
    smooth, trend = data_frame.iloc[0], data_frame.iloc[1] - data_frame.iloc[0]
    forecast = pd.Series(data=[np.nan] * data_frame.count(), index=data_frame.index)
    forecast.iloc[0] = data_frame.iloc[0]

    size = data_frame.count()

    for n in range(1, size):
        last_smooth, smooth = smooth, alpha * data_frame[n] + (1 - alpha) * (smooth + trend)
        trend = beta * (smooth - last_smooth) + (1 - beta) * trend
        forecast.iloc[n] = smooth + trend

    rmse = eval_measures.rmse(data_frame, forecast)

    return rmse
Esempio n. 31
0
    def plotPrediction(self, fit, ax):
        """Plot predicted vs. test"""

        sub = self.sub
        if len(sub) == 0:
            sub = X.index
        Xout = self.X.ix[-self.X.index.isin(sub)]
        yout = self.y.ix[-self.y.index.isin(sub)]
        ypred = fit.predict(Xout)
        ax.scatter(yout, ypred, alpha=0.6, edgecolor='black',
                   color='blue', lw=0.5, label='fit')
        ax.plot(ax.get_xlim(), ax.get_xlim(), ls="--", lw=2, c=".2")
        ax.set_xlabel('test')
        ax.set_ylabel('predicted')
        ax.set_title('predicted vs test data')
        import statsmodels.tools.eval_measures as em
        yt = yout.squeeze().values
        rmse = em.rmse(yt, ypred)
        ax.text(0.9,0.1,'rmse: '+ str(round(rmse,3)),ha='right',
                    va='top', transform=ax.transAxes)
        return
Esempio n. 32
0
def calculate_total_error(actual, predictions, df):
    """
    Calculate root mean square error (RMSE), mean and error as a percentage of mean

    Inputs:
        actual: values of actual data series

        predictions: values of prediction data series

        df: dataframe of all values

    Outputs:
        root mean squared error of the two series

        mean of the actual series

        percent: percentage of rmse of the actual mean


    Means and errors are formatted as integers

    Percent is formatted as one decimal point
    """

    end_date = df.index[-1]


    actual = actual[:end_date]
    predictions = predictions[:end_date]



    error = rmse(actual, predictions)
    print(f'{error:.0f}', 'RMSE')

    CancMean = actual.mean()
    print(f'{CancMean:.0f}', 'Mean')

    percent = error/CancMean*100
    print(f'{percent:.1f}', '% Error')
Esempio n. 33
0
def mlr_array(Y, X, MASK=None, MASKnodata=None, Ynodata=None, Xnodata=None):

    if MASK is not None:
        X = [np.where(MASK == MASKnodata, np.NaN, x) for x in X]
        Y = np.where(MASK == MASKnodata, np.NaN, Y)

    # also mask array's nodata (X's must have same nodata value!):
    if Ynodata is not None:
        Y = np.where(Y == Ynodata, np.NaN, Y)
    if Xnodata is not None:
        X = [np.where(X == Xnodata, np.NaN, x) for x in X]

    # reshape arrays:
    Y = np.reshape(Y, (Y.shape[0] * Y.shape[1]))
    X = [np.reshape(x, (x.shape[0] * x.shape[1])) for x in X]

    # mask NaNs:
    mask = 0
    for x in X:
        mask = np.where(np.isnan(x), 1, mask)

    mask = np.where(np.isnan(Y), 1, mask)

    X = [np.where(mask == 1, np.NaN, x) for x in X]
    Y = np.where(mask == 1, np.NaN, Y)

    # retrieve valid values:
    X = [x[~np.isnan(x)] for x in X]
    Y = Y[~np.isnan(Y)]

    # prepare predictors
    X = np.array(X).T
    X = sm.add_constant(X)

    model = sm.OLS(Y, X).fit()
    m_predict = model.predict(X)
    print('Model', 'RMSE', 'Predict', 'Y', 'X')
    m_list = [model, rmse(Y, m_predict), m_predict, Y, X]
    return m_list
Esempio n. 34
0
def forecast_arima(df: pd.DataFrame, cols: list, with_graph: bool = True):
    lag = 0
    order = 1
    moving_avg_model = 0
    steps = 50

    for col in cols:
        model = ARIMA(df[col].iloc[:-steps],
                      order=(lag, order, moving_avg_model))
        model_fit = model.fit()

        model_for = model_fit.get_forecast(steps=steps, alpha=0.05)
        print('\t==== Summary of forecast ARIMA(%d, %d, %d) ====\n' %
              (lag, order, moving_avg_model))
        print(model_for.summary_frame(), model_for.conf_int(), sep='\n')
        print('RMSE: %f\nMAE: %f' %
              (rmse(df[col][-50:], model_for.predicted_mean),
               meanabs(df[col][-50:], model_for.predicted_mean)))
        print()

        if with_graph is True:
            plt.figure(figsize=(12, 5))
            plt.xlabel(col)
            plt.title('Forecast for %s using ARIMA(%d, %d, %d)' %
                      (col, lag, order, moving_avg_model))

            ax1 = model_for.predicted_mean.plot(color='blue',
                                                grid=True,
                                                label='Actual')
            ax2 = df[col][-50:].plot(color='red',
                                     grid=True,
                                     secondary_y=True,
                                     label='Estimated')

            h1, l1 = ax1.get_legend_handles_labels()
            h2, l2 = ax2.get_legend_handles_labels()

            plt.legend(h1 + h2, l1 + l2, loc=2)
            plt.show()
Esempio n. 35
0
    def execute(self,filename):
        df1= pd.read_csv('data/'+filename,index_col='Date',parse_dates=True)
        df1.index.freq='MS'
        
        df = pd.read_csv('./data/'+filename)
        df.columns = ['ds','y']
        df['ds'] = pd.to_datetime(df['ds'])
        m = Prophet()
        m.fit(df)
        future = m.make_future_dataframe(periods=24,freq = 'MS')
        forecast = m.predict(future)
        filename=filename[:-4]
        m.plot(forecast).savefig(os.path.join('static/', secure_filename(filename+'_prophetPredict.jpg')))
        # 80% for training
        train = df.iloc[:int(len(df)*0.8)]
        test = df.iloc[len(train):]
        m = Prophet()
        m.fit(train)
        future = m.make_future_dataframe(periods=len(test),freq='MS')
        forecast = m.predict(future)
        #print(forecast.tail())
        ax = forecast.plot(x='ds',y='yhat',label='Predictions',legend=True,figsize=(12,8))
        g=test.plot(x='ds',y='y',label='True Miles',legend=True,ax=ax,xlim=('2018-01-01','2019-01-01'))
        g.figure.savefig(os.path.join('static/', secure_filename(filename+'_prophetCompare.jpg')))
        predictions = forecast.iloc[len(train):]['yhat']
        error=rmse(predictions,test['y'])
        mean=test.mean()
        print('percentage')
        self.accuracy=100-(error/mean*100)
        data=dict()
        data['stationary']=self.adf_test(df1)
        data['accuracy']=str(self.accuracy)
        return data



#a=ProphetModel()
#print(a.execute('BeerWineLiquor.csv'))
Esempio n. 36
0
def double_exponential_smoothing(series, next_periods, alpha=None, beta=None):
    # Datetime format
    date_format = '%Y-%m'

    # Get size of original dataframe
    size = series.count()

    # Create ahead-day entries in future
    date_range = pd.date_range(start=series.index[0], periods=size + next_periods, format=date_format, freq='MS')

    forecast = pd.Series(data=[np.nan] * len(date_range), index=date_range)
    forecast.iloc[0] = series.iloc[0]

    # Init
    smooth, trend = series.iloc[0], series.iloc[1] - series.iloc[0]

    # Calculate alpha, beta if it's None
    if alpha is None or beta is None:
        initial_values = np.array([0.0, 1.0])
        boundaries = [(0, 1), (0, 1)]

        parameters = fmin_l_bfgs_b(RMSE, x0=initial_values, args=(series, next_periods), bounds=boundaries,
                                   approx_grad=True)
        alpha, beta = parameters[0]

    for n in range(1, size):
        last_smooth, smooth = smooth, alpha * series[n] + (1 - alpha) * (smooth + trend)
        trend = beta * (smooth - last_smooth) + (1 - beta) * trend
        forecast.iloc[n] = smooth + trend

    for n in range(size, size + next_periods):
        m = n - size + 1
        forecast.iloc[n] = smooth + m * trend

    rmse = eval_measures.rmse(series, forecast[:-next_periods])

    return forecast, rmse, alpha, beta
Esempio n. 37
0
def regframe(X, Y, mod, idx):

    ##rsq,mae,mse,rmse,mape

    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        shuffle=False)

    model = mod.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    k_fold = KFold(n_splits=10, shuffle=False)

    df = pd.Series(
        {
            'rsq_train':
            model.score(x_train, y_train),
            'rsq_test':
            model.score(x_test, y_test),
            'subt_rsq':
            model.score(x_train, y_train) - model.score(x_test, y_test),
            'mae_test':
            mean_absolute_error(y_test, y_pred),
            'mse_test':
            mse(y_test, y_pred),
            'rmse_test':
            rmse(y_test, y_pred),
            'mape_test': (np.mean(np.abs((y_test - y_pred) / y_test)) * 100),
            'cross-score':
            cross_val_score(estimator=mod, X=X, y=Y, cv=k_fold).mean(),
            'cross-train':
            cross_val_score(estimator=mod, X=x_train, y=y_train,
                            cv=k_fold).mean()
        },
        name=idx)
    return df
Esempio n. 38
0
def sm_fit(X, Y, alpha=None, L1_wt=0.0):
    actual_v_predicted_plot = bokeh.plotting.figure(tools=['save'],
                                                    x_axis_type='log',
                                                    y_axis_type='log')
    resid_v_actual_plot = bokeh.plotting.figure(tools=['save'])
    cv_rmse = []

    ts = TimeSeriesSplit(7)
    for train_index, test_index in ts.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_test, Y_test = X[test_index], Y[test_index]
        model = sm.OLS(Y_train, X_train)

        if alpha is None:
            reg_results = model.fit()
        else:
            reg_results = model.fit_regularized(alpha=alpha, L1_wt=L1_wt)
        sm_plot_actual_v_predicted(actual_v_predicted_plot, reg_results,
                                   X_test, Y_test[:, 0])
        sm_plot_resid_v_actual(resid_v_actual_plot, reg_results, X_test,
                               Y_test[:, 0])
        cv_rmse.append(rmse(reg_results.predict(X_test), Y_test[:, 0]))
    cv_rmse = pd.Series(cv_rmse, name='rmse').reset_index()
    return reg_results, resid_v_actual_plot, actual_v_predicted_plot, cv_rmse
Esempio n. 39
0
def calc_nrmse(df_target, df_new):
    """
    Calculates the normalized root mean square error of the target input dataframe and the simulated output dataframe to determine if the rmse decreased with a new mutation or not.
    Input(s):
    df_target is the user-inputted tsv file containing transcript abundances for each gene.
    df_new is the simulator-generated tsv file containng transcript abundances for each gene.
    Output(s):
    RMSE is a floating point number that refers to the root mean square error calculated.
    """

    #Confirms that the dataframes are the same shape
    assert df_target.shape == df_new.shape
    assert all(df_target.columns == df_new.columns)
    assert all(df_target.index == df_new.index)

    norm_errs = []

    #Performs a normalized RMSE to help determine the fitness of the new genome
    for column in df_target.columns:
        nrmse = rmse(df_target[column], df_new[column])/\
                        np.mean(df_target[column])
        norm_errs.append(nrmse)

    return np.mean(norm_errs)
Esempio n. 40
0
def sm_forest_fit(X, Y, tuning_parameters=None):
    if tuning_parameters is not None:
        max_depth, min_samples_leaf, n_estimators, max_features = tuning_parameters
        max_depth, min_samples_leaf, n_estimators, max_features = int(
            round(max_depth)), int(round(min_samples_leaf)), int(
                round(n_estimators)), max_features
    else:
        max_depth = 3
        min_samples_leaf = 1
        n_estimators = 10
        max_features = 'auto'

    actual_v_predicted_plot = bokeh.plotting.figure(tools=['save'],
                                                    x_axis_type='log',
                                                    y_axis_type='log')
    resid_v_actual_plot = bokeh.plotting.figure(tools=['save'])
    cv_rmse = []

    ts = TimeSeriesSplit(7)
    for train_index, test_index in ts.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_test, Y_test = X[test_index], Y[test_index]
        model = sm.OLS(Y_train, X_train)
        reg_results = RandomForestRegressor(max_depth=max_depth,
                                            min_samples_leaf=min_samples_leaf,
                                            n_estimators=n_estimators,
                                            max_features=max_features,
                                            n_jobs=-1)
        reg_results.fit(X, Y)
        sm_plot_actual_v_predicted(actual_v_predicted_plot, reg_results,
                                   X_test, Y_test[:, 0])
        sm_plot_resid_v_actual(resid_v_actual_plot, reg_results, X_test,
                               Y_test[:, 0])
        cv_rmse.append(rmse(reg_results.predict(X_test), Y_test[:, 0]))
    cv_rmse = pd.Series(cv_rmse, name='rmse').reset_index()
    return reg_results, resid_v_actual_plot, actual_v_predicted_plot, cv_rmse
Esempio n. 41
0
    lstm_predictions_scaled.append(lstm_pred)
    current_batch = np.append(current_batch[:, 1:, :], [[lstm_pred]], axis=1)

lstm_predictions_scaled

lstm_predictions = scaler.inverse_transform(lstm_predictions_scaled)

lstm_predictions

test_data_sa['LSTM_Predictions'] = lstm_predictions
test_data_sa

test_data_sa['New Cases'].plot(figsize=(16, 5), legend=True)
test_data_sa['LSTM_Predictions'].plot(legend=True)

lstm_rmse_error_sa = rmse(test_data_sa['New Cases'],
                          test_data_sa["LSTM_Predictions"])
lstm_mse_error_sa = lstm_rmse_error_sa**2
mean_value = sa['value'].mean()

#%%

scaler = MinMaxScaler()
scaler.fit(train_data_korea)
scaled_train_data = scaler.transform(train_data_korea)
scaled_test_data = scaler.transform(test_data_korea)

n_input = 7
n_features = 1
generator = TimeseriesGenerator(scaled_train_data,
                                scaled_train_data,
                                length=n_input,
Esempio n. 42
0
def regression(dependent_str, model_list, dataset, independents_filter):
    # first we create the model for this dependent variable with the entire dataset
    independents_str = " + ".join(model_list)
    print(independents_str)
    # https://stackoverflow.com/questions/48522609/how-to-retrieve-model-estimates-from-statsmodels

    X = dataset[sorted(independents_filter)]
    y = dataset[dependent_str]
    model = smf.ols(formula=dependent_str + " ~ " + independents_str,
                    data=dataset).fit()

    # then we calculate the average fitness (rsme normalized) using k fold cross validation
    kf = KFold(configuration.kfold, True, 1)
    #print("########################################################################")
    fitness_norm = 0
    fitness = 0
    for train, test in kf.split(dataset):
        model_t = smf.ols(formula=dependent_str + " ~ " + independents_str,
                          data=dataset.iloc[train]).fit()

        # filter columns
        X = dataset[sorted(independents_filter)]
        y = dataset[dependent_str]

        # filter rows
        X = X.iloc[test]
        y = y.iloc[test]

        # generate predictions and metric
        ypred = model_t.predict(X)
        r = rmse(y, ypred)
        rmse_norm = round(r / (max(y) - min(y)), 3)
        #print("rmse_norm = ", rmse_norm)
        fitness_norm = fitness_norm + rmse_norm
        fitness = fitness + r

        # to be able to check manually
        #print(y)
        #print(ypred)

    fitness_norm = round(fitness_norm / configuration.kfold, 3)
    fitness = round(fitness / configuration.kfold, 3)
    #print("########################################################################")
    #print(fitness_norm, fitness)
    #print("########################################################################")

    rsquared = round(model.rsquared, 3)
    rsquared_adj = round(model.rsquared_adj, 3)

    X = dataset[sorted(independents_filter)]
    model_y = dataset[dependent_str]
    #model_y_pred = model_t.predict(X)

    # compare with random values
    #df_random = pd.DataFrame(np.random.randint(1,100,size=(len(model_y), 1)))
    randomlist = random.sample(range(1, 100), len(model_y))
    rmse_random = rmse(model_y, randomlist)

    #print("########################################################################")
    #print(model_y)
    #print(model_y_pred)
    #print("########################################################################")
    #print(model_y)
    #print(randomlist)
    #print("########################################################################")
    #print("rmse_random", rmse_random)

    #return (dep + " ~ " + independents, rsquared, rsquared_adj, fitness_norm, fitness, model.summary(), model_y, model_y_pred)
    return (dependent_str + " ~ " + independents_str, rsquared, rsquared_adj,
            fitness_norm, fitness, model.summary(), 0, 0, rmse_random)
Esempio n. 43
0
def hw(x, horizon, params, quantile=None, verbose=True, boxcox=False):
    """
    Holt-Winters point prediction
    :param x: input time series (assume equally spaced)
    :param horizon: number of points to predict
    :param params: dict with keys and values for specified parameters. Keys must be in the set level, trend, season, damped
     All keys must be specified.
     Examples:
     The first component in the tuple prevails
     - params['level'] = None: compute the optimal level alpha
     - params['level'] = <positive_number>: set alpha = <positive_number> (between 0 and 1)
     - params['trend'] = [None, None]: set beta = 0
     - params['trend'] = ['A', None]: use additive trend and compute optimal beta
     - params['trend'] = ['M', <positive_number>]: use multiplicative trend and set beta = <positive_number> (between 0 and 1)
     - params['season'] = [None, <seasonality>, None]: set s_len = 1, gamma = 0 (no seasons). The value of <seasonality> is ignored
     - params['season'] = ['A', None, None]: try season detection to set the 2nd component (season length)
     - params['season'] = ['A', <seasonality>, None]: use additive seasonality, compute optimal gamma assuming season length equals <seasonality>
     - params['season'] = ['M', <seasonality>, <positive_number>]: use multiplicative seasonality, set gamma = <positive_number> and assume season length equals <seasonality>
     - params['damped'] = [None, <anything>]: error: damped can only be True or False
     - params['damped'] = [False, <anything>]: set <anything> = 1. No damping
     - params['damped'] = [True, None]: damping present. Compute optimal value for phi
     - params['damped'] = [True, <positive_number>]:  damping present. Set phi = <positive_number> (between 0 and 1)
    :param quantile: error band to return, when not None
    :param verbose: whether to print some information while executing
    :return: df_out, rmse, params, yint, df_errs
             df_out is a DataFrame with columns:
                - y input data
                - yhat (point forecasts and forecasts)
                - ylwr (lower forecast bound on forecasts)
                - yupr (upper forecast bound on forecasts)
                - lj_pval (Ljung-Box independence test p-values on transformed errors. the min of p_values is shown -worst case-)
                - sw_pval (Shapiro-Wilks test p_value on transformed errors)
                - lbda (BoxCox transform parameter used for the forecast bounds)
             rms_err is the rmse on the data
             params is the model parameter dictionary includes the values for alpha, beta, gamma, phi and s_len that minimize the one step prediction rmse or that were supplied.
             mdl is the type of model used (trend, season, damped)
    """
    if set(params.keys()) != {'level', 'trend', 'season', 'damped'}:
        print('invalid params keys: ' + str(params.keys()))
        return None

    opt_pars = list()  # list of params to optimze

    # damping
    if len(params['damped']) != 2:
        print('invalid damping parameters: ' + str(params['damped']))
        return None
    if params['damped'][0] not in [True, False]:
        print('invalid damped parameter: ' + str(params['damped']))
        return None
    if params['damped'][0] is True:
        if params['damped'][1] is None:
            opt_pars.append('damped')
        else:
            if not(0.0 < params['damped'][1] <= 0.98):   # 0.98 See Hyndman
                print('invalid damped parameter: ' + str(params['damped']))
                return None
    else:
        params['damped'][1] = 1.0

    # seasons
    if len(params['season']) != 3:
        print('invalid seasonality parameters: ' + str(params['season']))
        return None
    if params['season'][0] not in [None, 'A', 'M']:
        print('invalid seasonality parameters: ' + str(params['season']))
        return None
    if params['season'][0] is None:
        params['season'] = [None, 1, 0.0]
    else:
        if params['season'][1] <= 1:
            params['season'][1] = hwu.get_season(x)
            if params['season'][1] <= 1.0:
                print('invalid seasonality parameters: ' + str(params['season']))
                return None
            else:
                if verbose:
                    print('using automated season detection. Seasonality: ' + str(params['season'][1]))
        if params['season'][2] is None:
            opt_pars.append('season')
        else:
            if not(0.0 <= params['season'][2] <= 1.0):
                print('invalid season parameter: ' + str(params['season']))
                return None
        if params['season'][1] > 1 and len(x) < 4 * params['season'][1]:
                print('not enough data for seasonality')
                return None

    # trend
    if len(params['trend']) != 2:
        print('invalid trend parameters: ' + str(params['trend']))
        return None
    if params['trend'][0] not in [None, 'A', 'M']:
        print('invalid trend parameters: ' + str(params['trend']))
        return None
    if params['trend'][0] is None:
        params['trend'][1] = 0.0
    else:
        if params['trend'][1] is None:
            opt_pars.append('trend')
        else:
            if not(0.0 <= params['trend'][1] <= 1.0):
                print('invalid trend parameter: ' + str(params['trend']))
                return None

    # level
    if params['level'] is None:
        opt_pars.append('level')
    else:
        if not(0.0 <= params['level'] <= 1.0):
            print('invalid trend parameter: ' + str(params['level']))
            return None

    # update params to get optimal parameters if needed
    Y = list(x[:])
    if len(opt_pars) > 0:
        get_pars(Y, params, opt_pars)
    alpha = params['level']
    trend, beta = params['trend']
    season, s_len, gamma = params['season']
    phi = params['damped'][1]
    if beta <= EPS:
        trend, beta = None, 0.0
        params['trend'] = [None, 0.0]
    if gamma <= EPS:
        season, s_len, gamma = None, 1, 0.0
        params['season'] = [None, 1, 0.0]
    if phi <= EPS:
        phi = 0.0
        params['season'] = [True, 0.0]
    if verbose: print('model parameters: ' + str(params))

    # initialize: set a[0], b[0], s[0]
    a, b, s = hwu.initialize(trend, season, Y, s_len, verbose)

    # HW main iteration
    yhat = list()  # one step point forecast: yhat_{t|t-1}, 1 <= t <= N.
    yint = list()  # list of multi step predictions: yhat_{t+h|t-1}, 0 <= h <= N-t, 1 <= t <= N
    for i in range(len(Y)):  # note that index = i fills position i+1
        # update the HW parameters
        a.append(hwu.level(alpha, phi, Y[i], a[i], b[i], s[i], trend, season))
        b.append(hwu.trend(beta, phi, hwu.get_aval(a[i + 1], a[i], trend), b[i], trend, season))
        s.append(hwu.season(gamma, phi, Y[i], a[i], b[i], s[i], trend, season))

        # one step prediction. s is initialized with s_len values. s[i] is s_len values behind, as it should
        # yhat[i] = y(i+1|Y_0, ..., Y_i), 0 <= i < N
        yhat.append(hwu.one_step_pred(a[i], b[i], s[i], phi, trend, season))

        # multi-step forecasts (used for error bounds): generate all the predictions at each step
        # yint[i][h] = y(i + 1 + h|y_0, .., y_i), 0 <= h < N - i, 0 <= i < N
        if quantile is not None:
            yint.append(hwu.point_fcast(len(Y), phi, s_len, a[i], b[i], s[-s_len:], trend, season))

    # states
    df_states = pd.DataFrame({'level': a, 'trend': b, 'season': s})

    # point predictions outside the data range
    yhat += hwu.point_fcast(horizon, phi, s_len, a[-1], b[-1], s[-s_len:], trend, season)
    rms_err = sm.rmse(Y, yhat[:len(Y)])
    yhat = np.array(yhat)
    df_hat = pd.DataFrame({'yhat': yhat[:len(Y)], 'y': Y})

    # interval predictions (errors)
    if quantile is not None and horizon > 0:
        df_int, df_errs, df_detail = hwu.interval_fcast(np.array(x), np.array(yint), horizon, quantile)
        df_int['yhat'] = yhat[-horizon:]
        df_int['yupr'], df_int['ylwr'] = df_int['yhat'] + df_int['upr'], df_int['yhat'] + df_int['lwr']
        df_hat['yupr'], df_hat['ylwr'] = df_hat['yhat'], df_hat['yhat']
        df_int.drop(['upr', 'lwr'], axis=1, inplace=True)
        df_out = pd.concat([df_hat, df_int], axis=0) if horizon > 0 else df_hat
    else:      # no errors computed
        df_int = pd.DataFrame({'yhat':  yhat[-horizon:], 'yupr':  yhat[-horizon:], 'ylwr':  yhat[-horizon:]})
        df_out = pd.concat([df_hat, df_int], axis=0) if horizon > 0 else df_hat
    df_out.reset_index(inplace=True, drop=True)
    return {'df_out': df_out, 'rmse': rms_err, 'params': params, 'states': df_states}
Esempio n. 44
0
vars.remove('TRAIN')    # this is also useless for prediction
vars.remove('ID')       # as is ID

min_rmse = 100      # initialise with something large
f_val = ""          # variable to hold our formula
resample_count = 3  # number of CV folds
for iter in xrange(1000):
    
    # generate a random expression
    f = 'DEATHS ~ {0}'.format(rand_expr(vars))
    
    # for 3 re-samples of the training set fit the lm on the fitting set
    # and calculate the rmse on the validation set
    rmse_val = 0
    for _ in xrange(resample_count):
        sampleIndices   = random.sample( train_data.index, int(0.75 * recs) )
        fitting_data    = train_data.ix[sampleIndices]
        validation_data = train_data.drop(sampleIndices)

        death_glm = smf.ols(formula = f, data = fitting_data).fit()
        rmse_val += rmse(death_glm.predict(validation_data), validation_data.DEATHS)
    
    rmse_val /= resample_count
    
    if rmse_val < min_rmse:
        print 'new minimum: {0}'.format(rmse_val)
        min_rmse = rmse_val
        f_val = f
        
print 'BEST EXPRESSION : {0} \n\n {1}'.format(min_rmse, f_val)
Esempio n. 45
0
def rmse(par_vals, *args):
    Y, par_names, param_dict = args[0], args[1], args[2]
    set_dict(par_names, par_vals, param_dict)
    results = hw(Y, 0, param_dict, verbose=False, quantile=None)
    Yhat = results['df_out']['yhat'].values
    return sm.rmse(Y, Yhat)
Esempio n. 46
0
def make_plot(image_dir, run_dirs, run_names=None, cmu0=0.5544):
    """ Make plot of runs against analytical solution.

    params:
    -------
    image_dir - str
      - Directory to save the file
    run_dirs - list
      - Names of the simulation directories
    run_names - str, optional
      - Names of simulation for legend
    cmu0 - float, optional
      - Parameter in GLS to calculate TKE (default 0.5544 Kantha-Clayson)
    """
    names = []
    m_sed = []
    m_vel = []
    m_dif = []
    for r in run_dirs:
        print r
        names.append(r)
        fileBase = r+'/Warner/data/profile'
        dataFile = os.path.join(fileBase, 'Warner-1K_trcr_1_0_2012-01-01_2012-01-02.nc')
        m_sed.append(dc.loadFromNetCDF(dataFile))
        dataFile = os.path.join(fileBase, 'Warner-1K_hvel_0_2012-01-01_2012-01-02.nc')
        m_vel.append(dc.loadFromNetCDF(dataFile))
        dataFile = os.path.join(fileBase, 'Warner-1K_tdff_0_2012-01-01_2012-01-02.nc')
        m_dif.append(dc.loadFromNetCDF(dataFile))

    # Calculate analytical values using actual water column depth H
    # z - Height above the bed.
    # H - Water column height
    z = abs(m_sed[0].z[0, -1] - m_sed[0].z[:, -1])
    z[0] = Z0
    depths = m_sed[0].z[:, -1]
    H = z[-1]
    u_star = calcFrictionVelocity(U, H, Z0)
    a_vel = calcVelocity(z, u_star, Z0)
    a_vis = calcEddyViscosity(z, u_star, H)
    a_dif = calcEddyDiffusivity(a_vis)
    a_sed = calcSediment(z)

    # "Analytical" values from imposing parabolic eddy viscosity
    print ' - test_warner_channel_analytical'
    fileBase = 'test_warner_channel_analytical/Warner/data/profile'
    dataFile = os.path.join(fileBase, 'Warner-1K_trcr_1_0_2012-01-01_2012-01-03.nc')
    m_sed.append(dc.loadFromNetCDF(dataFile))
    dataFile = os.path.join(fileBase, 'Warner-1K_hvel_0_2012-01-01_2012-01-03.nc')
    m_vel.append(dc.loadFromNetCDF(dataFile))
    dataFile = os.path.join(fileBase, 'Warner-1K_tdff_0_2012-01-01_2012-01-03.nc')
    m_dif.append(dc.loadFromNetCDF(dataFile))

    # Calculate RMSE
    vel_rmse = []
    dif_rmse = []
    sed_rmse = []
    for i, r in enumerate(run_dirs):
       vel_rmse.append(stats.rmse(m_vel[i].data[:, 0, -1], a_vel))
       dif_rmse.append(stats.rmse(m_dif[i].data[:, 0, -1], a_dif))
       sed_rmse.append(stats.rmse(m_sed[i].data[:, 0, -1], a_sed))

    # Plots
    ticks_font = matplotlib.font_manager.FontProperties(size=6)
    f, ax = plt.subplots(1, 3, sharey=True, figsize=(18, 7))
    f.subplots_adjust(wspace=0.4, top=0.9)
    #plt.rc('axes', color_cycle=['r', 'g', 'b', 'y'])

    for vel in m_vel:
        ax[0].plot(np.squeeze(vel.data[:, 0, -1]), depths, marker='.')
    p2 = ax[0].plot(a_vel, depths, color='k')
    ax[0].set_xlim([0, 1.5])
    ax[0].set_ylim([-10, 0.5])
    ax[0].xaxis.set_ticks([0, 0.5, 1, 1.5])
    ax[0].grid(True)
    ax[0].set_ylabel('Z-coordinate $m$')
    ax[0].set_xlabel('Velocity $m/s$')
    #ax[0].set_title('RMSE: %4.3f' % vel_rmse, fontsize=12)
    if run_names is None:
        legend_str = names
    else:
        legend_str = run_names
    legend_str.append('Analytical')
    ax[0].legend(legend_str, loc='upper left', fontsize=8)
    ax[0].fill_between([0, 1.5], -10, m_sed[0].z[0, -1], facecolor='brown')
    ax[0].fill_between([0, 1.5], -10, m_sed[0].z[-1, -1], facecolor='blue', alpha=0.1)

    for dif in m_dif:
        ax[1].plot(np.squeeze(dif.data[:, :, -1]), depths, marker='.')
    ax[1].plot(a_dif, depths, color='k')
    ax[1].set_xlim([0, 0.1])
    #ax[1].xaxis.set_ticks([0, 002, 0.04, 0.06, 0.08])
    ax[1].grid(True)
    ax[1].set_xlabel('Edddy diffusivity $m^2/s$')
    #ax[1].set_title('RMSE: %4.3f' % dif_rmse, fontsize=12)
    ax[1].fill_between([0, 0.1], -10, m_sed[0].z[0, -1], facecolor='brown')
    ax[1].fill_between([0, 0.1], -10, m_sed[0].z[-1, -1], facecolor='blue', alpha=0.1)

    for sed in m_sed:
        ax[2].plot(np.squeeze(sed.data[:, :, -1]), depths, marker='.')
    ax[2].plot(a_sed, depths, color='k')
#    ax[4].xaxis.set_ticks([0.150, 0.2, 0.25, 0.3, 0.35, 0.4])
    ax[2].set_xlim([0.05, 0.35])
    ax[2].grid(True)
    ax[2].set_xlabel('Sediment $kg/m^3$')
    ax[2].fill_between([0.05, 0.35], -10, m_sed[0].z[0, -1], facecolor='brown')
    ax[2].fill_between([0.05, 0.35], -10, m_sed[0].z[-1, -1], facecolor='blue', alpha=0.1)

    f.suptitle('Warner et al. 2008 open channel test', fontsize=14)

    # Save fig
    print 'saving image : warner_comparison.png'
    runName = 'warner_comparison'
    f.savefig(runName, dpi=200)
    plt.close('all')