コード例 #1
0
def grid_search_best_model_timeseries_ar(df, grid, cv):
    best_param = None
    best_score = np.infty
    tsp = TimeSeriesSplit(n_splits=cv)

    for param in grid.get('lags'):
        scores = []
        for train_ind, test_ind in tsp.split(df):
            train_data = df.iloc[train_ind]
            test_data = df.iloc[test_ind]
            try:
                #print(train_data, test_data)
                estimator = ar_model.AutoReg(train_data, lags=param)
                res = estimator.fit()
                #print(res.params)
                #get out of sample predictions with test data start and end
                pred = estimator.predict(res.params, test_data.index[0],
                                         test_data.index[-1])
                #print(pred)
                y_pred = pred.values.reshape(-1)
                y_test = test_data.values.reshape(-1)
                score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
                scores.append(score)
            except:
                pass
        #print(scores)
        if len(scores) > 0 and np.mean(scores) < best_score:
            best_score = np.mean(scores)
            best_param = param

    if best_param is not None:
        estimator = ar_model.AutoReg(df, lags=best_param)
        res = estimator.fit()
        print("best parameters:" + str(best_param))
        print("validation rmse:" + str(best_score))
        #get insample predictions with start and end indices
        predictions = estimator.predict(res.params,
                                        start=0,
                                        end=df.shape[0] - 1)
        y_pred = predictions.values.reshape(-1)
        y_train = df.values.reshape(-1)[best_param:]
        train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))
        print("train rmse:" + str(train_rmse))
        return estimator, res
    else:
        return None, None
コード例 #2
0
#read our petrol data in
series = read_csv('petrol_prices.csv',
                  header=0,
                  index_col=0,
                  parse_dates=True,
                  squeeze=True)

# split dataset
X = difference(series.values)
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:]

# train autoregression
window = 10
model = ar_model.AutoReg(train, lags=6)
model_fit = model.fit()
coef = model_fit.params

# walk forward over time steps in test
history = [train[i] for i in range(len(train))]
predictions = list()
for t in range(len(test)):
    yhat = predict(coef, history)
    obs = test[t]
    predictions.append(yhat)
    history.append(obs)
rmse = sqrt(mean_squared_error(test, predictions))

#the mean sq. error
print('Test RMSE: %.3f' % rmse)
コード例 #3
0
        prepped_dataf['New Cases'] = prepped_dataf['Log Number Cases Delta']

    prepped_dataf = prepped_dataf[prepped_dataf['New Cases'].notna()]
    prepped_dataf = prepped_dataf.replace([np.inf, -np.inf], np.nan)
    prepped_dataf = prepped_dataf[prepped_dataf['New Cases'].notna()]
    logged_vals = prepped_dataf['Log Number Cases']
    for col in prepped_dataf.keys():
        if re.match(r'Log.*', col):
            print(re.match(r'Log.*', col))
            prepped_dataf = prepped_dataf.drop(col, axis=1)
    with pandas.option_context('display.max_rows', None, 'display.max_columns',
                               None):  # more options can be specified also
        print(prepped_dataf)
    #prepped_dataf.drop(prepped_dataf.tail(3).index,inplace=True) # drop last n rows

    model = ar_model.AutoReg(prepped_dataf, lags=2).fit()
    print(model.summary())
    target_len = len(prepped_dataf['New Cases']) - 1
    prediction = model.predict(start=205, end=205 + target_len)
    predictions = [pred for pred in prediction]
    cases_log_diff_rev = []
    for v in predictions:
        index = predictions.index(v)
        v_0 = logged_vals[index]
        v_0 += v
        cases_log_diff_rev.append(v_0)

    print(np.exp(cases_log_diff_rev))
    start_date = prepped_dataf.index[0]
    start_date = start_date + datetime.timedelta(days=1)
    date_list = [
コード例 #4
0
energy_test = energy[energy.index >= test_start_dt]

scaler = MinMaxScaler()
energy_train['load_scaled'] = scaler.fit_transform(energy_train['load'])
energy_train.head(10)

#plot both
sns.distplot(energy_train['load'])
sns.distplot(energy_train['load_scaled'])

energy_train1 = energy_train.copy()
energy_train1 = energy_train1.drop('load', axis=1)
energy_train1.index.freq = 'H'

#build model
estimator = ar_model.AutoReg(energy_train1, lags=5)
res = estimator.fit()
print(res.params)

energy_validation['load_scaled'] = scaler.fit_transform(energy_validation)
energy_validation1 = energy_validation.copy()
energy_validation1 = energy_validation1.drop('load', axis=1)
energy_validation1.index.freq = 'H'

#valdiation error
pred = estimator.predict(res.params, energy_validation1.index[0],
                         energy_validation1.index[-1])
print(pred)
y_pred = pred.values.reshape(-1)
y_test = energy_validation1.values.reshape(-1)
score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
コード例 #5
0
import os
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa import ar_model

path = 'F:/'
df = pd.read_csv(os.path.join(path, 'uk-deaths-from-bronchitis-emphys.csv'))
df.info()

df.columns = ['timestamp', 'y']
df.index = pd.to_datetime(df['timestamp'], format='%Y-%m').copy()
df.index.freq = 'MS'
df.drop('timestamp', axis=1, inplace=True)

#build model
estimator = ar_model.AutoReg(df, lags=5)
res = estimator.fit()
print(res.params)
print(res.model)
print(res.summary())

#using model
predictions = estimator.predict(res.params, start=0, end=df.shape[0] - 1)
print(predictions)
y_pred = predictions.values.reshape(-1)
y_train = df.values.reshape(-1)[5:]
train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))
print(train_rmse)

#evaluate model
tsp = TimeSeriesSplit(n_splits=3)
コード例 #6
0
def train(
    data: np.ndarray,
    used_model: str = "autoreg",
    p: int = 5,
    d: int = 1,
    q: int = 0,
    cov_type="nonrobust",
    method="cmle",
    trend="nc",
    solver="lbfgs",
    maxlag=13,
    # SARIMAX args
    seasonal=(0, 0, 0, 0),
) -> Any:
    """Autoregressive model from statsmodels library. Only univariate data.

    Args:
        data (np.ndarray): Time series data.
        used_model (str, optional): Used model. Defaults to "autoreg".
        p (int, optional): Order of ARIMA model (1st - proportional). Check statsmodels docs for more. Defaults to 5.
        d (int, optional): Order of ARIMA model. Defaults to 1.
        q (int, optional): Order of ARIMA model. Defaults to 0.
        cov_type: Parameters of model call or fit function of particular model. Check statsmodels docs for more.
            Defaults to 'nonrobust'.
        method: Parameters of model call or fit function of particular model. Check statsmodels docs for more.
            Defaults to 'cmle'.
        trend: Parameters of model call or fit function of particular model. Check statsmodels docs for more.
            Defaults to 'nc'.
        solver: Parameters of model call or fit function of particular model. Check statsmodels docs for more.
            Defaults to 'lbfgs'.
        maxlag: Parameters of model call or fit function of particular model. Check statsmodels docs for more.
            Defaults to 13.
        seasonal: Parameters of model call or fit function of particular model. Check statsmodels docs for more.
            Defaults to (0, 0, 0, 0).

    Returns:
        statsmodels.model: Trained model.
    """

    import statsmodels.tsa.api as sm
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa import ar_model

    used_model = used_model.lower()

    if used_model == "ar":
        model = sm.AR(data)
        fitted_model = model.fit(method=method, trend=trend, solver=solver, disp=0)

    elif used_model == "arima":
        order = (p, d, q)
        model = ARIMA(data, order=order)
        fitted_model = model.fit()

    elif used_model == "sarimax":
        order = (p, d, q)
        model = SARIMAX(data, order=order, seasonal_order=seasonal)
        fitted_model = model.fit(method=method, trend=trend, solver=solver, disp=0)

    elif used_model == "autoreg":
        auto = ar_model.ar_select_order(data, maxlag=maxlag)
        model = ar_model.AutoReg(
            data,
            lags=auto.ar_lags,
            trend=auto.trend,
            seasonal=auto.seasonal,
            period=auto.period,
        )
        fitted_model = model.fit(cov_type=cov_type)

    else:
        raise ValueError(
            f"Used model has to be one of ['ar', 'arima', 'sarimax', 'autoreg']. You configured: {used_model}"
        )

    setattr(fitted_model, "my_name", used_model)
    setattr(fitted_model, "data_length", len(data))

    return fitted_model