Beispiel #1
0
def updated_six_period_plot_forecast_vs_arima(idx):
    '''
    Builds plots that show the series history and then also 
    the forecast with confidence intervals
    '''
    series = series_lst6[idx]
    if idx == 68 or idx == 69:
        train, test = train_test_split(series, train_size=len(series)-6)
        model = pm.auto_arima(train, seasonal=True)
        forecasts = model.predict(test.shape[0]).tolist()

    elif len(series) <= 24:
        train, test = train_test_split(series, train_size=len(series)-6)
        model = pm.auto_arima(train, seasonal=True)
        forecasts = model.predict(test.shape[0]).tolist()

    else:
        train, test = train_test_split(series, train_size=len(series)-6)
        model = pm.auto_arima(train, seasonal=True, m=12)
        forecasts = model.predict(test.shape[0]).tolist()
    
    forecasts = np.insert(np.array(forecasts), 0 , train.iloc[-1][0]).tolist()
    params = model.get_params()
    SARIMAmodel = SARIMAX(train, order=params['order'], seasonal_order=params['seasonal_order']).fit()
    fcast = SARIMAmodel.get_forecast(6)
    conf_inf = fcast.conf_int()
    print(model)
    fig, ax = plt.subplots(figsize = (18, 12))
    ax.plot(series.index, series, label = 'Actual Sales')
    ax.plot(series[-7:].index, forecasts, label = 'Forecasted Sales')
    ax.fill_between(conf_inf.index, conf_inf['lower TOTAL'].clip_lower(0),
                    conf_inf['upper TOTAL'], color = 'lightgrey',
                    label = '95% Confidence Interval for Forecast')
    ax.axvline(x = series[-6:].index[0], color='k', linestyle='--',
               label = 'End of Historical Sales')
    ax.set_title(f'Comparison of Actual vs Forecasted Sales \n for the 
                {lst_of_stores[idx][0]} Store and {lst_of_stores[idx][1]} Department',
                 fontsize = 20)
    ax.set_xlabel('Time', fontsize = 24)
    ax.set_ylabel('Sales', fontsize = 24)
    ax.tick_params(axis='both', which='major', labelsize=16)
    ax.legend(fontsize = 16)
    #ax.set_ylim([None, 5350]) #for the good
    #ax.set_ylim([None, 21000]) #for the bad
    #ax.set_ylim([None, 5350]) #for the ugly
    plt.grid(c='silver')
    #plt.savefig('../images/the_bad2')
    plt.show()
Beispiel #2
0
def arima_forecast(store_ids, dept_ids, art_dict, holdout_periods, interval):
    '''
    Estimates ARIMA parameters for all 70 groups and then forecasts

    Args:
        store_ids = store ids for 10 stores in dataset
        dept_ids = department ids for 7 departments in dataset
        art_dict = dictionary where some series have their training timeframes altered
        holdout_periods = how many periods you want to forecast and compare against
        interval = resamples daily sales to a different interval, ie monthly ('M')

    Returns:
        -A list of forecasts for all 70 groups (a store/department combination)
        -A list of series for all 70 groups 
    '''
    lst_of_forecasts = []
    for _ in range(holdout_periods):
        lst_of_forecasts.append([]) 
    series_lst = []
    for idx, val in enumerate(series_setup(store_ids, dept_ids)):
        if idx not in art_dict.keys():
            series_lst.append(resample_series(make_series(val[0], val[1]), interval))
        else:
            series_lst.append(art_dict[idx])
    count = 0
    for idx,i in enumerate(series_lst):
        print(count)
        count += 1
        if idx == 68 or idx == 69:
            train, test = train_test_split(i, train_size=len(i)-holdout_periods)
            model = pm.auto_arima(train, seasonal=True)
            forecasts = model.predict(test.shape[0]).tolist()
            for idx, val in enumerate(forecasts):
                lst_of_forecasts[idx].append(val)
        elif len(i) <= 24:
            train, test = train_test_split(i, train_size=len(i)-holdout_periods)
            model = pm.auto_arima(train, seasonal=True)
            forecasts = model.predict(test.shape[0]).tolist()
            for idx, val in enumerate(forecasts):
                lst_of_forecasts[idx].append(val)
        else:
            train, test = train_test_split(i, train_size=len(i)-holdout_periods)
            model = pm.auto_arima(train, seasonal=True, m=12)
            forecasts = model.predict(test.shape[0]).tolist()
            for idx, val in enumerate(forecasts):
                lst_of_forecasts[idx].append(val)
    return lst_of_forecasts, series_lst
Beispiel #3
0
def load_dataset(index:int)->tuple:
    assert index<N_FILES, "Index out of range"
    
    path = "../datasets/"+FILES[index]
    time_series = pd.read_csv(path, header=None).values.reshape(-1)
    y_train, y_test = train_test_split(time_series, test_size=TEST_SIZE)
    
    return (y_train, y_test)
Beispiel #4
0
def forecast(us_counties: pd.DataFrame,
             log_metrics: bool,
             hp: dict,
             metric_threshold: int = 5):
    metrics = {}
    growth_rates = {}
    horizon = hp['horizon']
    metric_skip = 0

    for location in tqdm(us_counties['location'].unique(), unit=' counties'):
        if log_metrics:
            if metric_skip == metric_threshold:
                metric_skip = 0
            else:
                metric_skip += 1
                continue
        y = us_counties[us_counties.location ==
                        location].reset_index()['cases']
        if len(y) < horizon:
            continue
        model = AutoARIMA(**hp)
        with warnings.catch_warnings():
            # When there is no cases, it will throw a warning
            warnings.filterwarnings("ignore")
            try:
                if log_metrics:
                    y, yv = train_test_split(y, test_size=horizon)
                model.fit(y)
                predictions = model.predict(n_periods=horizon)
            # Value error very rarely with weird/broken time series data
            except (ValueError, IndexError):
                continue
            if log_metrics:
                metrics[location] = np.mean(
                    np.abs(yv - predictions) /
                    (np.abs(yv) + np.abs(predictions)))
            last_forecast = predictions[len(predictions) - 1]
            todays_cases = y[len(y) - 1]
            # Places with very small amount of cases are hard to predict
            case_handicap = min(1.0, 0.5 + (todays_cases / 120))
            growth = (last_forecast / todays_cases) * case_handicap
            growth_rates[location] = growth

    final_list = [
        i[0]
        for i in sorted(growth_rates.items(), key=lambda i: i[1], reverse=True)
    ]

    def rank_risk(row) -> int:
        case_growth = growth_rates.get(row.location)
        if not case_growth:
            return 1
        return round(max(0, (case_growth - 1) * 100))

    if not log_metrics:
        us_counties['outbreak_risk'] = us_counties.apply(rank_risk, axis=1)

    return us_counties, final_list, metrics
Beispiel #5
0
def test_order_does_not_matter_with_date_transformer():
    train_y_dates, test_y_dates, train_X_dates, test_X_dates = \
        train_test_split(y_dates, X_dates, test_size=15)

    pipeline_a = Pipeline([
        ('fourier', FourierFeaturizer(m=3, prefix="FOURIER")),
        ('dates', DateFeaturizer(column_name="date", prefix="DATE")),
        ("arima",
         AutoARIMA(seasonal=False,
                   stepwise=True,
                   suppress_warnings=True,
                   maxiter=3,
                   error_action='ignore'))
    ]).fit(train_y_dates, train_X_dates)
    Xt_a = pipeline_a.transform(exogenous=test_X_dates)
    pred_a = pipeline_a.predict(exogenous=test_X_dates)

    pipeline_b = Pipeline([
        ('dates', DateFeaturizer(column_name="date", prefix="DATE")),
        ('fourier', FourierFeaturizer(m=3, prefix="FOURIER")),
        ("arima",
         AutoARIMA(seasonal=False,
                   stepwise=True,
                   suppress_warnings=True,
                   maxiter=3,
                   error_action='ignore'))
    ]).fit(train_y_dates, train_X_dates)
    Xt_b = pipeline_b.transform(exogenous=test_X_dates)
    pred_b = pipeline_b.predict(exogenous=test_X_dates)

    # dates in A should differ from those in B
    assert pipeline_a.x_feats_[0].startswith("FOURIER")
    assert pipeline_a.x_feats_[-1].startswith("DATE")

    assert pipeline_b.x_feats_[0].startswith("DATE")
    assert pipeline_b.x_feats_[-1].startswith("FOURIER")

    # columns should be identical once ordered appropriately
    assert Xt_a.equals(Xt_b[pipeline_a.x_feats_])

    # forecasts should be identical
    assert_array_almost_equal(pred_a, pred_b, decimal=3)
Beispiel #6
0
def arima_pred(actual, pred_num):
    '''
    --------
    Description:
    actual is the true value on both train and test data
    pred_num is the length of test data
    --------
    Example:
    fit, pred = arima_pred(series, 28)
    '''
    ## data split
    train, test = model_selection.train_test_split(actual, test_size=pred_num)
    ## train model
    arima_model = pm.auto_arima(train,
                                trace=False,
                                stepwise=True,
                                suppress_warnings=True,
                                error_action='ignore')
    ## predict
    pred = arima_model.predict(n_periods=pred_num)
    return ([arima_model.predict_in_sample(), pred])
Beispiel #7
0
def main():
    cfparser = configparser.ConfigParser()
    cfparser.read('config.ini')
    database = cfparser['Server']['database']

    try:
        con = sqlite3.connect(database)
        print('Connected to SQLite')
    except Error as e:
        print('database connection error: ' + str(e))
        sys.exit(-1)

    with con:
        cur = con.cursor()

        cur.execute("SELECT fee FROM gas_fees")
        data = cur.fetchall()

    print('Read data -> %s rows' % (len(data), ))

    print('Sample: %s' % (data[0][0], ))

    y = np.asarray(data[-100:])  # total number of samples to take
    print(y)

    train, test = train_test_split(
        y, train_size=50)  # total number of samples / 2

    # Fit your model
    model = pm.auto_arima(train, seasonal=True, m=7)  # Seasonal = True??

    # make your forecasts
    forecasts = model.predict(test.shape[0])  # predict N steps into the future

    # Visualize the forecasts (blue=train, red=whole dataset, green=forecasts)
    x = np.arange(y.shape[0])
    plt.plot(x, y, c='red')
    plt.plot(x[:50], train, c='blue')  # total number of samples / 2
    plt.plot(x[50:], forecasts, c='green')  # same as above
    plt.show()
Beispiel #8
0
# print day and pm 2.5 values
print(data.head())

# group df by day

# calculate mean value of pm2.5  for every given day
print("MEAN pm25 values by day\n", data.pm25)
data.plot()
plt.title('Initial mean values for November')
plt.show()
#
# # begin training
X = data.values
print("length of input values", len(X))

y_train, y_test = train_test_split(X, test_size=0.3)

print("length of train values", len(y_train))

print("length of test values", len(y_test))
predictions = []

model_ar = AR(y_train)
model_ar_fit = model_ar.fit()

predictions = model_ar_fit.predict(start=len(y_train), end=len(data))
print("length of predictions", len(predictions))

# TODO try all possibilities of (p,d,q)
model_arima = ARIMA(y_train, order=(1, 0, 4))
model_arima_fit = model_arima.fit()
Beispiel #9
0
def model_train(test=False):

    ## subset the data to enable faster unittests

    #create dataframe for temporary capture of traiing results
    df_res = pd.DataFrame(columns=["country", "rmse", "mape"])

    df = fetch_data()
    ts = df.groupby("invoice_date")["revenue"].sum().rename("sales")
    y = ts.resample('MS').mean()

    ## start timer for runtime
    time_start = time.time()
    model = pm.auto_arima(y,
                          start_p=1,
                          start_q=1,
                          test='adf',
                          max_p=3,
                          max_q=3,
                          m=12,
                          start_P=0,
                          seasonal=True,
                          d=None,
                          D=1,
                          trace=True,
                          error_action='ignore',
                          suppress_warnings=True,
                          stepwise=True)
    if test:
        saved_model = "sales-arima-{}.joblib".format(
            re.sub("\.", "_", str(MODEL_VERSION)))
        train, test = model_selection.train_test_split(y, test_size=0.1)
        result = model.fit(train)
        joblib.dump(model, os.path.join(MODEL_DIR, saved_model))
        df_res.loc[0] = ["all", "0.2", "0.3"]
        m, s = divmod(time.time() - time_start, 60)
        h, m = divmod(m, 60)
        runtime = "%03d:%02d:%02d" % (h, m, s)
        update_train_log(y.shape[0], {
            'country': all,
            'rmse': "0.2",
            'mape': "0.3"
        },
                         runtime,
                         MODEL_VERSION,
                         MODEL_VERSION_NOTE,
                         test=True)

    else:
        country = data.index.unique().tolist()
        c_listlen = len(country)
        for i in range(c_listlen):
            cntry = country[i]
            #format the country variable
            if cntry.isspace():
                str_country = cntry
                str_country = re.sub(r"\s+", '-', str_country)
                str_country = str_country.lower()
            else:
                str_country = cntry.lower()

            #set country model
            saved_model = str_country + "-" + "sales-arima-{}.joblib".format(
                re.sub("\.", "_", str(MODEL_VERSION)))
            #filter data to train based on country
            y = filter_cntry_data(df, cntry)

            # Split data into train / test sets
            train, test = model_selection.train_test_split(y, test_size=0.1)

            #smodel.summary()
            result = model.fit(train)

            #print Autom arima diagnostics
            #results.plot_diagnostics(figsize=(16, 8))
            #plt.show()

            joblib.dump(model, os.path.join(MODEL_DIR, saved_model))
            #result.plot_diagnostics(figsize=(15,12))

            #print( result.summary().tables[1])

            #print("\n Proceed with Auto Arima due to better AIC value\n")
            #forecast

            forecast = model.predict(n_periods=len(test))
            forecast = pd.DataFrame(forecast,
                                    index=test.index,
                                    columns=['predictions'])

            #hide the plots as this will be caled via scripts
            #plot
            '''
            plt.plot(train,label='Train')
            plt.plot(test, label='Valid')
            plt.plot(forecast, label ='Prediction')
            plt.legend()
            plt.show()


            '''

            rms = round(sqrt(mean_squared_error(test, forecast)), 2)
            #print("Arima rms \n:{}",model_train ())

            mape_result = round(mean_absolute_percentage_error(test, forecast))

            df_res.loc[i] = [cntry, rms, mape_result]
            m, s = divmod(time.time() - time_start, 60)
            h, m = divmod(m, 60)

            update_train_log(y.shape[0], {
                'country': cntry,
                'rmse': rms,
                'mape': mape_result
            },
                             runtime,
                             MODEL_VERSION,
                             MODEL_VERSION_NOTE,
                             test=False)
            runtime = "%03d:%02d:%02d" % (h, m, s)

    return dict(df_res.to_dict())
Beispiel #10
0
def draw_(province, isDaily):
    # 模型训练
    model = arima.AutoARIMA(
        start_p=0,
        max_p=4,
        d=None,
        start_q=0,
        max_q=1,
        start_P=0,
        max_P=1,
        D=None,
        start_Q=0,
        max_Q=1,
        m=7,
        seasonal=True,
        test="kpss",
        trace=True,
        error_action="ignore",
        suppress_warnings=True,
        stepwise=True,
    )
    if isDaily:
        data = df[province].diff().dropna()
        model.fit(data)
    else:
        data = df[province]
        model.fit(data)

    # 模型验证
    train, test = train_test_split(data, train_size=0.8)
    pred_test = model.predict_in_sample(start=train.shape[0], dynamic=False)
    validating = pd.Series(pred_test, index=test.index)
    r2 = r2_score(test, pred_test)

    # 开始预测
    pred, pred_ci = model.predict(n_periods=14, return_conf_int=True)
    idx = pd.date_range(data.index.max() + pd.Timedelta("1D"),
                        periods=14,
                        freq="D")
    forecasting = pd.Series(pred, index=idx)

    # 绘图呈现
    plt.figure(figsize=(24, 6))

    plt.plot(data.index, data, label="Actual Value", color="blue")
    plt.plot(validating.index, validating, label="Check Value", color="orange")
    plt.plot(forecasting.index,
             forecasting,
             label="Predict Value",
             color="red")
    # plt.fill_between(forecasting.index, pred_ci[:, 0], pred_ci[:, 1], color="black", alpha=.25)

    plt.legend()
    plt.ticklabel_format(style="plain", axis="y")
    # plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
    if isDaily:
        plt.title(
            f"Daily Confirmed Cases Forecasting - {province}\nARIMA {model.model_.order}x{model.model_.seasonal_order} (R2 = {r2:.6f})"
        )
        plt.savefig(
            os.path.join("figures",
                         f"covid-{adjust_name(province)}-daily.svg"),
            bbox_inches="tight",
        )
        plt.close()
    else:
        plt.title(
            f"Accumulative Confirmed Cases Forecasting - {province}\nARIMA {model.model_.order}x{model.model_.seasonal_order} (R2 = {r2:.6f})"
        )
        plt.savefig(
            os.path.join("figures", f"covid-{adjust_name(province)}.svg"),
            bbox_inches="tight",
        )
        plt.close()
def LSTM_uni_train(
    raw_data,
    use_date_max,
    col,
    com_col,
    scale,
    n_steps,
    n_features,
    test_h,
    model,
    BATCH_SIZE=1,
    BUFFER_SIZE=100,
    EVALUATION_INTERVAL=100,
    EPOCHS=1000,
    optimizer='adam',
    loss='mse',
    metrics=['mse'],
    saveroot='C:/Users/KIMYEONKYOUNG/Desktop/2021 AI 빅데이터팀/메탈 수요예측/code/model_회사별/'
):  #scaleoption : None,standard,minmax,robust # n_features = 1 (univariate)

    #raw data 2 time series data
    ts_data = data2tsdata(raw_data, col, use_date_max)

    #scaling
    if scale == None:
        data_t = ts_data
    if scale == 'standard':
        (data_t, scal) = standardscale(ts_data)
    if scale == 'minmax':
        (data_t, scal) = minmaxscale(ts_data)
    if scale == 'robust':
        (data_t, scal) = robustscale(ts_data)

    #get train data
    (df_train, df_test) = train_test_split(data_t, test_h, n_steps)
    # [train] dateaframe to tensor
    nd_train = np.asarray(df_train)
    nd_train = nd_train.reshape(len(df_train), )
    nd_test = np.asarray(df_test)
    nd_test = nd_test.reshape(len(df_test), )

    (train_x, train_y) = split_sequence(nd_train, n_steps)
    train_x = train_x.reshape(train_x.shape[0], n_steps, n_features)

    #train data 2 train & val
    train_univariate = tf.data.Dataset.from_tensor_slices((train_x, train_y))
    train_univariate = train_univariate.cache().shuffle(BUFFER_SIZE).batch(
        BATCH_SIZE).repeat()
    val_univariate = tf.data.Dataset.from_tensor_slices((train_x, train_y))
    val_univariate = val_univariate.batch(BATCH_SIZE).repeat()

    # Build EarlyStopping
    path_checkpoint = "lstm_model_checkpoint_try.h5"
    es_callback = tf.keras.callbacks.EarlyStopping(
        monitor="loss", min_delta=0, patience=100, mode='auto'
    )  # mode=auto loss면 최저값100번정도 반복되면 정지, acc면 최고값이 100번정도 반복되면 정지
    modelckpt_callback = tf.keras.callbacks.ModelCheckpoint(
        monitor="loss",
        filepath=path_checkpoint,
        verbose=1,
        save_weights_only=True,
        save_best_only=True,
    )

    model.compile(optimizer, loss, metrics)

    #train model
    history = model.fit(train_univariate,
                        epochs=EPOCHS,
                        validation_data=val_univariate,
                        steps_per_epoch=EVALUATION_INTERVAL,
                        validation_steps=1,
                        verbose=1,
                        callbacks=[es_callback, modelckpt_callback])

    #save model
    file_root = col.replace('_실적', "")
    model.save(saveroot + '/' + str(file_root) + '/' + str(use_date_max) +
               '_' + 'lstm_model_checkpoint_' + str(scale) + '_' + str(col) +
               '.h5')

    #graph
    visualize_loss(history, "Training & vaildation Loss", saveroot=saveroot)

    #########################################################################################
    #predict

    if test_h == 1:
        xtt = nd_test.reshape(1, n_steps, n_features)
        #predict
        yhat = model.predict(xtt)
        prediction = pd.DataFrame(yhat)
        prediction.columns = ['yhat']
        prediction.index = df_test[4:].index

    if test_h != 1:
        (xt, yt) = split_sequence(nd_test, n_steps)
        xtt = xt.reshape(xt.shape[0], n_steps, n_features)
        #predict
        yhat = model.predict(xtt)
        prediction = pd.DataFrame(yhat)
        prediction.columns = ['yhat']
        prediction.index = df_test[n_step:].index

    #inverse_scale
    prediction['prediction'] = scal.inverse_transform(prediction)

    train_g, test_g = model_selection.train_test_split(
        ts_data, train_size=len(ts_data) - test_h)
    #outputdataframe(이동,실적,pred)
    outpu = pd.merge(test_g,
                     prediction['prediction'],
                     left_index=True,
                     right_index=True)
    com_ts_data = data2tsdata(raw_data, com_col, use_date_max)
    output = pd.merge(outpu, com_ts_data, left_index=True, right_index=True)

    #timeseries graph(train,test,predict)
    predict_graph(train_g,
                  test_g,
                  prediction['prediction'],
                  output[com_col],
                  saveroot=saveroot)

    #excel에 output저장(model_root, test_h, test_train loss, timeseries graph, output df)
    wb = Workbook()
    result_df = result_input(wb,
                             model_root=saveroot + 'lstm_model_checkpoint_' +
                             str(scale) + '_' + str(col) + '.h5',
                             test_h=test_h,
                             col=col,
                             com_col=com_col,
                             output=output,
                             pre_graph=saveroot + '_timeseries.png',
                             loss_graph=saveroot + 'lossgraph.png')
    wb.save(saveroot + '/' + str(file_root) + '/' + use_date_max + '.xlsx')

    return result_df
data = pd.DataFrame(grp_date.mean())
print("MEAN pm25 values by day\n", data.pm25)
data.plot()
plt.title('Initial mean values for November')
plt.show()

# begin training
X = data.values
print("length of input values", len(X))
# ~70% of data->training
# train = X[0:21]  # 21 data as train

#
y_train, y_test = train_test_split(
    X,
    test_size=0.3,
    # shuffle=False)
)
print("y train", y_train)
print("y test", y_test)

#

print("length of train values", len(y_train))
# print("length of train values", len(train))
# ~30% to test, 9 data as test
# test = X[21:]
# print("length of test values", len(test))
print("length of test values", len(y_test))
predictions = []
            if use_diff:
                #将目标序列从原始序列变成 差分序列
                price = price.diff(1)[train_3m.shape[0] - past - pred + 1:]
            else:
                price = price[train_3m.shape[0] - past - pred + 1:]

            prefix = valfiles_oi[ind].split(
                '_')[0] + '-validation-{}d-'.format(pred)
            #滑动窗口
            for i in range(past + pred - 1, len(price)):
                print(
                    '===========当前训练的是{}数据集,目标节点是{}=================='.format(
                        valfiles_oi[ind].split('_')[0],
                        val_3m.index[(i - (past + pred) + 1)]))
                sample = price[(i - (past + pred) + 1):(i + 1)]
                train, test = train_test_split(sample, train_size=past)
                pipeline = Pipeline([
                    # ('boxcox', BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
                    ('arima',
                     pm.AutoARIMA(seasonal=True,
                                  m=1,
                                  suppress_warnings=True,
                                  trace=True,
                                  error_action="ignore"))
                ])

                pipeline.fit(train)
                pred_result = pipeline.predict(pred)
                print('pred_result is : ', pred_result)
                print(
                    '====================一次训练结束=============================\n\n\n'
Beispiel #14
0
                inplace=True)
series = model_data.iloc[:, 0]

###############################################################################
## model 1: ARIMA
series_diff = series.diff().dropna()
plt.plot(series_diff)
## ADF stationary test(p-value 6.575445276313532e-27, stationary)
sm.tsa.stattools.adfuller(series_diff)

## Ljung-Box white noise test(p-value close to 0, not white noise)
plt.plot(lb_test(series_diff, lags=None, boxpierce=False)[1])
plt.show()

## data split
train, test = model_selection.train_test_split(series, test_size=28)
## train model
arima_model = pm.auto_arima(train,
                            trace=True,
                            stepwise=True,
                            suppress_warnings=True,
                            error_action='ignore')
arima_model.summary()

## model checking(all p-value > 0.05, residual is white noise, model is correct)
plt.plot(lb_test(arima_model.resid(), lags=None, boxpierce=False)[1])
plt.axhline(y=0.05, c="r", ls="--", lw=2)
plt.show()

## predict
preds, conf_int = arima_model.predict(n_periods=test.shape[0],
Beispiel #15
0
def model_train ():
    
    filenm=""
    if request.method == 'POST':
        f = request.files['file']
        if f.filename !='':
            f.save(os.path.join(DATA_DIR,f.filename))
        filenm=f.filename

    print("\nFile name is :\n{}",filenm)
    data = fetch_data(filenm)
    
    #create dataframe for temporary capture of traiing results 
    df_res = pd.DataFrame(columns=["country","rmse","mape"])
    ts = data.groupby("invoice_date")["revenue"].sum().rename("sales")
    y_all = ts.resample('MS').mean()

    if filenm=="test.txt":
        time_start = time.time()
        model = pm.auto_arima(y_all, start_p=1, start_q=1,
                                 max_p=3, max_q=3, m=12,
                                 start_P=0, seasonal=True,
                                 d=None, D=1, trace=True,
                                 error_action='ignore',  
                                 suppress_warnings=True, 
                                 stepwise=True)

        saved_model ="sales-arima-{}.joblib".format(re.sub("\.", "_", str(MODEL_VERSION)))
        #train, test = model_selection.train_test_split(y_all, test_size=0.1)
        result = model.fit(y_all)
        joblib.dump(model, os.path.join(MODEL_DIR, saved_model))
        df_res.loc[0] = ["all","0.2", "0.3"] 
        m, s = divmod(time.time()-time_start, 60)
        h, m = divmod(m, 60)
        runtime = "%03d:%02d:%02d"%(h, m, s)
        train_shape = str(y_all.shape[0])+" x  1"
        update_train_log(train_shape,{'country':all,'rmse':"0.2",'mape':"0.3"},runtime,MODEL_VERSION, MODEL_VERSION_NOTE, test=True)
        
    else:
        
        ## input checking

        #get the number of months to forecast
         #select the country model

        try:#value = int(data['value'])
            country = data.index.unique().tolist()
           
        except (KeyError,TypeError,ValueError):
            raise JsonError(description='Invalid value')

        #enforce datetime astype on cloumn invoice_date
        #data["invoice_date"] = pd.to_datetime(data["invoice_date"])

         ## start timer for runtime
        time_start = time.time()
        c_listlen= len(country)



        # Seasonal - fit stepwise auto-ARIMA
            #with ARIMA, due to size of the data , weshall not use train split
        #Having checked the ARIMA fit model, all countries repor the same hyperparameters
        #Training will be done however per country
        model = pm.auto_arima(y_all, start_p=1, start_q=1,
                                 max_p=3, max_q=3, m=12,
                                 start_P=0, seasonal=True,
                                 d=None, D=1, trace=True,
                                 error_action='ignore',  
                                 suppress_warnings=True, 
                                 stepwise=True)

        for i in range(c_listlen):
            cntry = country[i]
            #format the country variable
            if cntry.isspace():
                str_country= cntry
                str_country = re.sub(r"\s+",'-',str_country)
                str_country =str_country.lower()
            else:
                str_country = cntry.lower()

            #set country model
            saved_model = str_country+"-"+"sales-arima-{}.joblib".format(re.sub("\.", "_", str(MODEL_VERSION)))
            #filter data to train based on country 
            y =filter_cntry_data(data,cntry)

            # Split data into train / test sets 
            train, test = model_selection.train_test_split(y, test_size=0.1)



            #smodel.summary()
            result = model.fit(train)



            #print Autom arima diagnostics
            #results.plot_diagnostics(figsize=(16, 8))
            #plt.show()


            joblib.dump(model, os.path.join(MODEL_DIR, saved_model))
            #result.plot_diagnostics(figsize=(15,12))


            #print( result.summary().tables[1])



            #print("\n Proceed with Auto Arima due to better AIC value\n")
            #forecast 


            forecast  =  model.predict(n_periods=len(test))
            forecast  = pd.DataFrame(forecast,index=test.index,columns=['predictions'])

            #hide the plots as this will be caled via scripts
            #plot
            '''
            plt.plot(train,label='Train')
            plt.plot(test, label='Valid')
            plt.plot(forecast, label ='Prediction')
            plt.legend()
            plt.show()


            '''

            rms = round(sqrt(mean_squared_error(test,forecast)),2)
            #print("Arima rms \n:{}",model_train ())

            mape_result = round(mean_absolute_percentage_error(test,forecast))

            df_res.loc[i] = [cntry,rms, mape_result] 
            m, s = divmod(time.time()-time_start, 60)
            h, m = divmod(m, 60)
            runtime = "%03d:%02d:%02d"%(h, m, s)
            train_shape = str(y_all.shape[0])+" x 1"
            update_train_log(train_shape,{'country':cntry,'rmse':rms,'mape':mape_result},runtime,MODEL_VERSION, MODEL_VERSION_NOTE, test=False)
    #return json_response(rmse = rms, mape=mape_result)
    return dict(df_res.to_dict())
Beispiel #16
0
from pmdarima.compat.pytest import pytest_error_str
from pmdarima.model_selection import train_test_split
from pmdarima.pipeline import Pipeline, _warn_for_deprecated
from pmdarima.preprocessing import BoxCoxEndogTransformer, FourierFeaturizer
from pmdarima.arima import ARIMA, AutoARIMA
from pmdarima.datasets import load_wineind
import numpy as np

import pytest

rs = np.random.RandomState(42)
wineind = load_wineind()
xreg = rs.rand(wineind.shape[0], 2)

train, test, x_train, x_test = train_test_split(wineind, xreg, train_size=125)


class TestIllegal:
    def test_non_unique_names(self):
        # Will fail since the same name repeated twice
        with pytest.raises(ValueError) as ve:
            Pipeline([("stage", BoxCoxEndogTransformer()),
                      ("stage", ARIMA(order=(0, 0, 0)))])

        assert "not unique" in pytest_error_str(ve)

    def test_names_in_params(self):
        # Will fail because 'steps' is a param of Pipeline
        with pytest.raises(ValueError) as ve:
            Pipeline([("steps", BoxCoxEndogTransformer()),
import pmdarima as pm
from pmdarima import arima
from pmdarima import model_selection
from pmdarima import pipeline
from pmdarima import preprocessing
from pmdarima.datasets._base import load_date_example

import numpy as np
from matplotlib import pyplot as plt

print("pmdarima version: %s" % pm.__version__)

# Load the data and split it into separate pieces
y, X = load_date_example()
y_train, y_test, X_train, X_test = \
    model_selection.train_test_split(y, X, test_size=20)

# We can examine traits about the time series:
pm.tsdisplay(y_train, lag_max=10)

# We can see the ACF increases and decreases rather rapidly, which means we may
# need some differencing. There also does not appear to be an obvious seasonal
# trend.
n_diffs = arima.ndiffs(y_train, max_d=5)

# Here's what the featurizer will create for us:
date_feat = preprocessing.DateFeaturizer(
    column_name="date",  # the name of the date feature in the exog matrix
    with_day_of_week=True,
    with_day_of_month=True)
import pmdarima as pm
from pmdarima.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# Load/split your data
y = pm.datasets.load_wineind()
train, test = train_test_split(y, train_size=150)

# Fit your model
model = pm.auto_arima(train, seasonal=True, m=12)

# make your forecasts
forecasts = model.predict(test.shape[0])  # predict N steps into the future

# Visualize the forecasts (blue=train, green=forecasts)
x = np.arange(y.shape[0])
plt.plot(x[:150], train, c='blue')
plt.plot(x[150:], forecasts, c='green')
plt.show()
from pmdarima import auto_arima
from pmdarima.arima import ndiffs
from pmdarima.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

df = pd.concat(map(pd.read_json,
                   Path("data").glob("forecast_*.json")),
               ignore_index=True)
df["time"] = df.apply(lambda r: datetime.fromtimestamp(r["time"]), axis=1)
df = df.sort_values(by=["time"])

temperature = df["temperature"]
temperature = temperature.fillna(temperature.mean())

train, test = train_test_split(temperature,
                               train_size=temperature.shape[0] - 365)

print(f"training size: {train.shape[0]}")
print(f"testing size: {test.shape[0]}")

# %%
kpss_diffs = ndiffs(train, alpha=0.05, test="kpss", max_d=6)
adf_diffs = ndiffs(train, alpha=0.05, test="adf", max_d=6)
n_diffs = max(adf_diffs, kpss_diffs)

print(f"d: {n_diffs}")

# %%
model = auto_arima(
    train,
    d=n_diffs,
Beispiel #20
0
def test_train_test_split():
    tr, te = train_test_split(y, test_size=10)
    assert te.shape[0] == 10
    assert_array_equal(y, np.concatenate([tr, te]))
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from pmdarima.pipeline import Pipeline
from pmdarima.preprocessing import BoxCoxEndogTransformer
import pickle

# Load/split your data
y = pm.datasets.load_sunspots()
train, test = train_test_split(y, train_size=2700)

# Define and fit your pipeline
pipeline = Pipeline([
    ('boxcox',
     BoxCoxEndogTransformer(lmbda2=1e-6)),  # lmbda2 avoids negative values
    ('arima',
     pm.AutoARIMA(seasonal=True, m=12, suppress_warnings=True, trace=True))
])

pipeline.fit(train)

# Serialize your model just like you would in scikit:
with open('model.pkl', 'wb') as pkl:
    pickle.dump(pipeline, pkl)

# Load it and make predictions seamlessly:
with open('model.pkl', 'rb') as pkl:
    mod = pickle.load(pkl)
    print(mod.predict(15))
# [25.20580375 25.05573898 24.4263037  23.56766793 22.67463049 21.82231043
# 21.04061069 20.33693017 19.70906027 19.1509862  18.6555793  18.21577243
# 17.8250318  17.47750614 17.16803394]
Beispiel #22
0
def sarimax_pmdarima(timeseries, train_length, m):
    """
    Previsioni con il modello SARIMAX e selezione automatica degli ordini

    Parameters
    ----------
    timeseries : Series
        la serie temporale.
    train_length : int
        la lunghezza del set di train (in rapporto alla serie completa).
    m : int
        il periodo stagionale.

    Returns
    -------
    tuple
        (order, seasonal_order)

    """
    # creo i set di train e di test
    train, test = model_selection.train_test_split(timeseries,
                                                   train_size=train_length)

    # scelgo e adatto il modello ai dati
    model = pm.auto_arima(train,
                          seasonal=True,
                          m=m,
                          suppress_warnings=True,
                          trace=True,
                          start_p=1,
                          start_q=1,
                          max_p=2,
                          max_q=2,
                          start_P=1,
                          start_Q=1,
                          max_P=2,
                          max_Q=2)

    # stampo i parametri del modello
    print(model.summary())

    # predizioni in-sample
    # http://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ARIMA.html#pmdarima.arima.ARIMA.predict_in_sample
    preds = model.predict_in_sample(end=len(train) - 1)
    sarimax_dates = pd.date_range(start=timeseries.index[0],
                                  end=timeseries.index[len(train) - 1],
                                  freq='D')
    sarimax_ts = pd.Series(preds, index=sarimax_dates)

    # predizioni out-of-sample
    # http://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.ARIMA.html#pmdarima.arima.ARIMA.predict
    fcast, conf_int = model.predict(n_periods=test.shape[0],
                                    return_conf_int=True)
    fcast_dates = pd.date_range(start=timeseries.index[len(train)],
                                periods=len(timeseries) - len(train),
                                freq='D')
    ts_fcast = pd.Series(fcast, index=fcast_dates)
    ts_ci_min = pd.Series(conf_int[:, 0], index=fcast_dates)
    ts_ci_max = pd.Series(conf_int[:, 1], index=fcast_dates)

    print('Test RMSE: %.4f' % np.sqrt(mean_squared_error(test, fcast)))

    # grafico del modello
    plt.figure(figsize=(40, 20), dpi=80)
    plt.title('Modello SARIMAX{}x{} per {}'.format(model.order,
                                                   model.seasonal_order,
                                                   timeseries.name))
    ax = train.plot(label='Train set', color='black')
    sarimax_ts.plot(ax=ax, label='In-sample predictions', color='green')
    plt.legend()
    plt.show()

    # grafico delle previsioni
    plt.figure(figsize=(40, 20), dpi=80)
    plt.title('Forecasting con SARIMAX{}x{} per {}'.format(
        model.order, model.seasonal_order, timeseries.name))
    ax = timeseries.plot(label='Observed', color='black')
    ts_fcast.plot(ax=ax,
                  label='Out-of-sample forecasts',
                  alpha=.7,
                  color='red')
    ax.fill_between(fcast_dates, ts_ci_min, ts_ci_max, color='k', alpha=.2)
    plt.legend()
    plt.show()

    # metriche di errore
    errore = ts_fcast - timeseries
    errore.dropna(inplace=True)
    print('MSE=%.4f' % (errore**2).mean())
    print('MAE=%.4f' % (abs(errore)).mean())

    return (model.order, model.seasonal_order)
Beispiel #23
0
plt.plot(np.linspace(0, 9, 10), cresc_p2['Preco'])

# %%
from pmdarima.datasets import load_lynx

# %%
dado_lynx = load_lynx()

# %%
dado_lynx.shape

# %%
from pmdarima import model_selection

# %%
treino, teste = model_selection.train_test_split(dado_lynx, train_size=100)

# %%
teste1 = teste[:10]
teste2 = teste[10:]

# %%
modelo_arima = auto_arima(treino,
                          start_p=1,
                          start_q=1,
                          d=0,
                          max_p=5,
                          max_q=5,
                          supress_warnings=True,
                          stepwise=True,
                          error_action='ignore')
Beispiel #24
0
"""
Created on Thu May  7 02:53:11 2020

@author: felip
"""

import pmdarima as pm
from pmdarima import model_selection
import numpy as np
from matplotlib import pyplot as plt

# #############################################################################
# Load the data and split it into separate pieces
# Australian total wine sales by wine makers in bottles
data = pm.datasets.load_wineind()
train, test = model_selection.train_test_split(data, train_size=150)

# Fit a simple auto_arima model
arima = pm.auto_arima(train,
                      error_action='ignore',
                      trace=True,
                      suppress_warnings=True,
                      maxiter=10,
                      seasonal=True,
                      m=12)

# #############################################################################
# Plot actual test vs. forecasts:
x = np.arange(test.shape[0])
plt.scatter(x, test, marker='x')
plt.plot(x, arima.predict(n_periods=test.shape[0]))
   <br/>
"""
print(__doc__)

# Author: Taylor Smith <*****@*****.**>

import pmdarima as pm
from pmdarima import model_selection
import joblib  # for persistence
import os

# #############################################################################
# Load the data and split it into separate pieces
y = pm.datasets.load_wineind()
train, test = model_selection.train_test_split(y, train_size=125)

# Fit an ARIMA
arima = pm.ARIMA(order=(1, 1, 2), seasonal_order=(0, 1, 1, 12))
arima.fit(y)

# #############################################################################
# Persist a model and create predictions after re-loading it
pickle_tgt = "arima.pkl"
try:
    # Pickle it
    joblib.dump(arima, pickle_tgt, compress=3)

    # Load the model up, create predictions
    arima_loaded = joblib.load(pickle_tgt)
    preds = arima_loaded.predict(n_periods=test.shape[0])