Beispiel #1
0
plot_acf(D_TS1_1[:100],ax=ax1).show()     #一阶差分ACF
plot_pacf(D_TS1_1[:100],ax=ax2).show()    #一阶差分PACF

print (u'差分序列的ADF检验结果为:',ADF(D_TS1_1))

#白噪声检验
statics,p_value = acorr_ljungbox(D_TS1_1,lags = 1)
print (u'差分序列的白噪声检验结果为: Q-statics = %s    p-value = %s'%(statics,p_value))  #返回统计量和p值

#ARIMA差分自回归滑动平均模型
#定阶

p = 1
q = 1
arima1_1 = ARIMA(TS1_1,(1,1,1)).fit()  #建立ARIMA(1,1,1)模型 ARIMA(0,1,1)
arima1_1.summary()

# 预测值,预测误差,预测置信区间
pre_a1_1, pre_b1_1, pre_c1_1 = arima1_1.forecast(365)
plt.plot(pre_a1_1)
pre_b1_1

Date2 = pd.date_range('20020401','20110708')
# pre_a1_1_cumsum = pre_a1_1.cumsum()

pre_D_TS1_1 = arima1_1.fittedvalues
plt.plot(D_TS1_1)
plt.plot(pre_D_TS1_1, color='red')

pre_D_TS1_1_cumsum = pre_D_TS1_1.cumsum()
pre_TS1_1 = pd.Series(TS1_1, index = Date2)
# %%
fig, (ax5, ax6) = plt.subplots(1,2, figsize=(16, 4))

plot_acf(airpassengers_season_diff_train.dropna(), ax5)
ax3.set_title('ACF of differenced season seriess')

plot_pacf(airpassengers_season_diff_train.dropna(), ax6)
ax4.set_title('PACF of differenced season series')

plt.show()


# %%
#  Find d parameter for ARIMA
find_d = ARIMA(airpassengers_season_diff_train.dropna(), order=(0,0,0)).fit()
find_d.summary()


# %%
arima = ARIMA(airpassengers_season_diff_train.dropna(), order=(1,0,1)).fit()
arima.summary()

# %% [markdown]
# The values under *coef* are the weights of the respective terms. 
# 
# AIC and BIC is to tell how good is the model and can be use to compare with other models. The lower the AIC the better the model
# 
# 
# %% [markdown]
# ## Residuals
# 
Beispiel #3
0
size = int(len(df_comp) * 0.8)
df = df_comp.iloc[:size]
df_test = df_comp.iloc[size:]




# review ACF and PACF (in reality is more functional to run auto_arima vs checking ACF/PACF manually, but this is for sake of example)
# ----------
# not done here



# run ARIMAX model using S&P500 vakues as exogenous factor to explain FTSE values
# ----------
model_arimax_111 = ARIMA(df.market_value, order=(1,1,1), exog=df.spx).fit()
print(model_arimax_111.summary())
print('----------')





# analyzing residuals
# ----------
df['residuals_model_arimax_111'] = model_arimax_111.resid.iloc[:]
sgt.plot_acf(residuals_model_arimax_111[1:], zero = False, lags = 40)
plt.title("ACF Of Residuals for ARIMAX(1,1,1)",size=20)
plt.show()

Beispiel #4
0
data = fast_log(data)
test_stationarity(
    differencing(data, 2), window=7,
    cutoff=0.01)  # looks very bad in the beginning, variance is wild
## removing the first 70
data = data[70:]
test_stationarity(differencing(data, 2), window=7,
                  cutoff=0.01)  # Not too too bad, let's try
train_data, test_data = train_test(
    data, 30, 'n')  # we try to predict about a month's data
plot_pacf(differencing(train_data, 2))  ## looks like AR(3)
plot_acf(differencing(test_data, 2))  ## looks like MA(0)
search = pq_search(train_data, 5, 2, 2, 5, 0.05)  #search for optimal pq
search[0]  # looks like pdq=321 is the one with lowest aic
arima_model = ARIMA(train_data, (3, 2, 1)).fit(disp=False)
print(arima_model.summary())

plt.rcParams.update({'font.size': 10})
plt.rc('xtick', labelsize=5)
arima_res_plot(arima_model)  #mean of residual almost 0, std is not bad
plt.savefig('cumulative_cases_arima_321_res.png', format='png', dpi=300)

plt.rc('xtick', labelsize=5)
arima_pred_plot(
    arima_model, test_data, 0.05,
    mode='exp')  # the predictions are not looking too good, sse=4.1E10
plt.title(
    'Predicted vs actual Cumulative Cases \n ARIMA(3,2,1) Residual: 4.1E10')
plt.savefig('cumulative_cases_pred_arima_321.png', format='png', dpi=300)

sm.stats.acorr_ljungbox(
from statsmodels.datasets.macrodata import load_pandas
from statsmodels.tsa.base.datetools import dates_from_range
from statsmodels.tsa.arima_model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
plt.interactive(False)

# let's examine an ARIMA model of CPI

cpi = load_pandas().data['cpi']
dates = dates_from_range('1959q1', '2009q3')
cpi.index = dates

res = ARIMA(cpi, (1, 1, 1), freq='Q').fit()
print res.summary()

# we can look at the series
cpi.diff().plot()

# maybe logs are better
log_cpi = np.log(cpi)

# check the ACF and PCF plots
acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95)
# center the confidence intervals about zero
#confint_acf -= confint_acf.mean(1)[:, None]
pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols')
# confidence interval is now an option to pacf
from scipy import stats
confint_pacf = stats.norm.ppf(1 - .025) * np.sqrt(1 / 202.)
Beispiel #6
0
def ts_arima(ts,p,d,q,start,end):
    arima = ARIMA(ts, order=(p,d,q)).fit(disp=-1,maxiter=100)
    print("未来五年:", arima.forecast(5)[0])
    print(arima.summary())
    ts_predict_arima = arima.predict(start,end, dynamic = False)
    return ts_predict_arima
##Use median value to build ARIMA model and decide the parameters
zillow_cd_median = pd.DataFrame(
    zillow_cd.drop(columns=[
        'RegionID', 'RegionName', 'City', 'State', 'Metro', 'CountyName',
        'SizeRank'
    ]).median())
zillow_cd_median.index = pd.to_datetime(zillow_cd_median.index)

##Check the ACF and PACF, determine that the order (0,1,1) is proper for the model
plot_acf(zillow_cd_median)
plot_pacf(zillow_cd_median)

##Fit the ARIMA model and predict
model1 = ARIMA(zillow_cd_median.values, (0, 1, 1)).fit()
model1.summary()
output1 = pd.DataFrame(model1.forecast(t)[0])
zillow_cd_median = zillow_cd_median.append(output1)
zillow_cd_median.index = pd.date_range('8/1/2010', latest, freq='MS')

##Create the forcasting line trend chart for the median property value
plt.figure(figsize=(15, 4))
plt.margins(x=0)
p2 = plt.plot(zillow_cd_median)
plt.axvline(x=pre, color='r', linestyle='--')
plt.ylim(ymin=0)
plt.show(
)  #gently increaseing trend for the perdiction period from 2017 to 2019

##Use the same parameters to predict the property value of specific area
##Here could build a funcion for connecting the cleaned useful dataset: airbnb_cd and zillow_cd into profit_ny
    return automodel


# In[132]:

forecated_accuracy = pd.DataFrame()
Predicted = pd.DataFrame()

model = arimamodel(Train['y'])  #Build Auto ARIMA MODEL
forecast_arima, conf_int = model.predict(n_periods=Test.shape[0] + 10,
                                         return_conf_int=True)
Predicted['preds'] = np.round(forecast_arima, 2)
Predicted = Predicted.reset_index()

# Get the Model Parameters/ Orders used by Auto- Arima
s = model.summary().tables[0].as_text()
start = s.find("Model:")
end = s.find(")")
end += len("end")
Predicted['Parameters'] = s[
    start:end]  #Store the Model Parameters/ Orders in 'Parameters' column
#forecast
fc, conf = model.predict(len(Test), return_conf_int=True)
plt = plotarima(Train['y'], Test['y'], model, "Auto_ARIMA.pdf", fc, conf)

#Test=Test.reset_index()
forecated_accuracy = pd.DataFrame(
    forecast_accuracy(forecast_arima[:len(Test)], Test['y']))

#merge Test,forecasted and accuracy metrics
Test = Test.reset_index(drop=True)
Beispiel #9
0
def buildARIMA(data, p, q, d):
    model = ARIMA(data, (p, d, q)).fit()
    print(model.summary())
    return model
def def_MA(timeseries, q=1, steps1=1):
    model = ARIMA(timeseries, (0, 0, q)).fit()
    summary = model.summary()
    forecast = model.forecast(steps=steps1)
    return summary, forecast
Beispiel #11
0
class Model(Config):
    def __init__(self, m, station, var, predictives):
        self.var = var
        self.station = station
        self.predictives = predictives
        if var in self.predictives:
            self.predictives.remove(var)
        self.alg = m['alg'](**m["args"])
        self.date = None
        self.param_grid = m['param_grid'] if 'param_grid' in m else None
        self.actual_holdout = None
        self.pred_holdout = None
        self.actual_train = None
        self.actual_test = None
        self.pred = None
        self.whole_pred = None
        self.se = None
        self.conf = None
        self.metrics = {}
        self.holdout_metrics = {}
        self.created = datetime.now()
        self.model_data = None
        self.lstm_history = None
        self.lstm_train_pred = None
        self.lstm_test_pred = None
        self.supervised_learn = None

    def set_props(self, alg, df):
        self.algorithm = alg
        self.start_time = df['Date'].min()
        self.end_time = df['Date'].max()
        self.n_records = df.shape[0]

    def get_meta(self):
        return dict(
            algorithm=self.algorithm,
            supervised_learning=self.supervised_learn,
            predictives=self.predictives,
            start_time=self.start_time,
            end_time=self.end_time,
            n_records=self.n_records,
            metrics=self.metrics,
            created=self.created,
        )

    def dataset_split(
            self,
            data,
            ratio,
            supervised_learning=Config.MODELLING_CONFIG["SUPERVISED_LEARNING"],
            shuffle_data=False):
        ratio = ratio or self.MODELLING_CONFIG["SPLIT_RATIO"]
        if supervised_learning == False:
            self.logger.info(
                "  Intitiate forecasting model train-test split ...")
            train_size = int(
                len(data) * Config.MODELLING_CONFIG["SPLIT_RATIO"])
            train, test = data.iloc[0:train_size], data.iloc[
                train_size:len(data)]
            self.logger.info(
                "  Training dataset: {},   Testing dataset: {}".format(
                    train.shape, test.shape))

        elif supervised_learning == True:
            self.logger.info(
                "  Intitiate supervised learning model train-test split ...")
            train, test = train_test_split(
                data,
                test_size=ratio,
                shuffle=shuffle_data,
                random_state=self.MODELLING_CONFIG["RANDOM_STATE"],
            )
            self.logger.info(
                "  Training dataset: {},   Testing dataset: {}".format(
                    train.shape, test.shape))
        return train, test

    def dl_univariate_data(self, data, start_index, end_index, history_size,
                           target_size):
        data = []
        labels = []

        start_index = start_index + history_size
        if end_index is None:
            end_index = len(data) - target_size

        for i in range(start_index, end_index):
            indices = range(i - history_size, i)
            # # Reshape data from (history_size,) to (history_size, 1)
            data.append(np.reshape(data[indices], (history_size, 1)))
            labels.append(data[i + target_size])
        return np.array(data), np.array(labels)

    def multivariate_data(self,
                          data,
                          target,
                          start_index,
                          end_index,
                          history_size,
                          target_size,
                          step,
                          single_step=False):
        data = []
        labels = []

        start_index = start_index + history_size
        if end_index is None:
            end_index = len(data) - target_size

        for i in range(start_index, end_index):
            indices = range(i - history_size, i, step)
            data.append(data[indices])
            if single_step:
                labels.append(target[i + target_size])
            else:
                labels.append(target[i:i + target_size])

        return np.array(data), np.array(labels)

    @staticmethod
    def mean_absolute_percentage_error(y_true, y_pred):
        mape = np.mean(np.abs((y_true - y_pred) / y_true + 1e-6)) * 100
        if type(mape) == pd.Series: mape = mape[0]
        return mape

    @staticmethod
    def root_mean_square_error(y_true, y_pred):
        rmse = math.sqrt(mean_squared_error(y_true, y_pred))
        return rmse

    @staticmethod
    def mean_absolute_error(y_true, y_pred):
        mae = mean_absolute_error(y_true, y_pred)
        return mae

    def evaluate(self, actual, pred):

        r2score = r2_score(actual, pred)
        MAPE = Model.mean_absolute_percentage_error(actual, pred)
        MAE = mean_absolute_error(actual, pred)
        rmse = Model.root_mean_square_error(actual, pred)
        metrics = dict(MAE=MAE, MAPE=MAPE, RMSE=rmse)  # R2_Score=r2score

        return metrics

    def predict(self, X_test):
        """predict new cases"""

        if all(p in X_test for p in self.predictives):
            X_test = X_test[self.predictives].astype(float)
            X_test.fillna(method=self.MODELLING_CONFIG["STATUS_MISSING_FILL"],
                          inplace=True)

            if any(X_test.isnull().values.all(axis=0)):
                return [np.nan] * X_test.shape[0]

            preds = self.alg.predict(X_test.dropna())
            return preds
        else:
            return [np.nan] * X_test.shape[0]

    def regression_scalar(self, data):
        """Regression using linear algorithms"""

        df = data[self.predictives + [self.var, "Date"]]
        print(self.predictives)
        print(len(self.predictives))

        scaler = MinMaxScaler()
        scaler.fit(df.drop(columns=[self.var, "Date"]).values)

        train, test = self.dataset_split(df)
        self.date = test["Date"]

        X_train = train.drop(columns=[self.var, "Date"])
        X_test = test.drop(columns=[self.var, "Date"])
        y_train = train[[self.var]]
        self.actual = test[[self.var]].values.ravel()
        self.example = X_train.iloc[0].values

        # # scaling
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        self.alg.fit(X_train, y_train.values.ravel())
        self.pred = self.alg.predict(X_test)

        self.metrics = self.evaluate(self.actual, self.pred)

    def regression_tree(self, data, metric_eval, cv_type):
        """regression using tree-based algrithms"""
        df = data[self.predictives + [self.var, "Date"]]

        # Train/Test
        if metric_eval == "test":

            if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0:
                n_holdout = max(
                    1,
                    int(df.shape[0] *
                        self.MODELLING_CONFIG["HOLDOUT_PERCENT"]))
                holdout = df.iloc[-n_holdout:, ]
                X_holdout = holdout.drop(columns=[self.var, "Date"])
                self.actual_holdout = holdout[[self.var]].values.ravel()

                df = df.iloc[:-n_holdout, ]

            train, test = self.dataset_split(
                df, ratio=Config.MODELLING_CONFIG["SPLIT_RATIO"])
            self.date = test["Date"]

            X_train = train.drop(columns=[self.var, "Date"])
            X_test = test.drop(columns=[self.var, "Date"])
            y_train = train[[self.var]]
            self.actual = test[[self.var]].values.ravel()
            self.example = X_train.iloc[0].values

            if self.param_grid != None:
                #print("    Running Grid Search...")
                param_grid_1 = {
                    k: v
                    for k, v in self.param_grid.items()
                    if k in ["max_depth", "num_leaves", "n_estimators"]
                }
                n_folds = int(100 /
                              (100 * self.MODELLING_CONFIG["SPLIT_RATIO"])) + 1
                grid_search_rf = GridSearchCV(estimator=self.alg,
                                              param_grid=param_grid_1,
                                              scoring='r2',
                                              cv=n_folds,
                                              n_jobs=8)
                grid_search_rf.fit(X_train, y_train.values.ravel())
                print('      Best Params: ', grid_search_rf.best_params_)
                print('      R2-Score: ', grid_search_rf.best_score_)

                self.alg = self.alg.set_params(**grid_search_rf.best_params_)

            self.alg.fit(X_train, y_train.values.ravel())
            self.pred = self.alg.predict(X_test)
            self.metrics = self.evaluate(self.actual, self.pred)

            if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0:
                self.pred_holdout = self.alg.predict(X_holdout)
                self.metrics_holdout = self.evaluate(self.actual_holdout,
                                                     self.pred_holdout)

        # Cross-validation
        elif (metric_eval == "cv"):

            if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0:
                n_holdout = int(df.shape[0] *
                                self.MODELLING_CONFIG["HOLDOUT_PERCENT"])
                holdout = df.iloc[-n_holdout:, ]
                X_holdout = holdout.drop(columns=[self.var, "Date"])
                self.actual_holdout = holdout[[self.var]].values.ravel()

                df = df.iloc[:-n_holdout, ]

            X_train = df.drop(columns=[self.var, "Date"])
            y_train = df[[self.var]]
            self.actual = df[[self.var]].values.ravel()
            self.date = df["Date"]
            self.example = X_train.iloc[0].values

            fold = LeaveOneOut() if cv_type == "loo" else int(
                100 / (100 * self.MODELLING_CONFIG["SPLIT_RATIO"]))

            if self.param_grid != None:
                print("    Running Grid Search...")
                param_grid_1 = {
                    k: v
                    for k, v in self.param_grid.items()
                    if k in ["max_depth", "num_leaves", "n_estimators"]
                }
                n_folds = int(100 /
                              (100 * self.MODELLING_CONFIG["SPLIT_RATIO"])) + 1
                grid_search_rf = GridSearchCV(estimator=self.alg,
                                              param_grid=param_grid_1,
                                              scoring='r2',
                                              cv=n_folds,
                                              n_jobs=8)
                grid_search_rf.fit(X_train, y_train.values.ravel())
                print('      Best Params: ', grid_search_rf.best_params_)
                print('      R2-Score: ', grid_search_rf.best_score_)

                self.alg = self.alg.set_params(**grid_search_rf.best_params_)

            self.alg.fit(X_train, y_train.values.ravel())

            self.pred = cross_val_predict(estimator=self.alg,
                                          X=X_train,
                                          y=y_train.values.ravel(),
                                          cv=fold,
                                          n_jobs=-1)
            self.metrics = self.evaluate(self.actual, self.pred)

            if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0:
                self.pred_holdout = self.alg.predict(X_holdout)
                self.metrics_holdout = self.evaluate(self.actual_holdout,
                                                     self.pred_holdout)

        elif (metric_eval == "cv"):

            if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0:
                self.n_holdout = int(df.shape[0] *
                                     self.MODELLING_CONFIG["HOLDOUT_PERCENT"])
                holdout = df.iloc[-self.n_holdout:, ]
                X_holdout = holdout.drop(columns=[self.var, "Date"])
                self.actual_holdout = holdout[[self.var]].values.ravel()

                df = df.iloc[:-self.n_holdout, ]

            train, test = self.dataset_split(df)
            self.date = test["Date"]

            X_train = train.drop(columns=[self.var, "Date"])
            X_test = test.drop(columns=[self.var, "Date"])
            y_train = train[[self.var]]
            self.actual = test[[self.var]].values.ravel()

            if self.param_grid != None:
                print("    Running Grid Search...")
                param_grid_1 = {
                    k: v
                    for k, v in self.param_grid.items()
                    if k in ["max_depth", "num_leaves", "n_estimators"]
                }
                n_folds = int(100 /
                              (100 * self.MODELLING_CONFIG["SPLIT_RATIO"])) + 1
                grid_search_rf = GridSearchCV(estimator=self.alg,
                                              param_grid=param_grid_1,
                                              scoring='r2',
                                              cv=n_folds,
                                              n_jobs=8)
                grid_search_rf.fit(X_train, y_train.values.ravel())

                ## Second pass for grid search on learning params
                print('      Best Params: ', grid_search_rf.best_params_)
                print('      R2-Score: ', grid_search_rf.best_score_)

                self.alg = self.alg.set_params(**grid_search_rf.best_params_)

            self.alg.fit(X_train, y_train.values.ravel())
            self.pred = self.alg.predict(X_test)
            self.metrics = self.evaluate(self.actual, self.pred)

            if self.MODELLING_CONFIG["HOLDOUT_PERCENT"] != 0:
                self.pred_holdout = self.alg.predict(X_holdout)
                self.metrics_holdout = self.evaluate(self.actual_holdout,
                                                     self.pred_holdout)

    def forecast_model(self,
                       data,
                       seasonal=Config.MODELLING_CONFIG["SEASONAL_OPTION"]):
        df = data[self.predictives]
        train, test = self.dataset_split(
            df,
            self.MODELLING_CONFIG["SPLIT_RATIO"],
            supervised_learning=Config.MODELLING_CONFIG["SUPERVISED_LEARNING"])

        history = [x for x in train]
        prediction_list = list()

        if seasonal == True:
            if self.alg == 'SARIMA':
                train, test = np.log10(train), np.log10(test)
                self.alg = pm.auto_arima(train,
                                         start_p=1,
                                         d=0,
                                         start_q=1,
                                         max_p=5,
                                         max_d=2,
                                         max_q=5,
                                         m=7,
                                         start_P=0,
                                         D=0,
                                         start_Q=0,
                                         max_P=5,
                                         max_D=2,
                                         max_Q=5,
                                         seasonal=True,
                                         trace=True,
                                         error_action='ignore',
                                         suppress_warnings=True,
                                         stepwise=True)
                for data in range(len(test)):
                    self.alg = self.alg.fit(disp=1)
                    self.pred = self.alg.predict(n_periods=1)
                    prediction_list.append(self.pred)
                    self.pred
                    self.actual_test = test[data]
                    history.append(self.actual_test)
            elif self.alg == 'HOLT_WINTER':
                self.alg = self.alg(
                    train,
                    seasonal_periods=Config.
                    MODELLING_CONFIG["HOLT_WINTER_SEASON"],
                    trend=Config.MODELLING_CONFIG["HOLT_WINTER_TREND"],
                    seasonal=Config.MODELLING_CONFIG["HOLT_WINTER_SEASONAL"])
                self.pred = self.alg.forecast(len(test))

        elif seasonal == False:
            for data in range(len(test)):
                self.alg = ARIMA(train,
                                 order=(Config.MODELLING_CONFIG['ARIMA_P'],
                                        Config.MODELLING_CONFIG['ARIMA_D'],
                                        Config.MODELLING_CONFIG['ARIMA_Q']))
                self.alg = self.alg.fit(disp=1)
                self.pred, self.se, self.conf = self.alg.forecast()
                prediction_list.append(self.pred)
                self.actual_test = test[data]
                history.append(self.actual_test)

        self.metrics = self.evaluate(self.actual_test, self.pred)

    def lstm_model(self, data):
        df = data[self.predictives]
        train, test = self.dataset_split(df,
                                         self.MODELLING_CONFIG["SPLIT_RATIO"])

        scaler = MinMaxScaler()
        scaler.fit(train)
        train = scaler.transform(train)
        test = scaler.transform(test)

        X_train, y_train = self.create_dataset(
            train, time_steps=Config.RNN_CONFIG["TIME_STEPS"])
        X_test, y_test = self.create_dataset(
            test, time_steps=Config.RNN_CONFIG["TIME_STEPS"])
        self.actual_train = y_train
        self.actual_test = y_test

        X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
        X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

        self.alg = keras.Sequential()
        self.alg.add(
            LSTM(units=Config.RNN_CONFIG["UNITS"],
                 input_shape=(X_train.shape[1], X_train.shape[2])))
        self.alg.add(Dropout(rate=Config.RNN_CONFIG["DROPOUT_RATE"]))
        self.alg.add(Dense(units=Config.RNN_CONFIG["DENSE_UNIT"]))
        self.alg.compile(loss=Config.RNN_CONFIG["LOSS_FUNC"],
                         optimizer=Config.RNN_CONFIG["OPTIMIZER"])

        self.lstm_history = self.alg.fit(
            X_train,
            y_train,
            epochs=Config.RNN_CONFIG["EPOCHS"],
            batch_size=Config.RNN_CONFIG["BATCH_SIZE"],
            validation_split=Config.RNN_CONFIG["VALIDATION_SPLIT"],
            shuffle=Config.RNN_CONFIG["SHUFFLE"],
            validation_data=(X_test, y_test),
            verbose=1,
        )
        self.logger.info(self.alg.summary())

        self.lstm_train_pred = self.alg.predict(X_train)
        self.lstm_test_pred = self.alg.predict(X_test)

        self.lstm_train_pred = scaler.inverse_transform(self.lstm_train_pred)
        y_train = scaler.inverse_transform([y_train])
        self.lstm_test_pred = scaler.inverse_transform(self.lstm_test_pred)
        y_test = scaler.inverse_transform([y_test])

        self.metrics = self.evaluate(self.actual_test[0], self.pred[:0])

    def feature_importance_plot(self):

        fig, ax = plt.subplots(figsize=(10, len(self.predictives) / 2))

        s = pd.Series(self.alg.feature_importances_, index=self.predictives)
        ax = s.sort_values(ascending=False).plot.barh()
        ax.invert_yaxis()

        patches = [
            mpatches.Patch(label="Test Size: {}".format(self.actual.shape[0]),
                           color='none')
        ]
        for alg, val in self.metrics.items():
            patches.append(
                mpatches.Patch(
                    label="{}: {:0.2f}".format(alg, val),
                    color='none',
                ))
        plt.legend(handles=patches, loc='lower right')

        return fig

    def residual_plot(self):

        fig = plt.figure(figsize=(10, 6))
        gs = gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[3, 1])

        ax1 = fig.add_subplot(gs[0])
        residual = self.actual - self.pred
        sns.residplot(x=self.pred, y=residual, ax=ax1)
        ax1.set_ylabel("Residual")
        ax1.set_xlabel("Predict")
        ax1.set_title(self.station)

        ax2 = fig.add_subplot(gs[1], sharey=ax1)
        ax2.hist(residual, orientation="horizontal")
        ax2.set_xlabel('Residual Distribution')

        return fig
def def_ARIMA(timeseries, p, d, q, steps1):
    model = ARIMA(timeseries, (p, d, q)).fit()
    summary = model.summary()
    forecast = model.forecast(steps=steps1)
    return summary, forecast
dta0 = pd.read_csv('data.CSV', header=0)
dta0.index = pd.to_datetime(dta0['date'])

##data = pd.DataFrame()
##data['date'] = ['2008/1/11','2008/2/6','2008/3/17','2008/4/13','2008/5/17','2008/6/15','2008/7/1','2008/7/12','2008/8/10','2008/9/14','2008/10/12','2008/11/16','2008/12/13','2009/1/19','2009/2/16','2009/3/13','2009/4/18','2009/5/16','2009/6/20','2009/7/11','2009/8/15','2009/9/19','2009/10/16','2009/11/14','2009/12/11','2010/1/15','2010/2/20','2010/3/13','2010/4/17','2010/5/15','2010/6/12','2010/7/16','2010/8/14','2010/9/18','2010/10/16','2010/11/19','2010/12/24','2011/1/21','2011/2/18','2011/3/19','2011/4/17','2011/5/15','2011/6/18','2011/7/16','2011/8/20','2011/9/24','2011/10/22','2011/11/19','2011/12/24','2012/1/14']
##data['dy'] = [0.62,1.01,1.78,1.29,0.11,-0.35,-0.44,-0.3,-1.11,-1.78,-1.39,-0.94,-0.36,1.47,1.75,2.04,1.03,0.02,-0.59,-1.35,-2.14,-1.96,-1.46,-0.56,0.04,0.96,1.58,1.43,0.95,0.14,-0.3,-1.35,-1.6,-1.98,-1.58,-0.98,0.56,1.14,1.19,1.18,0.61,0.76,-0.66,-1.14,-1.35,-1.85,-0.95,-0.65,0.44,1.09]
##data.index = pd.to_datetime(data['date'])

p = 0
d = 1
q = 1

arima = ARIMA(dta0['H'].dropna(), (p, d, q)).fit()

print(arima.summary())

dta_pred = arima.predict(typ='levels')  #预测

###拟合
##fig1 = plt.figure(figsize=(12,8))
##plt.plot(dta0['dy'], color='green')
##plt.plot(dta_pred, color='yellow')
##fig1.show()

#模型预测
forecast_ts = arima.forecast(10)
fore = pd.DataFrame()
fore['date'] = ['2021-01-24', '2021-01-25', '2021-01-26']
fore['result'] = pd.DataFrame(forecast_ts[0])
fore.index = pd.to_datetime(fore['date'])
from statsmodels.tsa.arima_model import ARIMA

#calculate errors

errors = model_full.predict(bluejet1) - bluejet1.flyers

#ACF plot to look significance of errors
tsa_plots.plot_acf(errors, lags=12)

# from the acf plot we can see that lags of the errors having significant association, hence we can use errors for forecsting errors for next 12 timeperiods.

# Auto regreesion(p), from the ACF considering principal of parcimony we can take P=1

model_AR = ARIMA(errors, order=(1, 0, 0)).fit(disp=0)

model_AR.summary()

pred_data["forecasted_errors"] = pd.Series(model_AR.forecast(12)[0])
pred_data[
    "improved_forecast"] = pred_data.forecasted_flyers + pred_data.forecasted_errors

#decomposition

import pandas as pd
from pandas import read_csv
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
series = read_csv('bluejet.csv', header=0, index_col=0)
series.reset_index(inplace=True)
series["date"] = pd.to_datetime(series["Month"])
series = series.set_index("date")
Beispiel #15
0
# ACF -> to identify q
plt.subplot(122)
plt.plot(lags_acf)
plt.axhline(y=0,linestyle="-",color="gray")
plt.axhline(y=-1.96/np.sqrt(len(diff_mystock)),linestyle="--",color='red')
plt.axhline(y=1.96/np.sqrt(len(diff_mystock)),linestyle="--",color='red')
plt.title("ACF")
plt.xlabel("Lags")
plt.ylabel("Correlation")

p=0; q=0; d=0

# Build the ARIMA model
m1 = ARIMA(diff_mystock,order=(p,d,q)).fit(disp=0)
m1.summary()

plt.hist(m1.resid)
plt.title("ARIMA model residuals")
# LJung-Box test to check the model goodness
# H0: residuals are independently distributed
# H1: residuals are not independently distributed
pvalue = sm.stats.acorr_ljungbox(m1.resid,lags=[1])[1]
if pvalue > 0.05:
    print("FTR H0. Residuals are independently distributed")
else:
    print("Reject H0. Residuals are not independently distributed")

# forecast for the next 12 months
f1 = m1.forecast(steps=12)
fig1 = sm.graphics.tsa.plot_acf(trainWTI['WTI'])
ax = fig1.add_subplot(111)
ax.set_xlabel("Lag")
ax.set_ylabel("ACF")
plt.show()

fig2 = sm.graphics.tsa.plot_pacf(trainWTI['WTI'])
ax = fig2.add_subplot(111)
ax.set_xlabel("Lag")
ax.set_ylabel("PACF")
plt.show()


# Parameter freq indicates that monthly statistics is used
arima_mod100 = ARIMA(trainWTI, (2,0,0), freq='M').fit()  # try (1,0,1)
print arima_mod100.summary()

# Check assumptions:
# 1) The residuals are not correlated serially from one observation to the next.
# The Durbin-Watson Statistic is used to test for the presence of serial correlation among the residuals
# The value of the Durbin-Watson statistic ranges from 0 to 4.
# As a general rule of thumb, the residuals are uncorrelated is the Durbin-Watson statistic is approximately 2.
# A value close to 0 indicates strong positive correlation, while a value of 4 indicates strong negative correlation.
print "==================== Durbin-Watson ====================="
print sm.stats.durbin_watson(arima_mod100.resid.values)
print "========================================================"

fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111)
ax = arima_mod100.resid.plot(ax=ax)
ax.set_title("Residual series")
Beispiel #17
0
import pandas as pd

engine = create_engine(
  'mysql+mysqlconnector://viewer:@dadata.cba.edu:3306/ACS'
 	  )
      
SELECT = """SELECT AVG(hhincome) AS hhincome, year,
    statefip
  FROM ACS
  GROUP BY year, statefip ORDER BY year, statefip"""

data = pd.read_sql(SELECT, engine)

reg = sm.ols("np.log(hhincome) ~ year", data=data).fit()
print(reg.summary())


#%%

reg = sm.ols("np.log(hhincome) ~ year + C(statefip)", data=data).fit()
print(reg.summary())

#%%

from statsmodels.tsa.arima_model import ARIMA

y = data.loc[data['statefip']==31, ['hhincome','year']]
y.index=pd.to_datetime(y.year)
reg = ARIMA(y['hhincome'], order=(1,1,0)).fit()
print(reg.summary())
Beispiel #18
0
# split dataset (on straight data = prices)
# ----------
size = int(len(df_comp) * 0.8)
df = df_comp.iloc[:size]
df_test = df_comp.iloc[size:]
# -- creating returns column from train dataset
df['returns'] = df.market_value.pct_change(1) * 100

# review ACF and PACF (in reality is more functional to run auto_arima vs checking ACF/PACF manually, but this is for sake of example)
# ----------
# not done here

# select ARMA model (by looking to PACF here) and iterating through more models
# ----------
model_arima_111 = ARIMA(df.market_value, order=(1, 1, 1)).fit()
print(model_arima_111.summary())
print('----------')
model_arima_511 = ARIMA(df.market_value, order=(5, 1, 1)).fit()
print(model_arima_511.summary())
print('----------')


# compare LLR results across models to see which model is best
# ----------
def LLR_test(mod_1, mod_2, DF=1):
    L1 = mod_1.fit().llf
    L2 = mod_2.fit().llf
    LR = (2 * (L2 - L1))
    p = chi2.sf(LR, DF).round(3)
    return p
pmax = int(len(D_arima_data) / 10)  #一般阶数不超过length/10
qmax = int(len(D_arima_data) / 10)  #一般阶数不超过length/10
bic_matrix = []  #bic矩阵
for p in range(pmax + 1):
    tmp = []
    for q in range(qmax + 1):
        try:  #存在部分报错,所以用try来跳过报错。
            tmp.append(ARIMA(arima_data, (p, 1, q)).fit().bic)
        except:
            tmp.append(None)
    bic_matrix.append(tmp)

# In[73]:

bic_matrix = pd.DataFrame(bic_matrix)  #从中可以找出最小值

p, q = bic_matrix.stack().idxmin()  #先用stack展平,然后用idxmin找出最小值位置。
print(u'BIC最小的p值和q值为:{0}、{1}'.format(p, q))

# In[68]:

model = ARIMA(arima_data, (0, 1, 1)).fit()  #建立ARIMA(0, 1, 1)模型

# In[69]:

model.summary()  #给出一份模型报告

# In[70]:

model.forecast(5)  #作为期5天的预测,返回预测结果、标准误差、置信区间。
Beispiel #20
0
from statsmodels.datasets.macrodata import load_pandas
from statsmodels.tsa.base.datetools import dates_from_range
from statsmodels.tsa.arima_model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
plt.interactive(False)

# let's examine an ARIMA model of CPI

cpi = load_pandas().data["cpi"]
dates = dates_from_range("1959q1", "2009q3")
cpi.index = dates

res = ARIMA(cpi, (1,1,1), freq='Q').fit()
print res.summary()

# we can look at the series
cpi.diff().plot()

# maybe logs are better
log_cpi = np.log(cpi)

# check the ACF and PCF plots
acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95)
# center the confidence intervals about zero
#confint_acf -= confint_acf.mean(1)[:,None]
pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols')
# confidence interval is now an option to pacf
from scipy import stats
confint_pacf = stats.norm.ppf(1-.025) * np.sqrt(1/202.)
Beispiel #21
0
from statsmodels.tsa.base.datetools import dates_from_range
from statsmodels.tsa.arima_model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import statsmodels.api as sm
plt.interactive(False)

# let's examine an ARIMA model of CPI

cpi = load_pandas().data['cpi']
dates = dates_from_range('1959q1', '2009q3')
cpi.index = dates

res = ARIMA(cpi, (1, 1, 1), freq='Q').fit()
print(res.summary())

# we can look at the series
cpi.diff().plot()

# maybe logs are better
log_cpi = np.log(cpi)

# check the ACF and PCF plots
acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95)
# center the confidence intervals about zero
# TODO: demean? --> confint_acf -= confint_acf.mean(1)[:, None]
pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols')
# confidence interval is now an option to pacf
confint_pacf = stats.norm.ppf(1 - .025) * np.sqrt(1 / 202.)
# that helps us determine the number of AR terms
plot_acf(hourly_sentiment_series_diff2)
pyplot.show()

plot_pacf(hourly_sentiment_series_diff2)
pyplot.show()

# Depending on ACF and PACF, create ARMA/ARIMA model 
# with AR and MA terms
# This will infer the frequency, so make sure there are 
# no gaps between datetimes
ARMA1model_hourly_sentiment = ARIMA(hourly_sentiment_series, order=(5,2,1)).fit(transparams=False)
# If the p-value for a AR/MA coef is > 0.05, it's not significant
# enough to keep in the model
# Might want to re-model using only significant terms
print(ARMA1model_hourly_sentiment.summary())

# Predict the next 5 hours (5 time steps ahead), 
# which is the test/holdout set
ARMA1predict_5hourly_sentiment = ARMA1model_hourly_sentiment.predict('2/6/2019  7:00:00 PM','2/6/2019  11:00:00 PM', typ='levels')
print('Forecast/preditions for 5 hours ahead ', ARMA1predict_5hourly_sentiment)

# Back transform so we can compare de-diff'd predicted values 
# with the de-diff'd/original actual values
# This is automatically done when predicting (specify typ='levels'), 
# so no need to manually de-diff
# Nevertheless, let's demo how we de-transform 2 rounds of diffs
# using cumulative sum with original data given
#diff2 back to diff1
undiff1 = hourly_sentiment_series_diff2.cumsum().fillna(hourly_sentiment_series_diff2)
#undiff1 back to original data
Beispiel #23
0
plot_acf(dftaxi_day.response_variable, lags=52)

#%% Stationariaty on entire dataset
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA
dftaxi_day = dftaxi_day[['response_variable']].astype(float)
model = ARMA(dftaxi_day, (1, 0)).fit()
model.summary()
#Matches autocorr(1), therefore stationary dataset!
#%%  Residuals for AR(1) using entire dataset
type(model.resid)
print(model.resid.plot())
print(plot_acf(model.resid, lags = 50))
#%% ARIMA
arima_model = ARIMA(dftaxi_day, (28,1, 1)).fit()
arima_model.summary()
print(arima_model.resid.plot())
print(plot_acf(arima_model.resid, lags = 50))
#%% Predict
#arima_model.predict(1,100).plot()

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax = train.plot(ax=ax)

fig = arima_model.plot_predict('2016-10-01',
    '2016-12-31', ax=ax, plot_insample=False)
###############################################################################################################
###############################################################################################################
#%%
Beispiel #24
0
# then consider adding an MA term to the model. The lag at which the ACF cuts off is the indicated number of MA terms.
fig1 = sm.graphics.tsa.plot_acf(trainWTI['WTI'])
ax = fig1.add_subplot(111)
ax.set_xlabel("Lag")
ax.set_ylabel("ACF")
plt.show()

fig2 = sm.graphics.tsa.plot_pacf(trainWTI['WTI'])
ax = fig2.add_subplot(111)
ax.set_xlabel("Lag")
ax.set_ylabel("PACF")
plt.show()

# Parameter freq indicates that monthly statistics is used
arima_mod100 = ARIMA(trainWTI, (2, 0, 0), freq='M').fit()  # try (1,0,1)
print arima_mod100.summary()

# Check assumptions:
# 1) The residuals are not correlated serially from one observation to the next.
# The Durbin-Watson Statistic is used to test for the presence of serial correlation among the residuals
# The value of the Durbin-Watson statistic ranges from 0 to 4.
# As a general rule of thumb, the residuals are uncorrelated is the Durbin-Watson statistic is approximately 2.
# A value close to 0 indicates strong positive correlation, while a value of 4 indicates strong negative correlation.
print "==================== Durbin-Watson ====================="
print sm.stats.durbin_watson(arima_mod100.resid.values)
print "========================================================"

fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(111)
ax = arima_mod100.resid.plot(ax=ax)
ax.set_title("Residual series")
import math
from statistics import mean

plt.style.use('fivethirtyeight')

df = pd.read_excel("../00Daily/Australia.xlsx", squeeze=True, parse_dates=True)
df = df[["Date", "LocalTransmission"]]
df.set_index("Date", inplace=True)
df.dropna(inplace=True)
##df['Date'] = pd.to_datetime(df['Date'])
LocalTransmission = df['LocalTransmission'].astype('int32')
#print (df.head())
print(df.index)

result = ARIMA(df, order=(1, 1, 1)).fit(disp=False)
print(result.summary())
#print(result.params)
predictions = result.predict(start="2020-03-01", end="2020-05-01")
#accuracy = result.score()
print(predictions)
##accuracy = result.score()
#print (accuracy)

result.plot_predict(start="2020-03-01", end="2020-05-01")
plt.suptitle(
    'Prediction for postive cases in Australia \n Algorithm used: ARIMA',
    fontsize=12)
plt.show()

##def mean_forecast_error(y, yhat):
##    return y.sub(yhat).mean()
Beispiel #26
0
plot_acf(D_data).show() #自相关图
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(D_data).show() #偏自相关图
ADF(D_data[u'销量差分'])#平稳性检测

#白噪声检验
from statsmodels.stats.diagnostic import acorr_ljungbox
acorr_ljungbox(D_data, lags=1) #返回统计量和p值

from statsmodels.tsa.arima_model import ARIMA

#定阶
pmax = int(len(D_data)/10) #一般阶数不超过length/10
qmax = int(len(D_data)/10) #一般阶数不超过length/10
bic_matrix = [] #bic矩阵
for p in range(pmax+1):
  tmp = []
  for q in range(qmax+1):
    try: #存在部分报错,所以用try来跳过报错。
      tmp.append(ARIMA(data, (p,1,q)).fit().bic)
    except:
      tmp.append(None)
  bic_matrix.append(tmp)

bic_matrix = pd.DataFrame(bic_matrix) #从中可以找出最小值

p,q = bic_matrix.stack().idxmin() #先用stack展平,然后用idxmin找出最小值位置。
print(u'BIC最小的p值和q值为:%s、%s' %(p,q)) 
model = ARIMA(data, (0,1,1)).fit() #建立ARIMA(0, 1, 1)模型
model.summary() #给出一份模型报告
model.forecast(5) #作为期5天的预测,返回预测结果、标准误差、置信区间。
    if h > 0:
        print('模型ARIMA(%s,1, %s)不符合白噪音检验' % (p, q))
        print('在BIC矩阵中去掉[%s,%s]组合,重新进行计算' % (p, q))
        matrix.iloc[p, q] = np.nan
        arimafail = arima
        continue
    else:
        # print(p,q)
        print('模型ARIMA(%s,%s)符合白噪声检验' % (p, q))
        break
'''

 '''

# 第   5   步--C盘---------模型预测
print('模型报告:summary():\n', arima.summary())
forecast_values, forecasts_standard_error, forecast_confidence_interval = arima.forecast(
    5)

pre_data = pd.DataFrame(xtest_value)
pre_data.insert(1, 'CWXT_DB:184:D:\\_predict', forecast_values)
pre_data.rename(columns={
    'CWXT_DB:184:D:\\': '实际值',
    'CWXT_DB:184:D:\\_predict': '预测值'
},
                inplace=True)
result_d = pre_data.applymap(lambda x: '%.2f' % x)
result_d.to_excel('../my_data/pedictdata_D_BIC_ARMA.xlsx')

# 第   5   步--D盘---------模型评价
# 为了评价时序预测模型效果的好坏,本章采用3个衡量模型预测精度的统计量指标:平均绝对误差、均方根误差、平均绝对百分误差
Beispiel #28
0
        print (u'模型ARIMA(%s,1,%s)不符合白噪声检验' % (p,q))
        print ('在AIC矩阵中去掉[%s,%s]组合,重新进行计算' % (p,q))
        aic_matrix.iloc[p,q] =  np.nan
        arimafail = arima
        continue
    else:
        print (p,q)
        print (u'模型ARIMA(%s,1,%s)符合白噪声检验' % (p,q))
        break
        
        


# In[7]:

arima.summary() # 注意当p,q值为0,0时,summary方法报错


# In[8]:

forecast_values, forecasts_standard_error, forecast_confidence_interval = arima.forecast(5)
forecast_values
# arimaf = ARIMA(xdata2, (0,1,1)).fit()
# arimaf.forecast(5)[0]


# In[9]:

predictdata = pd.DataFrame(xtest_value)
predictdata.insert(1,'CWXT_DB:184:C:\\_predict',forecast_values)
predictdata.rename(columns={'CWXT_DB:184:C:\\':u'实际值','CWXT_DB:184:C:\_predict':u'预测值'},inplace=True)
Beispiel #29
0
from statsmodels.tsa.base.datetools import dates_from_range
from statsmodels.tsa.arima_model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import statsmodels.api as sm
plt.interactive(False)

# 让我们以 CPI 的 ARIMA 模型来举例 

cpi = load_pandas().data['cpi']
dates = dates_from_range('1959q1', '2009q3')
cpi.index = dates

res = ARIMA(cpi, (1, 1, 1), freq='Q').fit()
print(res.summary())

# 我们可以画图查看序列 
cpi.diff().plot()

# 或许查看日志会更好
log_cpi = np.log(cpi)

# 检查 ACF 和 PCF 图
acf, confint_acf = sm.tsa.acf(log_cpi.diff().values[1:], confint=95)
# 将置信区间定为零
# TODO: demean? --> confint_acf -= confint_acf.mean(1)[:, None]
pacf = sm.tsa.pacf(log_cpi.diff().values[1:], method='ols')
# 置信区间是 pacf 的一个选项
confint_pacf = stats.norm.ppf(1 - .025) * np.sqrt(1 / 202.)
#
#     diff(diff(y)) = ARMA(p, q)
#
# The order of differencing is the same as applying the `diff` function _d_ times.

# Compared to an ARMA model, ARIMA models _do not rely on the underlying series being stationary._ The differencing operation can _convert_ the series to one that is stationary.
#
# Since ARIMA models automatically include differencing, we can use this on a broader set of data without assumptions of a constant mean.

# In[163]:

from statsmodels.tsa.arima_model import ARIMA

# We can see that this model in fact simplifies automatically to an ARMA model.
arima101 = ARIMA(store1_sales_data, (1, 0, 1)).fit()
arima101.summary()

# In[168]:

# Let's remove the moving average component since it wasn't particularly useful before.
# Also, lets add the differencing parameter.
# Now this is equivalent to a AR(1) model on the differenced data.
arima110 = ARIMA(store1_sales_data, (1, 1, 0)).fit()

# Note the value of the coeffient.
arima110.summary()

# In[169]:

# We can compute the lag 1 auto correlation of the difference series and see if they match!
store1_sales_data.Sales.diff(1).autocorr(1)