Example #1
0
class ARIMAModelResult:
    def __init__(self, autoregressive_periods, integrated_order, moving_average_model_periods, training_data, test):
        self.autoregressive_periods = autoregressive_periods
        self.integrated_order = integrated_order
        self.moving_average_model_periods = moving_average_model_periods
        self.model = ARIMA(training_data, order=(
            self.autoregressive_periods,
            self.integrated_order,
            self.moving_average_model_periods
        )
                           )
        self.fit = self.model.fit()
        self.aic = self.fit.aic
        self.predictions = self.fit.forecast(steps=len(test))[0]
        self.model_fitness = mean_squared_error(test, self.predictions)

    def __eq__(self, other):
        return self.model_fitness == other.model_fitness

    def __lt__(self, other):
        return self.model_fitness < other.model_fitness

    def __gt__(self, other):
        return self.model_fitness > other.model_fitness

    def __str__(self):
        return "Autoregressive periods: {}\nIntegraded Order: {}\nMoving Average Model Periods: {}\n Predictions: {}\nMSE: {}".format(
            self.autoregressive_periods,
            self.integrated_order,
            self.moving_average_model_periods,
            self.predictions,
            self.model_fitness
        )
Example #2
0
 def ARIMA_forcast2(self):
     # this approach forecast 1 data pt at a time, then add the new forecast datapoint to the training data
     # then repeat
     import warnings
     warnings.filterwarnings('ignore')
     
     # test without taking log of data
     # using rolling avg 
     y = vr_df2_ts.values
     train = vr_df2_ts.values[286:574]
     prediction = list()
     for t in range(288):
         modelY = ARIMA(y, order=(1,1,1))
         results = modelY.fit(disp=-1)
         out = results.forecast()
         yhat = out[0]
         prediction.append(yhat)
         y = np.append(y,train[t])
         
     forecast = pd.Series(prediction,index=pd.date_range(start='2017-02-09 00:00:00', periods=288,freq='5min'))
     exog = vr_df2_ts.iloc[286:574]
     exog.set_index(pd.date_range(start='2017-02-09 00:00:00', periods=288,freq='5min'),inplace=True)
     
     plt.plot(vr_df2_ts)
     plt.plot(exog,'g')
     plt.plot(forecast,'r')
Example #3
0
def arima_predict(train_dat, n_predictions, p=2, d=0, q=0):
    arima = ARIMA(np.array(train_dat).astype(np.float), [p, d, q])
    diffed_logged_results = arima.fit(trend='c', disp=False)
    preds = diffed_logged_results.predict(len(train_dat),
                                          len(train_dat) + n_predictions - 1,
                                          exog=None, dynamic=False)
    return preds
    def forecast_by_cluster(self, hold_out_n, n_ahead, order, exog):
        dfit = self.ds_agg_by_c
        
        efit = efor = None
        if hold_out_n > 0:
            # hold out validation required
            dfit = dfit[:-hold_out_n]
            if (exog is not None):
                efit = exog[:-hold_out_n]
                efor = exog[-hold_out_n:]
        else:
            if (exog is not None):
                efit = exog[:-n_ahead]
                efor = exog[-n_ahead:]
        ds_c_for = np.zeros((n_ahead, self.n_clusters))

        for c in tqdm(range(self.n_clusters)):
            cdfit = dfit[:,c]
            if sum(cdfit) == 0:
                ds_c_for[:,c] = 0
                continue
            m = ARIMA(cdfit, exog = efit, order = order)
            mf = m.fit()
            f = mf.forecast(n_ahead, exog = efor, alpha = .95)[0]
            ds_c_for[:,c] = f
        
        self.ds_c_for = ds_c_for
Example #5
0
def arimamodel(ts):
	ts_log, ts_log_diff = trend(ts)
	model = ARIMA(ts_log, order = (2,1,2))
	result_ARIMA = model.fit(disp = -1)

	m = ARIMA(ts, order = (2,1,2)).fit()

	arimares = ARMAResults(m, params = '')

	pre = arimares.forcast(steps = 60)


	# pre = m.predict('20150901', '20151230', dynamic = True)
	print pre

	# prediction back to the original scale
	predictions_ARIMA = backorg(result_ARIMA, ts_log)
	plt.plot(predictions_ARIMA)
	# print (predictions_ARIMA - ts)[40:80]

	plt.plot(ts, color = 'red')

	# plt.plot(ts_log_diff)
	# plt.plot(result_ARIMA.fittedvalues, color = 'red')
	plt.title('RSS: %.4F' % np.sum((result_ARIMA.fittedvalues - ts_log_diff)**2))
	plt.show()
Example #6
0
    def get_grouped_data(self, forecast=False):
        cdf = self.cumulative_sum()
        gdf = self.group_by('M')

        if cdf.shape[0] > gdf.shape[0]:
            df = cdf.to_frame()
            df.columns = ['cumulative sum']
            df['total added'] = gdf.to_frame()['event']
        else:
            df = gdf.to_frame()
            df.columns = ['total added']
            df['cumulative sum'] = cdf.to_frame()['event']

        if forecast:
            mtotals = pd.to_numeric(df['cumulative sum'], downcast='float')
            model = ARIMA(mtotals, order=(10,1,0))
            model_fit = model.fit(disp=0)
            forecast = model_fit.forecast(steps=12)
            dates = pd.date_range('2017-04-30', '2018-06-01', freq='M')
            records = zip([x.to_datetime() for x in dates], forecast[0])
            ndf = pd.DataFrame.from_records(records)
            ndf.columns = ['date', 'forecast']
            ndf.set_index(['date'], inplace=True)
            df = pd.concat([df, ndf], axis=1)

        return df
Example #7
0
def mamodel(ts):
	ts_log, ts_log_diff = trend(ts)
	model = ARIMA(ts_log, order = (0,1,1))
	result_MA = model.fit(disp = -1)
	plt.plot(ts_log_diff)
	plt.plot(result_MA.fittedvalues, color = 'red')
	plt.title('RSS: %.4F' % np.sum((result_MA.fittedvalues - ts_log_diff)**2))
	plt.show(block = False)
Example #8
0
def armodel(ts):
	ts_log, ts_log_diff = trend(ts)
	model = ARIMA(ts_log, order = (1,1,0))
	result_AR = model.fit(disp = -1)
	plt.plot(ts_log_diff)
	plt.plot(result_AR.fittedvalues, color = 'red')
	# pdb.set_trace()
	plt.title('RSS: %.4F' % np.sum((result_AR.fittedvalues - ts_log_diff)**2))
	plt.show(block = False)
Example #9
0
 def ARIMA_fit(self):
     # order=(p,d,q) AR and MA can also be modeled separately by enter 0 for either p or q
     model = ARIMA(ts_log, order=(5,1,5))
     self.results_ARIMA = model.fit(disp=-1)
     
     print(results_ARIMA.summary())
     
     plt.plot(ts_log_diff)
     plt.plot(results_ARIMA.fittedvalues, color='r')
     plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log_diff['in_tpkts'])**2))
Example #10
0
def ARIMA_fun( data ):
    lag_pacf = pacf( data, nlags=20, method='ols' )
    lag_acf, ci2, Q  = acf( data, nlags=20 , qstat=True, unbiased=True)

    model = ARIMA(orig_data, order=(1, 1, int(ci2[0]) ) )  
    results_ARIMA = model.fit(disp=-1)
    plt.subplot(121)
    plt.plot( data )
    plt.plot(results_ARIMA.fittedvalues)
    #plt.show()
    return results_ARIMA.fittedvalues
Example #11
0
 def fit(self):
   if len(self.df) < self.t_window: return None
   model = ARIMA(self.df, order=(2, 1, 1))
   results_ARIMA = model.fit(disp=-1)
   forecast = results_ARIMA.predict(start = self.t_window, end= self.t_window+2, dynamic= True)
   forecast = forecast.cumsum()
   predictions_ARIMA_log = pd.Series(self.df.ix[self.t_window-1], index=forecast.index)
   predictions_ARIMA_log = predictions_ARIMA_log.add(forecast,fill_value=0)
   predictions_ARIMA = np.exp(predictions_ARIMA_log)
   #print self.df
   return predictions_ARIMA
Example #12
0
    def objfunc(order, *params):
        series = params

        try:
            mod = ARIMA(series, order, exog=None)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                res = mod.fit(disp=0, solver='bfgs', maxiter=5000)
        except:
            return float('inf')
        if math.isnan(res.aic):
            return float('inf')
        return res.aic
Example #13
0
    def pridictNextNdays(self,train):
        timeSerize = train[self.selected]
        timeSerize = timeSerize[self.start_train:self.end_train]
        model = ARIMA(timeSerize, order=(self.p,self.d,self.q), freq='D') # build a model
        fitting = model.fit(disp=False)
        forecast, fcasterr, conf_int = fitting.forecast(steps=self.next_ndays, alpha=.05)

        # params = fitting.params
        # residuals = fitting.resid
        # p = fitting.k_ar
        # q = fitting.k_ma
        # k_exog = fitting.k_exog
        # k_trend = fitting.k_trend
        # forecast = _arma_predict_out_of_sample(params,self.next_ndays,residuals, p, q, k_trend, k_exog, endog=timeSerize, exog=None, start=len(timeSerize))
        return  forecast
Example #14
0
 def testArima(self,train):
     realSerize = train[self.selected]
     timeSerize = realSerize[self.start_train:self.end_train]
     realData = train[self.selected][self.end_train:self.next_ndays]
     model = ARIMA(timeSerize, order=(self.p,self.d, self.q)) # build a model
     fitting = model.fit(disp=False)
     forecast, fcasterr, conf_int = fitting.forecast(steps=self.next_ndays, alpha=.05)
     # params = fitting.params
     # residuals = fitting.resid
     # p = fitting.k_ar
     # q = fitting.k_ma
     # k_exog = fitting.k_exog
     # k_trend = fitting.k_trend
     # forecast = _arma_predict_out_of_sample(params,self.next_ndays,residuals, p, q, k_trend, k_exog, endog=timeSerize, exog=None, start=len(timeSerize))
     return  {'real':list(realSerize)[self.end_train:self.end_train+self.next_ndays],'pridiction':forecast}
Example #15
0
    def predict_arima_next_days(self, item):
        ts = df_train[item]
        ts = ts.sort_index() # sorting index Date
        ts_last_day = ts[self.fc] # real last data
        ts = ts[0:self.fc] # index 0 until last data - 1

        model = ARIMA(ts, order=(self.p, self.d, self.q)) # build a model
        fitting = model.fit(disp=False)

        # n_days forecasting
        forecast, fcasterr, conf_int = fitting.forecast(steps=self.n_days, alpha=.05)
        # ts:          history until 1 day before self.fc
        # ts[self.fc]: last day
        # forecast:    1 day forecast (time equalto ts[self.fc])
        return ts, ts_last_day, forecast
Example #16
0
def arima(ts, forecast_window):
    logger.info(ts)
    start = int(ts.count() - 1)
    end = int(start + forecast_window)

    ts_log = np.log(ts)
    model = ARIMA(ts_log, order=(0, 1, 2))
    results = model.fit(disp=-1)
    prediction = results.predict(start=start, end=end, dynamic=True)
    future = pd.Series(prediction, copy=True)
    cumsum = future.cumsum()
    prediction_future = future.add(ts_log.ix[-1])
    prediction_future = prediction_future.add(cumsum)
    ts_future = np.exp(prediction_future)

    return ts_future
def predictFutureProfit(df, forward):
	results = {}

	for asset in get_assets(df):
		ts = df[asset]
		ts_log = np.log(ts)

		model = ARIMA(ts_log, order=(1, 1, 0))  
		results_ARIMA = model.fit(disp=-1)  
		predictions_diff = results_ARIMA.predict(2, len(ts.index)-1, dynamic=True)
		predictions_diff_cumsum = predictions_diff.cumsum()
		predictions_log = pd.Series(ts_log.ix[0], index=ts_log.index)
		predictions_log = predictions_log.add(predictions_diff_cumsum,fill_value=0)
		predictions = np.exp(predictions_log)
		results[asset] = predictions[-1]

	return results
Example #18
0
    def arima(self):
        kl = self.get_kline()
        cp = self.get_close_price(kl)
        date = self.get_date(kl)
        #t = datetime.fromtimestamp(date[-1].timestamp()+24*60*60)
        t = date[-1] + timedelta(days=int(self.day_history/5)) #days seconds ...
        print("predict date:", date[-1],"--->", t)

        dta = pd.Series(cp, index=date)
        print(dta)
        model=ARIMA(dta,order=(4,1,3)) #P D Q
        result=model.fit()
        pred=result.predict( date[-10], t,dynamic=True,typ='levels')
        plt.figure(figsize=(12,8))
        plt.plot(dta, 'ro-')
        plt.xticks(rotation=45)
        plt.plot(pred, 'go-')

        plt.show()
def fitArima(ts):
    import statsmodels.api as sm
    logged_ts = np.log(ts)
    diffed_logged_ts = (logged_ts - logged_ts.shift(7))[7:]
    p = 0
    d = 1
    q = 1
    arima = ARIMA(diffed_logged_ts, [p, d, q], exog=None, freq='D', missing='none')
    diffed_logged_results = arima.fit(trend='c', disp=False)
    predicted_diffed_logged = diffed_logged_results.predict(exog=None, dynamic=False)
    #a=pd.date_range(diffed_logged_ts.index[1], periods=90, freq='D')
    predicted_diffed_logged_ts = pd.Series(predicted_diffed_logged, index=diffed_logged_ts.index[d:])
    predicted_diffed_logged_ts = np.exp(logged_ts.shift(7) + diffed_logged_ts.shift(d) + predicted_diffed_logged_ts)
    
    concatenated = pd.concat([ts, predicted_diffed_logged_ts], axis=1, keys=['original', 'predicted'])
    #a= concatenated
    #a.plot()
    #plt.show()
    return concatenated
Example #20
0
def arima_model(accounts):
    """Fit ARIMA models for each account"""

    # Model each account
    account_models = {}
    for account_type, account in accounts:
        account_data = accounts[(account_type, account)]
        account_data.name = account

        # ARIMA model order is unknown, so find the highest order that can be fit
        order = 0
        modeled = False
        while not modeled and order < len(ARIMA_ORDERS):
            try:
                model = ARIMA(account_data, order=ARIMA_ORDERS[order])
                results = model.fit()
                modeled = True
                account_models[(account_type, account)] = results
            except  (ValueError, np.linalg.LinAlgError):
                order += 1

    return account_models
Example #21
0
 def ARIMA_forcast3(self):
     # load dataset
     series = pd.Series(vr_df['ACTIVE_FLOWS'][0:7000])
     # seasonal difference
     X = series.values
     cycle = 288 #2016
     differenced = difference(X, cycle)
     # fit model
     model = ARIMA(differenced, order=(1,1,1))
     model_fit = model.fit(disp=0)
     # multi-step out-of-sample forecast
     forecast = model_fit.forecast(steps=2016)[0]
     # invert the differenced forecast to something usable
     history = [x for x in X]
     step = 1
     forecast_values = []
     for yhat in forecast:
         inverted = inverse_difference(history, yhat, cycle)
         #print('Day %d: %f' % (day, inverted))
         forecast_values.append(inverted)
         history.append(vr_df['ACTIVE_FLOWS'][7000+step-1])
         step += 1
Example #22
0
 def ARIMA_forecast4(self):
     # parameters
     num_train_init = 7318 
     num_forecast = 12 #one day = 288 data points
     cycle = 288 #for a total 288 samples per day
     startdate = vr_df.index[num_train]
     field = 'DELETED_FLOWS'
     # array of predicted values
     forecast_values = []
     
     for i in range(0,int(len(vr_df)/num_forecast)):
         # check array for out of bound
         num_train_current = i*num_forecast+num_train_init
         if ((num_train_current) > len(vr_df)):
             break
         # load dataset
         series = pd.Series(vr_df[field][0:num_train_current])
         # Make data stationary: seasonal difference
         X = series.values
         differenced = difference(X, cycle)
         # fit model
         model = ARIMA(differenced, order=(1,1,1))
         model_fit = model.fit(disp=0)
         # multi-step out-of-sample forecast
         forecast = model_fit.forecast(steps=num_forecast)[0]
         # invert the differenced forecast to something usable
         history = [x for x in X]
         step = 1
         for yhat in forecast:
             inverted = inverse_difference(history, yhat, cycle)        
             forecast_values.append(inverted)
             #append actual data
             try:
                 history.append(vr_df[field][num_train_current+step-1])
             except:
                 # reached the end of actual data array, use forecasted values to estimate
                 history.append(inverted)
             step += 1
Example #23
0
def previsao_matematica(reservatId, data):
    seriesArray = Series.from_array(predict_info.getSeries(reservatId, data))
    seriesValues = seriesArray.values

    mathDict = {'calculado': False, 'volumes': [], 'dias': 0}

    #if isNonStationary(seriesValues) == True:
    days_in_year = 1
    differenced = predict_info.difference(seriesValues, days_in_year)
    # fit model
    model = ARIMA(differenced, order=(1,0,1))
    model_fit = model.fit(disp = -1)
    # multi-step out-of-sample forecast
    forecast = model_fit.forecast(steps=180)[0]
    # invert the differenced forecast to something usable
    mathDict['calculado'] = True
    history = [x for x in seriesValues]
    for yhat in forecast:
        inverted = predict_info.inverse_difference(history, yhat, days_in_year)
        history.append(inverted)
        if inverted >= 0.0:
            mathDict['volumes'].append("%.4f" % round((inverted), 4))
            mathDict['dias'] = mathDict['dias'] + 1
    return mathDict
Example #24
0
	def _set_model_and_fit(self, train_data, order_):
		model = ARIMA(train_data, order=order_)
		model_fit = model.fit(disp=0)
		return model_fit
Example #25
0
size = int(len(ts_month_log) - 13)
train, test = ts_month_log[0:size], ts_month_log[size:len(ts_month_log)]
history = [x for x in train]
predictions = list()
selisih = list()
mapee = list()
print "Ini Data Test"
print
print test
print
print train
print "Printing Predicted vs Expected Values"
print
for t in range(len(test)):
    model = ARIMA(history, order=(0, 1, 1))
    mode_fit = model.fit(disp=-1)
    output = mode_fit.forecast()
    yhat = output[0]
    predictions.append(float(yhat))
    obs = test[t]
    history.append(obs)
    selisih.append(yhat - obs)
    mapee.append(obs - yhat / obs)
    print "Predicted=%f, Expected=%f" % (np.exp(yhat), np.exp(obs))

error = mean_squared_error(test, predictions)
RMSE = sqrt(error)
ME = sum(selisih) / len(ts_month_log)
MAE = mean_absolute_error(test, predictions)
MPE = 1 / sum(mapee) * 100
MAPE = 100 / sum(mapee) * 100
Example #26
0
data = data[:].astype(np.float)
data.tail()

data.plot()
plt.show()


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(data)
plot_pacf(data)
plt.show()


from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults

model = ARIMA(data, order = (1, 1, 0))
model_fit = model.fit(trend = 'nc', full_output = True, disp = 1)
print(model_fit.summary())

model_fit.plot_predict()


fore = model_fit.forecast(steps = 1)
print(fore)

# 2018년 1월 1일 전력량 763473.4587

Example #27
0
train, test = split_dataset(df_modal_price_supervised)
# evaluate model and get scores
n_input = 5
score, scores = evaluate_model(train, test, n_input)
# summarize scores
summarize_scores('lstm', score, scores)
# plot scores

district_data = df[:140]
district_data.head()

district_data['Date'] = pd.to_datetime(district_data.arrival_date,
                                       dayfirst=True)
district_data['Day'] = district_data.Date.dt.day
district_data['month'] = district_data.Date.dt.month
district_data['year'] = district_data.Date.dt.year
district_data['day_of_week'] = district_data.Date.dt.dayofweek
district_data['weekend'] = district_data.Date.apply(weekend)

district_data.sort_values(by='Date', inplace=True)
district_data.head()

from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot as plt

model = ARIMA(df.modal_price, order=(5, 0, 4))
model_fitted = model.fit(disp=-1)
plt.plot(df.modal_price)
plt.plot(model_fitted.fittedvalues, color='red')
print(model_fitted.summary())
print(f"Coefficients: {model_fit.params}")
predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)
plltt(train['Close_log_diff'],test['Close_log_diff'],predictions,'Auto Regression model')
newall(test['Close_log_diff'],predictions,'AR model')


# ARIMA
p=d=q=range(0,5)
import itertools
val = list(itertools.product(p,d,q))

print("Combinations of p,d,p for ARIMA to get low AIC ")
for param in val:
    try:
        model_arima = ARIMA(test['Close_log_diff'],order = param)
        model_arima_fit = model_arima.fit()
        print(param,model_arima_fit.aic)
    
    except:
        continue

model = ARIMA(test['Close_log_diff'], order=(2,2,0), freq=test['Close_log_diff'].index.inferred_freq)  
results_ARIMA = model.fit(disp=-1)
plt.plot(test['Close_log_diff'])
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.show()
print(results_ARIMA.summary())


# AUTO Arima
stepwise_model = auto_arima(timeseries_dfnew['Close'], start_p=1, start_q=1,
Example #29
0
from statsmodels.tsa.arima_model import ARIMA

from common_calculations import get_the_stationary_series
from first_part_calculations import show_statistical_data, ma_by_ar_resid

# отримання стаціонарного часового ряду
stationary_series = get_the_stationary_series()

# АРКС(2)
arks_model = ARIMA(stationary_series, order=(2, 0, 0))
model = arks_model.fit(disp=0)
AR_resid = model.resid
split = len(stationary_series) - int(0.2 * len(stationary_series))
train, test = stationary_series[0:split], stationary_series[split:]
pred = model.predict(len(test))
show_statistical_data(train, model, pred)

ma_by_ar_resid(AR_resid, 2, 0, 4, 'АРКС(2,4)')

arks_n5_PKC = AR_resid.rolling(5).mean().fillna(AR_resid[:5].mean())
ma_by_ar_resid(arks_n5_PKC, 2, 0, 3,
               'АРКС(2,3) із застосуванням власного простого КС, при N=5')

arks_n10_PKC = AR_resid.rolling(10).mean().fillna(AR_resid[:10].mean())
ma_by_ar_resid(arks_n10_PKC, 2, 0, 7,
               'АРКС(2,7) із застосуванням власного простого КС, при N=10')

arks_n5_EKC = AR_resid.ewm(5).mean()
ma_by_ar_resid(
    arks_n5_EKC, 2, 0, 3,
    'АРКС(2,3) із застосуванням власного експоненційного КС, при N=5')
Example #30
0
ax1 = fig.add_subplot(211)
fig = plot_acf(df['Seasonal First Difference'].iloc[13:], lags=40, ax=ax1)

#plt.show()

ax2 = fig.add_subplot(212)
fig = plot_pacf(df['Seasonal First Difference'].iloc[13:], lags=40, ax=ax2)

#plt.show()

# For non-seasonal data
#p=1, d=1, q=0 or 1
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(df['electricity_available'], order=(1, 1, 1))
model_fit = model.fit()

model_fit.summary()

df['forecast'] = model_fit.predict(start=90, end=103, dynamic=True)
df[['electricity_available', 'forecast']].plot(figsize=(12, 8))

#plt.show()

import statsmodels.api as sm

model = sm.tsa.statespace.SARIMAX(df['electricity_available'],
                                  order=(1, 1, 1),
                                  seasonal_order=(1, 1, 1, 12))
results = model.fit()
#data for arima
trainX = bt['price_feature'][bt['price_feature'].index < '2017-09-01']
testX = bt['price_feature'][bt['price_feature'].index >= '2017-09-01']

#ARIMA
print('\n\n Running Model type: ARIMA')
plot_acf(bt['price_feature'].diff().values[1:], lags=50)
plt.show()
plot_pacf(bt['price_feature'].diff().values[1:], lags=50)
plt.show()

predX = list()
history = list(bt['price_feature'].values)
model = ARIMA(history, order=(1, 1, 1))
model_fit = model.fit(disp=0)
train_error = math.sqrt(sum(model_fit.resid**2) / model_fit.resid.shape[0])
for t in range(len(testX)):
    model = ARIMA(history, order=(1, 1, 1))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predX.append(yhat)
    obs = testX[t]
    history.append(obs)
    #print('predicted=%f, expected=%f' % (yhat, obs))
test_error = math.sqrt(mean_squared_error(testX, predX))
print('Train RMSE: %.3f' % train_error)
print('Test RMSE: %.3f' % test_error)

# plot
def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')

series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
print(series.head())
series.plot()
pyplot.show()

autocorrelation_plot(series)
pyplot.show()


# fit model
model = ARIMA(series, order=(5,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()

residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())

# http://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.predict.html

X = series.values
size = int(len(X) * 0.66)
Example #33
0
    strt = "4/1/" + str(i)
    endt = "11/30/" + str(i)
    dat = pd.date_range(start=strt, end=endt, freq="D")
    L = []
    for j in data.x:
        L += [j]
    dailyrain = pd.Series(L, index=dat)
    print dailyrain

    #test_stationarity(dailyrain)

    #plt.savefig("DailyGAURainfall" + str(i) + ".png")
    plt.close()

    model = ARIMA(dailyrain, order=orders[i - 2008])
    results_AR = model.fit(disp=-1)
    plt.plot(dailyrain, Label="Daily Rainfall Data")
    results = results_AR.fittedvalues.apply(nonZ)
    plt.plot(results, color="RED", Label="Predicted Daily Rainfall")
    plt.title("RS: %.4f" % (sum((dailyrain - results)**2)))
    plt.legend(loc="best")
    plt.savefig("DailyGAU" + str(i) + str(orders[i - 2008]) + ".png")
    f.write("Parametersfor the year " + str(i) + "\n")
    ar_coef, ma_coef = results_AR.arparams, results_AR.maparams
    f.write("AR Coefficients: " + str(ar_coef) + "\n")
    f.write("MA Coefficients: " + str(ma_coef) + "\n")
    f.write("\n")

p_values = [0, 1, 2, 3, 4]
q_values = [0, 1, 2, 3, 4]
d_values = [0]
def fit_models(mypath='', js=None):
    '''
    Takes a file path with a file in json format, or a string with json structure
    Returns json (fecha, prediccion, error), error_prom, accuracy
    '''
#%%
    import os
    import time
    import datetime
    import numpy as np
    import pandas as pd
    import json
    from os import listdir
    from os.path import isfile, join
    from objdict import ObjDict
    
#%%
    if mypath != '':
        os.chdir(mypath)

        lista_archivos = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    
        lista_dat = []
        for dat in lista_archivos:
            with open(dat) as json_data:
                lista_dat.append(json.load(json_data))
    
        for i in range(0,len(lista_dat)):
            fechas = []   
            valores = [] 
            for j in lista_dat[0]:
                fechas.append(j['fecha'])
                valores.append(j['valor'])
                
    elif js != None:
        fechas = []   
        valores = []
        for i in js:
            fechas.append(i['fecha'])
            valores.append(i['valor'])
         
#%%
        
    fechas_list = [fechas[x:x+1] for x in xrange(0, len(fechas), 1)]
    
    fechas_format = []
    for date in fechas:
        #fechas_format.append(time.ctime(date/1000))
        fechas_format.append(datetime.datetime.fromtimestamp(date/1000.0).strftime('%Y-%m-%d-%H'))

        
    #crear variables para separar fecha: ano, mes, dia, hora
    ano,mes,dia,hora = [],[],[],[]
    for date in fechas_format:
        fecha = date.split('-')
        ano.append(int(fecha[0]))
        mes.append(int(fecha[1]))
        dia.append(int(fecha[2]))
        hora.append(int(fecha[3]))
        
                
    #crear variables para dia de la semana
    dia_semana = []    
    for date in fechas:
        if time.ctime(date/1000).split()[0] == 'Mon':
            dia_semana.append(1)
        elif time.ctime(date/1000).split()[0] == 'Tue':
            dia_semana.append(2)
        elif time.ctime(date/1000).split()[0] == 'Wed':
            dia_semana.append(3)
        elif time.ctime(date/1000).split()[0] == 'Thu':
            dia_semana.append(4)
        elif time.ctime(date/1000).split()[0] == 'Fri':
            dia_semana.append(5)
        elif time.ctime(date/1000).split()[0] == 'Sat':
            dia_semana.append(6)
        elif time.ctime(date/1000).split()[0] == 'Sun':
            dia_semana.append(7)
        else:
            print 'Error'
    
    #crear vector fechas
    fechas_pandas = pd.to_datetime(fechas_format)
    
    #crear timeseries    
    dframe = pd.Series(valores, index=fechas_pandas)
            
#%%

    import pyflux as pf
    from datetime import datetime
    import matplotlib.pyplot as plt
    #%matplotlib inline 
    
#%%
    #Ver datos
    #plt.plot(dframe)
    
    #Eliminar Outliers
            
    dframe = dframe[~((dframe-dframe.mean()).abs()>3*dframe.std())]    
    dframe= dframe[(dframe!=0)]      
            
    #ver datos
    #plt.plot(dframe)    
            
    #Separar en train, test
    features_train = dframe[0:int(len(dframe)*.9)]
    features_test = dframe[int(len(dframe)*.9)+1:len(dframe)]
    
    
    #ver datos
    #plt.plot(features_train)
    #plt.plot(features_test)

#%%
    #probar stationarity
    from statsmodels.tsa.stattools import adfuller
    
    def test_stationarity(timeseries, plot=False):
    
        #Determing rolling statistics
        rolmean = pd.rolling_mean(timeseries, window=12)
        rolstd = pd.rolling_std(timeseries, window=12)
    
        #Plot rolling statistics:
        if plot:
            fig = plt.figure(figsize=(12, 8))
            orig = plt.plot(timeseries, color='blue',label='Original')
            mean = plt.plot(rolmean, color='red', label='Rolling Mean')
            std = plt.plot(rolstd, color='black', label = 'Rolling Std')
            plt.legend(loc='best')
            plt.title('Rolling Mean & Standard Deviation')
            plt.show()
            print 'Results of Dickey-Fuller Test:'
        
        #Perform Dickey-Fuller test:
        dftest = adfuller(timeseries, autolag='AIC')
        dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value',
        '#Lags Used','Number of Observations Used'])
        for key,value in dftest[4].items():
            dfoutput['Critical Value (%s)'%key] = value
        if plot:
            print dfoutput
        else:
            return dfoutput
        
#%%
      
    #test_stationarity(features_train)
    '''
    print 'Dickey-Fuller test for original data'
    test_stationarity(features_train, plot=True)
    '''
    p_value = test_stationarity(features_train).iloc[1]
    
#%%

#Estimating & Eliminating Trend    
    #Log Transformation
    features_train_log = np.log(features_train)
    #plt.plot(features_train_log)
    #test_stationarity(features_train_log)
    
    p_value_log = test_stationarity(features_train_log).iloc[1]
    
    #%%
    
    #First Differencing
    features_train_diff = features_train - features_train.shift(1)
    #plt.plot(features_train_diff)
    
        #Visualizar transformacion
    #plt.plot(features_train_diff)
    #plt.plot(features_train)
    features_train_diff.dropna(inplace=True)
    #test_stationarity(features_train_diff)
    p_value_diff = test_stationarity(features_train_diff).iloc[1]
    
    #%%
    #Second Differencing
    features_train_diff2 = features_train_diff - features_train_diff.shift(1)
    features_train_diff2.dropna(inplace=True)
    p_value_diff2 = test_stationarity(features_train_diff2).iloc[1]
    
    #%%
    #Differencing + log
    train_log_diff = features_train_log - features_train_log.shift(1)
    #plt.plot(dframe_log_diff)
    train_log_diff.dropna(inplace=True)
    #test_stationarity(train_log_diff)
    p_value_log_diff = test_stationarity(train_log_diff).iloc[1]

    #%%
    #Second Difference + Log
    train_log_diff2 = train_log_diff - train_log_diff.shift(1)
    #plt.plot(train_log_diff2)
    train_log_diff2.dropna(inplace=True)
    #test_stationarity(train_log_diff2)
    p_value_log_diff2 = test_stationarity(train_log_diff2).iloc[1]

    #%%
    #find best transformation
    p_value_list = [p_value, p_value_log, p_value_diff, 
                    p_value_log_diff, p_value_diff2, p_value_log_diff2]
    
    winner_index = p_value_list.index(min(p_value_list))
    if winner_index == 0:
        winner = features_train
    if winner_index == 1:
        winner = features_train_log
    if winner_index == 2:
        winner = features_train_diff
    if winner_index == 3:
        winner = train_log_diff
    if winner_index == 4:
        winner = features_train_diff2
    if winner_index == 5:
        winner = train_log_diff2 
        
    #%%
    #print 'Dickey-Fuller test for best transformation of data', 
    #test_stationarity(winner, plot=True)
    
    #%%
    #Forecasting a Time Series
    #Arima - Auto-Regressive Integrated Moving Averages.
    
    '''
    Number of AR (Auto-Regressive) terms (p): AR terms are just lags 
        of dependent variable. 
        For instance if p is 5, the predictors for x(t) will be x(t-1)….x(t-5).
    Number of MA (Moving Average) terms (q): MA terms are lagged forecast errors 
        in prediction equation. 
        For instance if q is 5, the predictors for x(t) will be e(t-1)….e(t-5) 
        where e(i) is the difference 
        between the moving average at ith instant and actual value.
    Number of Differences (d): These are the number of nonseasonal differences, 
        i.e. in this case we took 
        the first order difference. So either we can pass that variable and 
        put d=0 or pass the original variable 
        and put d=1. Both will generate same results.
    '''
    
    #ACF and PACF plots: dframe_diff
    from statsmodels.tsa.stattools import acf, pacf
    lag_acf = acf(winner, nlags=20)
    lag_pacf = pacf(winner, nlags=20, method='ols')
    top_line = 1.96/np.sqrt(len(winner))
    
    #%%
    #Get best q and p. Not optimized
    '''
    q=0    
    for i in lag_acf:
       if i > top_line:
           q+=1
       else:
           break
    
    p=0    
    for i in lag_pacf:
       if i > top_line:
           p+=1
       else:
           break
    '''
    #%%
    '''
    #Plot ACF: 
    plt.subplot(121) 
    plt.plot(lag_acf)
    plt.axhline(y=0,linestyle='--',color='gray')
    plt.axhline(y=-1.96/np.sqrt(len(winner)),linestyle='--',color='gray')
    plt.axhline(y=1.96/np.sqrt(len(winner)),linestyle='--',color='gray')
    plt.title('Autocorrelation Function')
    #Plot PACF:
    plt.subplot(122)
    plt.plot(lag_pacf)
    plt.axhline(y=0,linestyle='--',color='gray')
    plt.axhline(y=-1.96/np.sqrt(len(winner)),linestyle='--',color='gray')
    plt.axhline(y=1.96/np.sqrt(len(winner)),linestyle='--',color='gray')
    plt.title('Partial Autocorrelation Function')
    plt.tight_layout()
    plt.show()
    '''
    
    #%%
    '''
    print('Enter the value of q, corresponding to the ACF graph')
    q = raw_input()
    print('Enter the value of p, corresponding to the PACF graph')
    p = raw_input()
    
    q = int(q)
    p = int(p)
    '''
    
    #In this plot, the two dotted lines on either sides of 0 are the 
    #confidence interevals. These can be used to determine the ‘p’ and ‘q’ values as:
    '''
    p – The lag value where the PACF chart crosses the upper confidence interval for the first time. 
        In this case p=2.
    q – The lag value where the ACF chart crosses the upper confidence interval for the first time. 
        In this case q=6.
    '''
    
    #%%
    #Model  (p,d,q)
    
    #Finding best parameters
    from statsmodels.tsa.arima_model import ARIMA
    acc_list = []
    for d in range(0,3):
        for p in range(0,6):
            for q in range(0,6):
                #print('Model Result')
                try:
                    model_diff = ARIMA(winner, order=(p, d, q))  
                    results_ARIMA_diff = model_diff.fit(disp=-1) 
                    error = np.sqrt((results_ARIMA_diff.fittedvalues-winner[1:])**2)
                    error_prom = error.mean()
                    accuracy = 100-error_prom
                    acc_list.append([p, d, q, accuracy])
                except:
                    next
                #plt.plot(winner)
                #plt.plot(results_ARIMA_diff.fittedvalues, color='red')
                #plt.title('RSS: %.4f'% sum((results_ARIMA_diff.fittedvalues-winner)**2))
                #plt.show()
        
    from operator import itemgetter
    params = sorted(acc_list, key=itemgetter(3))[len(acc_list)-1]
    p = params[0]
    d = params[1]
    q = params[2]
    
#%%
    #Build model with best params       
    model_diff = ARIMA(winner, order=(p, d, q))  
    results_ARIMA_diff = model_diff.fit(disp=-1)
    
    '''
    plt.plot(winner)
    plt.plot(results_ARIMA_diff.fittedvalues, color='red')
    plt.title('RSS: %.4f'% sum((results_ARIMA_diff.fittedvalues-winner[2:])**2))
    plt.show()
    
    error = np.sqrt((results_ARIMA_diff.fittedvalues-winner)**2)
    error_prom = error.mean()
    accuracy = 100-error_prom
    '''
    
    #%%
    
    #Taking it back to original scale
    #store the predicted results as a separate series and observe it.
    if winner_index == 0:
        predictions_ARIMA = pd.Series(results_ARIMA_diff.fittedvalues, copy=True)
        pred_ARIMA_diff_corrected = predictions_ARIMA

    if winner_index == 1:
        predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True)
        pred_ARIMA_diff_corrected = np.exp(predictions_ARIMA_diff)

    if winner_index == 2:
        predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True)
        pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1)
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[1:]

    if winner_index == 3:
        predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True)
        pred_ARIMA_diff_corrected = np.exp(predictions_ARIMA_diff)
        pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1)
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[1:]
    
    if winner_index == 4:
        predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True)
        pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1)
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[2:]
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected.shift(-2)
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[:len(pred_ARIMA_diff_corrected)-2]
    
    if winner_index == 5:
        predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True)
        pred_ARIMA_diff_corrected = np.exp(predictions_ARIMA_diff)
        pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1)
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[2:]
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected.shift(-1)
        pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[:len(pred_ARIMA_diff_corrected)-1]                                                         

    #%% 
    #Visualize in-sample predictions                                                             
    '''
    plt.plot(features_train)
    plt.plot(pred_ARIMA_diff_corrected, color='red')
    '''
    #%%
    '''
    plt.plot(pred_ARIMA_diff_corrected.head(100), color='red')
    plt.plot(features_train.head(100))
    '''
    
    #%%
    '''
    plt.plot(pred_ARIMA_diff_corrected.tail(100), color='red')
    plt.plot(features_train.tail(100))
    '''
    
    #%%
    #visualizar error
    '''
    print('Percentage of Errors')
    in_sample_error = np.sqrt((pred_ARIMA_diff_corrected-features_train)**2)    
    in_sample_error_prom = error.mean()
    in_sample_accuracy = 100-error_prom
    
    plt.plot(in_sample_error)
    plt.title('Promedio Error: %.4f'% in_sample_error_prom + '; Precision: %.4f'% in_sample_accuracy)
    plt.show()
    '''
   
    #%%
    #Out of sample predictions
    if winner_index == 0:
        out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True)
    
    if winner_index == 1:
        out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True)
        out_of_sample_predictions_ARIMA = np.exp(out_of_sample_predictions_ARIMA)

    if winner_index == 2:
        out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2]
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-2)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-2]                                                                    

    if winner_index == 3:
        out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True)
        out_of_sample_predictions_ARIMA = np.exp(out_of_sample_predictions_ARIMA)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2]
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-2)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-2]
        
    if winner_index == 4:
        out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2]
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-5)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-4]    
        out_of_sample_predictions_ARIMA.index = features_test.head(len(out_of_sample_predictions_ARIMA)).index           
    if winner_index == 5:
        out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True)
        out_of_sample_predictions_ARIMA = np.exp(out_of_sample_predictions_ARIMA)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2]
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-4)
        out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-4]    


    #%%
    #Error total
    '''
    print('Out of sample prediction')
    plt.plot(features_test, color='green')
    plt.plot(out_of_sample_predictions_ARIMA, color='red')
    
    '''
    #visualizar error
    error = np.sqrt((out_of_sample_predictions_ARIMA-features_test)**2).head(len(out_of_sample_predictions_ARIMA))
    error_prom = error.mean()
    accuracy = 100-error_prom

    '''
    plt.plot(error)
    
    plt.title('Promedio Error: %.4f'% error_prom + '; Precision: %.4f'% accuracy)
    plt.show()
    '''
    
    #%%
    #Error primeros 50 datos
    '''
    print('Out of sample prediction First 50')
    plt.plot(features_test.head(50), color='green')
    plt.plot(out_of_sample_predictions_ARIMA.head(50), color='red')
    
    error_50 = np.sqrt((out_of_sample_predictions_ARIMA.head(50)-features_test.head(50))**2).head(len(out_of_sample_predictions_ARIMA.head(50)))
    error_prom_50 = error_50.mean()
    accuracy_50 = 100-error_prom_50

    plt.plot(error_50)
    plt.title('Promedio Error: %.4f'% error_prom_50 + '; Precision: %.4f'% accuracy_50)
    plt.show()
    '''
    
    #%%
    data = []
    for i in range(0, len(error)):
        entry = ObjDict()
        entry.fecha = str(out_of_sample_predictions_ARIMA.index[i])
        entry.prediccion = out_of_sample_predictions_ARIMA[i]
        entry.error = error[i]
        data.append(entry)
    
        #%%
    #print('data, RMSE, error_prom, accuracy')
    return json.dumps(data), error_prom, accuracy
Example #35
0
def ARIMAmodel(data, days=0):
    from statsmodels.tsa.arima_model import ARIMA
    model = ARIMA(data, order=(1, 1, 1))
    model_fit = model.fit(disp=False)
    yhat = model_fit.predict(len(data), len(data) + days, typ='levels')
    return (yhat)
Example #36
0
plt.show()

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(211)
fig = plot_acf(diffshift, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_pacf(diffshift, lags=40, ax=ax2)
plt.show()
#arima
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(indxds_logscale, order=(3, 1, 0))
results = model.fit(disp=0)
print(results.summary())
plt.plot(diffshift, color='g')
plt.plot(results.fittedvalues, color='b')
#plt.title('RSS: %.4f'% sum((results.fittedvalues-diffshift['Close'])**2))
####
x = results.forecast(steps=15)[0]
fore = np.exp(x)
z = fore.tolist()
price1 = pd.concat(
    [pd.Series(df['Close']), pd.Series(z)], ignore_index=True, copy=True)
print(price1.tail())
fig, ax = plt.subplots(1, 1)
price1.plot(ax=ax, color='k', label='actual')
price1.iloc[53:].plot(ax=ax, color='r', label='forecasted')
plt.xlabel('index')
Example #37
0
model_fit = ARIMAResults.load('model.pkl')
bias = numpy.load('model_bias.npy')
# make first prediction
predictions = list()
yhat = float(model_fit.forecast()[0])
yhat = bias + inverse_difference(history, yhat, months_in_year)
predictions.append(yhat)
history.append(y[0])
print('>Predicted=%.3f, Expected=%3.f' % (yhat, y[0]))
# rolling forecasts
for i in range(1, len(y)):
    # difference data
    months_in_year = 12
    diff = difference(history, months_in_year)
    # predict
    model = ARIMA(diff, order=(6, 0, 0))
    model_fit = model.fit(trend='nc', disp=0)
    yhat = model_fit.forecast()[0]
    yhat = bias + inverse_difference(history, yhat, months_in_year)
    predictions.append(yhat)
    # observation
    obs = y[i]
    history.append(obs)
    print('>Predicted=%.3f, Expected=%3.f' % (yhat, obs))
# report performance
mse = mean_squared_error(y, predictions)
rmse = sqrt(mse)
print('RMSE: %.3f' % rmse)
pyplot.plot(y)
pyplot.plot(predictions, color='red')
pyplot.show()
Example #38
0
 def use_arima(self, training_data, p, d, q):
     model = ARIMA(training_data, order=(p, d, q))
     model_fit = model.fit(disp=False)
     return model_fit.forecast()[0]
Example #39
0
def global_forcast():
    dataset = pd.read_csv("data_formatted.csv")
    forecasting_dataset = pd.read_csv("forecasting.csv")
    """
    We are creating matrix of independent variable and vector of dependent variable
    """
    X = dataset.iloc[:, 0:1].values
    Y = dataset.iloc[:, 1:2].values
    X1 = forecasting_dataset.iloc[:, 0:1].values
    Y1 = forecasting_dataset.iloc[:, 1:2].values
    # =============================================================================
    # Y = dataset.iloc[:,3].values
    """
    Here we will check for exitance of any missing value and replace that missing value by mean of the 
    column
    """

    imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
    Y = imputer.fit_transform(Y)
    Y = imputer.transform(Y)
    """
    Here we are going to convert years to some label as numeric calculations can't be
    performed on labels
    """
    #from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    #
    #labelencoder_X = LabelEncoder()
    #X[:,0]=labelencoder_X.fit_transform(X[:,0])
    """
    We don't need hot encoding yet because years do have certain weightage
    """
    """
    This is the most crucial step of data preprocessing.Here we are splitting our dataset
    into training and test dataset to avoid overfitting.Here we have choosen random_state of 42 
    which is generally most suitable state for unbiased division.
    """
    #Training our model on training set data

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)
    print("Global Oil consumption forecasting using ARIMA model")
    model = ARIMA(X_train, order=(1, 1, 0))
    model_fit = model.fit(disp=0)
    print(model_fit.summary())
    # plot residual errors
    residuals = DataFrame(model_fit.resid)
    residuals.plot()
    pyplot.show()
    residuals.plot(kind='kde')
    pyplot.show()
    print(residuals.describe())
    #Visualizing the result on test set
    X_test_list = []
    for x in X_test.flat:
        X_test_list.append(x)
    Y_pred = [3320, 3400, 3893, 3895]
    Y_forecast = [4394.190, 4421.864, 4719.0507607, 4628.7790, 5074.080]
    X_forecast = []
    for x in X1.flat:
        X_forecast.append(x)

    plt.scatter(X_test, Y_test, color="red")
    plt.plot(X_test_list, Y_pred, color="blue")
    plt.title("Year vs Consumption (Test set)")
    plt.xlabel("Year of Consumption")
    plt.ylabel("Total Consumption")
    locator = matplotlib.ticker.MultipleLocator(2)
    plt.gca().xaxis.set_major_locator(locator)
    formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}")
    plt.gca().xaxis.set_major_formatter(formatter)
    plt.show()

    plt.scatter(X_test, Y_test, color="red")
    plt.bar(X_test_list, Y_pred)
    plt.title("Year vs Consumption (Test set)")
    plt.xlabel("Year of Consumption")
    plt.ylabel("Total Consumption")
    locator = matplotlib.ticker.MultipleLocator(1)
    plt.gca().xaxis.set_major_locator(locator)
    formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}")
    plt.gca().xaxis.set_major_formatter(formatter)
    plt.show()

    #Forecasting the result for next five years

    plt.plot(X_forecast, Y_forecast, color="blue")
    plt.title("Year vs Consumption (Forecasting result)")
    plt.xlabel("Year of Consumption")
    plt.ylabel("Total Consumption")
    locator = matplotlib.ticker.MultipleLocator(2)
    plt.gca().xaxis.set_major_locator(locator)
    formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}")
    plt.gca().xaxis.set_major_formatter(formatter)
    plt.show()

    plt.bar(X_forecast, Y_forecast)
    plt.title("Year vs Consumption (Forecasting result)")
    plt.xlabel("Year of Consumption")
    plt.ylabel("Total Consumption")
    locator = matplotlib.ticker.MultipleLocator(1)
    plt.gca().xaxis.set_major_locator(locator)
    formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}")
    plt.gca().xaxis.set_major_formatter(formatter)
    plt.show()
Example #40
0
    def startARIMAForecasting(dataset, P, D, Q, newdates):
        model = ARIMA(dataset, order=(P, D, Q))

        model_fit = model.fit(disp=0)
        prediction = model_fit.forecast(len(newdates))[0]
        return prediction
Example #41
0
# Import the ARIMA module from statsmodels
from statsmodels.tsa.arima_model import ARIMA

# Forecast temperatures using an ARIMA(1,1,1) model
mod = ARIMA(temp_NY, order=(1, 1, 1))
res = mod.fit()

# Plot the original series and the forecasted series
res.plot_predict(start='1872-01-01', end='2046-01-01')
plt.show()
def run_main():
    k = ts.get_hist_data('600519')  #600519茅台股票  这里可以设置获取的时间段
    # k  = ts.get_hist_data('600519',start='2015-05-04',end='2018-05-02')

    lit = ['open', 'high', 'close', 'low']  #这里我们只获取其中四列
    data = k[lit]

    d_one = data.index  #以下9行将object的index转换为datetime类型
    d_two = []
    d_three = []
    date2 = []
    for i in d_one:
        d_two.append(i)
    for i in range(len(d_two)):
        d_three.append(parse(d_two[i]))
    data2 = pd.DataFrame(data, index=d_three, dtype=np.float64)

    #构建新的DataFrame赋予index为转换的d_three。当然你也可以使用date_range()来生成时间index

    plt.plot(data2['close'])  #一看数据就不稳定,所以我们需要做差分
    plt.title('股市每日收盘价')
    plt.show()

    data2_w = data2['close'].resample(
        'W-MON').mean()  #由于原始数据太多,按照每一周来采样,更好预测,并取每一周的均值
    data2_train = data2_w['2015':'2017']  #我们只取2015到2017的数据来训练
    plt.plot(data2_train)
    plt.title('周重采样数据')
    plt.show()

    #一阶差分,分析ACF
    acf = plot_acf(data2_train, lags=20)  #通过plot_acf来查看训练数据,以便我们判断q的取值
    plt.title("股票指数的 ACF")
    acf.show()

    #一阶差分,分析PACF
    pacf = plot_pacf(data2_train, lags=20)  #通过plot_pacf来查看训练数据,以便我们判断p的取值
    plt.title("股票指数的 PACF")
    pacf.show()

    #处理数据,平稳化处理
    data2_diff = data2_train.diff(1)  #差分很简单使用pandas的diff()函数可以进行一阶差分
    diff = data2_diff.dropna()
    for i in range(4):  #五阶差分,一般一到二阶就行了,我有点过分
        diff = diff.diff(1)
        diff = diff.dropna()
    plt.figure()
    plt.plot(diff)
    plt.title('五阶差分')
    plt.show()

    # 五阶差分的ACF
    acf_diff = plot_acf(diff, lags=20)
    plt.title("五阶差分的ACF")  #根据ACF图,观察来判断q
    acf_diff.show()

    # 五阶差分的PACF
    pacf_diff = plot_pacf(diff, lags=20)  #根据PACF图,观察来判断p
    plt.title("五阶差分的PACF")
    pacf_diff.show()

    print("train sample data")
    print(data2_train.head())

    #根据ACF和PACF以及差分 定阶并建模
    model = ARIMA(data2_train, order=(6, 1, 5), freq='W-MON')  #pdq    频率按周

    #拟合模型
    arima_result = model.fit()

    #预测
    pred_vals = arima_result.predict(
        '2017-01-02', dynamic=True,
        typ='levels')  #输入预测参数,这里我们预测2017-01-02以后的数据

    #可视化预测
    stock_forcast = pd.concat([data2_w, pred_vals],
                              axis=1,
                              keys=['original',
                                    'predicted'])  #将原始数据和预测数据相结合,使用keys来分层

    #构图
    plt.figure()
    plt.plot(stock_forcast)
    plt.title('真实值vs预测值')
    plt.show()
Example #43
0
class arima():
    """ARIMA class object"""
    def __init__(
            self,
            df,
            col_name,
            latest_date,
            obs_num,
            lstm_pred,
            order=(0, 2, 1),
            pred_num=30,
            train_num=200,
    ):
        self.df = df
        self.col = col_name
        self.series = df[col_name]
        self.date = latest_date
        self.order = order
        self.obs_num = obs_num
        self.pred_num = pred_num
        self.train_num = train_num
        self.lstm_pred = lstm_pred
        best_cfg = load(open('_models/arima_order.pkl', 'rb'))
        try:
            order = best_cfg[col_name]
        except:
            order = (0, 2, 1)
        self.model = ARIMA(self.series[-1 * train_num:], order=order)
        self.model_fit = self.model.fit(disp=0)

    def quick_fit_plot(self):
        """create streamlit plot object"""
        st.pyplot(self.model_fit.plot_predict(1, self.obs_num + self.pred_num))

    def plot_acf_pacf(self):
        """Auto correlation function plot  on streamlit object"""
        fig, axs = plt.subplots(2)
        plt.subplots_adjust(hspace=0.4)
        plot_acf(self.series[-1 * self.train_num:], ax=axs[0])
        plot_pacf(self.series[-1 * self.train_num:], ax=axs[1])
        st.pyplot(fig)

    def get_pred(self):
        trend = self.model_fit.forecast(self.pred_num)[0]
        conf_inv = self.model_fit.forecast(self.pred_num)[2]
        return trend, conf_inv

    def _create_df_plot(self, col_type='Cases', arima_on=True, lstm_on=True):
        """create  plot data frame of prediction and confidence region  for altair plot"""
        localize = lambda x: "{:,}".format(round(x))
        # origin data
        temp = pd.DataFrame(self.series[-1 * self.obs_num:]).rename(
            columns={self.col: 'cases'})
        temp['Date'] = temp.index
        temp['Type'] = col_type
        temp['Cases'] = temp['cases'].apply(localize)
        # predictions
        if arima_on:
            line = pd.DataFrame(self.model_fit.forecast(self.pred_num)[0],
                                index=pd.date_range(start=self.date +
                                                    dt.timedelta(days=1),
                                                    periods=self.pred_num),
                                columns=['cases'])
            line['Date'] = line.index
            line['Type'] = 'ARIMA_pred'
            line['Cases'] = line['cases'].apply(localize)

            temp = temp.append(line, ignore_index=True)

        # Predictions2
        if lstm_on:
            lstm_line = pd.DataFrame(
                self.lstm_pred[self.col]).rename(columns={self.col: 'cases'})
            lstm_line['Date'] = lstm_line.index
            lstm_line['Type'] = 'LSTM_pred'
            lstm_line['Cases'] = lstm_line['cases'].apply(localize)

            temp = temp.append(lstm_line, ignore_index=True)

        # confidence region
        cl = pd.DataFrame(self.model_fit.forecast(self.pred_num)[2],
                          index=pd.date_range(start=self.date +
                                              dt.timedelta(days=1),
                                              periods=self.pred_num),
                          columns=['lower', 'upper'])
        cl['Date'] = cl.index

        return temp, cl

    def draw_single_trend(self,
                          return_chart=False,
                          country_name='Cases',
                          arima_on=True,
                          lstm_on=True):

        df_plot, df_cl = self._create_df_plot(country_name, lstm_on=lstm_on)
        trend_chart = alt.Chart(df_plot).mark_line().encode(
            x=alt.X("Date:T", scale=alt.Scale(zero=False)),
            y=alt.Y("cases:Q", scale=alt.Scale(zero=False)),
            color=alt.Color('Type',
                            sort=[country_name, 'ARIMA_pred', "LSTM_pred"]),
            strokeDash=alt.condition(
                alt.FieldOneOfPredicate(field='Type',
                                        oneOf=['ARIMA_pred', 'LSTM_pred']),
                #((alt.datum.Type == 'ARIMA_pred') or (alt.datum.Type == 'LSTM_pred')),
                alt.value([10,
                           5]),  # dashed line: 5 pixels  dash + 5 pixels space
                alt.value([0])),
            tooltip=["Date:T",
                     "Cases:O"]).properties(width=800,
                                            height=300).interactive()

        band = alt.Chart(df_cl).mark_area(opacity=0.5, color='grey').encode(
            x=alt.X("Date:T", scale=alt.Scale(zero=False)),
            y=alt.Y('lower', title='cases'),
            y2=alt.Y2('upper',
                      title='cases')).properties(width=800,
                                                 height=300).interactive()

        if return_chart:
            return band + trend_chart
        else:
            st.altair_chart(band + trend_chart)
Example #44
0
from pandas import datetime
from matplotlib import pyplot
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

series = [O[i][j]['rainfall'] for j in range(20) for i in range(52)]
X = np.sqrt(series)
size = int(len(X) * 0.66)
train, test = X[0:780], X[780:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(5, 2, 0))
    model_fit = model.fit(disp=0, )
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot
pyplot.plot(test)
pyplot.plot(predictions, color='red')
pyplot.show()
Example #45
0
class Kappa(object):
    def __init__(self, fname, cvparams=True):
        '''
        Input: file path to dataframe to be used for analysis
        Output: Kappa object with cleaned dataframe
        '''
        self.fname = fname
        temp = pickle.load(open(fname))
        self.df = add_ngames_col(nstreams_filter(600,temp))
        self.df.set_index('date', inplace=True)
        self.df['dayofweek'] = self.df.index.dayofweek
        self.df['weekofyear'] = self.df.index.weekofyear
        self.df['year'] = self.df.index.year
        # self.rfr_params = {'n_estimators':300,
        #       'max_features':'sqrt',
        #       'n_jobs':-1}

        if cvparams==True:
            self.cvparams = pickle.load(open('pickle_pile/cross_val_smorc.pkl','rb'))
        self.cv_pred = {}

    def cfilt(self,channel):
        '''
        Input: channel to filter for
        Output: channel-specific dataframe
        '''
        self.cdf=chan_filter(self.df, channel)
        self.cdf.sort_index(inplace=True)


    def dumb_set(self,dft):
        '''
        Input: Initial dataframe
        Output: X, y for machine learning algorithms with dummified categorical variables.
        '''
        # for i in np.arange(7):
        #     dft['lagday{0}'.format(i+1)] = (dft["AVG CCV's"].shift((i+1))).fillna(0)
        y = dft["AVG CCV's"]
        dc = ['AirTime','Platform', 'tdelta','avg_frequency','Language', 'index','#', "AVG CCV's", "Max CCV's", 'Hours Watched']# 'Hours Watched', 'Channel', 'Main Game'

        X = dft.drop(dc, axis=1)
        X = pd.get_dummies(X, columns = ['Channel', 'Main Game', 'dayofweek'])
        #eventually add Language
        return X, y


    def _make_holdout_split(self, df, leaveout=3): #
        '''
        Input: dataframe and # leaveout weeks.
        Output: X,y training and hold data partitions
        '''
        # self.folds = pd
        lod = leaveout*7
        start, end = df.index.min(), df.index.max()
        self.folds = pd.date_range(start, end, freq='7D')#'{0}D'.format(lod))
        self.Xset, self.yset = self.dumb_set(df)
        lo = self.folds[-leaveout:][0]
        X_trainset = self.Xset.query('date < @lo')
        y_trainset = self.yset.reset_index().query('date < @lo')
        X_holdset = self.Xset.query('date >= @lo')
        y_holdset = self.yset.reset_index().query('date >= @lo')
        self.X_trainset = X_trainset.copy()
        self.y_trainset = y_trainset.copy().set_index('date')
        self.X_holdset  = X_holdset.copy()
        self.y_holdset = y_holdset.copy().set_index('date')
        self.X_train = self.X_trainset.reset_index().copy()
        self.y_train = self.y_trainset.reset_index().copy()
        self.X_hold = self.X_holdset.reset_index().copy()
        self.y_hold = self.y_holdset.reset_index().copy()

    def _fchain_kfold_indicies(self, lag=1, ahead=1):
        '''
        Input: lag weeks, ahead weeks
        Output: forward chain kfold cross validation indices for time series.
        '''
        #currently avoiding dummy problems by dummifying early, need to fix
        # and adapt later down the road. Also add cols
        ld = pd.Timedelta(days=lag*7)
        ad = pd.Timedelta(days=ahead*7)
        kstart, kend = self.X_trainset.index.min(), self.X_trainset.index.max()
        period = lag*7 + ahead*7
        self.kfolds = pd.date_range(kstart,kend, freq='{0}D'.format(period))
        self.train_kfoldi = []
        self.test_kfoldi = []
        self.fkfoldi = []
        # self.kfoldxi = []
        # self.kfoldyi = []
        for i, f in enumerate(self.kfolds):
            j = 1+i
            if f==kstart:
                #For first fold
                udb = self.kfolds[1] - ad
                train_xset = self.X_train.query('date < @udb')
                train_yset = self.y_train.query('date < @udb')
                test_yset = self.y_train.query('date >= @udb & date < @self.kfolds[1]')
                test_xset = self.X_train.query('date >= @udb & date < @self.kfolds[1]')

            elif i == len(self.kfolds)-1:
                #For last fold
                udb = kend - ad
                train_yset = self.y_train.query('date < @udb')
                train_xset = self.X_train.query('date < @udb')
                test_xset = self.X_train.query('date >= @udb')
                test_yset = self.y_train.query('date >= @udb')

            else:
                #middle folds
                udb = self.kfolds[j]-ad
                train_xset = self.X_train.query('date < @udb')
                train_yset = self.y_train.query('date < @udb')
                test_xset = self.X_train.query('date >= @udb & date < @self.kfolds[@j]')
                test_yset = self.y_train.query('date >= @udb & date < @self.kfolds[@j]')

            self.testx_ind = test_xset.index.values
            self.testy_ind = test_yset.index.values
            self.trainx_ind = train_xset.index.values
            self.trainy_ind = train_yset.index.values
            self.train_kfoldi.append([self.trainx_ind, self.trainy_ind])
            self.test_kfoldi.append([self.testx_ind, self.testy_ind])
            self.fkfoldi.append((self.trainx_ind, self.testx_ind))

        self.Xtrn = self.X_train.drop('date', axis=1)
        self.ytrn = self.y_train.drop('date', axis=1)
        # self.Xhld = self.X_hold.drop('date', axis=1)
        # self.yhld = self.yhld.drop('date', axis=1)



    def run_cvmod(self, channel):
        '''
        DEPRECATED:
        Originally used for testing/debugging
        '''
        self.cfilt(channel)
        # self.cdf.sort_index(inplace=True)
        self._make_holdout_split(mk.cdf)
        self._fchain_kfold_indicies()
        # self.Xtrn = self.X_train.drop('date', axis=1)
        # self.ytrn = self.y_train.drop('date', axis=1)
        self.rfrcv = RandomForestRegressor(**self.rfr_params)
        mses = []
        r2s = []
        for j in xrange(len(self.kfolds)):
            print '******'
            print 'Evaluating Fold #{0}'.format(j)
            print '******'
            Xtrain_indices, ytrain_indices = self.train_kfoldi[j]
            Xtest_indices, ytest_indices = self.test_kfoldi[j]
            xtrain = self.Xtrn.iloc[Xtrain_indices]
            ytrain = self.ytrn.iloc[ytrain_indices]
            xtest = self.Xtrn.iloc[Xtest_indices]
            ytest = self.ytrn.iloc[ytest_indices]
            self.rfrcv.fit(xtrain.values, ytrain.values)
            ypred = self.rfrcv.predict(xtest)
            mses.append(mean_squared_error(ytest.values, ypred))
            r2s.append(r2_score(ytest.values, ypred))
        self.rmses, self.r2_scores = np.sqrt(mses), np.array(r2s)

    def test_stationarity(self,channel):
        '''
        Input: channel for testing
        Output: Results of Dickey-Fuller test and plot with data, rolling mean, and rolling std.
        '''
        #requires date index
        ts = chan_filter(self.df, channel)
        ts.sort_index(inplace=True)
        timeseries = ts["AVG CCV's"]
        rolmean = pd.Series.rolling(timeseries, window=7).mean()
        rolstd = pd.Series.rolling(timeseries, window=7).std()
        orig = plt.plot(timeseries, color='blue', label='Original')
        mean = plt.plot(rolmean, color='red', label='Rolling Mean')
        std = plt.plot(rolstd, color = 'black', label='Rolling std')
        plt.legend(loc='best')
        plt.show(block=False)
        print 'Results of Dickey-Fuller Test'
        dftest = adfuller(timeseries, autolag='AIC')
        dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number Observations Used'])
        for key, value in dftest[4].items():
            dfoutput['Critical Value (%s)' % key] = value
        print dfoutput

    def plot_acf_pacf(self, channel, lags=20):
        '''
        Input: channel and #lags to include
        Output: Plots with autocorrelation function and partial autocorrelation function.
        '''
        #set indexto date in input
        ts = chan_filter(self.df, channel)
        ts.sort_index(inplace=True)
        data = ts["AVG CCV's"]
        fig = plt.figure(figsize=(12,8))
        ax1 = fig.add_subplot(211)
        fig = plot_acf(data, lags=lags, ax=ax1)
        ax2 = fig.add_subplot(212)
        fig = plot_pacf(data, lags=lags, ax=ax2)
        plt.show()


    def pltrange(self):#, indx_range=None):
        '''
        DEPRECATED:
        Used for early EDA, data-viz and results analysis
        '''
        ypred = self.rfrcv.predict(self.Xtrn)
        # if indx_range != None:
        #     plt.plot(ypred[indx_range])
        #     plt.plot(self.ytrn.values[indx_range])
        #     plt.show(block=False)
        # else:
        plt.plot(ypred)
        plt.plot(self.ytrn.values)
        plt.show(block=False)

    def run_grid_search(self, estimator):
        '''
        Input: estimator name
        Output: best parameters for a given estimator
        '''

        if estimator.__class__.__name__ == 'RandomForestRegressor':
            self.gridsearch = GridSearchCV(estimator,
                                            self.rfr_gsparams,
                                            n_jobs=-1,
                                            verbose=True,
                                            scoring='mean_squared_error',
                                            cv=self.fkfoldi)


            #have this functionr return best params then pass those as argument to cross_val_score and loop through different channels
        elif estimator.__class__.__name__ == 'GradientBoostingRegressor':
            self.gridsearch = GridSearchCV(estimator,
                                            self.gboostR_gsparams,
                                            n_jobs=-1,
                                            verbose=True,
                                            scoring='mean_squared_error',
                                            cv=self.fkfoldi)
        self.gridsearch.fit(self.Xtrn, self.ytrn)
        print self.gridsearch.best_params_
        print 'for ', estimator.__class__.__name__

    def eval_models(self, channel):#deprecated, cvpredict needs partitions
        '''
        DEPRECATED:
        cvpredict requires full partitions of cross val indices. Was attempting to simplify code however forward chain cross val not compatible with cvpredict.
        '''
        self.cfilt(channel)
        self._make_holdout_split(self.cdf)
        self._fchain_kfold_indicies()
        lassoCV_params = {'cv': self.fkfoldi,
                            'n_jobs':-1,
                            'alphas':np.logspace(-4,2,100)}
        ridgeCV_params =  {'cv': self.fkfoldi,
                            'alphas':np.logspace(-4,2,100),
                            'scoring':'mean_squared_error'}
        models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params']),
        GradientBoostingRegressor(**self.cvparams[channel]['GradientBoostingRegressor']['params']),
        LassoCV(**lassoCV_params),
        RidgeCV(**ridgeCV_params)]
        self.cv_pred[channel] = {}
        for mod in models:
            self.cv_pred[channel][mod.__class__.__name__] = cross_val_predict(estimator=mod, X=self.Xtrn, y=self.ytrn, cv=self.fkfoldi, n_jobs=-1)

        pickle.dump(self.cv_pred[channel], open('pickle_pile/{0}_cvpred.pkl'.format(channel),'wb'))

    def linear_kappa_search(self):
        '''
        DEPRECATED:
        more efficient method used elsewhere.
        Input:
        Output:
        '''
        channels = ['lirik', 'summit1g', 'imaqtpie', 'nl_kripp', 'destiny']
        self.lcvp = {}
        for channel in channels:
            self.lcvp[channel] = {}
            self.cfilt(channel)
            self._make_holdout_split(self.cdf)
            self._fchain_kfold_indicies()
            lassoCV_params = {'cv': self.fkfoldi,
                                'n_jobs':-1,
                                'alphas':np.logspace(-4,2,100)}
            ridgeCV_params =  {'cv': self.fkfoldi,
                                'alphas':np.logspace(-4,2,100),
                                'scoring':'mean_squared_error'}
            models = [LassoCV(**lassoCV_params), RidgeCV(**ridgeCV_params)]
            for regression in models:
                reg_name = regression.__class__.__name__
                self.lcvp[channel][reg_name] = {}
                regression.fit(self.Xtrn, self.ytrn)
                self.lcvp[channel][reg_name][alpha] = regression.alpha_
                mse_scores = cross_val_score(estimator=regression, X=self.Xtrn, y=self.ytrn, scoring='mean_squared_error', n_jobs=-1)
                self.lcvp[channel][reg_name][rmse] = np.sqrt(mse_score).mean()



    def kappa_search(self, channel, estimator):
        '''
        Input: Channel for which to optimize model, estimator for model.
        Output: Gridsearch on estimator using parameters defined in function.
        '''
        self.cfilt(channel)
        self._make_holdout_split(self.cdf)
        self._fchain_kfold_indicies()
        self.rfr_gsparams = {'n_estimators': [10, 100, 200, 300],
                                'criterion': ['mse'],
                                'min_samples_split': [2,4,6],
                                'min_samples_leaf': [1,2],
                                'max_features': ['sqrt', None, 'log2'],
                                'n_jobs':[-1]
                                }
        self.gboostR_gsparams = {'loss': ['ls','lad','huber'],
                                    'learning_rate': [.001, .01, .1, 1, 2],
                                    'n_estimators': [50, 100, 200],
                                    'max_depth': [2,5,8,10],
                                    'max_features': [None,'sqrt','log2']
                            }
        self.xts = self.X_train.set_index('date')
        self.yts = self.y_train.set_index('date')
        self.arima_params = {'endog': self.yts, 'order': (2,1,2)}
        self.run_grid_search(estimator)

    def load_newh(self):
        '''
        Due to time-lapse in data collection and analysis, new data had been acquired that could be analyzed.
        Output: New dataframe containing completely unseen data.
        '''
        temp = pickle.load(open('pickle_pile/dfg.pkl', 'rb'))
        dfg = nstreams_filter(600, temp)
        dfg.set_index('date', inplace=True)
        dfg['year']=dfg.index.year
        return dfg

    def _find_holdout_date_thresh(self,channel):
        '''
        Input: channel
        Output: date range of holdout set
        '''
        self.cfilt(channel)
        self._make_holdout_split(self.cdf)
        return self.X_hold['date'].min(), self.cdf.index.max()

    def eval_holdout_data(self, channel):
        '''
        Used after adding freshly collected data.
        Evaluates models using previously gridsearch-optimized estimators. Currently, because of dummy variables, these need to be created early in the process to ensure proper dimensionality of categorical features. This step is not necessary if used in graphlab due its superious handling of categorical variables.
        Further, creation of dummie dictionary using training set and then adding dummy columns to holdout data encoded by dummy dictionary also works.
        '''
        dhmin, dhmax = self._find_holdout_date_thresh(channel)
        self.dfn = chan_filter(self.load_newh(), channel)
        self.dfn.sort_index(inplace=True)
        dft = self.dfn.query('date > @dhmax')
        self.dfu = pd.concat([self.cdf,dft])
        new_hold_num = self.dfu.query('date >= @dhmin').shape[0]
        lon = new_hold_num/7
        # dd = (self.X_hold.shape[0] + dft.shape[0])/7
        self._make_holdout_split(self.dfu, leaveout=lon)
        self._fchain_kfold_indicies()
        lassoCV_params = {'cv': self.fkfoldi,
                            'n_jobs':-1,
                            'alphas':np.logspace(-4,2,100)}
        ridgeCV_params =  {'cv': self.fkfoldi,
                            'alphas':np.logspace(-4,2,100),
                            'scoring':'mean_squared_error'}
        models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params']),
        GradientBoostingRegressor(**self.cvparams[channel]['GradientBoostingRegressor']['params']),
        LassoCV(**lassoCV_params),
        RidgeCV(**ridgeCV_params)]
        xh = self.X_hold.drop('date', axis=1)
        yh = self.y_hold.drop('date', axis=1).values
        plt.figure(figsize=(16,10))
        plt.plot(yh, label='True Values', color='black')
        for mod in models:
            mod.fit(self.Xtrn, self.ytrn)
            mod_name = mod.__class__.__name__
            ypred  = mod.predict(xh)
            mses = mean_squared_error(yh, ypred)
            rmse = np.sqrt(mses).mean()
            plt.plot(ypred, label = 'rmse: {0}, for model: {1}'.format(rmse, mod_name))

        plt.legend(loc='best')
        plt.show(block=False)

    def SMOrc_findbest(self):
        '''
        Output: pickled dictionary containing optimized model parameters for a specific channel.
        Runs gridsearch for optimal parameters for each model, for each channel. This is not ideal, but currently best option due to dramatic differences between individual channels.
        Further work involves creation/optimization of general model that will not be tailored for a specific channel.

        Fun Fact: Variable name comes from twitch.tv emote that depicts a brutish orc, representing the brute force method of optimization used.
        '''
        self.prep_arima()
        #Has to be done with dummie info so won't get error when instatiating in list with no input arguments
        models = [RandomForestRegressor(), GradientBoostingRegressor(), ARIMA(**self.arima_params)]
        channels = ['lirik', 'summit1g', 'imaqtpie', 'nl_kripp', 'destiny', 'admiral_bahroo']
        self.cvscores = {}
        for channel in channels:
            self.cvscores[channel] = {}
            for model in models:
                print 'Running ',model.__class__.__name__, ' for channel: ', channel
                self.kappa_search(channel, model)
                self.cvscores[channel][model.__class__.__name__] = {}
                if model.__class__.__name__ != 'ARIMA':
                    self.cvscores[channel][model.__class__.__name__]['params'] = self.gridsearch.best_params_
                    # self.mod = model(**self.gridsearch.best_params_)
                    mod = self.gridsearch.best_estimator_
                    self.cvscores[channel][model.__class__.__name__]['scores'] = cross_val_score(estimator=mod, X=self.Xtrn, y=self.ytrn, scoring='mean_squared_error', cv=self.fkfoldi, n_jobs=-1)
                else:
                    pass
                    # self.prep_arima(channel)
                    # mod = model(**self.arima_params)
                    # cvscore[channel][model.__class__.__name__]['scores'] = cross_val_score(estimator=mod, X=self.xts, y=self.yts, scoring='mean_squared_error', cv=self.ffkoldi, n_jobs=-1)
        pickle.dump(self.cvscores, open('pickle_pile/cross_val_SMOrc.pkl', 'wb'))


        # for estimator in models:
        #     self.cvscores[estimator] = {}
        #     for channel in channels:
        #         print 'Running ',estimator.__class__.__name__, ' for channel: ', channel
        #         self.kappa_search(channel, estimator)
        #         self.cvscores[estimator][channel] = {}
        #         if estimator.__class__.__name__ != 'ARIMA':
        #             self.cvscores[estimator][channel][params] = self.gridsearch.best_params_
        #             self.cvscores[estimator][channel][scores] = cross_val_score(estimator(**self.gridsearch.best_params_),self.Xtrn, self.ytrn, scoring='mean_squared_error', cv=self.fkfoldi)
        #         else:
        #             cvscore[estimator][channel][scores] = cross_val_score(estimator(**arima_params), scoring='mean_squared_error', cv=self.fkoldi)
        # pickle.dump(self.cvscores, open('pickle_pile/cross_val_SMOrc.pkl', wb))

    # def DansGame(self, channel):
    #
    #     self.cfilt(channel)
    #     self._make_holdout_split(self.cdf)
    #     self._fchain_kfold_indicies()
    #
    #     self.rfr = RandomForestRegressor(**self.cvparams[channel][RandomForestRegressor])
    #
    #     pass

    def run_arima(self):#use current build
        '''
        DEPRECATED:
        Primarily used for testing/debugging.
        Runs statsmodels ARIMA.
        '''

        self.xts = self.X_train.set_index('date')
        self.yts = self.y_train.set_index('date')
        self.yts.astype('float', inplace=True)
        self.arimod = ARIMA(endog = self.yts, order = (2,1,2))#, exog=self.xts)
        self.aresults = self.arimod.fit()

    def prep_arima(self, channel='lirik'):#use current build
        '''
        DEPRECATED:
        Required to 'prep' due to statsmodels' ARIMA not following the same flow as primarily used sklearn models.
        Abandoned in-lieu of R arima methods.
        '''
        self.cfilt(channel)
        self._make_holdout_split(self.cdf)
        self._fchain_kfold_indicies()
        self.xts = self.X_train.set_index('date')
        self.yts = self.y_train.set_index('date')
        self.yts.astype('float', inplace=True)
        self.arima_params = {'endog': self.yts, 'order': (2,1,2)}
        # self.arimod = ARIMA(endog = self.yts, order = (2,1,2))#, exog=self.xts)
        # self.aresults = self.arimod.fit()
    #
    # def EleGiggle(self, tname, dname, ivars):
        # self.r = robjects.r("""
        #     tset = read.csv("{0}")
        #     dset = read.csv("{1}")
        #     y = dset["AVG CCV's"]
        #     features = {2}
        #     X = train_set[features]
        #     X_test = test_set[features]
        #     fit = auto.arima(y, xreg=X)
        #     ypred = forecast(fit, xreg=X_test)
        # """.format(train_name, test_name, ivars))
    #     rp = robjects.r("""ypred['mean']""")[0]
    #     ypred = [rp[i] for i in range(len(rp))]
    #     return ypred


    def SeemsGood(self):
        '''
        Input:
        Output: Saves figures comparing model performance for each channel listed below.
        '''
        channels = ['lirik', 'nl_kripp', 'imaqtpie', 'summit1g']
        # channels = ['lirik']
        tscores = {}
        for channel in channels:
            tscores[channel] = {}
            self.cfilt(channel)
            self._make_holdout_split(self.cdf)
            self._fchain_kfold_indicies()
            # self.prep_arima(channel)
            # self.arimod = ARIMA(**self.arima_params)
            self.prep_arima(channel)
            lassoCV_params = {'cv': self.fkfoldi,
                                'n_jobs':-1,
                                'alphas':np.logspace(-4,2,100)}
            ridgeCV_params =  {'cv': self.fkfoldi,
                                'alphas':np.logspace(-4,2,100),
                                'scoring':'mean_squared_error'}
            lassy = LassoCV(**lassoCV_params)
            ridge = RidgeCV(**ridgeCV_params)
            lassy.fit(self.Xtrn,self.ytrn)
            ridge.fit(self.Xtrn, self.ytrn)
            x = {}
            yp = {}
            y = {}
            mses = {}
            # resA = []
            for j in xrange(len(self.kfolds)):
                Xtrain_indices, ytrain_indices = self.train_kfoldi[j]
                Xtest_indices, ytest_indices = self.test_kfoldi[j]
                xtrain = self.Xtrn.iloc[Xtrain_indices]
                ytrain = self.ytrn.iloc[ytrain_indices]
                xtest = self.Xtrn.iloc[Xtest_indices]
                ytest = self.ytrn.iloc[ytest_indices]
                # models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params'])]


                models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params']),
                GradientBoostingRegressor(**self.cvparams[channel]['GradientBoostingRegressor']['params']),Ridge(ridge.alpha_)]
                #Lasso(lassy.alpha_)]#, ]
                # models = [ARIMA(endog=yits, order= (2,1,2))] #exog=xits)]
                # models = ['ARIMA']
                for mod in models:
                    mod_name = mod.__class__.__name__
                    print '******'
                    print mod_name
                    print '******'
                    if j==0:
                        x[mod_name], y[mod_name], yp[mod_name], mses[mod_name] = [], [], [], []
                    if mod_name != 'ARIMA':
                        mod.fit(xtrain,ytrain)
                        ypred = mod.predict(xtest)
                        yp[mod_name].append(ypred)
                        y[mod_name].append(ytest.values)
                        x[mod_name].append(xtest)
                        mses[mod_name].append(mean_squared_error(ytest.values,ypred))
                    # elif mod=='ARIMA':
                    #     yits = self.y_train.iloc[ytrain_indices]
                    #     yits.set_index('date',inplace=True)
                    #     yits.astype('float', inplace=True)
                    #     xits = self.X_train.iloc[Xtrain_indices]
                    #     xits.set_index('date',inplace=True)
                    #     xits.astype('float', inplace=True)
                    #
                    #     ydts = self.y_train.iloc[ytest_indices]
                    #     ydts.set_index('date',inplace=True)
                    #     ydts.astype('float', inplace=True)
                    #     xdts = self.X_train.iloc[Xtest_indices]
                    #     xdts.set_index('date',inplace=True)
                    #     xdts.astype('float', inplace=True)
                    #     ivars = 'c({})'.format(u' '.join(list(xits.columns)).encode('utf-8')[1:-1])
                    #     tfold = pd.concat([xits,yits], axis=1)
                    #     tname = 'pickle_pile/tfold{0}.csv'.format(j)
                    #     # tname = 'tfold{0}.csv'.format(j)
                    #
                    #     tname = ''.join(tname).encode('utf-8')
                    #     tfold.to_csv(tname)
                    #
                    #     # tname = 'pickle_pile/tfold.csv'
                    #
                    #     dfold = pd.concat([xdts,ydts], axis=1)
                    #     dname = 'pickle_pile/dfold{0}.csv'.format(j)
                    #     # dname = 'pickle_pile/dfold.csv'
                    #     dfold = to_csv(dname)
                    #     try:
                    #         ypred = self.EleGiggle(tname, dname, ivars)
                    #         yp[mod_name].append(ypred)
                    #         y[mod_name].append(ytest.values)
                    #         x[mod_name].append(xtest)
                            # mses[mod_name].append(mean_squared_error(ytest.values,ypred))
                    #     except:
                    #         print 'Nope'

                    # else:
                    #     ares = mod.fit()
                    #     ypred= ares.predict(start=ydts.index.min(), end=ydts.index.max())
                    #     yp[mod_name].append(ypred)
                    #     y[mod_name].append(ytest.values)
                    #     x[mod_name].append(xtest)
                        # mses[mod_name].append(mean_squared_error(ytest.values,ypred))

            self.xym = {}
            fig = plt.figure(figsize=(16,10))
            plt.tick_params(labelsize=18)
            for mod in models:
                mod_name = mod.__class__.__name__
                self.xym[mod_name] = {}
                self.xym[mod_name]['y'] = list(itertools.chain.from_iterable(y[mod_name]))
                self.xym[mod_name]['x'] = list(itertools.chain.from_iterable(x[mod_name]))
                self.xym[mod_name]['yp'] = list(itertools.chain.from_iterable(yp[mod_name]))
                self.xym[mod_name]['rmse'] = np.average(np.sqrt(mses[mod_name]))
                if mod_name=='RandomForestRegressor':
                    plt.plot(self.xym[mod_name]['yp'], color='blue', linewidth=2, label=mod_name+' rmse: {0}'.format(self.xym[mod_name]['rmse']))
                elif mod_name=='GradientBoostingRegressor':
                    plt.plot(self.xym[mod_name]['yp'], 'g--', linewidth=2, label=mod_name+' rmse: {0}'.format(self.xym[mod_name]['rmse']))
                elif mod_name=='Ridge':
                    plt.plot(self.xym[mod_name]['yp'], 'r--', linewidth=2, label=mod_name+' rmse: {0}'.format(self.xym[mod_name]['rmse']))
            # plt.scatter(self.Xtrn.index[:len(self.xym[mod_name]['y'])], self.xym[mod_name]['y'], color='green', marker='o', label='true')
            plt.plot(self.xym[mod_name]['y'], color='magenta', label='true', linewidth=2)
            plt.legend(loc='best', prop={'size':14})

            # plt.show()
            plt.savefig('figures/cv_{0}_rflmodel.png'.format(channel))
plt.legend(loc='best')
plt.title('Log - expwighted_avg - moving_avg')
plt.show(block=False)

# In[16]:

ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)
ts_log_diff.dropna(inplace=True)

# In[24]:

from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(ts_log, order=(2, 1, 2))
results_ARIMA = model.fit(disp=-1)
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)

predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_log = pd.Series(ts_log.iloc[0], index=ts_log.index)

predictions_ARIMA_log = predictions_ARIMA_log.add(
    predictions_ARIMA_diff_cumsum, fill_value=0)

predictions_ARIMA = np.exp(predictions_ARIMA_log)

plt.plot(ts)
plt.plot(predictions_ARIMA)

plt.title('RMSE: %.4f' % np.sqrt(sum((predictions_ARIMA - ts)**2) / len(ts)))
from statsmodels.tsa.arima_model import ARIMA
from scipy.stats import gaussian_kde
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

def norm(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

dataframe = pd.read_csv('Chaotic_TimeSeries_turkey_elec.csv')
dataframe.head()
plt.plot(dataframe)
autocorrelation_plot(dataframe.ix[:,0])

### AVALIAR V3 LINHAS
model00 = ARIMA(np.array(dataframe.ix[:,0]), dates=None,order=(2,1,0))
model11 = model00.fit(disp=1)
model11.summary()
model11.forecast()
resid9=model11.resid
np.mean(abs(resid9))/max(np.array(dataframe.ix[:,0]))

x3 = resid9
x3 = x3[numpy.logical_not(numpy.isnan(x3))]
dftest13 = adfuller(x3, autolag='AIC')
dfoutput1 = pd.Series(dftest13[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
print('Dickey Fuller Test:\n',dfoutput1)

look_back=200
start=0
end=len(resid9)
lag=look_back
Example #48
0
ax[1].set_title("First-order differences of DJIA during Jan 2016-Dec 2016")

# plot signal
plotds(first_order_diff, nlag=50)
adf_result = adfuller(first_order_diff)
print("ADF Statistic: %f" % adf_result[0])
print("p-value: %f" % adf_result[1])

# Optimize ARMA parameters
aicVal = []
for d in range(1, 3):
    for ari in range(0, 3):
        for maj in range(0, 3):
            try:
                arima_obj = ARIMA(djia_df["Close"].tolist(), order=(ari, d, maj))
                arima_obj_fit = arima_obj.fit()
                aicVal.append([ari, d, maj, arima_obj_fit.aic])
            except ValueError:
                pass

# Optimal ARIMA model
arima_obj = ARIMA(djia_df["Close"].tolist(), order=(0, 2, 1))
arima_obj_fit = arima_obj.fit(disp=0)
arima_obj_fit.summary()

# Evaluate prediction
pred = np.append([0, 0], arima_obj_fit.fittedvalues.tolist())
djia_df["ARIMA"] = pred
diffval = np.append([0, 0], arima_obj_fit.resid + arima_obj_fit.fittedvalues)
djia_df["diffval"] = diffval
pplt.autoscale(enable=True, axis='x', tight=None)
pplt.show()


# In[26]:

decomposition = seasonal_decompose(calc_ent2.entropy.values, freq=24)  
fig = plt.figure()  
fig = decomposition.plot()  
fig.set_size_inches(15, 8)


# In[44]:

model=ARIMA(calc_ent2['entropy'],(1,0,0))    ## The endogenous variable needs to be type Float or you get a cast error
model_fit = model.fit()       # fit is a Function
model_fitted = model_fit.fittedvalues    # fittedvalues is a Series
print(model_fit.summary())
print(model_fitted)


# In[29]:

from pprint import pprint               # get a variety of different attributes from the object (including functions)
#pprint (dir(model))
#pprint (dir(model_fit))


# In[30]:

print(model.endog_names)
Example #50
0
from scipy.stats import gaussian_kde
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

def norm(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

start=0
end=-10
dataframe = pd.read_csv('Apple_Data_300.csv')[start:end]
dataframe.head()
autocorrelation_plot(dataframe.ix[:,4])

### AVALIAR V3 LINHAS
model00 = ARIMA(np.array(dataframe.ix[:,4]), dates=None,order=(2,1,0))
model11 = model00.fit(disp=1)
model11.summary()
model11.forecast()
resid9=model11.resid
np.mean(abs(resid9))/max(np.array(dataframe.ix[:,4]))

x3 = resid9
x3 = x3[numpy.logical_not(numpy.isnan(x3))]
dftest13 = adfuller(x3, autolag='AIC')
dfoutput1 = pd.Series(dftest13[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
print('Dickey Fuller Test:\n',dfoutput1)

look_back=200
start=0
end=len(resid9)
lag=look_back
Example #51
0
# decide the structure (p,q) of the model ------------------------------------

fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(residual, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(residual, lags=40, ax=ax2)
plt.show()

## decide the parameter of the model ------------------------------------------
from statsmodels.tsa.arima_model import ARIMA

model = ARIMA(residual, order=(3, 0, 2))  

results_ARIMA = model.fit(disp=-1)  

ARIMA_predict=results_ARIMA.predict('1959-07-01','1969-12-01')

ARIMA_all=pd.concat([results_ARIMA.fittedvalues,ARIMA_predict])

plt.plot(residual_1,color='k')

plt.plot(residual)

plt.plot(results_ARIMA.fittedvalues, color='red')

plt.plot(ARIMA_all, color='red')

plt.show()
Example #52
0
def feature_selecion():

    start_date = '2016-06-01'
    end_date = '2016-07-01'
    data_file = "static/data/GBPUSD/DAT_MT_GBPUSD_M1_2016.csv"
    news = ["Brexit", "US presidential election 2012"]
    currency = ["GBP/USD", "EUR/USD"]
    example_number = 0

    #price
    data = read_csv(data_file)
    data['Time'] = data[['Date', 'Time']].apply(lambda x: ' '.join(x), axis=1)
    data['Time'] = data['Time'].apply(
        lambda x: to_datetime(x) - timedelta(hours=2))
    data.index = data.Time
    mask = (data.index > start_date) & (data.index <= end_date)
    data = data.loc[mask]
    series = data["Close"]

    #price and the gradient
    fig = plt.figure()

    ax3 = fig.add_subplot(211)
    ax3.plot(series)
    ax3.set_title(currency[example_number] + ' prices during ' +
                  news[example_number] + ' time period')
    ax3.set_xlabel('Time')
    ax3.set_ylabel('Price')

    np_array_series = np.array(data['Close'])
    np_array_dates = np.array(data.index)
    gradients = np.gradient(np_array_series)

    ax1 = fig.add_subplot(212)
    ax1.set_title('Gradients of the price series')
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Gradient')
    ax1.plot(np_array_dates, gradients)

    fig.savefig("static/anomalies/feature_lection_image1.png")

    price_list = series.values
    ADF_result_price = adfuller(price_list)
    print('ADF Statistic: for series %f' % ADF_result_price[0])
    print('p-value: %f' % ADF_result_price[1])  #p-value: 0.668171
    print('Critical Values:')

    for key, value in ADF_result_price[4].items():
        print('\t%s: %.3f' % (key, value))

    #create log return series
    series_log_ret = np.log(data.Close) - np.log(data.Close.shift(1))
    series_log_ret = series_log_ret.dropna()

    log_return_list = series_log_ret.values
    ADF_result_log_return = adfuller(log_return_list)
    print('ADF Statistic: for series_log_ret %f' % ADF_result_log_return[0])
    print(
        'p-value: %f' % ADF_result_log_return[1]
    )  #p-value: 0.000000 therefore, null hypothesis is rejected. the system is stationary
    print('Critical Values:')

    for key, value in ADF_result_log_return[4].items():
        print('\t%s: %.3f' % (key, value))

    input_series = []
    #testing for stationarity in series
    if ADF_result_price[0] < 0.05:
        input_series = price_list
    else:
        input_series = log_return_list

    #Creating the ARIMA model
    arima_model = ARIMA(series_log_ret, order=(4, 1, 1))
    model_fit = arima_model.fit(disp=0)
    print(model_fit.summary())
    #tsaplots.plot_acf(series_log_ret, lags=30)
    #tsaplots.plot_pacf(series_log_ret, lags=30)

    #Getting the residual series
    residuals = pd.DataFrame(model_fit.resid)
    #np.square(residuals).plot()

    residual_list = residuals.values
    residual_squared = list()

    for x in residual_list:
        residual_squared.append(x[0])

    #checking for stationarity in the residual series
    ADF_result_residual_squared = adfuller(residual_squared)
    print('ADF Statistic: for residuals %f' % ADF_result_residual_squared[0])
    print(
        'p-value: %f' % ADF_result_residual_squared[1]
    )  #p-value: 0.000000 therefore, null hypothesis is rejected. the system is stationary
    print('Critical Values:')
    for key, value in ADF_result_residual_squared[4].items():
        print('\t%s: %.3f' % (key, value))

    #different configurations for GARCH model
    configurations = [[2, 0, 0], [2, 0, 1], [1, 0, 0], [1, 0, 1]]

    opt_model = {}
    opt_configuration = []

    #getting the most suitable configuration
    for i in range(len(configurations)):
        BIC = np.inf
        garch_model = arch_model(series_log_ret,
                                 p=configurations[i][0],
                                 o=configurations[i][1],
                                 q=configurations[i][2])
        model = garch_model.fit(update_freq=5)
        if BIC > model.bic:
            BIC = model.bic
            opt_model = model
            opt_configuration = configurations[i]

    print(opt_model.summary())
    conditional_volatilit = opt_model.conditional_volatility

    #https://plot.ly/matplotlib/subplots/ for four
    #for three
    #ax1 = fig.add_subplot(221)

    fig = plt.figure()

    ax3 = fig.add_subplot(221)
    ax3.plot(series)
    ax3.set_title(currency[example_number] + ' prices during ' +
                  news[example_number] + ' time period')
    ax3.set_xlabel('Time')
    ax3.set_ylabel('Price')

    ax2 = fig.add_subplot(222)
    ax2.plot(conditional_volatilit)
    ax2.set_title('Conditional Volatility')
    ax2.set_xlabel('Time')
    ax2.set_ylabel('Conditional Volatility')

    ax1 = fig.add_subplot(223)
    ax1.plot(np_array_dates, gradients)
    ax1.set_title('Gradients: ' + currency[example_number] +
                  ' prices during ' + news[example_number])
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Gradient')

    np_array_CH = np.array(conditional_volatilit)
    np_array_CH_dates = np.array(conditional_volatilit.index)
    gradients_CH = np.gradient(np_array_CH)

    ax4 = fig.add_subplot(224)
    ax4.plot(np_array_CH_dates, gradients_CH)
    ax4.set_title('Gradients: Conditional Volatility')
    ax4.set_xlabel('Time')
    ax4.set_ylabel('Gradient')

    fig.savefig("static/anomalies/feature_lection_image2.png")

    df_CH = pd.DataFrame()
    df_CH['Index'] = np_array_CH_dates
    df_CH['CH_Gradient'] = gradients_CH
    df_CH.index = df_CH['Index']
    df_CH['CH'] = conditional_volatilit
    df_CH = df_CH.drop(['Index'], axis=1)

    df_price = pd.DataFrame()
    df_price['Index'] = np_array_dates
    df_price['Price_Gradient'] = gradients
    df_price.index = df_price['Index']
    df_price['Price'] = series
    df_price = df_price.drop(['Index'], axis=1)

    features = pd.concat([df_price, df_CH], axis=1)
    features = features.dropna(axis=0)

    print(features)

    features.to_csv('static/anomalies/features.csv')

    return "done"
def arima_models(ts_log, p, d, q):
    model = ARIMA(ts_log, order = (p, d, q))
    results = model.fit(disp = -1)
    return results
    ax2.set_xlabel('Lag')
    ax2.set_ylabel('Partial Autocorrelation')
    plt.show()
from statsmodels.tsa.arima_model import ARIMA
import warnings

cols = train.columns[1:-1]
for key in top_pages:
    data = np.array(train.loc[top_pages[key],cols],'f')
    result = None
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        try:
            arima = ARIMA(data,[2,1,4])
            result = arima.fit(disp=False)
        except:
            try:
                arima = ARIMA(data,[2,1,2])
                result = arima.fit(disp=False)
            except:
                print(train.loc[top_pages[key],'Page'])
                print('\tARIMA failed')
    #print(result.params)
    pred = result.predict(2,599,typ='levels')
    x = [i for i in range(600)]
    i=0

    plt.plot(x[2:len(data)],data[2:] ,label='Data')
    plt.plot(x[2:],pred,label='ARIMA Model')
    plt.title(train.loc[top_pages[key],'Page'])
for key, value in result2[4].items():
    print('\t%s: %.3f' % (key, value))

## INDPRO first differenced log
result3 = adfuller(d_ln_indpro_temp)
print('ADF Statistic: %f' % result3[0])
print('p-value: %f' % result3[1])
print('Critical Values:')
for key, value in result3[4].items():
    print('\t%s: %.3f' % (key, value))

# ARIMA INDPRO
## fit model ARIMA(4,1,0), differencing done
## by ARIMA
model = ARIMA(indpro, order=(3, 1, 0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

## fit model ARIMA(4,0,0), differencing done by me
## beforehand. So this is essentially an ARMA(4, 0)
## model on the already differenced data
d_indpro = pd.DataFrame(d_indpro_temp)
model2 = ARIMA(d_indpro, order=(3, 0, 0))
model_fit2 = model2.fit(disp=0)
print(model_fit2.summary())

### model2 is equivalent to model
### hence, my differenced series is differenced
### in the same way as the ARIMA function differences

residuals = pd.DataFrame(model_fit.resid)
Example #56
0
X = dataset.transpose() #将dataset矩阵转置
plot_acf(X, lags=24)
plot_pacf(X, lags=24)
pyplot.show()


# %%
size = 24 * 7
train, test = X[:size], X[size:len(X)]
forecast = numpy.zeros(len(test))
bound = numpy.zeros((len(test), 2))
step = 4
for t in range(0, len(test), step):
    print(t)
    model = ARIMA(train, order=(7, 0, 0))
    model_fit = model.fit()
    output = model_fit.forecast(step, alpha=.05)
    forecast[t:t + step] = output[0]
    bound[t:t + step, :] = output[2]  #???
    train = numpy.append(train, test[t:t + step])
error = mean_absolute_error(test, forecast)
print('Test MAE: %.3f' % error)


# %% plot
#==============================================================================
pyplot.show()
  
timeline = numpy.arange(0, len(test))
baseline = numpy.zeros(len(test))
residual = test - forecast
Example #57
0
def arima_with_data_transformation():
    # For Arima func, our data has to be stationary. So we check it.
    test_stationarity(ts)

    # Generate log series so we'll have stationary data
    log_d = np.log(ts)

    # Show the data after the transformaion to log
    plt.plot(log_d, color='red')
    plt.show()

    # Generate shifted sereis
    log_d_diff = log_d - log_d.shift()
    log_d_diff.dropna(inplace=True)

    # Test if now we have stationariy data
    test_stationarity(log_d_diff)

    # Check autocorrelation
    lag_acf = acf(log_d_diff, nlags=20)
    # Check autocorrelation after reduction the previos elemnts
    lag_pacf = pacf(log_d_diff, nlags=20, method='ols')

    #Plot ACF:
    plt.subplot(121)
    plt.plot(lag_acf)
    plt.axhline(y=0, linestyle='--', color='gray')
    plt.axhline(y=-1.96 / np.sqrt(len(log_d_diff)),
                linestyle='--',
                color='gray')
    plt.axhline(y=1.96 / np.sqrt(len(log_d_diff)),
                linestyle='--',
                color='gray')
    plt.title('Autocorrelation Function')
    plt.show()

    #Plot PACF:
    plt.subplot(122)
    plt.plot(lag_pacf)
    plt.axhline(y=0, linestyle='--', color='gray')
    plt.axhline(y=-1.96 / np.sqrt(len(log_d_diff)),
                linestyle='--',
                color='gray')
    plt.axhline(y=1.96 / np.sqrt(len(log_d_diff)),
                linestyle='--',
                color='gray')
    plt.title('Partial Autocorrelation Function')
    plt.tight_layout()
    plt.show()

    # Run Arima model
    model = ARIMA(log_d, order=(2, 1, 0))
    results_ARIMA = model.fit(disp=-1)
    plt.plot(log_d_diff, color='red')
    plt.plot(results_ARIMA.fittedvalues, color='yellow')
    plt.show()

    #scale it back to the original values
    predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
    predictions_ARIMA_diff_cumsum = -predictions_ARIMA_diff.cumsum()
    predictions_ARIMA_log = pd.Series(log_d.ix[0], index=log_d.index)
    predictions_ARIMA_log = predictions_ARIMA_log.add(
        predictions_ARIMA_diff_cumsum, fill_value=0)
    predictions_ARIMA = np.exp(predictions_ARIMA_log)

    plt.plot(ts, color='yellow')
    plt.plot(predictions_ARIMA, color='green')
    plt.title('RMSE: %.4f' %
              np.sqrt(sum((predictions_ARIMA - ts)**2) / len(ts)))
    plt.show()
result = pd.DataFrame(columns=['artist_id', 'plays', 'Ds'])

# fit data
for aid in arts['artist_id']:
    one = arts[arts.artist_id == aid]
    one.pop('artist_id')
    ts = pd.Series(data=one['plays'], index=one.index)
    # log
    ts_log = np.log(ts)
    # difference
    ts_log_diff = ts_log - ts_log.shift(7)
    ts_log_diff.dropna(inplace=True)
    # arima
    model = ARIMA(ts_log_diff, order=(7, 0, 0))
    results_ARIMA = model.fit(maxiter=1000000)
    # fit
    predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
    predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
    predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index)
    predictions_ARIMA_log = predictions_ARIMA_log.add(
        predictions_ARIMA_diff_cumsum, fill_value=0)
    predictions_ARIMA = np.exp(predictions_ARIMA_log)
    # predict
    pred = results_ARIMA.predict(start='20150831', end='20151030')
    pred_ARIMA_diff_cumsum = pred.cumsum()
    pred_ARIMA_log = pd.Series(ts_log.ix[len(ts_log) - 1], index=pred.index)
    pred_ARIMA_log = pred_ARIMA_log.add(pred_ARIMA_diff_cumsum, fill_value=0)
    pred_ARIMA = np.exp(pred_ARIMA_log)
    # plot
    fig, ax = plt.subplots(nrows=1, ncols=1)
    mpl.rcParams['axes.unicode_minus'] = False

    x = data['Passengers'].astype(np.float)
    x = np.log(x)
    print x.head(10)

    show = 'prime'   # 'diff', 'ma', 'prime'
    d = 1
    diff = x - x.shift(periods=d)
    ma = x.rolling(window=12).mean()
    xma = x - ma

    p = 2
    q = 2
    model = ARIMA(endog=x, order=(p, d, q))     # 自回归函数p,差分d,移动平均数q
    arima = model.fit(disp=-1)                  # disp<0:不输出过程
    prediction = arima.fittedvalues
    print type(prediction)
    y = prediction.cumsum() + x[0]
    mse = ((x - y)**2).mean()
    rmse = np.sqrt(mse)

    plt.figure(facecolor='w')
    if show == 'diff':
        plt.plot(x, 'r-', lw=2, label=u'原始数据')
        plt.plot(diff, 'g-', lw=2, label=u'%d阶差分' % d)
        #plt.plot(prediction, 'r-', lw=2, label=u'预测数据')
        title = u'乘客人数变化曲线 - 取对数'
    elif show == 'ma':
        #plt.plot(x, 'r-', lw=2, label=u'原始数据')
        #plt.plot(ma, 'g-', lw=2, label=u'滑动平均数据')
Example #60
0
    plot_pacf(xt, lags=50, ax=ax_pacf)
    plt.tight_layout()
    return None


# plotting data
plotds(Nifty_data['Close'], nlag=50)

#plotting QQ plot and probability plot
sm.qqplot(Nifty_data['Close'], line='s')

# Optimize ARIMA parameters
aicVal = []
for d in range(0, 3):
    for ari in range(0, 3):
        for maj in range(0, 3):
            try:
                arima_obj1 = ARIMA(train.tolist(), order=(ari, d, maj))
                arima_obj1_fit = arima_obj1.fit()
                aicVal.append([ari, d, maj, arima_obj1_fit.aic])
            except:
                pass

print(aicVal)

pred = np.append([0, 0], arima_obj1_fit.fittedvalues.tolist())

import sklearn

sklearn.metrics.r2_score(arima_obj1_fit, pred)