def test_ld(self): pacfyw = pacf_yw(self.x, nlags=40, method="mle") pacfld = pacf(self.x, nlags=40, method="ldb") assert_almost_equal(pacfyw, pacfld, DECIMAL_8) pacfyw = pacf(self.x, nlags=40, method="yw") pacfld = pacf(self.x, nlags=40, method="ldu") assert_almost_equal(pacfyw, pacfld, DECIMAL_8)
def get_acf_pacf(self, inputDataSeries, lag = 15): # Copy the data in input data outputData = pandas.DataFrame(inputDataSeries) if min(inputDataSeries.index) == inputDataSeries.index[0]: # Ascending multiplier = 1 lag = multiplier*lag elif max(inputDataSeries.index) == inputDataSeries.index[0]: # Descending multiplier = -1 lag = multiplier*lag else: print('Cannot determine the order put the lag value manually') print('Syntax: calc_returns(inputData, columnName, lag = lag_value)') n_iter = lag columnName = outputData.columns[0] i = 1 # Calculate ACF acf_values = [] acf_values.append(outputData[columnName].corr(outputData[columnName])) while i <= abs(n_iter): col_name = 'lag_' + str(i) outputData[col_name] = '' outputData[col_name] = outputData[columnName].shift(multiplier*i) i += 1 acf_values.append(outputData[columnName].corr(outputData[col_name])) # Define an emplty figure fig = plt.figure() # Define 2 subplots ax1 = fig.add_subplot(211) # 2 by 1 by 1 - 1st plot in 2 plots ax2 = fig.add_subplot(212) # 2 by 1 by 2 - 2nd plot in 2 plots ax1.plot(range(len(acf_values)), acf(inputDataSeries, nlags = n_iter), \ range(len(acf_values)), acf_values, 'ro') ax2.plot(range(len(acf_values)), pacf(inputDataSeries, nlags = n_iter), 'g*-') # Plot horizontal lines ax1.axhline(y = 0.0, color = 'black') ax2.axhline(y = 0.0, color = 'black') # Axis labels plt.xlabel = 'Lags' plt.ylabel = 'Correlation Coefficient' return {'acf' : list(acf_values), \ 'pacf': pacf(inputDataSeries, nlags = n_iter)}
def partial_autocorrelation(x, *args, nlags=None, method='ldb', **kwargs): """ Return partial autocorrelation function (PACF) of signal `x`. Parameters ---------- x: array_like A 1D signal. nlags: int The number of lags to calculate the correlation for (default: min(600, len(x))) args, kwargs As accepted by `statsmodels.tsa.stattools.pacf`. Returns ------- acf: array Partioal autocorrelation function. confint : optional As returned by `statsmodels.tsa.stattools.pacf`. """ from statsmodels.tsa.stattools import pacf if nlags is None: nlags = min(1000, len(x) - 1) corr = pacf(x, *args, nlags=nlags, method=method, **kwargs) return _significant_acf(corr, kwargs.get('alpha'))
def test_ols(self): pacfols, confint = pacf(self.x, nlags=40, alpha=.05, method="ols") assert_almost_equal(pacfols[1:], self.pacfols, DECIMAL_6) centered = confint - confint.mean(1)[:,None] # from edited Stata ado file res = [[-.1375625, .1375625]] * 40 assert_almost_equal(centered[1:41], res, DECIMAL_6)
def ACF_PACF_plot(self): #plot ACF and PACF to find the number of terms needed for the AR and MA in ARIMA # ACF finds MA(q): cut off after x lags # and PACF finds AR (p): cut off after y lags # in ARIMA(p,d,q) lag_acf = acf(self.ts_log_diff, nlags=20) lag_pacf = pacf(self.ts_log_diff, nlags=20, method='ols') #Plot ACF: ax=plt.subplot(121) plt.plot(lag_acf) ax.set_xlim([0,5]) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y= -1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y= 1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y= -1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout()
def plotPACF(timeSeries): lag_pacf = pacf(timeSeries, nlags=20, method='ols') plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(timeSeries)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(timeSeries)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout()
def ARIMA_fun( data ): lag_pacf = pacf( data, nlags=20, method='ols' ) lag_acf, ci2, Q = acf( data, nlags=20 , qstat=True, unbiased=True) model = ARIMA(orig_data, order=(1, 1, int(ci2[0]) ) ) results_ARIMA = model.fit(disp=-1) plt.subplot(121) plt.plot( data ) plt.plot(results_ARIMA.fittedvalues) #plt.show() return results_ARIMA.fittedvalues
def FE(self, serie_atual): ''' Método para fazer a diferenciacao de uma serie_atual :param serie_atual: serie_atual real ''' #serie_df = pd.DataFrame(serie_atual) serie_diff = pd.Series(serie_atual) serie_diff = serie_diff - serie_diff.shift() serie_diff = serie_diff[1:] features = [] #feature 1: auto_correlacao = acf(serie_diff, nlags=5) for i in auto_correlacao: features.append(i) #feature 2: parcial_atcorr = pacf(serie_diff, nlags=5) for i in parcial_atcorr: features.append(i) #feature 3: variancia = serie_diff.std() features.append(variancia) #feature 4: serie_skew = serie_diff.skew() features.append(serie_skew) #feature 5: serie_kurtosis = serie_diff.kurtosis() features.append(serie_kurtosis) #feature 6: turning_p = self.turningpoints(serie_diff) features.append(turning_p) #feature 7: #feature 8: return features
def global_analysis(csv_fname, trajectory_df): # catch small trajectory_dfs if len(trajectory_df.index) < MIN_TRAJECTORY_LEN: return None else: # for each trajectory, loop through segments acf_data = np.zeros((len(INTERESTED_VALS), 1, LAGS+1)) pacf_data = np.zeros((len(INTERESTED_VALS), 1, LAGS+1)) # do analysis variable by variable count = -1 for var_name, var_values in trajectory_df.iteritems(): count += 1 # make matrices # make dictionary for column indices var_index = trajectory_df.columns.get_loc(var_name) # {'velo_x':0, 'velo_y':1, 'velo_z':2, 'curve':3, 'log_curve':4}[var_name] # # run ACF and PACF for the column col_acf, acf_confint = acf(var_values, nlags=LAGS, alpha=.05)#, qstat= True) # # # store data acf_data[var_index, 0, :] = col_acf ## super_data_confint_lower[var_index, segment_i, :] = acf_confint[:,0] ## super_data_confint_upper[var_index, segment_i, :] = acf_confint[:,1] # ## , acf_confint, acf_qstats, acf_pvals col_pacf, pacf_confint = pacf(var_values, nlags=LAGS, method='ywmle', alpha=.05) pacf_data[var_index, 0, :] = col_pacf # # TODO: check for PACF values above or below +-1 # super_data[var_index+len(INTERESTED_VALS), segment_i, :] = col_pacf # super_data_confint_lower[var_index+len(INTERESTED_VALS), segment_i, :] = pacf_confint[:,0] # super_data_confint_upper[var_index+len(INTERESTED_VALS), segment_i, :] = pacf_confint[:,1] return acf_data, pacf_data
def get_acf_pacf(self, inputDataSeries, lag = 15): # Copy the data in input data outputData = pandas.DataFrame(inputDataSeries) if min(inputDataSeries.index) == inputDataSeries.index[0]: # Ascending multiplier = 1 lag = multiplier*lag elif max(inputDataSeries.index) == inputDataSeries.index[0]: # Descending multiplier = -1 lag = multiplier*lag else: print('Cannot determine the order put the lag value manually') print('Syntax: calc_returns(inputData, columnName, lag = lag_value)') n_iter = lag return {'acf' : acf(inputDataSeries, nlags = n_iter), \ 'pacf': pacf(inputDataSeries, nlags = n_iter)}
def corrfunc(timeseries): diff_ts = timeseries - timeseries.shift() diff_ts.dropna(inplace=True) ts_acf = acf(diff_ts, nlags=20) ts_pacf = pacf(diff_ts, nlags=20, method='ols') #Plot ACF and PACF: fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) plt.tick_params(axis="both", which="both", bottom="on", top="off", labelbottom="on", left="on", right="off", labelleft="on") fig = sm.graphics.tsa.plot_acf(timeseries.values.squeeze(), lags=20, ax=ax1) plt.title('ACF', fontsize=15) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(timeseries, lags=20, ax=ax2) plt.tick_params(axis="both", which="both", bottom="on", top="off", labelbottom="on", left="on", right="off", labelleft="on") plt.xlabel("Lags", fontsize=14) plt.title('PACF', fontsize=15) plt.tight_layout() fig.savefig('corrfunc.png', bbox_inches="tight")
def plot_acf_and_pacf(y): lag_acf = acf(y, nlags=20) lag_pacf = pacf(y, nlags=20, method='ols') plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(y)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(y)),linestyle='--',color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(y)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(y)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() plt.show() plt.close()
def acf_pacf(ts): ts_log, ts_log_diff = trend(ts) lag_acf = acf(ts_log_diff, nlags = 20) lag_pacf = pacf(ts_log_diff, nlags = 20, method = 'ols') #plot acf plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle = '--', color = 'gray') plt.axhline(y = -1.96/np.sqrt(len(ts_log_diff)), linestyle = '--', color = 'gray') plt.axhline(y = 1.96/np.sqrt(len(ts_log_diff)), linestyle = '--', color = 'gray') plt.title('Autocorrelation Function') #plot pacf plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle = '--', color = 'gray') plt.axhline(y = -1.96/np.sqrt(len(ts_log_diff)), linestyle = '--', color = 'gray') plt.axhline(y = 1.96/np.sqrt(len(ts_log_diff)), linestyle = '--', color = 'gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() plt.show()
ts_df_log_rolling = (ts_df_log - ts_df_log_rolling_temp).dropna() plt.figure(figsize=(15, 6)) plt.plot(ts_df_log, label='Log Transformed') plt.plot(ts_df_log_rolling, color='red', label='Log and Rolling Average Transformed') plt.legend(loc='best') plt.show() Dickey_Fuller_test(ts_df_log_rolling.Weighted_Price) return ts_df_log_rolling ts_df_log_rolling = rolling_avg_diff(ts_df_log, ts_df_log_rolling_temp) lag = 20 lag_pacf = pacf(ts_df_log_rolling, nlags=lag, method='ols') lag_acf = acf(ts_df_log_rolling, nlags=lag) lag = 20 lag_pacf = pacf(ts_df_log_rolling, nlags=lag, method='ols') lag_acf = acf(ts_df_log_rolling, nlags=lag) #Plot ACF: plt.figure(figsize=(15, 3)) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(ts_df_log_rolling)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(ts_df_log_rolling)), linestyle='--',
def model_feature(file_name, df, feature): #first create a directory by feature name to store the results file_name_wo_extn = file_name[:-4] dir_name = os.path.join(os.path.sep, os.getcwd(), OUTPUT_DIR_NAME, file_name_wo_extn, feature) if os.path.exists(dir_name): logger.info('dir name is ==> ' + dir_name) #delete existing directory if any shutil.rmtree(dir_name) os.makedirs(dir_name) #temporarily change to the new feature directory curr_dir = os.getcwd() os.chdir(dir_name) #create a string buffer to store all information about this feature which will then be written to a file at the end s = '' s = _write_to_string(s, '----------- Time Series Analysis for ' + feature + ' from ' + str(df['Date'][0]) + ' to ' + str(df['Date'][len(df['Date']) - 1]) + '-----------') #only look at the fearture of intrest as a univariate time series #x-axis is the time.. X = np.array(df['Date'], dtype=np.datetime64) #df['First Difference'] = df[feature] - df[feature].shift() y = np.array(df[feature] - df[feature].shift()) _draw_multiple_line_plot('first_difference.html', feature, [X], [y], ['navy'], ['packets percentage delta'], [None], [1], 'datetime', 'Date', 'Packets Percentage Delta', y_start=-100, y_end=100) #calculate autocorelation and partial auto corelation for the first difference lag_correlations = acf(y[1:]) lag_partial_correlations = pacf(y[1:]) logger.info ('lag_correlations') logger.info(lag_correlations) s = _write_to_string(s, 'lag_correlations') s = _write_to_string(s, str(lag_correlations)) y = lag_correlations _draw_multiple_line_plot('lag_correlations.html', 'lag_correlations', [X], [y], ['navy'], ['lag_correlations'], [None], [1], 'datetime', 'Date', 'lag_correlations', y_start=-1, y_end=1) logger.info ('lag_partial_correlations') logger.info(lag_partial_correlations) s = _write_to_string(s, 'lag_partial_correlations') s = _write_to_string(s, str(lag_partial_correlations)) y = lag_partial_correlations _draw_multiple_line_plot('lag_partial_correlations.html', 'lag_partial_correlations', [X], [y], ['navy'], ['lag_partial_correlations'], [None], [1], 'datetime', 'Date', 'lag_partial_correlations', y_start=-1, y_end=1) #seasonal decompae to extract seasonal trends decomposition = seasonal_decompose(np.array(df[feature]), model='additive', freq=15) _draw_decomposition_plot('decomposition.html', X, decomposition, 'seasonal decomposition', 'datetime', 'decomposition', width=600, height=400) #run various ARIMA models..and see which fits best... s, model_names, models, results, MAE = _try_ARIMA_and_ARMA_models(s, df, feature) #check if we got consistent output, all 4 variables returns by the prev function are # lists..they should be the same length len_list = [len(model_names), len(models), len(results), len(MAE)] if len(len_list) == len_list.count(len_list[0]): #looks consistent, all lengths are equal logger.info('_try_ARIMA_models output looks consistent, returns %d models ' % len(model_names)) else: logger.info('_try_ARIMA_models output IS NOT consistent, returns %d model names ' % len(model_names)) logger.info(len_list) logger.info('EXITING.....') sys.exit() s, predicted_dates, predicted, model_selection_list = _do_forecasts(df, feature, X, s, model_names, models, results, MAE) #write everything to file with open(feature + '.txt', "w") as text_file: text_file.write(s) #go back to parent directory os.chdir(curr_dir) #return the results return feature, model_names, models, results, MAE, predicted_dates, predicted, model_selection_list
def run_models(): #-------------------------------------Creating and storing MLP model----------------------------------------------------- # Importing the dataset and separating dependent/independent variables dataset = pd.read_csv("assets/predicts.csv") # print(dataset.dtypes) dataset['Main purpose of visit'].value_counts() dataset['Accessibility status'].value_counts() dataset['Accomodation status'].value_counts() dataset['health services status'].value_counts() cleanup_nums = {"Accessibility status":{"Poor": 1, "Fair": 2,"Good":3,"Better":4}, "Accomodation status": {"Poor": 1, "Fair": 2,"Good":3,"Better":4}, "health services status":{"Poor": 1, "Fair": 2,"Good":3,"Better":4}, } dataset.replace(cleanup_nums, inplace=True) dataset.head(5) # print(dataset.head(5)) X = dataset.iloc[:,1:8].values # print(X[:,3]) y = dataset.iloc[:,10].values # print(y) # Encoding categorical data labelencoder_X_3 = LabelEncoder() X[:, 3] = labelencoder_X_3.fit_transform(X[:, 3]) list(labelencoder_X_3.inverse_transform([0, 1, 2, 3])) X[:, 3] X[:,0:4] # print(X) onehotencoder = OneHotEncoder(categorical_features = [3] ) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # print('\n'.join([''.join(['{:9}'.format(item) for item in row]) # for row in X])) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) a=y_test b=y_train # Feature Scaling //escaping sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Part 2 - making the the ANN model # Importing the Keras libraries and packages # Initialising the ANN for regression #Creating regression model REG = Sequential() # Adding the input layer and the first hidden layer with dropout if required REG.add(Dense(units=20,input_dim=9 ,kernel_initializer="normal", activation = 'relu')) #REG.add(Dropout(p=0.1)) # Adding the second hidden layer REG.add(Dense(units =20,kernel_initializer="normal", activation = 'relu')) #REG.add(Dropout(p=0.1)) # Adding the output layer REG.add(Dense(units = 1, kernel_initializer="normal")) # Compiling the ANN #def root_mean_squared_error(y_true, y_pred): # return K.sqrt(K.mean(K.square(y_pred - y_true))) REG.compile(optimizer = 'adam', loss= 'mean_squared_error') # Fitting the ANN to the Training set REG.fit(X_train, y_train, batch_size = 10, epochs = 200) # Part 3 - Making the predictions and evaluating the model X_test # Predicting the Test set results y_pred = REG.predict(X_test) REG.save('assets/REG_MLP_model.h5') K.clear_session() #--------------------------------------------------------------------------------------------------------------------- #---------------------------------------Creating and storing SARIMA model---------------------------------------------- #data collecting...converting dataset to html.... df = pd.read_csv('assets/Touristarrival_monthly.csv') df1=df.iloc[:5] html_table_template = df1.to_html(index=False) html_table=df.to_html(index=False) #data observation and log transformation df.index=pd.to_datetime(df['Month']) df['#Tourists'].plot() mpl.pyplot.ylabel("No.of Toursits Arrivals ") mpl.pyplot.xlabel("Year") #storing plots mpl.pyplot.savefig('PredictionEngine/static/img/sarima_input.png', dpi=600,bbox_inches='tight') mpl.pyplot.clf() series=df['#Tourists'] logtransformed=np.log(series) logtransformed.plot() mpl.pyplot.ylabel("log Scale(No.of Toursits Arrivals) ") mpl.pyplot.xlabel("Year") #storing plots mpl.pyplot.savefig('PredictionEngine/static/img/sarima_input_logscaled.png', dpi=600,bbox_inches='tight') mpl.pyplot.clf() #Train test split percent_training=0.80 split_point=round(len(series)*percent_training) # print(split_point) training , testing = series[0:split_point] , series[split_point:] training=np.log(training) #differencing to achieve stationarity training_diff=training.diff(periods=1).values[1:] #plot of residual log differenced series mpl.pyplot.plot(training_diff) mpl.pyplot.title("Tourist arrivals data log-differenced") mpl.pyplot.xlabel("Years") mpl.pyplot.ylabel("Toursits arrivals") mpl.pyplot.clf() #ACF and PACF plots 1(with log differenced training data) lag_acf=acf(training_diff,nlags=40) lag_pacf=pacf(training_diff,nlags=40,method='ols') #plot ACF mpl.pyplot.figure(figsize=(15,5)) mpl.pyplot.subplot(121) mpl.pyplot.stem(lag_acf) mpl.pyplot.axhline(y=0,linestyle='-',color='black') mpl.pyplot.axhline(y=-1.96/np.sqrt(len(training)),linestyle='--',color='gray') mpl.pyplot.axhline(y=1.96/np.sqrt(len(training)),linestyle='--',color='gray') mpl.pyplot.xlabel('lag') mpl.pyplot.ylabel("ACF") #storing plots in bytes mpl.pyplot.savefig('PredictionEngine/static/img/sarima_afc.png', dpi=600,bbox_inches='tight') mpl.pyplot.clf() #plot PACF mpl.pyplot.figure(figsize=(15,5)) mpl.pyplot.subplot(122) mpl.pyplot.stem(lag_pacf) mpl.pyplot.axhline(y=0,linestyle='-',color='black') mpl.pyplot.axhline(y=-1.96/np.sqrt(len(training)),linestyle='--',color='gray') mpl.pyplot.axhline(y=1.96/np.sqrt(len(training)),linestyle='--',color='gray') mpl.pyplot.xlabel('lag') mpl.pyplot.ylabel("PACF") #storing plots in bytes mpl.pyplot.savefig('PredictionEngine/static/img/sarima_pafc.png', dpi=600,bbox_inches='tight') mpl.pyplot.clf() #SARIMA Model specification model=sm.tsa.statespace.SARIMAX(training,order=(2,0,3),seasonal_order=(2,1,0,12),trend='c',enforce_invertibility=False,enforce_stationarity=False) # fit model model_fit = model.fit() model_fit.save("assets/REG_SARIMA_model.pickle") # print(model_fit.summary()) #plot residual errors # residuals = pd.DataFrame(model_fit.resid) # fig, ax = mpl.pyplot.subplots(1,2) # residuals.plot(title="Residuals", ax=ax[0]) # residuals.plot(kind='kde', title='Density', ax=ax[1]) # mpl.pyplot.show() # print(residuals.describe()) # Model evaluation and forecast model_fitted=load_pickle("assets/REG_SARIMA_model.pickle") forecast=model_fitted.forecast(len(df)-250) # print(forecast) forecast=np.exp(forecast) # print(forecast) #plot forecast results and display RMSE mpl.pyplot.figure(figsize=(10,5)) mpl.pyplot.plot(forecast,'r') mpl.pyplot.plot(series,'b') mpl.pyplot.legend(['Predicted test values','Actual data values']) mpl.pyplot.title('RMSE:%.2f'% np.sqrt(sum((forecast-testing)**2)/len(testing))) mpl.pyplot.ylabel("No.of Toursits Arrivals Monthly") mpl.pyplot.xlabel("Year") mpl.pyplot.autoscale(enable='True',axis='x',tight=True) mpl.pyplot.axvline(x=series.index[split_point],color='black'); #storing plots mpl.pyplot.savefig('PredictionEngine/static/img/sarima_result.png', dpi=600,bbox_inches='tight') mpl.pyplot.clf() forecaste=model_fitted.forecast(len(df)-214) forecast_next=forecaste[62:] forecast_next=np.exp(forecast_next) # print(forecast_next) mpl.pyplot.figure(figsize=(10,5)) mpl.pyplot.plot(forecast_next,'r') mpl.pyplot.plot(series,'b') mpl.pyplot.legend(['Predicted next steps values']) mpl.pyplot.title('Monthly tourist arrivals predictions') mpl.pyplot.ylabel("No.of Toursits Arrivals ") mpl.pyplot.xlabel("Year") mpl.pyplot.autoscale(enable='True',axis='x',tight=True) #storing plots in bytes mpl.pyplot.savefig('PredictionEngine/static/img/sarima_forecast.png', dpi=600,bbox_inches='tight') mpl.pyplot.clf()
# * **Moving average (MA) -** incorporates the dependency between an observation and a residual error from a moving average model applied to lagged observations. # # The notation **MA(q)** refers to the moving average model of order q:<br/> # ![image.png](attachment:image.png) # # *Example* — If q is 3 the predictor for X(t) will be # X(t) = µ + εt + θ1.ε(t-1) + θ2.ε(t-2) + θ3.ε(t-3) # Here instead of difference from previous term, we take errer term (ε) obtained from the difference from past term # Now we need to figure out the values of p and q which are parameters of ARIMA model. We use below two methods to figure out these values - # # **Autocorrelation Function (ACF):** It just measures the correlation between two consecutive (lagged version). example at lag 4, ACF will compare series at time instance t1…t2 with series at instance t1–4…t2–4 # # **Partial Autocorrelation Function (PACF):** is used to measure the degree of association between X(t) and X(t-p). acf_lag = acf(train_df.diff().dropna().values, nlags=20) pacf_lag = pacf(train_df.diff().dropna().values, nlags=20, method='ols') model = ARIMA(train_df.values, order=(5, 0, 3)) model_fit = model.fit(disp=0) # Plot residual errors residuals = pd.DataFrame(model_fit.resid) # # Forecast fc, se, conf = model_fit.forecast(480, alpha=0.05) # 95% conf # Make as pandas series fc_series = pd.Series(fc, index=test_df.index) lower_series = pd.Series(conf[:, 0], index=test_df.index)
# Checking out a scatter plot , probably we can try out different lags and check data #sb.jointplot('Logged First Difference','Lag 20',stock_data, kind ='reg', size = 10) #pylab.show () # Probably we can use stat models and check out the lagged data for all and see #if any correlation exits from statsmodels.tsa.stattools import acf from statsmodels.tsa.stattools import pacf #acf is auto correlation fucntion and pacf is partial acf (works only for 1 d array) #iloc is integer location, check pandas lag_corr = acf (stock_data ['Logged First Difference'].iloc [1:]) lag_partial_corr = pacf (stock_data ['Logged First Difference'].iloc [1:]) #fig, ax = plt.subplots (figsize = (16,12)) #ax.plot (lag_corr) #pylab.show () # To extract trends and seasonal patterns for TS analysis from statsmodels.tsa.seasonal import seasonal_decompose #set the frequency value right for monthly set freq = 30 decomposition = seasonal_decompose(stock_data['Natural Log'], model='additive', freq=30) #fig = decomposition.plot() #pylab.show () #lets fit some ARIMA, keep indicator as 1 and rest as zero ie (p,q,r) = (1,0,0)
diff[diff == (-inf)] = -100 diff[diff == (inf)] = 100 test_stationarity(differ) # In[34]: print 'The above differenced series is stationary' # In[35]: #Making ACF and PACF plots from statsmodels.tsa.stattools import acf, pacf from statsmodels.tsa.arima_model import ARIMA lag_acf = acf(differ, nlags=20) lag_pacf = pacf(differ, nlags=20, method='ols') #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(differ)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(differ)), linestyle='--', color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(differ)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(differ)), linestyle='--', color='gray')
# look at the noise from the data and check if the noise is stationary or not decomposedlogdata = residual decomposedlogdata.dropna(inplace=True) test_stationarity(decomposedlogdata) print("\nVisually, from the output of the graph, we see that the residuals of the log of the time series is not stationary."\ " That is why we have to have your moving average parameter in place so that it smooths and setup to predict what will happen next.") # Now that we know the value of d, the Integration parameter of the ARIMA model, which is the order of differentiation # But how can you know the values of P and Q, which are the Autoregresive lag correlations and Moving Average, respectively? # To do that we have to plot ACF and PACF plots, which stand for autocorrelation fuction and partial correlation function # To calculate the value of Q, we need the ACF graph, and the value of P, we need the PACF graph lag_acf = acf(datasetlogdiffshifting, nlags=20) lag_pacf = pacf(datasetlogdiffshifting, nlags=20, method='ols') # Plot ACF to determine the Q(Moving Average part of ARIMA) fig, ax = plt.subplots() plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(datasetlogdiffshifting)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(datasetlogdiffshifting)), linestyle='--', color='gray') plt.title('Autocorrelation Function') # Plot PACF to determine the P(Autoregressive part of ARIMA)
def pacf(timeSeries,nlags = 40, alpha = 0.05, method = 'ywunbiased'): results = stattools.pacf(timeSeries, nlags = nlags, alpha = alpha, method = method) return results
df_ep = pd.DataFrame({'extrapol': trend_extrapol.ravel()}, index=np.arange(1, N + 13)) df = pd.concat([df, df_ep], axis=1) ### Periodic continuation of seasonal cycle ### df.loc[N + 1:, 'seasonal'] = df.loc[1:12, 'seasonal'].values ### Determine parameters for ARIMA model ### last_index = ts_residual.index[-1] lag_acf = acf(ts_residual, nlags=20) lag_pacf = pacf(ts_residual, nlags=20, method='ols') confidence_interval = 1.96 / np.sqrt(len(ts_residual)) p = min(*np.where(lag_pacf < confidence_interval)) q = min(*np.where(lag_acf < confidence_interval)) ### Fit ARIMA model and forecast_residuals ### to_go = N - last_index + 12 model = ARIMA(np.asarray(ts_residual), order=(p, 1, q)) results = model.fit(disp=0) pred_residuals = results.forecast(steps=to_go)[0] df_pr = pd.DataFrame({'pred res': pred_residuals}, index=np.arange(last_index + 1, N + 13))
def plot_pacf_bars(ds, rgi_df, xlim=None, nlags=200, slice_start=3000, plot_confint=True): """ Parameters ---------- ds rgi_df xlim path nlags slice_start plot_confint """ # iterate over all above selected glaciers for rgi_id, glacier in rgi_df.iterrows(): # select glacier rgi_id = rgi_id name = glacier['name'] log.info('PACF plots for {} ({})'.format(name, rgi_id)) # create figure and axes fig, ax = plt.subplots(1, 1) # compute acf over 1000 years lags = np.arange(0, nlags + 1) # select the complete dataset ds_sel = ds.sel(mb_model='random', normalized=False, rgi_id=rgi_id) # select time frame slice_end = None ds_sel = ds_sel.isel(time=slice(slice_start, slice_end)) # define bar width width = 0.4 # plot zero aux line ax.axhline(0, c='k', ls=':') for i, b in enumerate(np.sort(ds.temp_bias)): # get length data length = ds_sel.sel(temp_bias=b).length # FLOWLINE MODEL # -------------- # compute autocorrelation and confidence intervals acf, confint = stattools.pacf(length.sel(model='fl'), nlags=nlags, alpha=0.01, method='ywmle') # plot autocorrelation function ax.bar(lags - width / 2, acf, width, color=fl_cycle[i], label='{:+.1f} °C'.format(b)) if plot_confint: # fill confidence interval ax.fill_between(lags[1:], confint[1:, 0] - acf[1:], confint[1:, 1] - acf[1:], color=fl_cycle[i], alpha=0.1) # V/A SCALING MODEL # ----------------- # compute autocorrelation and confidence intervals acf, confint = stattools.pacf(length.sel(model='vas'), nlags=nlags, alpha=0.01, method='ywmle') # plot autocorrelation function ax.bar(lags + width / 2, acf, width, color=vas_cycle[i], label='{:+.1f} °C'.format(b)) if plot_confint: # fill confidence interval ax.fill_between(lags[1:], confint[1:, 0] - acf[1:], confint[1:, 1] - acf[1:], color=vas_cycle[i], alpha=0.1) # adjust axes if not xlim: xlim = [0, nlags] ax.set_xlim(xlim) ax.set_ylim([-1.1, 1.1]) # add grid ax.grid() # get legend handles and labels handles, labels = ax.get_legend_handles_labels() title_proxy, = plt.plot(0, marker='None', linestyle='None', label='dummy') # create list of handles and labels in correct order my_handles = list([title_proxy]) my_handles.extend(handles[::2]) my_handles.extend([title_proxy]) my_handles.extend(handles[1::2]) my_labels = list(["$\\bf{Flowline\ model}$"]) my_labels.extend(labels[::2]) my_labels.extend(["$\\bf{VAS\ model}$"]) my_labels.extend(labels[1::2]) # add single two-column legend ax.legend(my_handles, my_labels, ncol=2) # labels, title, ... ax.set_xlabel('Lag [years]') ax.set_ylabel('Correlation coefficient') # store plot dir_path = '/Users/oberrauch/work/master/plots/final_plots/pacf/' f_name = '{}.pdf'.format(name.replace(' ', '_')) path = os.path.join(dir_path, f_name) plt.savefig(path, bbox_inches='tight')
def pacf(): pacf1 = pacf(train) pacf1 = pd.DataFrame([pacf1]).T pacf1.plot(kind='bar', figsize=(12, 10)) plt.show()
def first_diff_pacf(): pacf1_diff = pacf(price_diff) pacf1_diff = pd.DataFrame([pacf1_diff]).T pacf1_diff.plot(kind='bar', figsize=(12, 10)) plt.show()
plt.savefig(directory_acf + "/feature" + str(i) + "_acf.png") i += 1 plt.clf() # create pacf graphs for each feature directory_pacf = "pacf/simple" if not os.path.exists(directory_pacf): os.makedirs(directory_pacf) i = 1 pacf_res = [] for group in groups: plt.figure(figsize=(10,latent_dim)) temp = stattools.pacf(df[names[group]]) plt.bar(range(len(temp)), temp, width = 0.1) plt.plot(temp, "ro") plt.xlabel("Lags") plt.ylabel("PACF") plt.title("PACF for feature " + str(group + 1)) plt.axhline(y = 0, linestyle = "--") plt.axhline(y = -1.96/np.sqrt(len(df[names[0]])), linestyle = "--") plt.axhline(y = 1.96/np.sqrt(len(df[names[0]])), linestyle = "--") pacf_res.append(temp) plt.savefig(directory_pacf + "/feature" + str(i) + "_pacf.png") i += 1 plt.clf() directory_hist = "histograms/simple" if not os.path.exists(directory_hist):
def pacf(self): return pacf(self.ts, nlags=20)
plt.subplot(414) plt.title("Residual Data") plt.plot(residual, label='Residual') plt.legend(loc='best') plt.tight_layout #get residual data to clean noise decomposedLogData = residual decomposedLogData.dropna(inplace=True) test_staionary(decomposedLogData) #getting an idea for p,q values needed to apply ARIMA model from statsmodels.tsa.stattools import acf, pacf lag_acf = acf(indexedDataset_logscale, nlags=20) lag_pacf = pacf(indexedDataset_logscale, nlags=20, method='ols') #plot Auto Correlation Function : to calc q plt.subplot(121) plt.plot(lag_acf) plt.title('Auto Correlation Function') #plot Partial Auto Correlation Function : to calc p plt.subplot(122) plt.plot(lag_pacf) plt.title('Partial Auto Correlation Function') plt.tight_layout() #checking Auto Regression from statsmodels.tsa.arima_model import ARIMA model = ARIMA(indexedDataset_logscale,
#plt.subplot(414) #plt.plot(residual, label='Residuals') #plt.legend(loc='best') #plt.tight_layout() # #plt.show() # #ts_log_decompose = residual #ts_log_decompose.dropna(inplace=True) #test_sta.test_stationarity(ts_log_decompose) ## decide the structure (p,q) of the model ------------------------------------ from statsmodels.tsa.stattools import acf, pacf lag_acf = acf(ts_log_ewma_diff, nlags=20) lag_pacf = pacf(ts_log_ewma_diff, nlags=20, method='ols') plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_ewma_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_ewma_diff)),linestyle='--',color='gray') plt.title('Autocorrelation Function') plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_ewma_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_ewma_diff)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() plt.show()
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywunbiased', use_vlines=True, title='Partial Autocorrelation', zero=True, vlines_kwargs=None, **kwargs): """ Plot the partial autocorrelation function Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : int or array_like, optional int or Array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : float, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : {'ywunbiased', 'ywmle', 'ols'} Specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf. Default. - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. title : str, optional Title to place on plot. Default is 'Partial Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. vlines_kwargs : dict, optional Optional dictionary of keyword arguments that are passed to vlines. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Plots lags on the horizontal and the correlations on vertical axis. Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` kwargs is used to pass matplotlib optional arguments to both the line tracing the autocorrelations and for the horizontal line at 0. These options must be valid for a Line2D object. vlines_kwargs is used to pass additional optional arguments to the vertical lines connecting each autocorrelation to the axis. These options must be valid for a LineCollection object. Examples -------- >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> dta = sm.datasets.sunspots.load_pandas().data >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) >>> del dta["YEAR"] >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40) >>> plt.show() .. plot:: plots/graphics_tsa_plot_pacf.py """ fig, ax = utils.create_mpl_ax(ax) vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) confint = None if alpha is None: acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method) else: acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, vlines_kwargs, **kwargs) return fig
def forecast(ts, log_series): """ make model on the TS after differencing. Having performed the trend and seasonality estimation techniques, there can be two situations: A strictly stationary series with no dependence among the values. This is the easy case wherein we can model the residuals as white noise. But this is very rare. A series with significant dependence among values. In this case we need to use some statistical models like ARIMA to forecast the data. The predictors depend on the parameters (p,d,q) of the ARIMA model: Number of AR (Auto-Regressive) terms (p): AR terms are just lags of dependent variable. For instance if p is 5, the predictors for x(t) will be x(t-1)...x(t-5). Number of MA (Moving Average) terms (q): MA terms are lagged forecast errors in prediction equation. For instance if q is 5, the predictors for x(t) will be e(t-1)...e(t-5) where e(i) is the difference between the moving average at ith instant and actual value. Number of Differences (d): These are the number of nonseasonal differences, i.e. in this case we took the first order difference. So either we can pass that variable and put d=0 or pass the original variable and put d=1. Both will generate same results. We use two plots to determine these numbers. Lets discuss them first. Autocorrelation Function (ACF): It is a measure of the correlation between the the TS with a lagged version of itself. For instance at lag 5, ACF would compare series at time instant 't1'...'t2' with series at instant 't1-5'...'t2-5' (t1-5 and t2 being end points). Partial Autocorrelation Function (PACF): This measures the correlation between the TS with a lagged version of itself but after eliminating the variations already explained by the intervening comparisons. Eg at lag 5, it will check the correlation but remove the effects already explained by lags 1 to 4. :param log_series: :return: """ #ACF and PACF plots ts_log_diff = ts_log - ts_log.shift() ts_log_diff = ts_log_diff.dropna() lag_acf = acf(ts_log_diff, nlags = 20) lag_pacf = pacf(ts_log_diff, nlags = 20, method = "ols") #plot ACF plt.subplot(221) plt.plot(lag_acf) plt.axhline(y=0, linestyle="--", color="gray") plt.axhline(y=-1.96 / np.sqrt(len(ts_log_diff)), linestyle="--", color="gray") #lower line of confidence interval plt.axhline(y=1.96 / np.sqrt(len(ts_log_diff)), linestyle="--", color="gray") #upper line of confidence interval plt.title('Autocorrelation Function') # Plot PACF: plt.subplot(222) plt.plot(lag_pacf) plt.axhline(y=0, linestyle="--", color="gray") plt.axhline(y=-1.96 / np.sqrt(len(ts_log_diff)), linestyle="--", color="gray") plt.axhline(y=1.96 / np.sqrt(len(ts_log_diff)), linestyle="--", color="gray") plt.title('Partial Autocorrelation Function') plt.tight_layout() #from these plots, we get p and q: #p - The lag value where the PACF chart crosses the upper confidence interval for the first time. If you notice # closely, in this case p=2. #q - The lag value where the ACF chart crosses the upper confidence interval for the first time. If you notice # closely, in this case q=2. #AR model res_arima = arima_models(ts_log, 2, 1, 0) # print pd.Series(res_arima.fittedvalues) plt.subplot(223) plt.plot(ts_log_diff) plt.plot(res_arima.fittedvalues, color='red') # plt.title('AR model--RSS: %.4f' % sum((pd.Series(res_arima.fittedvalues) - ts_log_diff) ** 2)) #MA model res_ma = arima_models(ts_log, 0, 1, 2) plt.subplot(224) plt.plot(ts_log_diff) plt.plot(res_ma.fittedvalues, color='red') # plt.title('MA model--RSS: %.4f' % sum((res_ma.fittedvalues - ts_log_diff) ** 2)) plt.plot() ##Combined model res = arima_models(ts_log, 2, 1, 2) plt.plot(ts_log_diff) plt.plot(res.fittedvalues, color='red') # plt.title('RSS: %.4f' % sum((res.fittedvalues - ts_log_diff) ** 2)) plt.show() #Here we can see that the AR and MA models have almost the same RSS but combined is significantly better. #predicting predictions_diff = pd.Series(res.fittedvalues, copy=True) print predictions_diff.head() #Notice that these start from '1949-02-01' and not the first month; because we took a lag by 1 and first element # doesn't have anything before it to subtract from. The way to convert the differencing to log scale is to add these # differences consecutively to the base number. An easy way to do it is to first determine the cumulative sum at # index and then add it to the base number. The cumulative sum can be found as: predictions_diff_cumsum = predictions_diff.cumsum() #now add them to the base number predictions_arima_log = pd.Series(ts_log.ix[0], index = ts_log.index) predictions_arima_log = predictions_arima_log.add(predictions_diff_cumsum, fill_value = 0) #now let us take the exponential to regain original form of series predictions_ARIMA = np.exp(predictions_arima_log) plt.plot(ts) plt.plot(predictions_ARIMA) # plt.title('RMSE: %.4f' % np.sqrt(sum((predictions_ARIMA - ts) ** 2) / len(ts))) plt.show()
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywm', use_vlines=True, title='Partial Autocorrelation', zero=True, **kwargs): """Plot the partial autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : int or array_like, optional int or Array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : 'ywunbiased' (default) or 'ywmle' or 'ols' specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. title : str, optional Title to place on plot. Default is 'Partial Autocorrelation' zero : bool, optional Flag indicating whether to include the 0-lag autocorrelation. Default is True. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero) confint = None if alpha is None: acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method) else: acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, **kwargs) return fig
def queryandinsert(): """ This is the main function which will be call by main... it integrate several other functions. Please do not call this function in other pack, otherwise it will cause unexpected result!!!!""" global gtbuDict # gtbuDict, being used to store query data from gtbu database..... global omsDict # being used to store query data from OMS database..... global presisDict global counter global testingDict starttime = datetime.datetime.now() print len(presisDict) print "connect to databae!" # connect to the database use my own toolkits querydbinfoOMS = getdbinfo('OMS') querydbnameOMS = "wifi_data" querydbinfoGTBU = getdbinfo("GTBU") querydbnameGTBU = "ucloudplatform" insertdbinfo = getdbinfo('REMOTE') insertdbname = 'login_history' # print the database information for verification for key, value in querydbinfoOMS.iteritems(): print key + " : " + str(value) queryStatementRemote = """ SELECT epochTime,visitcountry,onlinenum FROM t_fordemo WHERE butype =2 AND visitcountry IN ('JP','DE','TR') AND epochTime BETWEEN DATE_SUB(NOW(),INTERVAL 2 DAY) AND NOW() ORDER BY epochTime ASC """ # get the online data which will be used to calculate the daily uer number ( Daily user number is bigger than the max number... # and the max number is actually what being used in this scenario queryStatementTraining = """ SELECT t1,t2,DATEDIFF(t2,t1) AS dif,imei,visitcountry FROM ( SELECT DATE(logindatetime) AS t1,DATE(logoutdatetime) AS t2, imei,visitcountry FROM t_usmguserloginlog WHERE visitcountry IN ('JP','DE','TR') ) AS z GROUP BY t1,t2,imei """ # (output data) get the max online number for each of these countries every day ( this record is incomplete due to the constant network partition # therefore a lot of corresponding operation is necessary for aligning the input and output date by day!... queryStatementOnline =""" SELECT epochTime,visitcountry,MAX(onlinenum) FROM ( SELECT DATE(epochTime) AS epochTime,visitcountry,onlinenum FROM t_fordemo WHERE butype =2 and visitcountry IN ('JP','DE','TR') ) AS z GROUP BY epochTime,visitcountry """ # (input data) get the order number information which will be used to calculate the daily maximum number for each country... # this number could be ridiculously large with respect to the real number for some specific countries. querystatementOMS = """ SELECT DATE(date_goabroad),DATE(date_repatriate),DATEDIFF(date_repatriate,date_goabroad),imei,package_id FROM tbl_order_basic WHERE imei IS NOT NULL AND (DATE(date_repatriate)) > '2016-01-01' AND DATE(date_goabroad) < DATE(NOW()) ORDER BY date_repatriate ASC """ querystatementOMSCount = """ SELECT date_goabroad,date_repatriate,DATEDIFF(date_repatriate,date_goabroad),t1.package_id,t3.iso2 FROM tbl_order_basic AS t1 LEFT JOIN tbl_package_countries AS t2 ON t1.package_id = t2.package_id LEFT JOIN tbl_country AS t3 ON t2.country_id = t3.pk_global_id WHERE t1.data_status = 0 AND DATE(date_goabroad) BETWEEN DATE(NOW()) AND DATE_ADD(NOW(),INTERVAL 3 MONTH) OR ( DATE(date_repatriate) >= DATE(NOW()) ) """ # establish connection to the mysql databases................ querydbGTBU = MySQLdb.connect(user = querydbinfoGTBU['usr'], passwd = querydbinfoGTBU['pwd'], host = querydbinfoGTBU['host'], port = querydbinfoGTBU['port'], db = querydbnameGTBU) querydbOMS = MySQLdb.connect(user = querydbinfoOMS['usr'], passwd = querydbinfoOMS['pwd'], host = querydbinfoOMS['host'], port = querydbinfoOMS['port'], db = querydbnameOMS) insertdb = MySQLdb.connect(user = insertdbinfo['usr'], passwd = insertdbinfo['pwd'], host = insertdbinfo['host'], port = insertdbinfo['port'], db = insertdbname) queryCurGTBU = querydbGTBU.cursor() queryCurOMS = querydbOMS.cursor() insertCur = insertdb.cursor() print "executing query!!! By using generator!!!" insertCur.execute(queryStatementRemote) remoteGenerator = fetchsome(insertCur,100) #fetchsome is a generator which will fetch a certain number of query each time. for row in remoteGenerator: accumulatOnlineNumber(row,testingDict) onlineList = getTestingList(testingDict) countryList = onlineList[1] jpIndex = countryList.index('JP') datalist = onlineList[2][jpIndex] timelist = onlineList[0] tsJP = Series(datalist,index = timelist) df = DataFrame() df['JP'] = tsJP print df.index print df.columns print df tsJP_log = np.log(tsJP) lag_acf = acf(tsJP_log,nlags=200) lag_pacf = pacf(tsJP_log,nlags=200,method='ols') # model = ARIMA(tsJP_log,order=(2,1,2)) model = ARMA(tsJP_log,(5,2)) res = model.fit(disp=-1) print "Here is the fit result" print res params = res.params residuals = res.resid p = res.k_ar q = res.k_ma k_exog = res.k_exog k_trend = res.k_trend steps = 300 newP = _arma_predict_out_of_sample(params, steps, residuals, p, q, k_trend, k_exog, endog=tsJP_log, exog=None, start=len(tsJP_log)) newF,stdF,confiF = res.forecast(steps) print newP newP = np.exp(newP) print newP print " Forecast below!!" print newF newF = np.exp(newF) print newF print stdF stdF = np.exp(stdF) print stdF x_axis = range(len(lag_acf)) y_axis = lag_acf onlineEWMA=go.Scatter( x = x_axis, y = y_axis, mode = 'lines+markers', name = "lag_acf" ) onlinePre=go.Scatter( x = x_axis, y = newP, mode = 'lines+markers', name = "predictJP" ) layout = dict(title = 'predicewma', xaxis = dict(title = 'Date'), yaxis = dict(title = 'online Number'), ) data = [onlineEWMA,onlinePre] fig = dict(data=data, layout=layout) plot(fig,filename ="/ukl/apache-tomcat-7.0.67/webapps/demoplotly/EWMAprediction.html",auto_open=False)
if abs(max(tcs_close)-avg) > abs(min(tcs_close)-avg): h = max(tcs_close) else: h = min(tcs_close) gradplot = figure() mapper = LinearColorMapper(palette= ['#084594', '#2171b5', '#4292c6', '#6baed6', '#9ecae1', '#c6dbef', '#deebf7', '#f7fbff'], low=avg, high=h) gradplot.scatter(range(len(list(tcs_close))), list(tcs_close), color = {'field': 'y', 'transform':mapper }) color_bar = ColorBar(color_mapper=mapper['transform'], width=8, location=(0,0)) gradplot.add_layout(color_bar, 'right') show(gradplot) #5 pwov_index = [i for i in range(len(pwov)) if pwov[i] == 1] tcs_plot = figure() tcs_plot.line(range(len(tcs_close)), tcs_close, line_width = 2, color = 'blue') tcs_plot.circle(pwov_index, list(tcs_close[pwov_index]), color='red') show(tcs_plot) #6 from bokeh.plotting import figure, show from statsmodels.tsa.stattools import pacf vals = pacf(tcs_close) pacfplot = figure() pacfplot.scatter(range(len(vals[:10])), vals[:10], line_width = 2, color = 'blue') show(pacfplot)
index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used' ]) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print(dfoutput) # Excellent p-value: we have a stationary evolution ! index_date = pd.date_range('1/1/2011', periods=5000, freq='1800s') data = pd.DataFrame( data={"CSPL_RECEIVED_CALLS": data["CSPL_RECEIVED_CALLS"].values}, index=index_date) data['CSPL_RECEIVED_CALLS'] = data['CSPL_RECEIVED_CALLS'].astype('float64') lag_acf = acf(data, nlags=20) lag_pacf = pacf(data, nlags=20, method='ols') plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(5000), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(5000), linestyle='--', color='gray') plt.title('Autocorrelation Function') plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(5000), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(5000), linestyle='--', color='gray') plt.title('Partial Autocorrelation Function')
plt.tight_layout() #there can be cases where an observation simply consisted of trend & seasonality. In that case, there won't be #any residual component & that would be a null or NaN. Hence, we also remove such cases. decomposedLogData = residual decomposedLogData.dropna(inplace=True) test_stationarity(decomposedLogData) decomposedLogData = residual decomposedLogData.dropna(inplace=True) test_stationarity(decomposedLogData) #ACF & PACF plots lag_acf = acf(datasetLogDiffShifting, nlags=20) lag_pacf = pacf(datasetLogDiffShifting, nlags=20, method='ols') #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray') plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray') plt.title('Autocorrelation Function') #Plot PACF plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray') plt.axhline(y=1.96/np.sqrt(len(datasetLogDiffShifting)), linestyle='--', color='gray')
index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used' ]) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print(dfoutput) data_shift.dropna(inplace=True) test_stationarity(data_shift) # ACF & PACF plots lag_acf = acf(data_shift, nlags=10) lag_pacf = pacf(data_shift, nlags=10, method='ols') # Plot ACF:s plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(data_shift)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(data_shift)), linestyle='--', color='gray') plt.title('Autocorrelation Function') # Plot PACF plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(data_shift)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(data_shift)), linestyle='--', color='gray')
plt.plot(np.arange(0, 11), acf(ts_log_mv_diff, nlags=10)) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-7.96 / np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='gray') plt.axhline(y=7.96 / np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='gray') plt.title('Autocorrelation Function') plt.show() # ### The ACF curve crosses the upper confidence value when the lag value is between 0 and 1. Thus, optimal value of q in the ARIMA model must be 0 or 1 # In[15]: plt.plot(np.arange(0, 11), pacf(ts_log_mv_diff, nlags=10)) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-7.96 / np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='gray') plt.axhline(y=7.96 / np.sqrt(len(ts_log_mv_diff)), linestyle='--', color='gray') plt.title('Partial Autocorrelation Function') plt.show() # ### The PACF curve drops to 0 between lag values 1 and 2. Thus, optimal value of p in the ARIMA model is 1 or 2. # In[16]: model = ARIMA(ts_log, order=(1, 1, 0))
plt.plot(decompose.seasonal) plt.plot(decompose.trend) #Differencing diff_df = df - df.shift(1) diff_df = diff_df.dropna() plt.plot(diff_df) decompose = seasonal_decompose(diff_df, freq=12) plt.plot(decompose.resid) plt.plot(decompose.seasonal) plt.plot(decompose.trend) from statsmodels.tsa.stattools import acf, pacf lag_acf = acf(df, nlags=20) lag_pacf = pacf(df, nlags=20, method='ols') plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(diff_df)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(diff_df)), linestyle='--', color='gray') plt.title('Autocorrelation Function') plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(diff_df)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(diff_df)), linestyle='--', color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() mod = SARIMAX(df,
plt.figure(6) plt.plot(differencing) plotstats(differencing) plt.show() print('ACF and PACF with series stationarized') pyplot.figure() plot_acf(differencing, ax=pyplot.gca(), lags=20) pyplot.figure() plot_pacf(differencing, ax=pyplot.gca(), lags=20) pyplot.show() lag_acf = acf(differencing, nlags=20) lag_pacf = pacf(differencing, nlags=20, method='ols') #Temporary test ACF and PACF plt.figure(13) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(series)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(series)), linestyle='--', color='gray') plt.title('Autocorrelation function for PETR4 - ARIMA (0,1,1)') plt.show() #Plot PACF: plt.figure(14) plt.plot(lag_pacf)
plt.plot(seasonal,label='Seasonality') plt.legend(loc='best') plt.subplot(414) plt.plot(residual, label='Residuals') plt.legend(loc='best') plt.tight_layout() ts_log_decompose = residual ts_log_decompose.dropna(inplace=True) test_stationarity(ts_log_decompose) ########################## #Modeling and forecasting# ########################## data_log_diff.dropna(inplace=True) lag_acf = acf(data_log_diff, nlags=20) lag_pacf = pacf(data_log_diff, nlags=20, method='ols') #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(data_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(data_log_diff)),linestyle='--',color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(data_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(data_log_diff)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout()
if __name__ == "__main__": data = pd.read_csv(sys.argv[1]) d = sys.argv[2] #print(d) #print(d) #lag_acf = acf(data["value"].diff(d).dropna(), nlags=30) #lag_pacf = pacf(data["value"].dropna(), nlags=25) if d == '0': lag_pacf = pacf(data["value"].dropna(), nlags=25) else: lag_pacf = pacf(data["value"].diff(d).dropna(), nlags=25) #print(lag_pacf) f = open("./ts_analysis/pacf.txt", "w") #f = open("pacf.txt", "w") for i in lag_pacf: f.write(str(i)+'\n') f.close() #plot_pacf(data["value"].diff(d).dropna()) #lt.savefig("../desktop/for_redis/pacf.png") #plt.savefig("pacf.png")
def main(args): data_file_list = tl.get_data_file_list(args.library) if args.list: tl.print_list(data_file_list) sys.exit(0) data = tl.load_data_file(data_file_list,args.select) if not args.static: logging.debug('Remove zero velocity samples') data = ntp.remove_non_positive_velocity_samples(data) if args.ue == 'e398': # Rename MAC downlink throughput in Application downlink throughput if need be ntp.process_data(data,ntp.process_lte_rename_mac_to_app) # Get basic data ntp.process_data(data,ntp.process_lte_app_bw_prb_util) ntp.process_data(data,ntp.process_lte_app_bw_prb_util_bw20) ntp.process_data(data,ntp.process_lte_app_bw_prb_util_bw10) ntp.process_data(data,ntp.process_lte_app_bw_prb_util_bw15) # Spectral efficiency, SNR, RSRP ntp.process_data(data,ntp.process_se_bw_norm) ntp.process_data(data,ntp.process_lte_rs_snr) ntp.process_data(data,ntp.process_lte_rsrp) column_list = ['Velocity', 'SE','SE norm', 'RS SNR/Antenna port - 1','RS SNR/Antenna port - 2', 'RSRP/Antenna port - 1','RSRP/Antenna port - 2'] if args.select is None: df = tl.concat_pandas_data([df[column_list] for df in data ]) else: df = data # Remove outliers because of bandwidth normalization issues df['SE norm'][df['SE norm'] > 7.5] = np.nan print(df['SE'].dropna().describe()) print(df['Velocity'].dropna().describe()) print(df['RS SNR/Antenna port - 1'].dropna().describe()) velocity_pacf = pacf(df['Velocity'].dropna(), nlags=10, method='ywunbiased', alpha=None) se_pacf,se_conf = pacf(df['SE'].dropna(), nlags=10, method='ywunbiased', alpha=0.05) se_norm_pacf,se_norm_conf = pacf(df['SE norm'].dropna(), nlags=10, method='ywunbiased', alpha=0.05) rs_snr_ap1_pacf = pacf(df['RS SNR/Antenna port - 1'].dropna(), nlags=10, method='ywunbiased', alpha=None) rsrp_ap1_pacf = pacf(df['RSRP/Antenna port - 1'].dropna(), nlags=10, method='ywunbiased', alpha=None) # Apply diff to ensure zero-mean velocity_acf = acf(df['Velocity'].dropna().diff().dropna(), unbiased=False, nlags=40, confint=None, qstat=False, fft=True, alpha=None) se_acf = acf(df['SE'].dropna().diff().dropna(), unbiased=False, nlags=40, confint=None, qstat=False, fft=True, alpha=None) se_norm_acf = acf(df['SE norm'].dropna().diff().dropna(), unbiased=False, nlags=40, confint=None, qstat=False, fft=True, alpha=None) rs_snr_ap1_acf = acf(df['RS SNR/Antenna port - 1'].dropna().diff().dropna(), unbiased=False, nlags=40, confint=None, qstat=False, fft=True, alpha=None) rsrp_ap1_acf = acf(df['RSRP/Antenna port - 1'].dropna().diff().dropna(), unbiased=False, nlags=40, confint=None, qstat=False, fft=True, alpha=None) plt.ion() plt.figure() plt.subplot2grid((2,1), (0,0)) plt.plot(se_pacf,lw=2.0,label='PACF SE') plt.plot(se_norm_pacf,lw=2.0,label='PACF SE norm') plt.plot(rs_snr_ap1_pacf,lw=2.0,label='PACF RS SNR AP1') plt.plot(rsrp_ap1_pacf,lw=2.0,label='PACF RSRP AP1') plt.plot(velocity_pacf,lw=2.0,label='PACF Velocity') plt.ylim([-0.2,1.1]) plt.grid(True) plt.legend(loc=0) plt.subplot2grid((2,1), (1,0)) plt.plot(se_acf,lw=2.0,label='ACF SE diff') plt.plot(se_norm_acf,lw=2.0,label='ACF SE norm diff') plt.plot(rs_snr_ap1_acf,lw=2.0,label='ACF RS SNR AP1 diff') plt.plot(rsrp_ap1_acf,lw=2.0,label='ACF RSRP AP1 diff') plt.plot(velocity_acf,lw=2.0,label='ACF Velocity diff') plt.ylim([-0.2,1.1]) plt.grid(True) plt.legend(loc=0) plt.tight_layout() input('Press any key')
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywm', use_vlines=True, **kwargs): """Plot the partial autocorrelation function Plots lags on the horizontal and the correlations on vertical axis. Parameters ---------- x : array_like Array of time-series values ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : array_like, optional Array of lag values, used on horizontal axis. If not given, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to 1/sqrt(len(x)) method : 'ywunbiased' (default) or 'ywmle' or 'ols' specifies which method for the calculations to use: - yw or ywunbiased : yule walker with bias correction in denominator for acovf - ywm or ywmle : yule walker without bias correction - ols - regression of time series on lags of it and on constant - ld or ldunbiased : Levinson-Durbin recursion with bias correction - ldb or ldbiased : Levinson-Durbin recursion without bias correction use_vlines : bool, optional If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- matplotlib.pyplot.xcorr matplotlib.pyplot.acorr mpl_examples/pylab_examples/xcorr_demo.py Notes ----- Adapted from matplotlib's `xcorr`. Data are plotted as ``plot(lags, corr, **kwargs)`` """ fig, ax = utils.create_mpl_ax(ax) if lags is None: lags = np.arange(len(x)) nlags = len(lags) - 1 else: nlags = lags lags = np.arange(lags + 1) # +1 for zero lag acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method) if use_vlines: ax.vlines(lags, [0], acf_x, **kwargs) ax.axhline(**kwargs) # center the confidence interval TODO: do in acf? confint = confint - confint.mean(1)[:,None] kwargs.setdefault('marker', 'o') kwargs.setdefault('markersize', 5) kwargs.setdefault('linestyle', 'None') ax.margins(.05) ax.plot(lags, acf_x, **kwargs) ax.fill_between(lags, confint[:,0], confint[:,1], alpha=.25) ax.set_title("Partial Autocorrelation") return fig
for the first time. These p lags will act as our features while forecasting the AR time series. """ # pacf plot fancy print('\n*** Partial ACF Plot ***') from statsmodels.graphics.tsaplots import plot_pacf plt.rcParams['figure.figsize'] = (8, 5) plt.figure(figsize=(5, 5)) plot_pacf(df.values.tolist(), lags=50) plt.axhline(y=0, linestyle='--', color='red') #plt.axhline(y=-1.96/np.sqrt(len(df)),linestyle='--',color='red') plt.axhline(y=1.96 / np.sqrt(len(df)), linestyle='--', color='red') plt.title("Partial Auto Corelation Plot") plt.xlabel('Lags') plt.show() # pacf plot simple print('\n*** Partial ACF Plot ***') from statsmodels.tsa.stattools import pacf pacf_50 = pacf(df[pColData], nlags=50) plt.rcParams['figure.figsize'] = (8, 5) plt.figure() plt.ylim(-2, 2) plt.plot(pacf_50, color='b') plt.axhline(y=0, linestyle='--', color='red') plt.axhline(y=-1.96 / np.sqrt(len(df)), linestyle='--', color='red') plt.axhline(y=1.96 / np.sqrt(len(df)), linestyle='--', color='red') plt.title('Partial Autocorrelation Function') plt.show
# In[13]: #ACF and PACF plots: from statsmodels.tsa.stattools import acf, pacf # In[14]: from statsmodels.tsa.arima_model import ARIMA # In[15]: lag_acf = acf(ts_log_diff, nlags=20) lag_pacf = pacf(ts_log_diff, nlags=20, method='ols') #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
#df.index=df.Date_time df.head() sale = df.Sales #lnsale=np.log(sale) #lnsale #plt.plot(lnsale) acf_1 = acf(sale) test_df = pd.DataFrame([acf_1]).T test_df.columns = ['Auto-correlation'] test_df.index += 1 test_df.plot(kind='bar') plt.show() pacf_1 = pacf(sale) test_df = pd.DataFrame([pacf_1]).T test_df.columns = ['Partial-Autocorrelation'] test_df.index += 1 test_df.plot(kind='bar') plt.show() result = ts.adfuller(sale) result sale_diff = sale - sale.shift() diff = sale_diff.dropna() acf_1_diff = acf(diff) test_df = pd.DataFrame([acf_1_diff]).T test_df.columns = ['First difference Auto-correlation'] test_df.index += 1 test_df.plot(kind='bar')
plt.plot(seasonal, label='Seasonal') plt.legend(loc='best') plt.subplot(414) plt.legend(loc='best') plt.tight_layout() decomposedLogData = residual decomposedLogData.dropna(inplace=True) test_stationarity(decomposedLogData) ## ACF and PACF Plots ## #from statsmodels.tsa.stattools import acf,pacf lag_acf = acf(dflogDiffShifting, nlags=20) lag_pacf = pacf(dflogDiffShifting, nlags=20, method='ols') ## Plot ACF ## plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(dflogDiffShifting)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(dflogDiffShifting)), linestyle='--', color='gray') plt.title('Autocorrelation Function') ## Plot PACF ##
n: length of series Returns: the series """ e = np.random.standard_normal(n) y = np.zeros(n) for t in np.arange(2,n): y[t] = phi1*y[t-1] + phi2*y[t-2] + e[t] return y # examples n = 100 y = ar2(0.7,0.2,n) ncorr = 25 # number of lags to compute for the autocorrelation functions y_acf = acf(y,nlags=ncorr) y_pacf = pacf(y,nlags=ncorr) # plot the series fig,ax = plt.subplots(figsize=(14,4)) ax.plot(y,label=r'$y_t$') ax.legend() ax.set_title(r'$y_t= \phi_1 y_{t-1} + \phi_2 y_{t-2} + \epsilon_t$') plt.show() #plot the acf and pacf fig2,axes = plt.subplots(2) fig2.subplots_adjust(hspace=0.5) axes[0].bar(np.arange(ncorr+1), y_acf) axes[0].set_title("Autocorrelation") axes[1].bar(np.arange(ncorr+1), y_pacf) axes[1].set_title("Partial Autocorrelation")
ts_logtransformed = (np.log(t_series)) #plt.plot(ts_logtransformed) #test_stationarity(ts_logtransformed) ts_diff_logtrans = ts_logtransformed - ts_logtransformed.shift(7) ts_diff_logtrans.head(10) #test_stationarity(ts_diff_logtrans) ts_diff_logtrans.dropna(inplace=True) #test_stationarity(ts_diff_logtrans) #plt.plot(ts_diff_logtrans) #ACF and PACF plots: lag_acf = acf(ts_diff_logtrans, nlags=30) lag_pacf = pacf(ts_diff_logtrans, nlags=50, method='ols') #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(ts_diff_logtrans)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(ts_diff_logtrans)), linestyle='--', color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122)
#df['First Difference'] = df['https'] - df['https'].shift() y = np.array(df['https'] - df['https'].shift()) _draw_multiple_line_plot('protocols_diff.html', 'transport layer protocols', [X], [y], ['navy'], ['packets percentage delta'], [None], [1], 'datetime', 'Date', 'Packets Percentage Delta', y_start=-100, y_end=100) df['first difference'] = df['https'] - df['https'].shift() lag_correlations = acf(df['first difference'].iloc[1:]) lag_partial_correlations = pacf(df['first difference'].iloc[1:]) print 'lag_correlations' print lag_correlations y = lag_correlations _draw_multiple_line_plot('lag_correlations.html', 'lag_correlations', [X], [y], ['navy'], ['lag_correlations'], [None], [1], 'datetime', 'Date', 'lag_correlations', y_start=-1, y_end=1)
def arima_model(df: pd.DataFrame, cols: list, lag: int, order: int, moving_avg_model: int, with_graph: bool): for col in cols: model = ARIMA(df[col], order=(lag, order, moving_avg_model)) model_fit = model.fit() print('\t==== Summary of ARIMA(%d, %d, %d) model for %s ====\n' % (lag, order, moving_avg_model, col)) print(model_fit.summary()) print() x_mean = df[col].mean() sst = df[col].apply(lambda x: (x - x_mean)**2).sum() ssr = sst - model_fit.sse r_squared = ssr / sst print('R-squared: %f\n' % r_squared) n = len(df[col]) k = len(model_fit.arroots) + len(model_fit.maroots) print('n: %d, k: %d' % (n, k)) adj_r_sqr = 1 - ((1 - r_squared) * (n - 1)) / (n - k - 1) print('Adjusted R-squared: %f' % adj_r_sqr) print() print('\t==== Correlogram of residuals ====\n') acf_results, _, q_stat = acf(model_fit.resid, nlags=15, qstat=True) pacf_results = pacf(model_fit.resid, nlags=15) for clag in range(0, 16): print('%d:' % (clag + 1), acf_results[clag], pacf_results[clag], '-' if clag - 1 < 0 else q_stat[clag - 1], sep='\t') print() if lag > 0 or moving_avg_model > 0: r_matrix = '(ar.L1 = 0)' if lag > 0 else '' if len(r_matrix) > 0 and moving_avg_model > 0: r_matrix = r_matrix + ',' r_matrix = r_matrix + ('(ma.L1 = 0)' if moving_avg_model > 0 else '') f_test = model_fit.f_test(r_matrix) print('\t==== F Test ====\n', f_test.summary()) print() print('\t==== Summary of residuals for %s ====\n' % col) residuals = pd.DataFrame(model_fit.resid) print(residuals.describe()) print() if with_graph is True: plot_pacf(residuals, lags=15, title='ARIMA(%d, %d, %d): PAC plot for residuals of %s' % (lag, order, moving_avg_model, col)) plt.show() #residuals.plot(kind='kde', title='Density of residuals %s' % col) #plt.show() ax = pd.plotting.autocorrelation_plot(pd.DataFrame(acf_results)) ax.set_title('ARIMA(%d, %d, %d): AC plot for residuals of %s' % (lag, order, moving_avg_model, col)) plt.show()
def segment_analysis(csv_fname, trajectory_df): # catch small trajectory_dfs if len(trajectory_df.index) < MIN_TRAJECTORY_LEN: return None else: num_segments = len(trajectory_df.index) - WINDOW_LEN # for each trajectory, loop through segments # super_data = np.zeros((num_segments+1, LAGS+1+1, 2*len(INTERESTED_VALS)+1)) # super_data = np.zeros((2*len(INTERESTED_VALS), num_segments, LAGS+1)) # super_data_confint_upper = np.zeros((2*len(INTERESTED_VALS), num_segments, LAGS+1)) # super_data_confint_lower = np.zeros((2*len(INTERESTED_VALS), num_segments, LAGS+1)) confident_data = np.zeros((2*len(INTERESTED_VALS), num_segments, LAGS+1)) # segmentnames = np.ndarray.flatten( np.array([["{name:s} seg{index:0>3d}".format(name="C", index=segment_i)]*(LAGS+1) for segment_i in range(num_segments)]) ) for segment_i in range(num_segments): # slice out segment from trajectory segment = trajectory_df[segment_i:segment_i+WINDOW_LEN] # data_matrix = np.zeros((2*len(INTERESTED_VALS), LAGS+1)) # confint_matrix = np.zeros((2*len(INTERESTED_VALS), LAGS+1)) ## for segment, run PACF and ACF for each feature # do analysis variable by variable for var_name, var_values in segment.iteritems(): # make matrices # make dictionary for column indices var_index = segment.columns.get_loc(var_name) # {'velo_x':0, 'velo_y':1, 'velo_z':2, 'curve':3, 'log_curve':4}[var_name] # run ACF and PACF for the column col_acf, acf_confint = acf(var_values, nlags=LAGS, alpha=.05)#, qstat= True) # store data # super_data[var_index, segment_i, :] = col_acf # super_data_confint_lower[var_index, segment_i, :] = acf_confint[:,0] # super_data_confint_upper[var_index, segment_i, :] = acf_confint[:,1] # make confident data acf_confint_distance = acf_confint[:,1] - acf_confint[:,0] ACF_conf_booltable = acf_confint_distance[:] >= CONFINT_THRESH filtered_data = col_acf filtered_data[ACF_conf_booltable] = 0. confident_data[var_index, segment_i, :] = filtered_data ## , acf_confint, acf_qstats, acf_pvals col_pacf, pacf_confint = pacf(var_values, nlags=LAGS, method='ywmle', alpha=.05) # TODO: check for PACF values above or below +-1 # super_data[var_index+len(INTERESTED_VALS), segment_i, :] = col_pacf # super_data_confint_lower[var_index+len(INTERESTED_VALS), segment_i, :] = pacf_confint[:,0] # super_data_confint_upper[var_index+len(INTERESTED_VALS), segment_i, :] = pacf_confint[:,1] # make confident data pacf_confint_distance = pacf_confint[:,1] - pacf_confint[:,0] PACF_conf_booltable = pacf_confint_distance[:] >= CONFINT_THRESH filtered_data = col_pacf # make a copy filtered_data[PACF_conf_booltable] = 0. confident_data[var_index+len(INTERESTED_VALS), segment_i, :] = filtered_data # analysis panel major_axis=[np.array([csv_fname]*num_segments), np.array(["{index:0>3d}".format(index=segment_i) for segment_i in range(num_segments)])] # p = pd.Panel(super_data, # items=['acf_velox', 'acf_veloy','acf_veloz', 'acf_curve', 'acf_logcurve', 'pacf_velox', 'pacf_veloy', 'pacf_veloz', 'pacf_curve', 'pacf_logcurve'], ## major_axis=np.array(["{name:s} seg{index:0>3d}".format(name=csv_fname, index=segment_i) for segment_i in range(num_segments)]), # major_axis=major_axis, # minor_axis=np.arange(LAGS+1)) # p.major_axis.names = ['Trajectory', 'segment_ID'] # # # confint panel # p_confint_upper = pd.Panel(super_data_confint_upper, # items=['acf_velox', 'acf_veloy','acf_veloz', 'acf_curve', 'acf_logcurve', 'pacf_velox', 'pacf_veloy', 'pacf_veloz', 'pacf_curve', 'pacf_logcurve'], ## major_axis=np.array(["{name:s} seg{index:0>3d}".format(name=csv_fname, index=segment_i) for segment_i in range(num_segments)]), # major_axis=major_axis, # minor_axis=np.arange(LAGS+1)) # p_confint_upper.major_axis.names = ['Trajectory', 'segment_ID'] # # p_confint_lower = pd.Panel(super_data_confint_lower, # items=['acf_velox', 'acf_veloy','acf_veloz', 'acf_curve', 'acf_logcurve', 'pacf_velox', 'pacf_veloy', 'pacf_veloz', 'pacf_curve', 'pacf_logcurve'], ## major_axis=np.array(["{name:s} seg{index:0>3d}".format(name=csv_fname, index=segment_i) for segment_i in range(num_segments)]), # major_axis=major_axis, # minor_axis=np.arange(LAGS+1)) # p_confint_lower.major_axis.names = ['Trajectory', 'segment_ID'] # analysis panel filtpanel = pd.Panel(confident_data, items=['acf_velox', 'acf_veloy','acf_veloz', 'acf_curve', 'acf_logcurve', 'pacf_velox', 'pacf_veloy', 'pacf_veloz', 'pacf_curve', 'pacf_logcurve'], # major_axis=np.array(["{name:s} seg{index:0>3d}".format(name=csv_fname, index=segment_i) for segment_i in range(num_segments)]), major_axis=major_axis, minor_axis=np.arange(LAGS+1)) filtpanel.major_axis.names = ['Trajectory', 'segment_ID'] return filtpanel
def test_pacf_nlags_error(reset_randomstate): e = np.random.standard_normal(100) with pytest.raises(ValueError, match="Can only compute partial"): pacf(e, 50)
print(arma_res.summary()) # In[3]: arma_res.resid.iloc[1:].plot(figsize=(6,4),color='seagreen') plt.ylabel('$\hat{z_t}$') # In[4]: from statsmodels.tsa import stattools acf,q,pvalue = stattools.acf(arma_res.resid,nlags=5,qstat=True) pacf,confint = stattools.pacf(arma_res.resid,nlags=5,alpha=0.05) print("自己相関係数:",acf) print("p値:",pvalue) print("偏自己相関:",pacf) print("95%信頼区間:",confint) # In[5]: p=sm.tsa.adfuller(arma_res.resid,regression='nc')[1] #[1]はp値の検定結果 p1=sm.tsa.adfuller(arma_res.resid,regression='c')[1] #[1]はp値の検定結果 print("ドリフト無しランダムウォーク p値:",p) print("ドリフト付きランダムウォーク p値:",p1)
def print_figures(self, directory, prefix=""): """Print figures Save figure for precipitation time series Save figure for cumulative frequency of precipitation in the time series Save figures for sample autocorrelation and partial autocorrelation Save figures for the poisson rate, gamma mean, gamma dispersion, latent variable Z over time Save the TimeSeries in text (converted to string which shows parameters) Args: directory: where to save the figures prefix: what to name the figures """ # required when plotting times on an axis pandas.plotting.register_matplotlib_converters() colours = matplotlib.rcParams['axes.prop_cycle'].by_key()['color'] cycle = cycler.cycler(color=[colours[0]], linewidth=[1]) # get autocorrelations acf = stattools.acf(self.y_array, nlags=20, fft=True) try: pacf = stattools.pacf(self.y_array, nlags=20) except (stattools.LinAlgError): pacf = np.full(21, np.nan) # print precipitation time series plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) plt.plot(self.time_array, self.y_array) plt.xlabel("time") plt.ylabel("rainfall (mm)") plt.savefig(path.join(directory, prefix + "rainfall.pdf")) plt.close() # print precipitation cumulative frequency # draw dot for mass at 0 mm plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) rain_sorted = np.sort(self.y_array) cdf = np.asarray(range(len(self))) plt.plot(rain_sorted, cdf) if np.any(rain_sorted == 0): non_zero_index = rain_sorted.nonzero()[0] if non_zero_index.size > 0: non_zero_index = rain_sorted.nonzero()[0][0] - 1 else: non_zero_index = len(cdf) - 1 plt.scatter(0, cdf[non_zero_index]) plt.xlabel("rainfall (mm)") plt.ylabel("cumulative frequency") plt.savefig(path.join(directory, prefix + "cdf.pdf")) plt.close() # plot sample autocorrelation plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) plt.bar(np.asarray(range(acf.size)), acf) plt.axhline(1 / math.sqrt(len(self)), linestyle='--', linewidth=1) plt.axhline(-1 / math.sqrt(len(self)), linestyle='--', linewidth=1) plt.xlabel("time (day)") plt.ylabel("autocorrelation") plt.savefig(path.join(directory, prefix + "acf.pdf")) plt.close() # plot sample partial autocorrelation plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) plt.bar(np.asarray(range(pacf.size)), pacf) plt.axhline(1 / math.sqrt(len(self)), linestyle='--', linewidth=1) plt.axhline(-1 / math.sqrt(len(self)), linestyle='--', linewidth=1) plt.xlabel("time (day)") plt.ylabel("partial autocorrelation") plt.savefig(path.join(directory, prefix + "pacf.pdf")) plt.close() # plot the poisson rate over time plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) plt.plot(self.time_array, self.poisson_rate.value_array) plt.xlabel("time") plt.ylabel("poisson rate") plt.savefig(path.join(directory, prefix + "poisson_rate.pdf")) plt.close() # plot the gamma mean over time plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) plt.plot(self.time_array, self.gamma_mean.value_array) plt.xlabel("time") plt.ylabel("gamma mean (mm)") plt.savefig(path.join(directory, prefix + "gamma_mean.pdf")) plt.close() # plot the gamme dispersion over time plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) plt.plot(self.time_array, self.gamma_dispersion.value_array) plt.xlabel("time") plt.ylabel("gamma dispersion") plt.savefig(path.join(directory, prefix + "gamma_dispersion.pdf")) plt.close() # plot the latent variable z over time plt.figure() ax = plt.gca() ax.set_prop_cycle(cycle) plt.plot(self.time_array, self.z_array) plt.xlabel("time") plt.ylabel("Z") plt.savefig(path.join(directory, prefix + "z.pdf")) plt.close() # print the parameters in text file = open(path.join(directory, prefix + "parameter.txt"), "w") file.write(str(self)) file.close()
color='lightgray', ax=ax1) ax2 = fig.add_subplot(1, 2, 2) fig = sm.graphics.tsa.plot_pacf(lnn225.squeeze(), lags=40, color='lightgray', ax=ax2) #fig.show() arma_mod = sm.tsa.ARMA(lnn225, order=(1, 0)) arma_res = arma_mod.fit(trend='c', disp=-1) print(arma_res.summary()) acf, q, pvalue = stattools.acf(arma_res.resid, nlags=5, qstat=True) pacf, confint = stattools.pacf(arma_res.resid, nlags=5, alpha=0.05) print("自己相関係数:", acf) print("p値:", pvalue) print("偏自己相関:", pacf) print("95%信頼区間:", confint) p = sm.tsa.adfuller(arma_res.resid, regression='nc')[1] #[1]はp値の検定結果 p1 = sm.tsa.adfuller(arma_res.resid, regression='c')[1] #[1]はp値の検定結果 print("ドリフト無しランダムウォーク p値:", p) print("ドリフト付きランダムウォーク p値:", p1) from scipy.stats import t resid = arma_res.resid.iloc[1:] m = resid.mean() v = resid.std() resid_max = pd.Series.rolling(arma_res.resid, window=250).mean().max()
# -*- coding: utf-8 -*- import numpy as np from pandas import * from statsmodels.tsa import stattools import matplotlib.pyplot as plt randn = np.random.randn ts = Series(randn(1000), index=DateRange('2000/1/1', periods=1000)) ts = ts.cumsum() ts.plot(style='<--') rolling_mean(ts, 60).plot(style='--', c='r') rolling_mean(ts, 180).plot(style='--', c='b') acf = stattools.acf(np.array(ts), 50) plt.bar(range(len(acf)), acf, width=0.01) plt.savefig("image.png") pcf = stattools.pacf(np.array(ts), 50) plt.bar(range(len(pcf)), pcf, width=0.01) plt.show() plt.savefig("image2.png")
# y(t-1),y(t-2),y(t-3). # q : This is the number of MA (Moving-Average) terms . Example — if p is 3 the predictor for y(t) will be # y(t-1),y(t-2),y(t-3). # d :This is the number of differences or the number of non-seasonal differences . #FIND VALUES OF p AND q: # Autocorrelation Function (ACF): It just measures the correlation between two consecutive (lagged version). # example at lag 4, ACF will compare series at time instance t1…t2 with series at instance t1–4…t2–4 # Partial Autocorrelation Function (PACF): is used to measure the degree of association between y(t) and y(t-p). from statsmodels.tsa.arima_model import ARIMA #ACF and PACF plots: from statsmodels.tsa.stattools import acf, pacf lag_acf = acf(ts_log_diff, nlags=20) lag_pacf = pacf(ts_log_diff, nlags=20, method='ols') #plot ACF: # plt.subplot(121) # plt.plot(lag_acf) # plt.axhline(y=0, linestyle='--', color='gray') # plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)), linestyle='--', color='gray') # plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)), linestyle='--', color='gray') # plt.title('Autocorrelation Function') #Plot PACF: # plt.subplot(122) # plt.plot(lag_pacf) # plt.axhline(y=0, linestyle='--', color='gray') # plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)), linestyle='--', color='gray') # plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)), linestyle='--', color='gray')