def csv_file_pick_rtt_series(): with open(JSON2CSV_FILE_ALL) as f_handler: next(f_handler) for line in f_handler: dest = line.split(';')[1] probe = line.split(';')[2] rtt_series_one_line = [float(lines) for lines in line.split(';')[3:] if float(lines)!= -1] if len(rtt_series_one_line) != 0: if ACTION == "periodicity": # print "probe:", probe # print "dest:", dest # print "rtt_series_one_line", rtt_series_one_line plot_fft_autocorr(rtt_series_one_line, dest, probe) elif ACTION == "rtt_statistics": rtt_statistics(rtt_series_one_line, dest, probe) elif ACTION == "autocorr_plot": # print "probe:", probe # print "dest:", dest # print "rtt_series_one_line", rtt_series_one_line autocorrelation_plot(pd.Series(rtt_series_one_line)) plt.show() else: print probe, GENERATE_TYPE, dest, "is an empty list"
def autocorr_plot(clusters, data, savedir=None): n = len(clusters) x = math.sqrt(n) x = int(x) y = 1 while x * y < n: y += 1 pallete = sns.color_palette("hls", n) k = 0 plt.figure() autocorrelation_plot(data.T) savepath = '/'.join(directory.split('/') + ['all_autocorr.png']) plt.savefig(savepath) plt.close() fig, axarr = plt.subplots(x, y, sharex=True, sharey=True) i = 0 j = 0 k = 0 for cid, cluster in clusters.iteritems(): ax = axarr[i, j] autocorrelation_plot(data.loc[cluster, :].T, ax=ax) k += 1 i = (i + 1) % x if i == 0: j = (j + 1) % y savepath = '/'.join(directory.split('/') + ['incluster_autocorr.png']) plt.savefig(savepath) plt.close()
def autocorrelation(array, name): fig = plt.figure() autocorrelation_plot(array) plt.legend([name], loc = 'upper left') # plt.show() fig.savefig("Election_13/stats/"+name+".png", bbox_inches='tight') plt.clf() fig.clf()
def ts_plots(rets, figsize=(12, 10)): import matplotlib.pyplot as plt fig, axarr = plt.subplots(2, 2, sharex=False, sharey=False, figsize=figsize) axgen = (e for e in np.array(axarr).ravel()) rets.plot(kind='line', ax=axgen.next()) # .set_title("data") rets.plot(kind='hist', bins=50, ax=axgen.next()) # .set_title("histogram") # rets.plot(kind='density',ax=axgen.next()).set_title("density") lag_plot(rets, lag=1, ax=axgen.next()) # .set_title("") autocorrelation_plot(rets, ax=axgen.next())
def process_trace(connection, name): print 'Downloading...' timeSeries = connection.demand(name) print 'complete' time = np.zeros(len(timeSeries.elements)) demand = np.zeros(len(timeSeries.elements)) for i in range(0, len(timeSeries.elements)): time[i] = timeSeries.elements[i].timestamp demand[i] = timeSeries.elements[i].value mean = np.mean(demand) print len(demand) #demand = demand - mean # http://www.simafore.com/blog/bid/105815/Time-series-analysis-using-R-for-cost-forecasting-models-in-8-steps # demand = np.array([np.sin(i) for i in range(0,500)]) # demand = np.log10(demand) # result = np.correlate(demand, demand, 'full') # result = result[0:len(result)] from scipy import signal # demand = sp.signal.detrend(demand, axis=0) t = np.arange(len(demand)) sp = np.fft.fft(demand) freq = np.fft.fftfreq(t.shape[-1]) from pandas.tools.plotting import autocorrelation_plot autocorrelation_plot(demand) fig = plt.figure() ax = fig.add_subplot(111) # ax.plot(range(0, len(ff)), ff) ax.plot(freq, sp.real, freq, sp.imag) # ax.acorr(demand, maxlags=700) print 'pandas' from pandas import Series s = Series(demand, index=range(0, len(demand))) corr = s.autocorr() print corr plt.show()
def plot_autocorrelation(self): """ Plots autocorrelation of power data Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm Returns ------- matplotlib.axis """ fig, ax = plt.subplots() for power in self.power_series(): autocorrelation_plot(power, ax = ax) return ax
def test_autocorrelation_plot(self): from pandas.tools.plotting import autocorrelation_plot _check_plot_works(autocorrelation_plot, series=self.ts) _check_plot_works(autocorrelation_plot, series=self.ts.values) ax = autocorrelation_plot(self.ts, label='Test') self._check_legend_labels(ax, labels=['Test'])
def test_autocorrelation_plot(self): from pandas.tools.plotting import autocorrelation_plot _check_plot_works(autocorrelation_plot, self.ts) _check_plot_works(autocorrelation_plot, self.ts.values) ax = autocorrelation_plot(self.ts, label='Test') t = ax.get_legend().get_texts()[0].get_text() self.assertEqual(t, 'Test')
def plotAutocorrelation(pth, bucketName): df = pd.read_hdf(pth+bucketName,'capitalKDF') autocorrelation_plot(df['A','p','1'], plt.subplot(2,2,1)) plt.title("Lag plot for best ask price") autocorrelation_plot(df['A','v','1'], plt.subplot(2,2,2)) plt.title("Lag plot for best ask volume") autocorrelation_plot(df['B','p','1'], plt.subplot(2,2,3)) plt.title("Lag plot for best bid price") autocorrelation_plot(df['B','v','1'], plt.subplot(2,2,4)) plt.title("Lag plot for best bid volume") plt.show()
def _plot(self): """_plot doc...""" data = pd.Series(np.asarray(self.data)) pl = self.pl ax = pl.gca() pdPlot.autocorrelation_plot(data, ax=ax) if self.xScale != 1.0: formatter = FuncFormatter(self._scaleTickMark) ax.get_xaxis().set_major_formatter(formatter) pl.title(self.title) pl.xlabel(self.xLabel) pl.ylabel(self.yLabel) if self.xLimits: pl.xlim(*self.xLimits) if self.yLimits: pl.ylim(*self.yLimits) pl.grid(True)
def pandas_autocorr(chain, labels, plt_label): from pandas.tools.plotting import autocorrelation_plot # autocorrelation plot npars = chain.shape[1] cmap = get_cmap(npars) # plot autocorrelation lag plt.figure(figsize=(16,6)) h = [autocorrelation_plot(chain[nburn:,i], color=cmap(i), lw=5, alpha=0.8, label=labels[i]) for i in xrange(npars)] plt.legend(loc=1, fontsize=14,ncol=3,numpoints=1,markerscale=0.7) plt.tight_layout() plt.savefig('pandas_autocorrelation_'+plt_label+'.png',dpi=150) plt.close()
features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features)) features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features)) selected_features = ['temp_pattern_labelOnTable','labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'cluster', 'pca_1_temp_mean_ws_120','pca_2_temp_mean_ws_120','pca_2','acc_watch_y_temp_mean_ws_120','gyr_watch_y_pse', 'gyr_watch_x_pse'] possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features] feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features'] # Let us first study whether the time series is stationary and what the autocorrelations are. dftest = adfuller(dataset['hr_watch_rate'], autolag='AIC') print dftest autocorrelation_plot(dataset['hr_watch_rate']) plot.show() # Now let us focus on the learning part. learner = TemporalRegressionAlgorithms() eval = RegressionEvaluation() # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. repeats = 5 # we set a washout time to give the NN's the time to stabilize. We do not compute the error during the washout time. washout_time = 10
ftest['count'] = reg + cas ftest['count'] = ftest['count'].clip(0, np.max(train['count'])) ftest[['count']].to_csv('submission-02.csv') # ## Exploring Autocorrelation # In[73]: import matplotlib.pyplot as plt from pandas.tools.plotting import autocorrelation_plot # Registered & Casual on Workingdays fig, axes = plt.subplots(ncols=3, nrows=8, figsize=(16, 16)) for h in range(24): autocorrelation_plot(train.registered[train.hour == h][train.workingday == 1], ax=axes[int(h / 3.0)][h % 3], color='c') autocorrelation_plot(train.casual[ train.hour == h][train.workingday == 1], ax=axes[int(h / 3.0)][h % 3], color='m') # In[72]: import matplotlib.pyplot as plt from pandas.tools.plotting import autocorrelation_plot # Registered & Casual on Non-Workingdays fig, axes = plt.subplots(ncols=3, nrows=8, figsize=(16, 16)) for h in range(24): autocorrelation_plot(train.registered[train.hour == h][train.workingday == 0], ax=axes[int(h / 3.0)][h % 3], color='c') autocorrelation_plot(train.casual[ train.hour == h][train.workingday == 0], ax=axes[int(h / 3.0)][h % 3], color='m')
def gen_cluster_plots(cluster_directory_root, depth): # load data gc, mt, track = load_data(None, 0) data = pd.concat([gc.data, mt.data]) labels = data.index.values pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) background = list(background) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models for clustering_id, clustering in clusterings.iteritems(): for model_id, members in clustering.iteritems(): sequences = data.loc[members, :] pltdir = '/'.join(cluster_directory_root.split('/') + ['plots']) # make line plots directory if not os.path.isdir(pltdir + '/line'): print "Creating directory...", pltdir os.mkdir(pltdir + '/line') savename = pltdir + '/line/' + model_id + '_lineplot' plt_title = model_id + ' Line Plot' ax = sequences.T.plot(legend=False, rot=2) ax.set_title(plt_title) ax.set_xlabel('Timepoint') ax.set_ylabel('Normalized Expression') print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make autocorr plots directory if not os.path.isdir(pltdir + '/autocorr'): print "Creating directory...", pltdir os.mkdir(pltdir + '/autocorr') savename = pltdir + '/autocorr/' + model_id + '_autocorr' plt_title = model_id + ' Autocorr Plot' for seq in sequences.index: ax = autocorrelation_plot(sequences.loc[seq]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make lag plots directory if not os.path.isdir(pltdir + '/lag'): print "Creating directory...", pltdir os.mkdir(pltdir + '/lag') from pylab import * NUM_COLORS = len(members) cm = get_cmap('gist_rainbow') colors = [] for i in range(NUM_COLORS): colors.append(cm(1.*i/NUM_COLORS)) savename = pltdir + '/lag/' + model_id + '_lagplot' plt_title = model_id + ' Lag Plot' for i, seq in enumerate(sequences.index): ax = lag_plot(sequences.loc[seq], c=colors[i]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() """
from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error from scipy.interpolate import spline from pandas.tools.plotting import autocorrelation_plot from statsmodels.tsa.arima_model import ARIMA from scipy.stats import gaussian_kde from statsmodels.tsa.stattools import adfuller from statsmodels.tsa.seasonal import seasonal_decompose def norm(x): return (x-np.min(x))/(np.max(x)-np.min(x)) dataframe = pd.read_csv('Chaotic_TimeSeries_turkey_elec.csv') dataframe.head() plt.plot(dataframe) autocorrelation_plot(dataframe.ix[:,0]) ### AVALIAR V3 LINHAS model00 = ARIMA(np.array(dataframe.ix[:,0]), dates=None,order=(2,1,0)) model11 = model00.fit(disp=1) model11.summary() model11.forecast() resid9=model11.resid np.mean(abs(resid9))/max(np.array(dataframe.ix[:,0])) x3 = resid9 x3 = x3[numpy.logical_not(numpy.isnan(x3))] dftest13 = adfuller(x3, autolag='AIC') dfoutput1 = pd.Series(dftest13[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) print('Dickey Fuller Test:\n',dfoutput1)
from pandas import DataFrame from statsmodels.tsa.arima_model import ARIMA from sklearn.metrics import mean_squared_error # Read in the data data = pd.read_csv("Budget_test.csv", index_col=0) # print(data.head()) data.index = pd.to_datetime(data.index) data.columns = ['WRVU Production'] plt.plot(data) plt.ylabel('wrvus') plt.show() autocorrelation_plot(data) pyplot.show() model = ARIMA(data, order=(5, 1, 0)) model_fit = model.fit(disp=0) print(model_fit.summary()) residuals = DataFrame(model_fit.resid) residuals.plot() pyplot.show() residuals.plot(kind='kde') pyplot.show() print(residuals.describe()) x = data.values size = int(len(x) * 0.5)
import numpy as np import pandas as pd import sys from datetime import datetime as dt import matplotlib.pyplot as plt from pandas.tools.plotting import lag_plot from pandas.tools.plotting import autocorrelation_plot to_date = lambda x: dt.strptime(x, "%Y%m%d").toordinal() dates, avg_temp = np.loadtxt(sys.argv[1], delimiter=',', usecols=(1, 11), unpack=True, converters={1: to_date}) dtidx = pd.DatetimeIndex([dt.fromordinal(int(date)) for date in dates]) data = pd.Series(avg_temp * .1, index=dtidx) fig = plt.figure() fig.add_subplot(211) lag_plot(data) plt.figure() autocorrelation_plot(data) plt.figure() resampled = data.resample('A') resampled.plot() plt.show()
line = np.linspace(0, 336, 336) plt.plot(line, label[0:336]) plt.xlabel('Hour') plt.ylabel('Power Demand') plt.title('Power Demand of first 14 days') plt.show() lag_plot(label) # Plotting the lag plot of target feature plt.title('Lag plot of Power Demand') plt.xlabel('P(t)') plt.ylabel('P(t+1)') plt.show() # Plotting auto-correlation autocorrelation_plot(label[0:1000]) plt.show() # Splitting train and test data train_data, test_data = data[0:119832], data[119832:] train_label, test_label = label[0:119832], label[119832:] # Implementing Persistence Model df = pd.concat([label.shift(48), label], axis=1) df.columns = ['t-1', 't+1'] X = df.values train, test = X[0:119832], X[127656:] train_X, train_y = train[:, 0], train[:, 1] test_X, test_y = test[:, 0], test[:, 1]
import matplotlib.pyplot as plt import statsmodels.api as sm from pandas.tools.plotting import autocorrelation_plot from mpi4py import MPI dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d') file = sys.argv[1] jenis = sys.argv[2] data = pd.read_csv(file, index_col='tanggal', date_parser=dateparse) dt = np.log(data[jenis]) dt.plot(label='Data '+jenis+' Pengamatan') plt.savefig('grafik_'+jenis+'.png', transparent=False) autocorrelation_plot(dt) plt.savefig('grafik_autocorelation_'+jenis+'.png', transparent=False) sm.graphics.tsa.plot_acf(dt, lags=40) plt.savefig('grafik_acf_'+jenis+'.png', transparent=False) sm.graphics.tsa.plot_pacf(dt, lags=40) plt.savefig('grafik_pacf_'+jenis+'.png', transparent=False) #ts.adfuller(dt, 1) if MPI.COMM_WORLD.Get_rank()==0: arima_mod1 = sm.tsa.ARIMA(dt, (3,0,2)).fit(trend='nc' , disp = False) print(arima_mod1.params) #print (arima_mod1.params) sm.stats.durbin_watson(arima_mod1.resid.values) #ws.to_csv("Arima_resid"+jenis+".csv") print(arima_mod1.aic) print(arima_mod1.bic) #print("HQIC: "+ arima_mod1.hqic)
誤差項に系列相関が残っている場合、トレンドも含めて、モデルに含まれていない要因が大きい影響を持っている可能性がありますので、思い当たる説明変数を加えてみたり、タイム・トレンドやラグ項を足したり、変分を取るなりして、コントロールしたほうがよいでしょう。 このような系列相関のチェックには、ADF検定によって誤差項の定常性を確認するのも有効だと思います。 """ # ADF test, H0: Non-stationary tsa.adfuller(rlt.resid,regression='nc') # Autocorrel plot of resid autocorrelation_plot(rlt.resid) # Show ACF of residuals ACF_resid=tsa.acf(rlt.resid) # Keep ACF of residuals """ 誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。 系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。 以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。 リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。 """ # Checking Multicolinearity by VIF
] possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] # Let us first study whether the time series is stationary and what the autocorrelations are. dftest = adfuller(dataset['acc_phone_x'], autolag='AIC') print dftest autocorrelation_plot(dataset['acc_phone_x']) plot.show() # Now let us focus on the learning part. learner = TemporalRegressionAlgorithms() eval = RegressionEvaluation() # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. repeats = 5 # we set a washout time to give the NN's the time to stabilize. We do not compute the error during the washout time. washout_time = 10
from statsmodels.tsa.arima_model import ARIMA from sklearn.metrics import mean_squared_error def parser(x): return datetime.strptime('190' + x, '%Y-%m') series = read_csv('/Home/Downloads/sales-of-shampoo-over-a-three-ye.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) autocorrelation_plot(series) # fit model model = ARIMA(series, order=(5, 1, 0)) model_fit = model.fit(disp=0) print(model_fit.summary()) # plot residual errors residuals = DataFrame(model_fit.resid) residuals.plot() pyplot.show() residuals.plot(kind='kde') pyplot.show() print(residuals.describe()) X = series.values size = int(len(X) * 0.66) train, test = X[0:size], X[size:len(X)]
loan_count_summary = year_month_summary['issue_d'] loan_count_summary.to_csv("LoanStatsGrouped.csv", index=True) # We can read later more rapidly #loan_count_summary = pd.read_csv('LoanStatsGrouped.csv') # What do we really care? Just the y values y = loan_count_summary.values plt.plot(y) plt.suptitle("the values") plt.show() # Test if the time series is stationary # Here I make an autocorrelation plot of the data. The decay with the lag indicate that the TS is not stationary import statsmodels.api as sm from pandas.tools.plotting import autocorrelation_plot autocorrelation_plot(y) plt.suptitle("Original series") plt.savefig("TS.pdf") #plt.show() # #### Dickey Fuller test # I can also perform a Dickey Fuller test for presence of unit roots test = sm.tsa.adfuller(y) print 'adf: ', test[0] print 'p-value: ', test[1] print 'Critical values: ', test[4] if test[0] > test[4]['10%']: print 'has unit roots , the series is not stationary' else: print 'has no unit roots , the series is stationary'
f, axarr = plt.subplots(2) axarr[0].plot(nord16.resample('D').mean(), lw = 2) axarr[1].plot(nord15.resample('D').mean(), color = 'red', lw = 2) f, axarr = plt.subplots(2) axarr[0].plot(nord16.resample('D').std(), lw = 2) axarr[1].plot(nord15.resample('D').std(), color = 'red', lw = 2) var_nord16 = np.array(nord16.resample('D').std()).ravel() var_nord15 = np.array(nord15.resample('D').std()).ravel() plt.figure() plt.hist(np.array(var_nord16), bins = 20) plt.figure() plotting.autocorrelation_plot(pd.Series(var_nord16)) plt.figure() plotting.autocorrelation_plot(pd.Series(np.random.sample(size = len(var_nord16)))) d16nord = DistBetweenZeroVarDays(var_nord16) d15nord = DistBetweenZeroVarDays(var_nord15) plt.figure() plt.hist(np.array(d16nord)) plt.figure() plt.hist(np.array(d15nord)) np.mean(d16nord) np.mean(d15nord) np.std(d16nord) np.std(d15nord)
# converts string to datetime object in pandas: df['issue_d_format'] = pd.to_datetime(df['issue_d']) dfts = df.set_index('issue_d_format') year_month_summary = dfts.groupby(lambda x : x.year * 100 + x.month).count() loan_count_summary = year_month_summary['issue_d'] # #### Test if the time series is stationary # Here I make an autocorrelation plot of the data. The decay with the lag indicate that the TS is not stationary # In[24]: import statsmodels.api as sm from pandas.tools.plotting import autocorrelation_plot autocorrelation_plot(loan_count_summary) # #### Dickey Fuller test # I can also perform a Dickey Fuller test for presence of unit roots # In[25]: test = sm.tsa.adfuller(loan_count_summary.values) print 'adf: ', test[0] print 'p-value: ', test[1] print 'Critical values: ', test[4] if test[0] > test[4]['10%']: print 'has unit roots , the series is not stationary' else: print 'has no unit roots , the series is stationary'
data_bit = [] for i in range(tsbtc.size-1): xy = np.array([tsbtc.ix[i],tsbtc.ix[i+1]]) data_bit.append(xy) dataset = np.array(data_bit) H, xedges, yedges = np.histogram2d(dataset[:,0], dataset[:,1], bins = 20, normed = True) plt.figure() im = plt.imshow(H, interpolation='nearest', origin='low',extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) plt.figure() plt.plot(statsmodels.api.tsa.acf(tsbtc)) plt.figure() plotting.autocorrelation_plot(tsbtc.ix[1340:]) reversed_arr = np.fliplr([np.array(tsbtc)])[0] plt.figure() plotting.autocorrelation_plot(reversed_arr) #################################################################################################### def get_dataset(ts): data_bit = [] for i in range(ts.size-1): xy = np.array([ts.ix[i],ts.ix[i+1]]) data_bit.append(xy) dataset = np.array(data_bit) return dataset #################################################################################################### def find_closest_index(edges, x): diffs = np.abs(edges - x)
pun.append(data3['PUN [€/MWH]'].dropna().values.ravel()) unlisted = [item for sublist in pun for item in sublist] df = pd.DataFrame(unlisted) df = df.set_index(pd.date_range('2014-01-01', '2016-12-14', freq = 'H')[:df.shape[0]]) df.plot() df.resample('D').mean().plot() df.resample('M').mean().plot() plt.figure() plotting.lag_plot(df.resample('M').mean()) plt.figure() plotting.autocorrelation_plot(df) plt.figure() plotting.autocorrelation_plot(df.resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2014].resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2015].resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2016].resample('D').mean()) plt.figure() plotting.lag_plot(df.ix[df.index.year == 2014]) plt.figure() plotting.lag_plot(df.ix[df.index.year == 2015], color = 'red') plt.figure()
pyplot.show() pyplot.plot(merged_dataframe['Date'], merged_dataframe['Close']) pyplot.xlabel('Years') pyplot.ylabel('Stock Closing Prices') pyplot.show() merged_dataframe.info() merged_dataframe['Date'] = merged_dataframe['Date'].dt.strftime('%Y-%m') df_dateclose = pd.Series(merged_dataframe['Close'].values, index=merged_dataframe['Date']) print(df_dateclose.index) print(df_dateclose.head()) autocorrelation_plot(df_dateclose) pyplot.show() merged_dataframe['Close'].head() plot_acf(df_dateclose) plot_pacf(df_dateclose) plot_pacf(df_dateclose, lags=50) plot_acf(df_dateclose) plot_acf(df_dateclose, lags=50) arima_df = pd.DataFrame(merged_dataframe, index=merged_dataframe['Date']) arima_df.index
import Fourier reconstructed = Fourier.fourierExtrapolation(dpun, 0, 16) plt.figure() plt.plot(dpun) plt.plot(reconstructed, color = 'red') np.mean(dpun - reconstructed) np.std(dpun - reconstructed) from pandas.tools import plotting plt.figure() plotting.lag_plot(pd.DataFrame(dpun)) plt.figure() plt.plot(statsmodels.api.tsa.acf(dpun)) lags = [] for i in range(dpun.size - 1): lags.append(np.array([dpun[i], dpun[i+1]])) lags = pd.DataFrame(lags) lags.corr() plt.figure() plotting.lag_plot(pd.DataFrame(dpun), lag = 7) plt.figure() plotting.autocorrelation_plot(pd.DataFrame(dpun))
def autocorrelation(building, floor, group=None, start=_start, end=_end): floor_data = get_series(building, group=group, start=start, end=end) series = floor_data[floor] autocorrelation_plot(series)
name = '$R_0$ development through time for windowsize '+str(ws) plt.title(name) plt.savefig(name) plt.clf() ax = plt.subplot() m.boxplot(ax=ax, rot=90) name = 'Spread of estimated parameters $p_ij$ for windowsize '+str(ws) plt.autoscale(tight=True) plt.title(name) plt.savefig(name) plt.clf() ax = plt.subplot() for i in range(36): autocorrelation_plot(m.iloc[i,:], ax=ax)#,label=str(m.columns[i])) name = 'Autocorrelation plot of the parameter estimates for window size '+str(ws) plt.title(name) plt.savefig(name) #plt.legend() plt.clf() ax = plt.subplot() for i in range(6): autocorrelation_plot(r.iloc[i,:], ax=ax)#,label=str(m.columns[i])) name = 'Autocorrelation plot of the $R_0$ for window size '+str(ws) plt.title(name) plt.savefig(name) plt.clf() diffs = diff(r)
plt.plot(res_ma.window_size, res_ma.MAPE) plt.show() # EWMA fig= plt.figure(figsize=(12,9)) plt.plot(ts.index, ts.rings, label = 'Ring size', alpha = 0.5) for i in np.linspace(1, 0.0001,10): plt.plot(ts.index, ts.rings.ewm(alpha = i).mean() + i* 10 , label = 'EWMA: %s'% i) plt.xlim(xmin = min(ts.index)-1, xmax = max(ts.index) +1 ) plt.legend(loc='best') # Autocorrelation from pandas.tools.plotting import autocorrelation_plot autocorrelation_plot(ts.rings) # Load dow jones ts = pd.read_csv('../data/Dow-Jones.csv', parse_dates=['Date'], index_col='Date', infer_datetime_format = True) ts = ts[:'2010-01-01'] autocorrelation_plot(ts.Value) # Avg temp ts = pd.read_csv('../data/mean-daily-temperature.csv', parse_dates=['date'], index_col='date', infer_datetime_format = True) autocorrelation_plot(ts.temp) # PACF import statsmodels.api as sm
def predict_arma(ad_group, pred_date): warnings.filterwarnings("ignore") ads_file = 'data/ad_table.csv' df = pd.read_csv(ads_file, header=0, sep=',') df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True) best_aic = np.inf best_order = None best_mdl = None max_lag = 30 tuning_result = {} # list_ad_group = set(df['ad'].values) if (ad_group in df['ad'].unique()): df_ad_group_train = df[df['ad'] == ad_group] df_ad_group_train = df_ad_group_train.reset_index() df_arma_train = df_ad_group_train[['shown', 'date']] series_train = pd.Series(df_arma_train['shown'], index=df_arma_train.index) for alpha in range(5): for beta in range(5): try: tmp_mdl = ARMA(series_train.values, order=(alpha, beta)).fit(method='mle', trend='nc') tmp_aic = tmp_mdl.aic if tmp_aic < best_aic: best_aic = tmp_aic best_order = (alpha, beta) best_mdl = tmp_mdl except: continue score, pvalue, _, _ = jarque_bera(best_mdl.resid) if pvalue < 0.10: print('The residuals may not be normally distributed.') else: print('The residuals seem normally distributed.') tuning_result = (best_aic, best_order) print('Ad_group: {} aic: {:6.2f} | best order: {}'.format( ad_group, best_aic, best_order)) df_ad_group_train['time_period'] = ( df_ad_group_train['date'] - df_ad_group_train['date'][0]).dt.days X = df_ad_group_train[['time_period']].values y = df_ad_group_train['shown'].values series_train.plot(title='Shown values trend', color='C1') plt.ylabel('shown values') plt.xlabel('Days gap from 2015-10-01') plt.scatter(X, y, facecolor='gray', edgecolors='none') plt.show() #check for auto correlation lag_plot(series_train) plt.show() autocorrelation_plot(series_train) plt.show() plot_acf(series_train.values, lags=max_lag) plt.show() data = series_train.values data = data.astype('float32') model = ARMA(data, order=best_order) # model_fit = model.fit(transparams=False) try: model_fit = model.fit(transparams=False) model_fit.plot_predict(plot_insample=True) plt.scatter(X, y, color='gray') plt.title('ARMA') plt.show() days_gap = (pd.to_datetime(pred_date) - df_arma_train['date'][0]).days forecast = model_fit.forecast(steps=days_gap) print('Prediction of shown value for', pred_date, '=') print(forecast[0][0]) except ValueError: print('This data is not suitable for ARMA') else: print("Ad group does not exist")
### for bootstrap: do I sample only from the past year, same month? for i in range(12): for j in range(5): print("difference between {} and {} for month {}".format(bymonth2.columns[j+1], bymonth2.columns[j], bymonth2.index[i])) #print(bymonth2.apply(lambda j: bymonth2[bymonth2.columns[j+1]].ix[i] - bymonth2[bymonth2.columns[j]].ix[i],axis=1)) print(bymonth2[bymonth2.columns[j+1]].ix[i] - bymonth2[bymonth2.columns[j]].ix[i]) from pandas.tools.plotting import autocorrelation_plot for i in range(12): plt.figure() autocorrelation_plot(bymonth2.ix[i]) ### hour-by-hour letters = 'abcdefghijklmnopqrstuvwxyz' lets = [] for i in range(rng.size): lets.append(letters[rng[i].hour]) letters_dict = {'Letters': lets, 'csud': pun} hdf = pd.DataFrame(letters_dict).set_index(rng) hourwise= {} for i in range(24): letter = letters[i] hour = hdf.ix[hdf['Letters'] == letter]
ax.set_ylabel("Y Label") ax.set_zlabel("Z Label") ax.set_xticks(df.finalDate[x].apply(lambda d: d.strftime("%Y-%m-%d")).values) plt.show() #################Several plots in 2D df.plot(subplots=True) df.plot(x="finalDate", y="finalBalance") df.hist(by=["windowSize", "trainingSize"]) df.boxplot("finalBalance", by=["windowSize", "trainingSize"]) scatter_matrix(df, alpha=0.2, diagonal="kde") df.plot(x="finalDate", y="finalBalance", kind="kde") parallel_coordinates(df, "windowSize") autocorrelation_plot(df.finalBalance) radviz(df, "finalBalance") df.plot(colormap="jet") #################More specific plots in 2D f, (ax1, ax2) = plt.subplots(2, 3) ax1[0].plot(df.groupby(["windowSize"]).mean()["finalBalance"]) ax1[0].set_title("Window Size Mean") ax1[0].set_ylim((5000, 15000)) ax2[0].plot(df.groupby(["windowSize"]).sum()["finalBalance"]) ax2[0].set_title("Window Size Sum") ax1[1].plot(df.groupby(["trainingSize"]).mean()["finalBalance"]) ax1[1].set_title("Training Size Mean") ax1[1].set_ylim((5000, 15000))
from matplotlib import pyplot from statsmodels.tsa.arima_model import ARIMA from sklearn.metrics import mean_squared_error from pandas.tools.plotting import autocorrelation_plot def parser(x): return datetime.strptime('190'+x, '%Y-%m') series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) print(series.head()) series.plot() pyplot.show() autocorrelation_plot(series) pyplot.show() # fit model model = ARIMA(series, order=(5,1,0)) model_fit = model.fit(disp=0) print(model_fit.summary()) # plot residual errors residuals = DataFrame(model_fit.resid) residuals.plot() pyplot.show() residuals.plot(kind='kde') pyplot.show()