def lagPlot(ySeries,plotName="plot"): plt.figure() plt.title(plotName) data = pandas.Series(ySeries) lag_plot(data, marker='2', color='green') plt.savefig("output.png") plt.show()
def ts_plots(rets, figsize=(12, 10)): import matplotlib.pyplot as plt fig, axarr = plt.subplots(2, 2, sharex=False, sharey=False, figsize=figsize) axgen = (e for e in np.array(axarr).ravel()) rets.plot(kind='line', ax=axgen.next()) # .set_title("data") rets.plot(kind='hist', bins=50, ax=axgen.next()) # .set_title("histogram") # rets.plot(kind='density',ax=axgen.next()).set_title("density") lag_plot(rets, lag=1, ax=axgen.next()) # .set_title("") autocorrelation_plot(rets, ax=axgen.next())
def TestIndipendence(st, zona): cnlist = (st[['CODICE RUC']].values == 'UC_DP1608_'+zona).ravel().tolist() cnor = st.ix[cnlist] cnor = cnor.reset_index(drop = True) d = np.random.randint(cnor.shape[0], size = 50) rem = set(range(cnor.shape[0])).difference(d) sam1 = cnor[['SEGNO SBILANCIAMENTO AGGREGATO ZONALE']].ix[d] sam2 = cnor[['SEGNO SBILANCIAMENTO AGGREGATO ZONALE']].ix[np.random.choice(list(rem), size = 50)] plt.figure() plt.hist(sam1.values.ravel()) plt.title('sample 1') plt.figure() plt.hist(sam2.values.ravel()) plt.title('sample 2') si = [] for i in range(cnor.shape[0]): si.append(dateutil.parser.parse(cnor[cnor.columns[1]].ix[i])) CN = cnor.set_index(pd.to_datetime(si)) sm = CN[['SEGNO SBILANCIAMENTO AGGREGATO ZONALE']].resample('D').mean() d = np.random.randint(sm.shape[0], size = 50) rem = set(range(sm.shape[0])).difference(d) sam1 = sm.ix[d].dropna() sam2 = sm.ix[np.random.choice(list(rem), size = 50)].dropna() plt.figure() plt.hist(sam1.values.ravel()) plt.title('sample 1') plt.figure() plt.hist(sam2.values.ravel()) plt.title('sample 2') plt.figure() plotting.lag_plot(sm) plt.title('lag = 1') plt.figure() plotting.lag_plot(sm, lag = 2) plt.title('lag = 2') plt.figure() plotting.lag_plot(sm, lag = 5) plt.title('lag = 5') plt.figure() plotting.lag_plot(sm, lag = 10) plt.title('lag = 10') plt.figure() plotting.lag_plot(sm, lag = 30) plt.title('lag = 30') return 0
def plot_lag(self, lag=1, ax=None): """ Plots a lag plot of power data http://www.itl.nist.gov/div898/handbook/eda/section3/lagplot.htm Returns ------- matplotlib.axis """ if ax is None: ax = plt.gca() for power in self.power_series(): lag_plot(power, lag, ax=ax) return ax
def my_lag_plot(data, filename=None, title=None, xlabel="Lag time(s)", ylabel="Time(s)"): """Generates a lag plot. Arguments: data -- list of data points Keyword arguments: lag -- which lag to plot filename -- filename to write graph to (None plots to screen) title -- graph title (if None, then "Lag %d plot" % lag is used) xlabel -- label on x-axis ylabel -- label on y-axis """ if title is None: title = "Lag plot" plt.cla() p = lag_plot(data) plt.title(title) plt.ylabel(ylabel) plt.xlabel(xlabel) if filename is not None: plt.savefig(filename) else: plt.show()
def autocorr(): import pandas.tools.plotting as ptp from statsmodels.graphics.tsaplots import plot_acf from statsmodels.tsa.ar_model import AR qdl = Quandl() start, end = "2017-01-01", "2018-01-01" es = qdl.get_data("ES", start=start, end=end) print(es.head()) xs = es['Settle'] print(type(xs.index)) ptp.lag_plot(xs) #plt.show() ptp.autocorrelation_plot(xs) #plt.show() plot_acf(xs, lags=7) #plt.show() train, test = xs[1:len(xs) - 7], xs[len(xs) - 7:] model = AR(train, dates=xs.index) ar_fit = model.fit() print('Lag: %s' % ar_fit.k_ar) print('Coefficients: %s' % ar_fit.params) #TODO fix error 'unknown string format' ar_predicts = ar_fit.predict(start=train[0], end=train[len(train) - 1], dynamic=False) for x in range(len(ar_predicts)): print('predicted: %f vs. expected: %f' % (ar_predicts[x], test[x])) print(len(test), len(ar_predicts)) error = mean_squared_error(test, ar_predicts) print('Test MSE: %.3f' % error) plt.plot(test) plt.show(ar_predicts, color='red') plt.show()
def plotLag(pth, bucketName): df = pd.read_hdf(pth + bucketName, 'capitalKDF') plt.subplot(2, 2, 1) lag_plot(df['A', 'p', '1']) plt.title("Lag plot for best ask price") plt.subplot(2, 2, 2) lag_plot(df['A', 'v', '1']) plt.title("Lag plot for best ask volume") plt.subplot(2, 2, 3) lag_plot(df['B', 'p', '1']) plt.title("Lag plot for best bid price") plt.subplot(2, 2, 4) lag_plot(df['B', 'v', '1']) plt.title("Lag plot for best bid volume") plt.show()
def plotLag(pth, bucketName): df = pd.read_hdf(pth+bucketName,'capitalKDF') plt.subplot(2,2,1) lag_plot(df['A','p','1']) plt.title("Lag plot for best ask price") plt.subplot(2,2,2) lag_plot(df['A','v','1']) plt.title("Lag plot for best ask volume") plt.subplot(2,2,3) lag_plot(df['B','p','1']) plt.title("Lag plot for best bid price") plt.subplot(2,2,4) lag_plot(df['B','v','1']) plt.title("Lag plot for best bid volume") plt.show()
def createPlot(self,data, cols, plotType, msg, pdf): fig = plt.figure() if plotType in ['hist']: fig = data[cols].hist() elif plotType in ['pie']: fig = data[cols].value_counts().plot.pie() elif plotType in ['kde']: fig = data[cols].plot.kde() elif plotType in ['lag']: from pandas.tools.plotting import lag_plot fig = lag_plot(data[cols]) elif plotType in ['autocorrelation']: from pandas.tools.plotting import autocorrelation_plot fig = autocorrelation_plot(data[cols]) elif plotType in ['plots']: fig = data[cols].plot(x_compat=True) else: fig = data[cols].value_counts().plot(kind = plotType) if plotType in ['bar','hist','kde','lag','autocorrelation','plots']: fig.set_ylabel(cols) fig.set_title(msg) pdf.savefig(fig.get_figure())
pun = [] pun.append(data1['PUN'].values.ravel()) pun.append(data2['PUN [€/MWH]'].values.ravel()) pun.append(data3['PUN [€/MWH]'].dropna().values.ravel()) unlisted = [item for sublist in pun for item in sublist] df = pd.DataFrame(unlisted) df = df.set_index(pd.date_range('2014-01-01', '2016-12-14', freq = 'H')[:df.shape[0]]) df.plot() df.resample('D').mean().plot() df.resample('M').mean().plot() plt.figure() plotting.lag_plot(df.resample('M').mean()) plt.figure() plotting.autocorrelation_plot(df) plt.figure() plotting.autocorrelation_plot(df.resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2014].resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2015].resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2016].resample('D').mean()) plt.figure() plotting.lag_plot(df.ix[df.index.year == 2014])
def gen_cluster_plots(cluster_directory_root, depth): # load data gc, mt, track = load_data(None, 0) data = pd.concat([gc.data, mt.data]) labels = data.index.values pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) background = list(background) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models for clustering_id, clustering in clusterings.iteritems(): for model_id, members in clustering.iteritems(): sequences = data.loc[members, :] pltdir = '/'.join(cluster_directory_root.split('/') + ['plots']) # make line plots directory if not os.path.isdir(pltdir + '/line'): print "Creating directory...", pltdir os.mkdir(pltdir + '/line') savename = pltdir + '/line/' + model_id + '_lineplot' plt_title = model_id + ' Line Plot' ax = sequences.T.plot(legend=False, rot=2) ax.set_title(plt_title) ax.set_xlabel('Timepoint') ax.set_ylabel('Normalized Expression') print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make autocorr plots directory if not os.path.isdir(pltdir + '/autocorr'): print "Creating directory...", pltdir os.mkdir(pltdir + '/autocorr') savename = pltdir + '/autocorr/' + model_id + '_autocorr' plt_title = model_id + ' Autocorr Plot' for seq in sequences.index: ax = autocorrelation_plot(sequences.loc[seq]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make lag plots directory if not os.path.isdir(pltdir + '/lag'): print "Creating directory...", pltdir os.mkdir(pltdir + '/lag') from pylab import * NUM_COLORS = len(members) cm = get_cmap('gist_rainbow') colors = [] for i in range(NUM_COLORS): colors.append(cm(1.*i/NUM_COLORS)) savename = pltdir + '/lag/' + model_id + '_lagplot' plt_title = model_id + ' Lag Plot' for i, seq in enumerate(sequences.index): ax = lag_plot(sequences.loc[seq], c=colors[i]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() """
pun_in = pun_g.ix[np.where((pun_g - np.mean(pun_g))/np.std(pun_g) <= 3)] bil_in6 = bil6.ix[np.where((pun_g - np.mean(pun_g))/np.std(pun_g) <= 3)] stats.linregress(bil_in6, pun_in) ######################### diff_pf = fsm['pun'] - fsm['francia'] plt.figure() plt.plot(diff_pf) plt.plot(pun) from pandas.tools import plotting plt.figure() plotting.lag_plot(fsm['pun']) plt.figure() plotting.lag_plot(diff_pf) diff_pf.corr(pun) plt.figure() plt.scatter(np.array(diff_pf), np.array(pun[1:281])) ################################## plt.figure() plt.scatter(np.array(fsm['francia']), np.array(fsm['pun'])) fplm = linear_model.LinearRegression(fit_intercept = True).fit(np.array(fsm['francia']).reshape(-1,1),np.array(fsm['pun']))
dec = statsmodels.api.tsa.seasonal_decompose(tsbtc, freq=52) dec.plot() plt.figure() tsbtc.plot() plt.axhline(y=200) plt.axhline(y=500) plt.figure() tsc.plot() tsbtc.corr(tsc) plt.figure() plotting.lag_plot( tsbtc ) ### surprising!!! I think the reticular squared structure puts in evidence the ### particular pattern tat I've noticed. ### N.B.: dates from 2013 data_bit = [] for i in range(tsbtc.size - 1): xy = np.array([tsbtc.ix[i], tsbtc.ix[i + 1]]) data_bit.append(xy) dataset = np.array(data_bit) H, xedges, yedges = np.histogram2d(dataset[:, 0], dataset[:, 1], bins=20, normed=True)
import random as rnd import pandas from pandas.tools.plotting import lag_plot import matplotlib.pyplot as plt s = pandas.Series([rnd.random() for i in range(10000)]) plt.figure() lag_plot(s, marker='o', color='grey') plt.xlabel('Random Number - s[i]') plt.ylabel('Lag1(Random Number) - s[i+1]') plt.show()
import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas.tools.plotting import lag_plot df = pd.read_csv('transcount.csv') df = df.groupby('year').aggregate(np.mean) gpu = pd.read_csv('gpu_transcount.csv') gpu = gpu.groupby('year').aggregate(np.mean) df = pd.merge(df, gpu, how='outer', left_index=True, right_index=True) df = df.replace(np.nan, 0) lag_plot(np.log(df['trans_count'])) plt.show()
series = Middle[Topics[0]] groups = series years = pd.DataFrame() for name, group in groups: years[name.year] = group.values years.boxplot() # In[493]: type(groups) # In[455]: from pandas.tools.plotting import lag_plot lag_plot(Middle[Topics[7]], color='blue') # In[425]: plt.figure(figsize=(14, 8)) m = 0 for i in Topics: plt.plot(Middle[i].rolling(window=2).mean(), lw=5, color=color[m], marker=markers[m], ms=15) m += 1 plt.xticks(year, rotation='vertical', fontsize=20) plt.yticks(fontsize=20) plt.xlabel('Year', fontsize=20)
plt.plot(np.array(Diff/pun)) spark_ts = pd.Series(spark['spread'].resample('D').mean(), dtype = 'float64') pun_ts = pd.Series(pun, dtype = 'float64') spark_ts.corr(pun_ts) DS = spark['spread'].resample('D').mean()/pun plt.figure() plt.plot(DS) DS.corr(pun_ts) plt.figure() plotting.lag_plot(DS) plt.figure() plt.plot(statsmodels.api.tsa.acf(np.array(DS))) plt.plot(DS.resample('M').mean()) plt.figure() plt.plot(statsmodels.api.tsa.periodogram(np.array(DS))) ############################################################################### def fourierExtrapolation(x, n_predict, n_harmonics = 0): x = np.array(x) n = x.size if n_harmonics == 0: n_harm = 100 # number of harmonics in model else:
# explore model little further print("R-squared:") regr.score(yearsTest, sunspTest) # result of -0.12 which means flat line better estimator than this model # reshape time series. index is years between 1700 and 2008 dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) del dta['YEAR'] dta.plot(figsize(12, 3)) import pandas.tools.plotting as pdplot # first do a lag plot which will show relationship with value this period and value in last period plt.figure(figsize=(12, 12)) pdplot.lag_plot(dta) plt.title("Sunspots this year vs. last year\n") # second do lag plots for 1-4 periods plt.figure(figsize=(12, 12)) Lags = [1, 2, 3, 4] plt.subplot(221) pdplot.lag_plot(dta, lag=Lags[0]) plt.title("Lag = " + str(Lags[0])) plt.subplot(222) pdplot.lag_plot(dta, lag=Lags[1]) plt.title("Lag = " + str(Lags[1]))
Nine = pd.DataFrame(nine) return Nine ########################################################################## #for i in range(19,22,1): # D = Extract_Hour(i) # D.plot() # plt.figure() # lag_plot(D) D = Extract_Hour(21) D.plot() lag_plot(D) x21 = data3["PUN"].ix[data3[data3.columns[1]] == 21] for x,i in enumerate(x21): print(x) print(i) if(i == max(x21)): break ################################################################################### ################################################################################### ### hourwise patterns ### names = ['data','data2','data3','data4','data5','data6','data7'] d = {} d2 = {}
pun.append(data2['PUN [€/MWH]'].values.ravel()) pun.append(data3['PUN [€/MWH]'].dropna().values.ravel()) pun.append(data4['PUN [€/MWH]'].dropna().values.ravel()) unlisted = [item for sublist in pun for item in sublist] df = pd.DataFrame(unlisted) ######### to: 2 DAYS AHEAD OF LAST PUN df = df.set_index(pd.date_range('2014-01-01', '2018-01-02', freq = 'H')[:df.shape[0]]) df.plot() df.resample('D').mean().plot() df.resample('M').mean().plot() plt.figure() plotting.lag_plot(df.resample('M').mean()) plt.figure() plotting.autocorrelation_plot(df) plt.figure() plotting.autocorrelation_plot(df.resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2014].resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2015].resample('D').mean()) plt.figure() plotting.autocorrelation_plot(df.ix[df.index.year == 2016].resample('D').mean()) plt.figure() plotting.lag_plot(df.ix[df.index.year == 2014])
aspect=2.5, palette='BuGn_r') # Oh man. This doesn't bode well as most of the crimes were not resolved. This means that there are still quite a lot of outstanding crime cases pending. # Anyway, we have now reached the last two columns of the dataset, X and Y. These columns are coordinates, coordinates which relate to the "address" column. Just for the purposes of this notebook, I will be plotting these coordinates via lag-plots just to visually investigate if the data is random or not. # # Refer to the Pandas visualisation webpage for a more detailed explanation: http://pandas.pydata.org/pandas-docs/version/0.18.1/visualization.html # In[ ]: # Importing the lag_plot plotting function from pandas.tools.plotting import lag_plot # Lag_plot for X coordinate plt.figure() lag_plot(crime.X) # In[ ]: lag_plot(crime.Y, c='goldenrod') # And finally let us look at the autocorrelation plot to look at the X and Y data just to check for randomness in the data over time. If the data is non-random, then one or more of the autocorrelations will be significantly non-zero, taking into account the confidence bands ( dashed and solid lines) # In[ ]: from pandas.tools.plotting import autocorrelation_plot autocorrelation_plot(crime.X, color='k', marker='.', linewidth='0.25') autocorrelation_plot(crime.Y, color='goldenrod', marker='.', linewidth='0.15') plt.ylim(-0.15, 0.15) # Seems pretty random for this time-series data. Anyway this notebook is a work in progress.
def lagPlot(ySeries,plotName="plot"): plt.figure() plt.title(plotName) data = pandas.Series(ySeries) lag_plot(data, marker='2', color='green') plt.show()
data = data.set_index(data['Date']) pun = data['PUN [€/MWH]'].dropna().resample('D').mean() nord = data['MGP NORD [€/MWh]'].dropna().resample('D').mean() plt.figure() plt.scatter(np.array(DF[DF.columns[0]]), np.array(pun)) plt.figure() plt.scatter(np.array(DF[DF.columns[1]]), np.array(pun), color='black') np.corrcoef(np.array(DF[DF.columns[0]]), np.array(pun)) np.corrcoef(np.array(DF[DF.columns[1]]), np.array(pun)) from pandas.tools import plotting plt.figure() plotting.lag_plot(DF['nord-fran']) perc = [] signed_perc = [] for i in range(DF['nord-fran'].size - 1): perc.append( np.abs((DF['nord-fran'].ix[i + 1] - DF['nord-fran'].ix[i]) / DF['nord-fran'].ix[i])) signed_perc.append((DF['nord-fran'].ix[i + 1] - DF['nord-fran'].ix[i]) / DF['nord-fran'].ix[i]) plt.figure() plt.plot(np.array(signed_perc)) #################### NORD ###################### plt.figure()
#################### # check auto correlation from matplotlib import pyplot from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf #series = Series.from_csv('daily-minimum-temperatures.csv', header=0) plot_acf(values, lags=50) pyplot.show() #################### # check correlation from matplotlib import pyplot from pandas.tools.plotting import lag_plot lag_plot(market_data_droppped['Close']) pyplot.show() #################### # determine best lag from pandas import Series from matplotlib import pyplot from statsmodels.tsa.ar_model import AR from sklearn.metrics import mean_squared_error # split dataset X = values.values train, test = X[1:len(X) - 5], X[len(X) - 5:] # train autoregression model = AR(train)
pun = {"PUN": pun1} rng2 = pd.date_range(start="01-01-2010", periods = pun1.size,freq = 'H') ixx = np.arange(56951) rng = pd.DataFrame([rng2,ixx]) rng.columns = [["rng", "ixx"]] rng.columns = ["rng"] dd = {"ixx": ixx, "pun": pun1} ap = pd.DataFrame(dd).set_index(rng2) ap2 = pd.DataFrame(pun1) ap2.columns = ["pun"] months = np.unique(rng.month) jan = ap2.ix[ap['ixx'].ix[ap.index.month == 1].tolist()] lag_plot(jan) for i in range(1,13,1): print i plt.figure() lag_plot(ap2.ix[ap['ixx'].ix[ap.index.month == i].tolist()]) ##### example of kernel density estimation #### kde_jan = KernelDensity(kernel='gaussian', bandwidth = 4).fit(np.array(jan['pun']).reshape(-1,1)) xplot = np.linspace(0,180,100) yplot = np.exp(kde_jan.score_samples(xplot.reshape(-1,1))) plt.figure() plt.plot(yplot) #########################################################
from math import log time_interval = input('Give me a time_interval (10,15,20,30)? ') print('You said: ' + str(time_interval)) forecasting_horizon = input('Give me the forecasting horizon in hours? ') print('You said: ' + str(forecasting_horizon)) steps_ahead = (60.0 / time_interval) * forecasting_horizon print('You want: ' + str(steps_ahead)) #Quick Check for Autocorrelation series = Series.from_csv( '/home/aris/Desktop/Short-Term-Electric-Load-Forecasting-CSL-master/price.csv', header=0) lag_plot(series, lag=300) pyplot.show() ############################################ plot_acf(series, lags=300) pyplot.show() ############################################## series.hist() pyplot.show() ############################################### X = series.values pyplot.plot(X) pyplot.show() result = adfuller(X) print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1])
df = df.replace(np.nan, 0) df.plot(logy=True) df[df['trans_count'] > 0].plot(kind='scatter', x='trans_count', y='gpu_trans_count_x', loglog=True) plt.show() from pandas.tools.plotting import lag_plot df = pd.read_csv('gpu.csv') df = df.groupby('year').aggregate(np.mean) gpu = pd.read_csv('gpu_transcount.csv') gpu = gpu.groupby('year').aggregate(np.mean) df = pd.merge(df, gpu, how='outer', left_index=True, right_index=True) df = df.replace(np.nan, 0) lag_plot(np.log(df['trans_count'])) plt.show() from pandas.tools.plotting import autocorrelation_plot df = pd.read_csv('gpu.csv') df = df.groupby('year').aggregate(np.mean) gpu = pd.read_csv('gpu_transcount.csv') gpu = gpu.groupby('year').aggregate(np.mean) df = pd.merge(df, gpu, how='outer', left_index=True, right_index=True) df = df.replace(np.nan, 0) autocorrelation_plot(np.log(df['trans_count'])) #用自相关函数绘制自相关图 plt.show() #交叉验证法 import pandas as pd import numpy as np df = pd.read_csv('wdbc.csv', header=None)
from pandas.tools.plotting import lag_plot from pandas.tools.plotting import autocorrelation_plot from scipy.stats.stats import pearsonr from math import sqrt from sklearn.metrics import mean_squared_error # Exploring the data # Descriptive stats isig.describe() isig.plot() pyplot.show() # Skewed toward the higher side isig.hist() pyplot.show() # Significantly autocorrelated as expected. autocorrelation_plot(isig) pyplot.show() # Check for autocorrelation. As expected, isig is directly related to previous values. High autocorrelation. lag_plot(isig) pyplot.show() # Check for seasonality # seasonal = seasonal_decompose(series['isig'], model='additive', frequency=10) # seasonal.plot() # pyplot.show() # seasonal = seasonal_decompose(series['glucose'], model='additive', frequency=10) # seasonal.plot() # pyplot.show()
pun = {"PUN": pun1} rng2 = pd.date_range(start="01-01-2010", periods=pun1.size, freq='H') ixx = np.arange(56951) rng = pd.DataFrame([rng2, ixx]) rng.columns = [["rng", "ixx"]] rng.columns = ["rng"] dd = {"ixx": ixx, "pun": pun1} ap = pd.DataFrame(dd).set_index(rng2) ap2 = pd.DataFrame(pun1) ap2.columns = ["pun"] months = np.unique(rng.month) jan = ap2.ix[ap['ixx'].ix[ap.index.month == 1].tolist()] lag_plot(jan) for i in range(1, 13, 1): print i plt.figure() lag_plot(ap2.ix[ap['ixx'].ix[ap.index.month == i].tolist()]) ##### example of kernel density estimation #### kde_jan = KernelDensity(kernel='gaussian', bandwidth=4).fit(np.array(jan['pun']).reshape(-1, 1)) xplot = np.linspace(0, 180, 100) yplot = np.exp(kde_jan.score_samples(xplot.reshape(-1, 1))) plt.figure() plt.plot(yplot)
sns.set_style('ticks') with sns.color_palette("Reds_r"): # plot densities of log-transformed data plt.figure(figsize=(8, 4)) for col in data.columns: sns.kdeplot(rets[col], shade=True) plt.legend(loc=2) print 'Log-transformed feature distributions' #============================================================================== # Lag_plot from pandas.tools.plotting import lag_plot for s in list(data.columns): plt.figure() lag_plot(data.ix[:, s]) #============================================================================== # Autocorrelation from pandas.tools.plotting import autocorrelation_plot for s in list(data.columns): plt.figure() autocorrelation_plot(data.ix[:, s]) #============================================================================== #http://stackoverflow.com/questions/22179119/normality-test-of-a-distribution-in-python # array = np.random.randn(10000) from matplotlib import pyplot as plt import matplotlib.mlab as mlab for s in list(data.columns): plt.figure()
plt.plot(np.array(Diff / pun)) spark_ts = pd.Series(spark['spread'].resample('D').mean(), dtype='float64') pun_ts = pd.Series(pun, dtype='float64') spark_ts.corr(pun_ts) DS = spark['spread'].resample('D').mean() / pun plt.figure() plt.plot(DS) DS.corr(pun_ts) plt.figure() plotting.lag_plot(DS) plt.figure() plt.plot(statsmodels.api.tsa.acf(np.array(DS))) plt.plot(DS.resample('M').mean()) plt.figure() plt.plot(statsmodels.api.tsa.periodogram(np.array(DS))) ############################################################################### def fourierExtrapolation(x, n_predict, n_harmonics=0): x = np.array(x) n = x.size if n_harmonics == 0: n_harm = 100 # number of harmonics in model else:
# Load dataset data_set = data_preparation.read_data('./data_set/HourlyDemands_2002-2016.csv') data, label = data_preparation.split_label(data_set, 'Ontario Demand') print('Data set Loaded!') print(data.shape) print(label.shape) # Plot 2 weeks of data points line = np.linspace(0, 336, 336) plt.plot(line, label[0:336]) plt.xlabel('Hour') plt.ylabel('Power Demand') plt.title('Power Demand of first 14 days') plt.show() lag_plot(label) # Plotting the lag plot of target feature plt.title('Lag plot of Power Demand') plt.xlabel('P(t)') plt.ylabel('P(t+1)') plt.show() # Plotting auto-correlation autocorrelation_plot(label[0:1000]) plt.show() # Splitting train and test data train_data, test_data = data[0:119832], data[119832:] train_label, test_label = label[0:119832], label[119832:]
tsc = pd.Series(change.ix[list(cd)]) dec = statsmodels.api.tsa.seasonal_decompose(tsbtc, freq = 52) dec.plot() plt.figure() tsbtc.plot() plt.axhline(y = 200) plt.axhline(y = 500) plt.figure() tsc.plot() tsbtc.corr(tsc) plt.figure() plotting.lag_plot(tsbtc) ### surprising!!! I think the reticular squared structure puts in evidence the ### particular pattern tat I've noticed. ### N.B.: dates from 2013 data_bit = [] for i in range(tsbtc.size-1): xy = np.array([tsbtc.ix[i],tsbtc.ix[i+1]]) data_bit.append(xy) dataset = np.array(data_bit) H, xedges, yedges = np.histogram2d(dataset[:,0], dataset[:,1], bins = 20, normed = True) plt.figure() im = plt.imshow(H, interpolation='nearest', origin='low',extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
def create_lag_plot(self, filename): fig, ax = plt.subplots() lag_plot(self.raw_time_series) plt.savefig('{}{}_lag_plot.png'.format(self.figure_output_dir, filename)) pass
data = data.set_index(data["Date"]) pun = data["PUN [€/MWH]"].dropna().resample("D").mean() nord = data["MGP NORD [€/MWh]"].dropna().resample("D").mean() plt.figure() plt.scatter(np.array(DF[DF.columns[0]]), np.array(pun)) plt.figure() plt.scatter(np.array(DF[DF.columns[1]]), np.array(pun), color="black") np.corrcoef(np.array(DF[DF.columns[0]]), np.array(pun)) np.corrcoef(np.array(DF[DF.columns[1]]), np.array(pun)) from pandas.tools import plotting plt.figure() plotting.lag_plot(DF["nord-fran"]) perc = [] signed_perc = [] for i in range(DF["nord-fran"].size - 1): perc.append(np.abs((DF["nord-fran"].ix[i + 1] - DF["nord-fran"].ix[i]) / DF["nord-fran"].ix[i])) signed_perc.append((DF["nord-fran"].ix[i + 1] - DF["nord-fran"].ix[i]) / DF["nord-fran"].ix[i]) plt.figure() plt.plot(np.array(signed_perc)) #################### NORD ###################### plt.figure() plt.scatter(np.array(DF[DF.columns[0]]), np.array(nord)) plt.figure()
df2 = df2.dropna() ###drop rows with null values df2.head() df2.count() ###count no. of not null value in each column df2.corr() ###Check corelation with each column ###change in inventory is poorly co-related remove that from model df2['Gross domestic product'].plot() ##timeseries plot plt.show() df2['Gross domestic product'].hist() ##histogram plot plt.show() df2['Gross domestic product'].plot(kind = 'kde') ##density plot plt.show() ##data is skewed left side lag_plot(df2['Gross domestic product']) ##positive correlation relationship autocorrelation_plot(df2['Gross domestic product']) ##line above dotted line shows statistically significant ##we can see trend in data and not seasonality ############################################################## #Multivariate regression ############################################################## ###Train- test split y = df2['Gross domestic product'] x = df2.drop(['Gross domestic product'], axis = 1) train_size = int(len(x)*0.70) train_x,test_x = x[0:train_size],x[train_size:len(x)] train_y,test_y = y[0:train_size],y[train_size:len(x)]
plt.figure() plotting.autocorrelation_plot(np.array(rsigma[1:]), color='palevioletred') ratio = [] for i in range(1, len(rsigma)): ratio.append( float(np.where(np.array(rsigma[:i]) <= 0)[0].size) / float(np.array(rsigma[:i]).size)) plt.figure() plt.plot(np.array(ratio)) #### Try fitting an Ornstein–Uhlenbeck process plt.figure() plotting.autocorrelation_plot(ger2015['CAL'].values.ravel()) plt.figure() plotting.lag_plot(ger2015['CAL']) X = ger2015['CAL'].values.ravel()[:-1] y = ger2015['CAL'].values.ravel()[1:] lm = LinearRegression(fit_intercept=True) lm.fit(X.reshape(-1, 1), y) a = lm.coef_[0] b = lm.intercept_ Sxy = np.sum(X * y) Sx = np.sum(X) Sy = np.sum(y) Sxx = np.sum(X**2)
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.975, 0.98, 0.99 ]) np.where( MAE >= scipy.stats.mstats.mquantiles(MAE, prob=0.99))[0].size / MAE.size plt.figure() plotting.autocorrelation_plot(Err) plt.figure() plotting.autocorrelation_plot(Y) plt.figure() plotting.autocorrelation_plot(YH, color='green') import statsmodels.graphics statsmodels.graphics.tsaplots.plot_acf(Y, lags=30 * 24) plotting.lag_plot(Y, lag=30 * 24) plt.figure() plotting.autocorrelation_plot(YH, color='green') col1 = [] col2 = [] i = 0 j = 30 * 24 while j < DTFC.shape[0]: col1.append(DTFC[DTFC.columns[31]].ix[i]) col2.append(DTFC[DTFC.columns[31]].ix[j]) i += 1 j += 1 plt.figure() plt.plot(np.array(col1))
This dataset describes the minimum daily temperatures over 10 years (1981-1990) in the city Melbourne, Australia. """ #Importing Data import pandas as pd import matplotlib.pyplot as plt series = pd.read_csv('/Users/ashutosh/Documents/analytics/Projects/TimeSeries/daily-minimum-temperatures.csv', parse_dates = ['Date']) print(series.head()) series['temp'].plot() plt.show() ## Checking for Autocorrelation from pandas.tools.plotting import lag_plot lag_plot(series['temp']) plt.show() ## Calculating Perason Coefficient between neighbouring values = pd.DataFrame(series['temp'].values) dataframe = pd.concat([values.shift(1), values], axis=1) dataframe.columns = ['t-1', 't+1'] result = dataframe.corr() print(result) ## Autocorrelation Plots from pandas.tools.plotting import autocorrelation_plot autocorrelation_plot(series['temp']) plt.show()
# | Hexbin | Drop NaNs | | # | Pie | Fill 0’s | | # # If any of these defaults are not what you want, or if you want to be explicit about how missing values are handled, consider using fillna() or dropna() before plotting. # ### density plot # In[51]: ser = pd.Series(np.random.randn(1000)) ser.plot.kde() plt.show() # ### lag plot # Lag plots are used to check if a data set or time series is random. Random data should not exhibit any structure in the lag plot. Non-random structure implies that the underlying data are not random. # In[52]: from pandas.tools.plotting import lag_plot plt.figure() data = pd.Series(0.1 * np.random.rand(1000) + 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) lag_plot(data) plt.show() # ### matplotlib gallery # documentation: http://matplotlib.org/gallery.html
import Fourier reconstructed = Fourier.fourierExtrapolation(dpun, 0, 16) plt.figure() plt.plot(dpun) plt.plot(reconstructed, color = 'red') np.mean(dpun - reconstructed) np.std(dpun - reconstructed) from pandas.tools import plotting plt.figure() plotting.lag_plot(pd.DataFrame(dpun)) plt.figure() plt.plot(statsmodels.api.tsa.acf(dpun)) lags = [] for i in range(dpun.size - 1): lags.append(np.array([dpun[i], dpun[i+1]])) lags = pd.DataFrame(lags) lags.corr() plt.figure() plotting.lag_plot(pd.DataFrame(dpun), lag = 7) plt.figure() plotting.autocorrelation_plot(pd.DataFrame(dpun))
################################################################# yspper10ydwted = pywt.dwt(yspper10y["Value"].values, "haar", mode="cpd") # try different levels yspper10dwtedcoeffs = pywt.wavedec(yspper10y["Value"].values, "haar", level=3) # try maximum wavelent decomposition yspper10dwtedcoeffs = pywt.wavedec(yspper10y["Value"].values, "haar") ################################################################# ## fun with lag_plots ################################################################# lag_plot(yspper10y) plt.title("Lag plot of 1-year lag of " + yaleds[12].name, fontsize=12) ################################################################################ ##### Is the U.S. stock market overvalued? ################################################################################ ################################################################################ ### Is the U.S. S&P 500 Price-to-Earnings (PE) ratio too expensive right now? ################################################################################ # EY : 20150910 in pandas, for a DataFrame, .plot() methods serves a "wrapper" for matplotlib's plt, see the pandas tutorial for Plotting fig = yspper10y.plot().figure fig.suptitle(yaleds[12].name, fontsize=14, fontweight="bold") ax = fig.add_subplot(111) ax.set_ylabel(yaleds[12].colname.split(",")[1] + " ; P/E ratio")
#data = read_csv('data/iris.data') # <markdowncell> # ## Lag Plot # <markdowncell> # 检测数据是否是随机数据? # <codecell> from pandas.tools.plotting import lag_plot plt.figure() data = pd.Series(0.1*uniform(1000)+0.9*np.sin(np.linspace(-99*np.pi,99*np.pi,num=1000))) lag_plot(data) # <rawcell> # 测试一个均匀数据 # <codecell> import numpy as np from pandas import DataFrame import matplotlib.pyplot as plt Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee'] Cols = ['A', 'B', 'C', 'D'] df = DataFrame(abs(np.random.randn(5, 4)), index=Index, columns=Cols)
# from pandas import Series # from pandas import DataFrame # from pandas import concat # from matplotlib import pyplot # series = Series.from_csv('daily-minimum-temperatures-in-me.csv', header=0) # print series.head() # # dataframe = concat([values.shift(1), values], axis=1) # series.plot() # pyplot.show() from pandas import Series from matplotlib import pyplot from pandas.tools.plotting import lag_plot series = Series.from_csv('wc98_workload_hour.csv', header=0) lag_plot(series) pyplot.show()
yspper10ydwted = pywt.dwt( yspper10y['Value'].values, "haar", mode="cpd") # try different levels yspper10dwtedcoeffs = pywt.wavedec(yspper10y['Value'].values, 'haar', level=3) # try maximum wavelent decomposition yspper10dwtedcoeffs = pywt.wavedec(yspper10y['Value'].values, 'haar') ################################################################# ## fun with lag_plots ################################################################# lag_plot(yspper10y) plt.title("Lag plot of 1-year lag of "+yaleds[12].name, fontsize=12) ################################################################################ ##### Is the U.S. stock market overvalued? ################################################################################ ################################################################################ ### Is the U.S. S&P 500 Price-to-Earnings (PE) ratio too expensive right now? ################################################################################ # EY : 20150910 in pandas, for a DataFrame, .plot() methods serves a "wrapper" for matplotlib's plt, see the pandas tutorial for Plotting fig = yspper10y.plot().figure fig.suptitle( yaleds[12].name,fontsize=14,fontweight='bold') ax = fig.add_subplot(111) ax.set_ylabel(yaleds[12].colname.split(',')[1] +' ; P/E ratio')