Exemple #1
0
def csv_file_pick_rtt_series():
    with open(JSON2CSV_FILE_ALL) as f_handler:
        next(f_handler)
        for line in f_handler:
            dest = line.split(';')[1]
            probe = line.split(';')[2]
            rtt_series_one_line = [float(lines) for lines in line.split(';')[3:] if float(lines)!= -1]

            if len(rtt_series_one_line) != 0:

                if ACTION == "periodicity":
                    # print "probe:", probe
                    # print "dest:", dest
                    # print "rtt_series_one_line", rtt_series_one_line
                    plot_fft_autocorr(rtt_series_one_line, dest, probe)
                elif ACTION == "rtt_statistics":
                    rtt_statistics(rtt_series_one_line, dest, probe)
                elif ACTION == "autocorr_plot":
                    # print "probe:", probe
                    # print "dest:", dest
                    # print "rtt_series_one_line", rtt_series_one_line
                    autocorrelation_plot(pd.Series(rtt_series_one_line))
                    plt.show()

            else:
                print probe, GENERATE_TYPE, dest, "is an empty list"
def autocorr_plot(clusters, data, savedir=None):
    n = len(clusters)
    x = math.sqrt(n)
    x = int(x)

    y = 1
    while x * y < n:
        y += 1

    pallete = sns.color_palette("hls", n)
    k = 0

    plt.figure()
    autocorrelation_plot(data.T)
    savepath = '/'.join(directory.split('/') + ['all_autocorr.png'])
    plt.savefig(savepath)
    plt.close()

    fig, axarr = plt.subplots(x, y, sharex=True, sharey=True)
    i = 0
    j = 0
    k = 0
    for cid, cluster in clusters.iteritems():
        ax = axarr[i, j]
        autocorrelation_plot(data.loc[cluster, :].T, ax=ax)
        k += 1
        i = (i + 1) % x
        if i == 0:
            j = (j + 1) % y

    savepath = '/'.join(directory.split('/') + ['incluster_autocorr.png'])
    plt.savefig(savepath)
    plt.close()
Exemple #3
0
def autocorrelation(array, name):
	fig = plt.figure()
	autocorrelation_plot(array)
	plt.legend([name], loc = 'upper left')
	# plt.show()
	fig.savefig("Election_13/stats/"+name+".png", bbox_inches='tight')
	plt.clf()
	fig.clf()
Exemple #4
0
def ts_plots(rets, figsize=(12, 10)):
    import matplotlib.pyplot as plt
    fig, axarr = plt.subplots(2, 2, sharex=False, sharey=False,
                              figsize=figsize)
    axgen = (e for e in np.array(axarr).ravel())

    rets.plot(kind='line', ax=axgen.next())  # .set_title("data")
    rets.plot(kind='hist', bins=50, ax=axgen.next())  # .set_title("histogram")
    # rets.plot(kind='density',ax=axgen.next()).set_title("density")
    lag_plot(rets, lag=1, ax=axgen.next())  # .set_title("")
    autocorrelation_plot(rets, ax=axgen.next())
Exemple #5
0
def process_trace(connection, name):
    print 'Downloading...'
    timeSeries = connection.demand(name)
    print 'complete'

    time = np.zeros(len(timeSeries.elements))
    demand = np.zeros(len(timeSeries.elements))
    for i in range(0, len(timeSeries.elements)):
        time[i] = timeSeries.elements[i].timestamp
        demand[i] = timeSeries.elements[i].value
        
    
    mean = np.mean(demand)
    
    print len(demand)
    
    #demand = demand - mean
    
    
    # http://www.simafore.com/blog/bid/105815/Time-series-analysis-using-R-for-cost-forecasting-models-in-8-steps
    
    
    # demand = np.array([np.sin(i) for i in range(0,500)])
    
    # demand = np.log10(demand)    
    # result = np.correlate(demand, demand, 'full')
    # result = result[0:len(result)]
    
    from scipy import signal
    # demand = sp.signal.detrend(demand, axis=0)
    
    t = np.arange(len(demand))
    sp = np.fft.fft(demand)
    freq = np.fft.fftfreq(t.shape[-1])
    
    from pandas.tools.plotting import autocorrelation_plot
    autocorrelation_plot(demand)
        
    fig = plt.figure()
    ax = fig.add_subplot(111)
    # ax.plot(range(0, len(ff)), ff)
    ax.plot(freq, sp.real, freq, sp.imag)
    # ax.acorr(demand, maxlags=700)
    
    
    
    
    print 'pandas'    
    from pandas import Series
    s = Series(demand, index=range(0, len(demand)))
    corr = s.autocorr()
    print corr
    
    plt.show()
Exemple #6
0
    def plot_autocorrelation(self):
        """
        Plots autocorrelation of power data 
        Reference: 
        http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm

        Returns
        -------
        matplotlib.axis 
        """
        fig, ax = plt.subplots()
        for power in self.power_series():
            autocorrelation_plot(power, ax = ax)
        return ax
    def test_autocorrelation_plot(self):
        from pandas.tools.plotting import autocorrelation_plot
        _check_plot_works(autocorrelation_plot, series=self.ts)
        _check_plot_works(autocorrelation_plot, series=self.ts.values)

        ax = autocorrelation_plot(self.ts, label='Test')
        self._check_legend_labels(ax, labels=['Test'])
    def test_autocorrelation_plot(self):
        from pandas.tools.plotting import autocorrelation_plot
        _check_plot_works(autocorrelation_plot, self.ts)
        _check_plot_works(autocorrelation_plot, self.ts.values)

        ax = autocorrelation_plot(self.ts, label='Test')
        t = ax.get_legend().get_texts()[0].get_text()
        self.assertEqual(t, 'Test')
Exemple #9
0
def plotAutocorrelation(pth, bucketName):
    df = pd.read_hdf(pth+bucketName,'capitalKDF')
    autocorrelation_plot(df['A','p','1'], plt.subplot(2,2,1))
    plt.title("Lag plot for best ask price")
    autocorrelation_plot(df['A','v','1'], plt.subplot(2,2,2))
    plt.title("Lag plot for best ask volume")
    autocorrelation_plot(df['B','p','1'], plt.subplot(2,2,3))
    plt.title("Lag plot for best bid price")
    autocorrelation_plot(df['B','v','1'], plt.subplot(2,2,4))
    plt.title("Lag plot for best bid volume")
    plt.show()
Exemple #10
0
    def _plot(self):
        """_plot doc..."""

        data = pd.Series(np.asarray(self.data))

        pl = self.pl
        ax = pl.gca()
        pdPlot.autocorrelation_plot(data, ax=ax)

        if self.xScale != 1.0:
            formatter = FuncFormatter(self._scaleTickMark)
            ax.get_xaxis().set_major_formatter(formatter)

        pl.title(self.title)
        pl.xlabel(self.xLabel)
        pl.ylabel(self.yLabel)
        if self.xLimits:
            pl.xlim(*self.xLimits)
        if self.yLimits:
            pl.ylim(*self.yLimits)
        pl.grid(True)
def pandas_autocorr(chain, labels, plt_label):

	from pandas.tools.plotting import autocorrelation_plot # autocorrelation plot

	npars = chain.shape[1]
	cmap = get_cmap(npars)

	# plot autocorrelation lag
	plt.figure(figsize=(16,6))
	h = [autocorrelation_plot(chain[nburn:,i], color=cmap(i), lw=5, alpha=0.8, label=labels[i])
	     for i in xrange(npars)]

	plt.legend(loc=1, fontsize=14,ncol=3,numpoints=1,markerscale=0.7)
	plt.tight_layout()
	plt.savefig('pandas_autocorrelation_'+plt_label+'.png',dpi=150)
	plt.close()
features_after_chapter_3 = list(set().union(basic_features, pca_features))
features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features))
features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features))

selected_features = ['temp_pattern_labelOnTable','labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'cluster',
                     'pca_1_temp_mean_ws_120','pca_2_temp_mean_ws_120','pca_2','acc_watch_y_temp_mean_ws_120','gyr_watch_y_pse',
                     'gyr_watch_x_pse']
possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features]
feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features']

# Let us first study whether the time series is stationary and what the autocorrelations are.

dftest = adfuller(dataset['hr_watch_rate'], autolag='AIC')
print dftest

autocorrelation_plot(dataset['hr_watch_rate'])
plot.show()

# Now let us focus on the learning part.

learner = TemporalRegressionAlgorithms()
eval = RegressionEvaluation()

# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.

repeats = 5

# we set a washout time to give the NN's the time to stabilize. We do not compute the error during the washout time.

washout_time = 10
Exemple #13
0
ftest['count'] = reg + cas
ftest['count'] = ftest['count'].clip(0, np.max(train['count']))
ftest[['count']].to_csv('submission-02.csv')


# ## Exploring Autocorrelation

# In[73]:

import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot

# Registered & Casual on Workingdays
fig, axes = plt.subplots(ncols=3, nrows=8, figsize=(16, 16))
for h in range(24):
    autocorrelation_plot(train.registered[train.hour == h][train.workingday == 1], ax=axes[int(h / 3.0)][h % 3], color='c')
    autocorrelation_plot(train.casual[    train.hour == h][train.workingday == 1], ax=axes[int(h / 3.0)][h % 3], color='m')


# In[72]:

import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot

# Registered & Casual on Non-Workingdays
fig, axes = plt.subplots(ncols=3, nrows=8, figsize=(16, 16))
for h in range(24):
    autocorrelation_plot(train.registered[train.hour ==  h][train.workingday == 0], ax=axes[int(h / 3.0)][h % 3], color='c')
    autocorrelation_plot(train.casual[    train.hour ==  h][train.workingday == 0], ax=axes[int(h / 3.0)][h % 3], color='m')

def gen_cluster_plots(cluster_directory_root, depth):
    # load data
    gc, mt, track = load_data(None, 0)
    data = pd.concat([gc.data, mt.data])

    labels = data.index.values
    pos_labels = labels + '+'
    neg_labels = labels + '-'
    pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels,
                            columns=data.columns.values)
    neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels,
                            columns=data.columns.values)

    data = pd.concat([data, pos_data, neg_data])

    generic_dir = cluster_directory_root.split('/') + (['*'] * depth)
    generic_dir = ('/').join(generic_dir)
    cluster_directories = \
        glob.glob(generic_dir)

    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass

    background = set()
    for clustering in clusterings.itervalues():
        for cid, members in clustering.iteritems():
            background.update(set(members))

    background = list(background)
    # data = data.loc[background, :]

    # generate ranomd clusterings of the same size k as our models
    for clustering_id, clustering in clusterings.iteritems():
        for model_id, members in clustering.iteritems():
            sequences = data.loc[members, :]
            pltdir = '/'.join(cluster_directory_root.split('/') + ['plots'])

            # make line plots directory
            if not os.path.isdir(pltdir + '/line'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/line')

            savename = pltdir + '/line/' + model_id + '_lineplot'

            plt_title = model_id + ' Line Plot'
            ax = sequences.T.plot(legend=False, rot=2)
            ax.set_title(plt_title)
            ax.set_xlabel('Timepoint')
            ax.set_ylabel('Normalized Expression')

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            # make autocorr plots directory
            if not os.path.isdir(pltdir + '/autocorr'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/autocorr')

            savename = pltdir + '/autocorr/' + model_id + '_autocorr'

            plt_title = model_id + ' Autocorr Plot'
            for seq in sequences.index:
                ax = autocorrelation_plot(sequences.loc[seq])
            ax.set_title(plt_title)

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            # make lag plots directory
            if not os.path.isdir(pltdir + '/lag'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/lag')

            from pylab import *
            NUM_COLORS = len(members)
            cm = get_cmap('gist_rainbow')
            colors = []
            for i in range(NUM_COLORS):
                colors.append(cm(1.*i/NUM_COLORS))

            savename = pltdir + '/lag/' + model_id + '_lagplot'

            plt_title = model_id + ' Lag Plot'
            for i, seq in enumerate(sequences.index):
                ax = lag_plot(sequences.loc[seq], c=colors[i])
            ax.set_title(plt_title)

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            """
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from scipy.interpolate import spline
from pandas.tools.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from scipy.stats import gaussian_kde
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

def norm(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

dataframe = pd.read_csv('Chaotic_TimeSeries_turkey_elec.csv')
dataframe.head()
plt.plot(dataframe)
autocorrelation_plot(dataframe.ix[:,0])

### AVALIAR V3 LINHAS
model00 = ARIMA(np.array(dataframe.ix[:,0]), dates=None,order=(2,1,0))
model11 = model00.fit(disp=1)
model11.summary()
model11.forecast()
resid9=model11.resid
np.mean(abs(resid9))/max(np.array(dataframe.ix[:,0]))

x3 = resid9
x3 = x3[numpy.logical_not(numpy.isnan(x3))]
dftest13 = adfuller(x3, autolag='AIC')
dfoutput1 = pd.Series(dftest13[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
print('Dickey Fuller Test:\n',dfoutput1)
Exemple #16
0
from pandas import DataFrame
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

# Read in the data
data = pd.read_csv("Budget_test.csv", index_col=0)

# print(data.head())
data.index = pd.to_datetime(data.index)
data.columns = ['WRVU Production']

plt.plot(data)
plt.ylabel('wrvus')
plt.show()

autocorrelation_plot(data)
pyplot.show()

model = ARIMA(data, order=(5, 1, 0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())

x = data.values
size = int(len(x) * 0.5)
import numpy as np
import pandas as pd
import sys
from datetime import datetime as dt
import matplotlib.pyplot as plt
from pandas.tools.plotting import lag_plot
from pandas.tools.plotting import autocorrelation_plot
 
to_date = lambda x: dt.strptime(x, "%Y%m%d").toordinal()
 
dates, avg_temp = np.loadtxt(sys.argv[1], delimiter=',', usecols=(1, 11), unpack=True, converters={1: to_date})
dtidx = pd.DatetimeIndex([dt.fromordinal(int(date)) for date in dates])
data = pd.Series(avg_temp * .1, index=dtidx)
 
fig = plt.figure()
fig.add_subplot(211)
lag_plot(data)
 
plt.figure()
autocorrelation_plot(data)
 
plt.figure()
resampled = data.resample('A')
resampled.plot()
plt.show()
line = np.linspace(0, 336, 336)
plt.plot(line, label[0:336])
plt.xlabel('Hour')
plt.ylabel('Power Demand')
plt.title('Power Demand of first 14 days')
plt.show()
lag_plot(label)

# Plotting the lag plot of target feature
plt.title('Lag plot of Power Demand')
plt.xlabel('P(t)')
plt.ylabel('P(t+1)')
plt.show()

# Plotting auto-correlation
autocorrelation_plot(label[0:1000])
plt.show()

# Splitting train and test data
train_data, test_data = data[0:119832], data[119832:]
train_label, test_label = label[0:119832], label[119832:]

# Implementing Persistence Model
df = pd.concat([label.shift(48), label], axis=1)
df.columns = ['t-1', 't+1']
X = df.values
train, test = X[0:119832], X[127656:]
train_X, train_y = train[:, 0], train[:, 1]
test_X, test_y = test[:, 0], test[:, 1]

Exemple #19
0
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pandas.tools.plotting import autocorrelation_plot
from mpi4py import MPI

dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

file = sys.argv[1]
jenis = sys.argv[2]
data = pd.read_csv(file, index_col='tanggal', date_parser=dateparse)

dt = np.log(data[jenis])

dt.plot(label='Data '+jenis+' Pengamatan')
plt.savefig('grafik_'+jenis+'.png', transparent=False)
autocorrelation_plot(dt)
plt.savefig('grafik_autocorelation_'+jenis+'.png', transparent=False)
sm.graphics.tsa.plot_acf(dt, lags=40)
plt.savefig('grafik_acf_'+jenis+'.png', transparent=False)
sm.graphics.tsa.plot_pacf(dt, lags=40)
plt.savefig('grafik_pacf_'+jenis+'.png', transparent=False)
#ts.adfuller(dt, 1)
if MPI.COMM_WORLD.Get_rank()==0:
	arima_mod1 = sm.tsa.ARIMA(dt, (3,0,2)).fit(trend='nc' , disp = False)
print(arima_mod1.params)
#print (arima_mod1.params)
sm.stats.durbin_watson(arima_mod1.resid.values)
#ws.to_csv("Arima_resid"+jenis+".csv")
print(arima_mod1.aic)
print(arima_mod1.bic)
#print("HQIC: "+ arima_mod1.hqic)
Exemple #20
0
誤差項に系列相関が残っている場合、トレンドも含めて、モデルに含まれていない要因が大きい影響を持っている可能性がありますので、思い当たる説明変数を加えてみたり、タイム・トレンドやラグ項を足したり、変分を取るなりして、コントロールしたほうがよいでしょう。

このような系列相関のチェックには、ADF検定によって誤差項の定常性を確認するのも有効だと思います。

"""




# ADF test, H0: Non-stationary
tsa.adfuller(rlt.resid,regression='nc')


# Autocorrel plot of resid
autocorrelation_plot(rlt.resid) # Show ACF of residuals
ACF_resid=tsa.acf(rlt.resid) # Keep ACF of residuals

"""
誤差項が定常であれば、モデル内の説明変数と被説明変数との間に安定した(一時的に外れても帰ってくるような)関係があることが保証されます。また、多くの経済変数はそもそも非定常ですので、残差が定常の場合、重要な要因がモデルから脱落している可能性も低くなります。

系列相関以外に大切なのは、多重共線性(マルチコリニアリティ)のチェックでしょう。これは、説明変数の間に強い相関がある場合に生じるもので、推定される係数の符号が反転してしまったりしますので厄介です。

以下のようにVIF統計量を計算して、10を大きく上回っていなければ、ひとまず安心と考えます。また、VIFを参照して機械的に判定しなくても、想定される符号と逆の符号を持った説明変数が現れれば、経験的にマルチコに気づくと思います。もっとも、マルチコの解決策は強相関している説明変数のどれかを取り除くくらいしか解決策がありません。

リッジ回帰など、パラメター空間を制約するやり方はそもそもパラメターの不偏性を犠牲にする上に、必ずしもマルチコを解消させる保障がないため、歪めますので、計量経済学では推奨されていません。

"""


# Checking Multicolinearity by VIF
Exemple #21
0
]

possible_feature_sets = [
    basic_features, features_after_chapter_3, features_after_chapter_4,
    features_after_chapter_5, selected_features
]
feature_names = [
    'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features'
]

# Let us first study whether the time series is stationary and what the autocorrelations are.

dftest = adfuller(dataset['acc_phone_x'], autolag='AIC')
print dftest

autocorrelation_plot(dataset['acc_phone_x'])
plot.show()

# Now let us focus on the learning part.

learner = TemporalRegressionAlgorithms()
eval = RegressionEvaluation()

# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.

repeats = 5

# we set a washout time to give the NN's the time to stabilize. We do not compute the error during the washout time.

washout_time = 10
Exemple #22
0
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error


def parser(x):
    return datetime.strptime('190' + x, '%Y-%m')


series = read_csv('/Home/Downloads/sales-of-shampoo-over-a-three-ye.csv',
                  header=0,
                  parse_dates=[0],
                  index_col=0,
                  squeeze=True,
                  date_parser=parser)

autocorrelation_plot(series)
# fit model
model = ARIMA(series, order=(5, 1, 0))
model_fit = model.fit(disp=0)
print(model_fit.summary())
# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())

X = series.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
loan_count_summary = year_month_summary['issue_d']
loan_count_summary.to_csv("LoanStatsGrouped.csv", index=True) # We can read later more rapidly
#loan_count_summary = pd.read_csv('LoanStatsGrouped.csv')

# What do we really care? Just the y values
y = loan_count_summary.values

plt.plot(y)
plt.suptitle("the values")
plt.show()

# Test if the time series is stationary
# Here I make an autocorrelation plot of the data. The decay with the lag indicate that the TS is not stationary
import statsmodels.api as sm
from pandas.tools.plotting import autocorrelation_plot
autocorrelation_plot(y)
plt.suptitle("Original series")
plt.savefig("TS.pdf")
#plt.show()

# #### Dickey Fuller test
# I can also perform a Dickey Fuller test for presence of unit roots
test = sm.tsa.adfuller(y)
print 'adf: ', test[0] 
print 'p-value: ', test[1]
print 'Critical values: ', test[4]
if test[0] > test[4]['10%']: 
    print 'has unit roots , the series is not stationary'
else:
    print 'has no unit roots , the series is stationary'
f, axarr = plt.subplots(2)
axarr[0].plot(nord16.resample('D').mean(), lw = 2)
axarr[1].plot(nord15.resample('D').mean(), color = 'red', lw = 2)

f, axarr = plt.subplots(2)
axarr[0].plot(nord16.resample('D').std(), lw = 2)
axarr[1].plot(nord15.resample('D').std(), color = 'red', lw = 2)

var_nord16 = np.array(nord16.resample('D').std()).ravel()
var_nord15 = np.array(nord15.resample('D').std()).ravel()

plt.figure()
plt.hist(np.array(var_nord16), bins = 20)
plt.figure()
plotting.autocorrelation_plot(pd.Series(var_nord16))
plt.figure()
plotting.autocorrelation_plot(pd.Series(np.random.sample(size = len(var_nord16))))

d16nord = DistBetweenZeroVarDays(var_nord16)
d15nord = DistBetweenZeroVarDays(var_nord15)

plt.figure()
plt.hist(np.array(d16nord))
plt.figure()
plt.hist(np.array(d15nord))

np.mean(d16nord)
np.mean(d15nord)
np.std(d16nord)
np.std(d15nord)
# converts string to datetime object in pandas:
df['issue_d_format'] = pd.to_datetime(df['issue_d']) 
dfts = df.set_index('issue_d_format') 
year_month_summary = dfts.groupby(lambda x : x.year * 100 + x.month).count()
loan_count_summary = year_month_summary['issue_d']


# #### Test if the time series is stationary
# Here I make an autocorrelation plot of the data. The decay with the lag indicate that the TS is not stationary

# In[24]:

import statsmodels.api as sm
from pandas.tools.plotting import autocorrelation_plot
autocorrelation_plot(loan_count_summary)


# #### Dickey Fuller test
# I can also perform a Dickey Fuller test for presence of unit roots

# In[25]:

test = sm.tsa.adfuller(loan_count_summary.values)
print 'adf: ', test[0] 
print 'p-value: ', test[1]
print 'Critical values: ', test[4]
if test[0] > test[4]['10%']: 
    print 'has unit roots , the series is not stationary'
else:
    print 'has no unit roots , the series is stationary'
Exemple #26
0
data_bit = []
for i in range(tsbtc.size-1):
    xy = np.array([tsbtc.ix[i],tsbtc.ix[i+1]])
    data_bit.append(xy)

dataset = np.array(data_bit)

H, xedges, yedges = np.histogram2d(dataset[:,0], dataset[:,1], bins = 20, normed = True)

plt.figure()
im = plt.imshow(H, interpolation='nearest', origin='low',extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])

plt.figure()
plt.plot(statsmodels.api.tsa.acf(tsbtc))
plt.figure()
plotting.autocorrelation_plot(tsbtc.ix[1340:])

reversed_arr = np.fliplr([np.array(tsbtc)])[0]
plt.figure()
plotting.autocorrelation_plot(reversed_arr)
####################################################################################################
def get_dataset(ts):
    data_bit = []
    for i in range(ts.size-1):
        xy = np.array([ts.ix[i],ts.ix[i+1]])
        data_bit.append(xy)
    dataset = np.array(data_bit)
    return dataset
####################################################################################################
def find_closest_index(edges, x):
    diffs = np.abs(edges - x)
Exemple #27
0
pun.append(data3['PUN [€/MWH]'].dropna().values.ravel())

unlisted =  [item for sublist in pun for item in sublist]

df = pd.DataFrame(unlisted)
df = df.set_index(pd.date_range('2014-01-01', '2016-12-14', freq = 'H')[:df.shape[0]])

df.plot()
df.resample('D').mean().plot()
df.resample('M').mean().plot()

plt.figure()
plotting.lag_plot(df.resample('M').mean())

plt.figure()
plotting.autocorrelation_plot(df)
plt.figure()
plotting.autocorrelation_plot(df.resample('D').mean())

plt.figure()
plotting.autocorrelation_plot(df.ix[df.index.year == 2014].resample('D').mean())
plt.figure()
plotting.autocorrelation_plot(df.ix[df.index.year == 2015].resample('D').mean())
plt.figure()
plotting.autocorrelation_plot(df.ix[df.index.year == 2016].resample('D').mean())

plt.figure()
plotting.lag_plot(df.ix[df.index.year == 2014])
plt.figure()
plotting.lag_plot(df.ix[df.index.year == 2015], color = 'red')
plt.figure()
Exemple #28
0
pyplot.show()

pyplot.plot(merged_dataframe['Date'], merged_dataframe['Close'])
pyplot.xlabel('Years')
pyplot.ylabel('Stock Closing Prices')
pyplot.show()

merged_dataframe.info()

merged_dataframe['Date'] = merged_dataframe['Date'].dt.strftime('%Y-%m')

df_dateclose = pd.Series(merged_dataframe['Close'].values,
                         index=merged_dataframe['Date'])
print(df_dateclose.index)
print(df_dateclose.head())
autocorrelation_plot(df_dateclose)
pyplot.show()

merged_dataframe['Close'].head()

plot_acf(df_dateclose)
plot_pacf(df_dateclose)

plot_pacf(df_dateclose, lags=50)

plot_acf(df_dateclose)
plot_acf(df_dateclose, lags=50)

arima_df = pd.DataFrame(merged_dataframe, index=merged_dataframe['Date'])

arima_df.index
Exemple #29
0
import Fourier

reconstructed = Fourier.fourierExtrapolation(dpun, 0, 16)

plt.figure()
plt.plot(dpun)
plt.plot(reconstructed, color = 'red')

np.mean(dpun - reconstructed)
np.std(dpun - reconstructed)

from pandas.tools import plotting

plt.figure()
plotting.lag_plot(pd.DataFrame(dpun))

plt.figure()
plt.plot(statsmodels.api.tsa.acf(dpun))

lags = []
for i in range(dpun.size - 1):
    lags.append(np.array([dpun[i], dpun[i+1]]))
    
lags = pd.DataFrame(lags)
lags.corr()

plt.figure()
plotting.lag_plot(pd.DataFrame(dpun), lag = 7)
plt.figure()
plotting.autocorrelation_plot(pd.DataFrame(dpun))
def autocorrelation(building, floor, group=None, start=_start, end=_end):
    floor_data = get_series(building, group=group, start=start, end=end)
    series = floor_data[floor]
    autocorrelation_plot(series)
name = '$R_0$ development through time for windowsize '+str(ws)
plt.title(name)
plt.savefig(name)  

plt.clf()
ax = plt.subplot()
m.boxplot(ax=ax, rot=90)
name = 'Spread of estimated parameters $p_ij$ for windowsize '+str(ws)
plt.autoscale(tight=True)
plt.title(name)
plt.savefig(name)

plt.clf()
ax = plt.subplot()
for i in range(36):
    autocorrelation_plot(m.iloc[i,:], ax=ax)#,label=str(m.columns[i]))
name = 'Autocorrelation plot of the parameter estimates for window size '+str(ws)
plt.title(name)
plt.savefig(name)
#plt.legend()

plt.clf()
ax = plt.subplot()
for i in range(6):
    autocorrelation_plot(r.iloc[i,:], ax=ax)#,label=str(m.columns[i]))
name = 'Autocorrelation plot of the $R_0$ for window size '+str(ws)
plt.title(name)
plt.savefig(name)

plt.clf()
diffs = diff(r)
plt.plot(res_ma.window_size, res_ma.MAPE)
plt.show()

# EWMA

fig= plt.figure(figsize=(12,9))
plt.plot(ts.index, ts.rings, label = 'Ring size', alpha = 0.5)
for i in  np.linspace(1, 0.0001,10):
    plt.plot(ts.index, ts.rings.ewm(alpha = i).mean() + i* 10 , label = 'EWMA: %s'% i)
plt.xlim(xmin = min(ts.index)-1, xmax = max(ts.index) +1 )
plt.legend(loc='best')

# Autocorrelation
from pandas.tools.plotting import autocorrelation_plot

autocorrelation_plot(ts.rings)

# Load dow jones
ts = pd.read_csv('../data/Dow-Jones.csv', parse_dates=['Date'], index_col='Date', infer_datetime_format = True)
ts = ts[:'2010-01-01']

autocorrelation_plot(ts.Value)

# Avg temp
ts = pd.read_csv('../data/mean-daily-temperature.csv', parse_dates=['date'], index_col='date', infer_datetime_format = True)

autocorrelation_plot(ts.temp)

# PACF
import statsmodels.api as sm
def predict_arma(ad_group, pred_date):
    warnings.filterwarnings("ignore")
    ads_file = 'data/ad_table.csv'
    df = pd.read_csv(ads_file, header=0, sep=',')
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
    best_aic = np.inf
    best_order = None
    best_mdl = None
    max_lag = 30
    tuning_result = {}
    #     list_ad_group = set(df['ad'].values)
    if (ad_group in df['ad'].unique()):
        df_ad_group_train = df[df['ad'] == ad_group]
        df_ad_group_train = df_ad_group_train.reset_index()
        df_arma_train = df_ad_group_train[['shown', 'date']]
        series_train = pd.Series(df_arma_train['shown'],
                                 index=df_arma_train.index)
        for alpha in range(5):
            for beta in range(5):
                try:
                    tmp_mdl = ARMA(series_train.values,
                                   order=(alpha, beta)).fit(method='mle',
                                                            trend='nc')
                    tmp_aic = tmp_mdl.aic
                    if tmp_aic < best_aic:
                        best_aic = tmp_aic
                        best_order = (alpha, beta)
                        best_mdl = tmp_mdl
                except:
                    continue
        score, pvalue, _, _ = jarque_bera(best_mdl.resid)

        if pvalue < 0.10:
            print('The residuals may not be normally distributed.')
        else:
            print('The residuals seem normally distributed.')
        tuning_result = (best_aic, best_order)
        print('Ad_group: {} aic: {:6.2f} | best order: {}'.format(
            ad_group, best_aic, best_order))

        df_ad_group_train['time_period'] = (
            df_ad_group_train['date'] - df_ad_group_train['date'][0]).dt.days
        X = df_ad_group_train[['time_period']].values
        y = df_ad_group_train['shown'].values
        series_train.plot(title='Shown values trend', color='C1')
        plt.ylabel('shown values')
        plt.xlabel('Days gap from 2015-10-01')
        plt.scatter(X, y, facecolor='gray', edgecolors='none')
        plt.show()
        #check for auto correlation
        lag_plot(series_train)
        plt.show()
        autocorrelation_plot(series_train)
        plt.show()
        plot_acf(series_train.values, lags=max_lag)
        plt.show()

        data = series_train.values
        data = data.astype('float32')
        model = ARMA(data, order=best_order)
        #         model_fit = model.fit(transparams=False)
        try:
            model_fit = model.fit(transparams=False)
            model_fit.plot_predict(plot_insample=True)
            plt.scatter(X, y, color='gray')
            plt.title('ARMA')
            plt.show()
            days_gap = (pd.to_datetime(pred_date) -
                        df_arma_train['date'][0]).days
            forecast = model_fit.forecast(steps=days_gap)

            print('Prediction of shown value for', pred_date, '=')
            print(forecast[0][0])
        except ValueError:
            print('This data is not suitable for ARMA')
    else:
        print("Ad group does not exist")
### for bootstrap: do I sample only from the past year, same month?


for i in range(12):
    for j in range(5):
        print("difference between {} and {} for month {}".format(bymonth2.columns[j+1], bymonth2.columns[j], 
              bymonth2.index[i]))
        #print(bymonth2.apply(lambda j: bymonth2[bymonth2.columns[j+1]].ix[i] - bymonth2[bymonth2.columns[j]].ix[i],axis=1))
        print(bymonth2[bymonth2.columns[j+1]].ix[i] - bymonth2[bymonth2.columns[j]].ix[i])


from pandas.tools.plotting import autocorrelation_plot

for i in range(12):
    plt.figure()
    autocorrelation_plot(bymonth2.ix[i])

### hour-by-hour
letters = 'abcdefghijklmnopqrstuvwxyz'
lets = []
for i in range(rng.size):
    lets.append(letters[rng[i].hour])

letters_dict = {'Letters': lets, 'csud': pun}
hdf = pd.DataFrame(letters_dict).set_index(rng)

hourwise= {}

for i in range(24):
    letter = letters[i]
    hour = hdf.ix[hdf['Letters'] == letter]
ax.set_ylabel("Y Label")
ax.set_zlabel("Z Label")
ax.set_xticks(df.finalDate[x].apply(lambda d: d.strftime("%Y-%m-%d")).values)
plt.show()


#################Several plots in 2D
df.plot(subplots=True)
df.plot(x="finalDate", y="finalBalance")
df.hist(by=["windowSize", "trainingSize"])
df.boxplot("finalBalance", by=["windowSize", "trainingSize"])
scatter_matrix(df, alpha=0.2, diagonal="kde")
df.plot(x="finalDate", y="finalBalance", kind="kde")

parallel_coordinates(df, "windowSize")
autocorrelation_plot(df.finalBalance)
radviz(df, "finalBalance")
df.plot(colormap="jet")

#################More specific plots in 2D
f, (ax1, ax2) = plt.subplots(2, 3)
ax1[0].plot(df.groupby(["windowSize"]).mean()["finalBalance"])
ax1[0].set_title("Window Size Mean")
ax1[0].set_ylim((5000, 15000))

ax2[0].plot(df.groupby(["windowSize"]).sum()["finalBalance"])
ax2[0].set_title("Window Size Sum")

ax1[1].plot(df.groupby(["trainingSize"]).mean()["finalBalance"])
ax1[1].set_title("Training Size Mean")
ax1[1].set_ylim((5000, 15000))
from matplotlib import pyplot
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error


from pandas.tools.plotting import autocorrelation_plot

def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')

series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
print(series.head())
series.plot()
pyplot.show()

autocorrelation_plot(series)
pyplot.show()


# fit model
model = ARIMA(series, order=(5,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()

residuals.plot(kind='kde')
pyplot.show()
Exemple #37
0
line = np.linspace(0, 336, 336)
plt.plot(line, label[0:336])
plt.xlabel('Hour')
plt.ylabel('Power Demand')
plt.title('Power Demand of first 14 days')
plt.show()
lag_plot(label)

# Plotting the lag plot of target feature
plt.title('Lag plot of Power Demand')
plt.xlabel('P(t)')
plt.ylabel('P(t+1)')
plt.show()

# Plotting auto-correlation
autocorrelation_plot(label[0:1000])
plt.show()

# Splitting train and test data
train_data, test_data = data[0:119832], data[119832:]
train_label, test_label = label[0:119832], label[119832:]

# Implementing Persistence Model
df = pd.concat([label.shift(48), label], axis=1)
df.columns = ['t-1', 't+1']
X = df.values
train, test = X[0:119832], X[127656:]
train_X, train_y = train[:, 0], train[:, 1]
test_X, test_y = test[:, 0], test[:, 1]