args = cmd_parser.parse_args()

print "Slurping data from %s, writing to %s" % (args.datadir,
    args.outfile)
files = sorted(os.listdir(args.datadir))

print "Iterating over datasets. This might take a while."
ensemble_all = np.empty([datatool.secs_per_day()])
counter_all = np.empty([datatool.secs_per_day()])
ensemble_weekend = np.empty([datatool.secs_per_day()])
counter_weekend = np.empty([datatool.secs_per_day()])
ensemble_weekday = np.empty([datatool.secs_per_day()])
counter_weekday = np.empty([datatool.secs_per_day()])
with pb.ProgressBar(maxval=len(files)) as progress:
  for idx, file in enumerate(files):
    freqdata = datatool.load_data_as_dataframe(os.path.join(args.datadir, file))
    weekend = False
    if freqdata.iloc[0].weekday in set([5, 6]):
      weekend = True
    for i, row in freqdata.iterrows():
      ensemble_all[row.s_since_midnight] += row.freq
      counter_all[row.s_since_midnight] += 1
      if weekend:
        ensemble_weekend[row.s_since_midnight] += row.freq
        counter_weekend[row.s_since_midnight] += 1
      else:
        ensemble_weekday[row.s_since_midnight] += row.freq
        counter_weekday[row.s_since_midnight] += 1
    progress.update(idx)

ensemble_all = ensemble_all / counter_all
Esempio n. 2
0
# vim:fileencoding=utf-8
import matplotlib.pyplot as plt
import numpy as np
import freqanalysis.ecdf as ecdf
import freqanalysis.datatools as datatool
from scipy.stats import ks_2samp



datasetfile = "datasets/20140904-export.txt"
print "loading ", datasetfile
df = datatool.load_data_as_dataframe(datasetfile)
print "Calculating ECDF of all values"
all_series, yvals = ecdf.get_ecdf(df['freq'])
print "Plotting graph"
ecdf.plot_ecdf_curve(all_series, yvals, color="b", label="Alle Werte")

df['minute'] = df.time.apply(lambda x: x.minute)

hour_df = df[(df.minute >= 58) | (df.minute <= 5)]
hour_series, yvals = ecdf.get_ecdf(hour_df['freq'])
ecdf.plot_ecdf_curve(hour_series, yvals, color="r",
    label="Stundenwechsel")

not_hour_df = df[(df.minute < 58 ) & (df.minute > 5)]
not_hour_series, yvals = ecdf.get_ecdf(not_hour_df['freq'])
ecdf.plot_ecdf_curve(not_hour_series, yvals, color="y", linestyle="-",
    label="unter der Stunde")
print "Null hypothesis: the two samples are drawn from the same continuous distribution."

D, p_value = ks_2samp(all_series, hour_series)
Esempio n. 3
0
# vim:fileencoding=utf-8
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from scipy.stats import ks_2samp
from scipy.stats.mstats import normaltest
import freqanalysis.datatools as datatool
import freqanalysis.normdist as nd

datasetfile = "datasets/20140904-export.txt"
print "loading ", datasetfile
df = datatool.load_data_as_dataframe(datasetfile)
df['minute'] = df.time.apply(lambda x: x.minute)
hour_df = df[(df.minute >= 58) | (df.minute <= 5)]
not_hour_df = df[(df.minute < 58) & (df.minute > 5)]

f, (ax1, ax2, ax3) = plt.subplots(3, sharex=True, sharey=True)
# pandas/matplotlib incompatibility: http://stackoverflow.com/a/22764377
nd.plot_fit(df['freq'].values, ax1, "Alle\ Werte")
nd.plot_fit(hour_df['freq'].values, ax2, "Nur\ Stundenwechsel")
nd.plot_fit(not_hour_df['freq'].values, ax3, "Kein\ Stundenwechsel")

f.savefig("images/normdistrib.png", bbox_inches='tight')

print
print "Executing KS-Test: is the data normally distributed?"
mu, std = norm.fit(df['freq'])
refdist = np.random.normal(mu, std, 100000)
D, p_value = ks_2samp(df['freq'].values, refdist)
if p_value < 0.01:
    print "Rejecting null hypothesis - the two distributions differ significantly. p = %.4f" % p_value
Esempio n. 4
0
import freqanalysis.datatools as datatool
import numpy as np
import sys as sys
import argparse
import os

cmd_parser = argparse.ArgumentParser()
cmd_parser.add_argument("datafile", help="the csv containing the frequency measurements")
cmd_parser.add_argument("outfile", help="HDF+ file to create")
args = cmd_parser.parse_args()

print "Slurping the CSV-file %s, writing to %s" % (args.datafile,
    args.outfile)

print "Loading datasets. This might take a while."
alldata = datatool.load_data_as_dataframe(args.datafile)

print "Selecting all friday data for comparison."
# select the friday 8:00 to 11:00 UTC datasets from the alldata frame
fridays = alldata[alldata.weekday == 4]
fridaydata = fridays[(fridays.hour > 7) & (fridays.hour < 11)]

print "Selecting eclipse data"
eclipsedata = alldata[(alldata.unix >= 1426838400) & (alldata.unix < 1426849200)]

with pd.get_store(args.outfile) as store:
  store['eclipsedata'] = eclipsedata
  store['fridaydata'] = fridaydata