def test_histogram2d(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('bigfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     min_ = Min(scheduler=s)
     min_.input.table = csv.output.table
     max_ = Max(scheduler=s)
     max_.input.table = csv.output.table
     histogram2d = Histogram2D(1, 2, xbins=100, ybins=100,
                               scheduler=s)  # columns are called 1..30
     histogram2d.input.table = csv.output.table
     histogram2d.input.min = min_.output.table
     histogram2d.input.max = max_.output.table
     heatmap = Heatmap(filename='histo_%03d.png', scheduler=s)
     heatmap.input.array = histogram2d.output.table
     #pr = Print(scheduler=s)
     pr = Every(proc=self.terse, scheduler=s)
     #pr.input.df = heatmap.output.heatmap
     #pr.input.df = histogram2d.output.df
     pr.input.df = csv.output.table
     csv.scheduler().start()
     s.join()
     #self.scheduler.thread.join()
     s = histogram2d.trace_stats()
 def t_histogram2d_impl(self, **kw: Any) -> None:
     s = self.scheduler()
     random = RandomTable(3, rows=100000, scheduler=s)
     stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
     stirrer.input[0] = random.output.result
     min_ = Min(scheduler=s)
     min_.input[0] = stirrer.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = stirrer.output.result
     histogram2d = Histogram2D(
         0, 1, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = stirrer.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(s.start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     t = stirrer.table.loc[:, ["_1", "_2"]]
     assert t is not None
     v = t.to_array()
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertEqual(np.sum(h1), np.sum(h2))
     self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())
 def test_histogram2d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram2d = Histogram2D(
         1, 2, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = csv.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(csv.scheduler().start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[1, 2]  # type: ignore
     )
     v = df.to_numpy()  # .reshape(-1, 2)
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertTrue(np.allclose(h1, h2))
 def test_histogram2d(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram2d = Histogram2D(
         1, 2, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = csv.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(csv.scheduler().start())
     _ = histogram2d.trace_stats()
Beispiel #5
0
#SUFFIX= ''
PREFIX= '../nyc-taxi/'
SUFFIX= '.bz2'

URLS = [
    PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX,
]

filenames = pd.DataFrame({'filename': URLS})
cst = Constant(Table('filenames', data=filenames), scheduler=s)
csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['dropoff_longitude', 'dropoff_latitude'], filter_=filter_, scheduler=s)
csv.input.filenames = cst.output.table
min = Min(scheduler=s)
min.input.table = csv.output.table
max = Max(scheduler=s)
max.input.table = csv.output.table
histogram2d = Histogram2D('dropoff_longitude', 'dropoff_latitude', xbins=RESOLUTION, ybins=RESOLUTION, scheduler=s)
histogram2d.input.table = csv.output.table
histogram2d.input.min = min.output.table
histogram2d.input.max = max.output.table
heatmap = Heatmap(filename='nyc_dropoff_yellow%d.png', history=5, scheduler=s)
heatmap.input.array = histogram2d.output.table

if __name__=='__main__':
    s.start()
Beispiel #6
0
#SUFFIX= ''
PREFIX= '../nyc-taxi/'
SUFFIX= '.bz2'

URLS = [
    PREFIX+'green_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-06.csv'+SUFFIX,
]

filenames = pd.DataFrame({'filename': URLS})
cst = Constant(Table('filenames', data=filenames), scheduler=s)
csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['Pickup_longitude', 'Pickup_latitude'], filter_=filter_, scheduler=s)
csv.input.filenames = cst.output.table
min = Min(scheduler=s)
min.input.table = csv.output.table
max = Max(scheduler=s)
max.input.table = csv.output.table
histogram2d = Histogram2D('Pickup_longitude', 'Pickup_latitude', xbins=RESOLUTION, ybins=RESOLUTION, scheduler=s)
histogram2d.input.table = csv.output.table
histogram2d.input.min = min.output.table
histogram2d.input.max = max.output.table
heatmap = Heatmap(filename='nyc_pickup_green%d.png', history=5, scheduler=s)
heatmap.input.array = histogram2d.output.table

if __name__=='__main__':
    s.start()
SUFFIX= '.bz2'

URLS = [
    PREFIX+'green_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'green_tripdata_2015-06.csv'+SUFFIX,
]

filenames = pd.DataFrame({'filename': URLS})
#cst = Constant(filenames, scheduler=s)
cst = Constant(Table('filenames', data=filenames), scheduler=s)
csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['Dropoff_longitude', 'Dropoff_latitude'], filter_=filter_, scheduler=s)
csv.input.filenames = cst.output.table
min = Min(scheduler=s)
min.input.table = csv.output.table
max = Max(scheduler=s)
max.input.table = csv.output.table
histogram2d = Histogram2D('Dropoff_longitude', 'Dropoff_latitude', xbins=RESOLUTION, ybins=RESOLUTION, scheduler=s)
histogram2d.input.table = csv.output.table
histogram2d.input.min = min.output.table
histogram2d.input.max = max.output.table
#heatmap = Heatmap(filename='nyc_Dropoff_green%d.png', history=5, scheduler=s)
heatmap = Heatmap(filename='nyc_Dropoff_green%d.png', scheduler=s)
heatmap.input.array = histogram2d.output.table

if __name__=='__main__':
    s.start()
Beispiel #8
0
from progressivis.io import CSVLoader
from progressivis.stats import Histogram2D, Min, Max
from progressivis.datasets import get_dataset
from progressivis.vis import Heatmap

print("Loading test_histogram2d")
print("Type of default_scheduler is %s" % type(Scheduler.default))

csv = CSVLoader(get_dataset('bigfile'),
                index_col=False,
                header=None,
                engine='c')
pr = Every()
pr.input.df = csv.output.table
min_ = Min()
min_.input.table = csv.output.table
max_ = Max()
max_.input.table = csv.output.table
histogram2d = Histogram2D('_1', '_2', xbins=128, ybins=128)
histogram2d.input.table = csv.output.table
histogram2d.input.min = min_.output.table
histogram2d.input.max = max_.output.table
# heatmap
heatmap = Heatmap(filename='histo_%03d.png')
heatmap.input.array = histogram2d.output.table
pr = Print(name='print')
pr.input.df = csv.output.table

if __name__ == '__main__':
    csv.start()
filenames = pd.DataFrame({'filename': URLS})
cst = Constant(Table('filenames', data=filenames), scheduler=s)
csv = CSVLoader(index_col=False,
                skipinitialspace=True,
                usecols=['pickup_longitude', 'pickup_latitude'],
                filter_=filter_,
                scheduler=s)
csv.input.filenames = cst.output.table
#min = Min(scheduler=s)
#min.input.df = csv.output.df
#max = Max(scheduler=s)
#max.input.df = csv.output.df
min = Constant(table=Table('bounds_min', data=pd.DataFrame([bounds_min])),
               scheduler=s)
max = Constant(table=Table('bounds_min', data=pd.DataFrame([bounds_max])),
               scheduler=s)
histogram2d = Histogram2D('pickup_longitude',
                          'pickup_latitude',
                          xbins=RESOLUTION,
                          ybins=RESOLUTION,
                          scheduler=s)
histogram2d.input.table = csv.output.table
histogram2d.input.min = min.output.table
histogram2d.input.max = max.output.table
heatmap = Heatmap(filename='nyc_pickup_yellow%d.png', history=5, scheduler=s)
heatmap.input.array = histogram2d.output.table

if __name__ == '__main__':
    s.start()