def test_dataset(): ns = {'t': df, 'x': 10} cache = dict() d = CachedDataset(ns, cache=cache) assert discover(d) == discover(ns) s = symbol('s', discover(d)) compute(s.x * 2, d) == 20 cache == {s.x * 2: 20}
def test_streaming(): seq = [{'name': 'Alice', 'x': 1}, {'name': 'Bob', 'x': 1}] ns = {'t': seq, 'x': 10} cache = dict() d = CachedDataset(ns, cache=cache) s = symbol('s', discover(d)) expr = s.t.x * 2 result = compute(expr, d) assert not isinstance(d.cache[expr], Iterator) assert into(list, d.cache[expr]) == [2, 2]
def test_pre_compute_on_multiple_datasets_is_selective(): from odo import CSV from blaze import Data from blaze.cached import CachedDataset df = pd.DataFrame( [[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]], columns=['id', 'name', 'amount']) iris = CSV(example('iris.csv')) dset = CachedDataset({'df': df, 'iris': iris}) d = Data(dset) assert str(compute(d.df.amount)) == str(df.amount)