def test_by(): assert set(compute(by(t, t.name, t.amount.sum()), c)) == \ set([('Alice', -200), ('Bob', 200), ('Charlie', 400), ('Edith', 200)]) assert set(compute(by(t, t.name, t.amount.count()), c)) == \ set([('Alice', 2), ('Bob', 1), ('Charlie', 1), ('Edith', 1)])
def test_sample(): NN = len(databig) for n in range(1, NN+1): assert (len(compute(tbig.sample(n=n), databig)) == len(compute(tbig.sample(frac=float(n)/NN), databig)) == n) assert len(compute(tbig.sample(n=NN*2), databig)) == NN
def test_unicode_field_names(): b = bcolz.ctable(np.array([(1, 1., 10.), (2, 2., 20.), (3, 3., 30.)], dtype=[('a', 'i8'), ('b', 'f8'), ('c', 'f8')])) s = symbol('s', discover(b)) assert eq(compute(s[u'a'], b)[:], compute(s['a'], b)[:]) assert eq(compute(s[[u'a', u'c']], b)[:], compute(s[['a', 'c']], b)[:])
def test_coalesce(): data = pd.Series([0, None, 1, None, 2, None], dtype=object) s = symbol('s', 'var * ?int') t = symbol('t', 'int') u = symbol('u', '?int') v = symbol('v', 'var * int') w = symbol('w', 'var * ?int') # array to scalar tm.assert_series_equal( compute(coalesce(s, t), {s: data, t: -1}), pd.Series([0, -1, 1, -1, 2, -1], dtype=object), ) # array to scalar with NULL tm.assert_series_equal( compute(coalesce(s, u), {s: data, u: None}), pd.Series([0, None, 1, None, 2, None], dtype=object), ) # array to array tm.assert_series_equal( compute(coalesce(s, v), { s: data, v: np.array([-1, -2, -3, -4, -5, -6]), }), pd.Series([0, -2, 1, -4, 2, -6], dtype=object), ) # array to array with NULL tm.assert_series_equal( compute(coalesce(s, w), { s: data, w: np.array([-1, None, -3, -4, -5, -6]), }), pd.Series([0, None, 1, -4, 2, -6], dtype=object), )
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] L = TableSymbol('L', '{id: int, name: string, amount: real}') R = TableSymbol('R', '{city: string, id: int}') assert set(compute(join(L, R), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='left'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='right'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='outer'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')])
def test_arithmetic(): assert str(compute(t['amount'] + t['id'], df)) == \ str(df.amount + df.id) assert str(compute(t['amount'] * t['id'], df)) == \ str(df.amount * df.id) assert str(compute(t['amount'] % t['id'], df)) == \ str(df.amount % df.id)
def test_arithmetic(): assert list(compute(t['amount'] + t['id'], data)) == \ [b + c for a, b, c, in data] assert list(compute(t['amount'] * t['id'], data)) == \ [b * c for a, b, c, in data] assert list(compute(t['amount'] % t['id'], data)) == \ [b % c for a, b, c, in data]
def test_by_multi_column_grouper(): t = TableSymbol('t', '{x: int, y: int, z: int}') expr = by(t[['x', 'y']], t['z'].count()) data = [(1, 2, 0), (1, 2, 0), (1, 1, 0)] print(set(compute(expr, data))) assert set(compute(expr, data)) == set([(1, 2, 2), (1, 1, 1)])
def test_arithmetic(): assert_series_equal(compute(t['amount'] + t['id'], df), df.amount + df.id) assert_series_equal(compute(t['amount'] * t['id'], df), df.amount * df.id) assert_series_equal(compute(t['amount'] % t['id'], df), df.amount % df.id)
def test_arithmetic(): assert eq(compute(t['amount'] + t['id'], x), x['amount'] + x['id']) assert eq(compute(t['amount'] * t['id'], x), x['amount'] * x['id']) assert eq(compute(t['amount'] % t['id'], x), x['amount'] % x['id'])
def test_union(): result = compute(union(t, t), x) assert result.shape == (x.shape[0] * 2,) assert eq(result[:5], x) assert eq(result[5:], x) result = compute(union(t.id, t.id), x) assert eq(result, np.array([1, 2, 3, 4, 5, 1, 2, 3, 4, 5]))
def test_summary_on_ndarray(): assert compute(summary(total=a.sum(), min=a.min()), ax) == (ax.min(), ax.sum()) result = compute(summary(total=a.sum(), min=a.min(), keepdims=True), ax) expected = np.array([(ax.min(), ax.sum())], dtype=[("min", "float32"), ("total", "float64")]) assert result.ndim == ax.ndim assert eq(expected, result)
def test_timedelta_arith(): dates = np.arange("2014-01-01", "2014-02-01", dtype="datetime64") delta = np.timedelta64(1, "D") sym = symbol("s", discover(dates)) assert (compute(sym + delta, dates) == dates + delta).all() assert (compute(sym - delta, dates) == dates - delta).all() assert (compute(sym - (sym - delta), dates) == dates - (dates - delta)).all()
def test_scalar_ops(data): from operator import add, sub, mul, truediv for op in (add, sub, mul, truediv): assert eq(compute(op(t.amount, 10), data), op(x['amount'], 10)) assert eq(compute(op(t.amount, t.id), data), op(x['amount'], x['id'])) assert eq(compute(op(10.0, t.amount), data), op(10.0, x['amount'])) assert eq(compute(op(10, t.amount), data), op(10, x['amount']))
def test_apply_column(): result = compute(t.amount.apply(np.sum, 'real'), df) expected = np.sum(df['amount']) assert result == expected result = compute(t.amount.apply(builtins.sum, 'real'), df) expected = builtins.sum(df['amount']) assert result == expected
def test_arithmetic(): with data() as d: assert eq(compute(t['amount'] + t['id'], d), x['amount'] + x['id']) assert eq(compute(t['amount'] * t['id'], d), x['amount'] * x['id']) assert eq(compute(t['amount'] % t['id'], d), x['amount'] % x['id'])
def test_summary_on_series(): ser = Series([1, 2, 3]) s = symbol('s', '3 * int') expr = summary(max=s.max(), min=s.min()) assert compute(expr, ser) == (3, 1) expr = summary(max=s.max(), min=s.min(), keepdims=True) assert compute(expr, ser) == [(3, 1)]
def test_nelements_array(): t = symbol('t', '5 * 4 * 3 * float64') x = np.random.randn(*t.shape) result = compute(t.nelements(axis=(0, 1)), x) np.testing.assert_array_equal(result, np.array([20, 20, 20])) result = compute(t.nelements(axis=1), x) np.testing.assert_array_equal(result, 4 * np.ones((5, 3)))
def test_by_with_single_row(): ct = bcolz.ctable([[1, 1, 3, 3], [1, 2, 3, 4]], names=list('ab')) t = symbol('t', discover(ct)) subset = t[t.a == 3] expr = by(subset.a, b_sum=subset.b.sum()) result = compute(expr, ct) expected = compute(expr, ct, optimize=False) tm.assert_frame_equal(result, expected)
def test_std(): amt = [row[1] for row in data] assert np.allclose(compute(t.amount.std(), data), np.std(amt)) assert np.allclose(compute(t.amount.std(unbiased=True), data), np.std(amt, ddof=1)) assert np.allclose(compute(t.amount.var(), data), np.var(amt)) assert np.allclose(compute(t.amount.var(unbiased=True), data), np.var(amt, ddof=1))
def test_sort(): assert eq(compute(t.sort("amount"), x), np.sort(x, order="amount")) assert eq(compute(t.sort("amount", ascending=False), x), np.sort(x, order="amount")[::-1]) assert eq(compute(t.sort(["amount", "id"]), x), np.sort(x, order=["amount", "id"])) assert eq(compute(t.amount.sort(), x), np.sort(x["amount"]))
def test_sort(): assert eq(compute(t.sort('amount'), x), np.sort(x, order='amount')) assert eq(compute(t.sort('amount', ascending=False), x), np.sort(x, order='amount')[::-1]) assert eq(compute(t.sort(['amount', 'id']), x), np.sort(x, order=['amount', 'id']))
def test_Distinct(): x = np.array( [("Alice", 100), ("Alice", -200), ("Bob", 100), ("Bob", 100)], dtype=[("name", "S5"), ("amount", "i8")] ) t = symbol("t", "var * {name: string, amount: int64}") assert eq(compute(t["name"].distinct(), x), np.unique(x["name"])) assert eq(compute(t.distinct(), x), np.unique(x))
def test_concat_mat(): s_data = np.arange(15).reshape(5, 3) t_data = np.arange(15, 30).reshape(5, 3) s = symbol("s", discover(s_data)) t = symbol("t", discover(t_data)) assert (compute(concat(s, t), {s: s_data, t: t_data}) == np.arange(30).reshape(10, 3)).all() assert (compute(concat(s, t, axis=1), {s: s_data, t: t_data}) == np.concatenate((s_data, t_data), axis=1)).all()
def test_multi_dataset_broadcast(): x = symbol('x', '3 * int') y = symbol('y', '3 * int') a = [1, 2, 3] b = [10, 20, 30] assert list(compute(x + y, {x: a, y: b})) == [11, 22, 33] assert list(compute(2*x + (y + 1), {x: a, y: b})) == [13, 25, 37]
def test_compound(self): s = t.amount.mean() r = compute(s, data) assert isinstance(r, float) expr = cos(s) ** 2 + sin(s) ** 2 result = compute(expr, data) expected = math.cos(r) ** 2 + math.sin(r) ** 2 assert result == expected
def test_summary_on_ndarray(): assert compute(summary(total=a.sum(), min=a.min()), ax) == \ (ax.min(), ax.sum()) result = compute(summary(total=a.sum(), min=a.min(), keepdims=True), ax) expected = np.array([(ax.min(), ax.sum())], dtype=[('min', 'float32'), ('total', 'float64')]) assert result.ndim == ax.ndim assert eq(expected, result)
def test_sort(): tm.assert_frame_equal(compute(t.sort('amount'), df), df.sort('amount')) tm.assert_frame_equal(compute(t.sort('amount', ascending=True), df), df.sort('amount', ascending=True)) tm.assert_frame_equal(compute(t.sort(['amount', 'id']), df), df.sort(['amount', 'id']))
def test_truncate_datetime(): s = symbol('x', 'datetime') assert compute(s.truncate(2, 'days'), datetime(2002, 1, 3, 12, 30)) ==\ date(2002, 1, 2) s = symbol('x', 'var * datetime') assert list(compute(s.truncate(2, 'days'), [datetime(2002, 1, 3, 12, 30)])) ==\ [date(2002, 1, 2)]
def test_sort(): assert list(compute(t.sort('amount'), data)) == \ sorted(data, key=lambda x: x[1], reverse=False) assert list(compute(t.sort('amount', ascending=True), data)) == \ sorted(data, key=lambda x: x[1], reverse=False) assert list(compute(t.sort(['amount', 'id']), data)) == \ sorted(data, key=lambda x: (x[1], x[2]), reverse=False)
def test_columns_series(): assert isinstance(compute(t['amount'], df), Series) assert isinstance(compute(t['amount'] > 150, df), Series)
def test_reductions_on_dataframes(): assert compute(count(t), df) == 3 assert shape(compute(count(t, keepdims=True), df)) == (1, )
def test_notnull(): assert (compute(nt.name.notnull(), ndf) == ndf.name.notnull()).all()
def test_str_concat(): a = Series(('a', 'b', 'c')) s = symbol('s', "3 * string[1, 'U32']") expr = s + 'a' assert (compute(expr, a) == (a + 'a')).all()
def test_neg(): assert_series_equal(compute(-t['amount'], df), -df['amount'])
def test_summary_keepdims(): expr = summary(count=t.id.count(), sum=t.amount.sum(), keepdims=True) expected = DataFrame([[3, 350]], columns=['count', 'sum']) tm.assert_frame_equal(compute(expr, df), expected)
def test_field_on_series(): expr = symbol('s', 'var * int') data = Series([1, 2, 3, 4], name='s') assert_series_equal(compute(expr.s, data), data)
def test_by_one(): result = compute(by(t['name'], total=t['amount'].sum()), df) expected = df.groupby('name')['amount'].sum().reset_index() expected.columns = ['name', 'total'] tm.assert_frame_equal(result, expected)
def test_1d_reductions_keepdims(): series = df['amount'] for r in [sum, min, max, nunique, count, std, var]: result = compute(r(t.amount, keepdims=True), {t.amount: series}) assert type(result) == type(series)
def test_unary_op(): assert (compute(exp(t['amount']), df) == np.exp(df['amount'])).all()
def test_count_keepdims_frame(): df = pd.DataFrame(dict(a=[1, 2, 3, np.nan])) s = symbol('s', discover(df)) assert_series_equal(compute(s.count(keepdims=True), df), pd.Series([df.shape[0]], name='s_count'))
def test_abs(): assert (compute(abs(t['amount']), df) == abs(df['amount'])).all()
def test_reductions(): assert compute(mean(t['amount']), df) == 350 / 3 assert compute(count(t['amount']), df) == 3 assert compute(sum(t['amount']), df) == 100 + 200 + 50 assert compute(min(t['amount']), df) == 50 assert compute(max(t['amount']), df) == 200 assert compute(nunique(t['amount']), df) == 3 assert compute(nunique(t['name']), df) == 2 assert compute(any(t['amount'] > 150), df) is True assert compute(any(t['amount'] > 250), df) is False assert compute(var(t['amount']), df) == df.amount.var(ddof=0) assert compute(var(t['amount'], unbiased=True), df) == df.amount.var() assert compute(std(t['amount']), df) == df.amount.std(ddof=0) assert compute(std(t['amount'], unbiased=True), df) == df.amount.std() assert compute(t.amount[0], df) == df.amount.iloc[0] assert compute(t.amount[-1], df) == df.amount.iloc[-1]
def test_by_with_complex_summary(): expr = by(t.name, total=t.amount.sum() + t.id.sum() - 1, a=t.id.min()) result = compute(expr, df) assert list(result.columns) == expr.fields assert list(result.total) == [150 + 4 - 1, 200 + 2 - 1]
def test_coerce_series(): s = pd.Series(list('123'), name='a') t = symbol('t', discover(s)) result = compute(t.coerce(to='int64'), s) expected = pd.Series([1, 2, 3], name=s.name) assert_series_equal(result, expected)
def test_datetime_truncation_same_as_python(): data = Series(['2000-01-01T12:10:00Z', '2000-06-25T12:35:12Z'], dtype='M8[ns]') s = symbol('s', 'var * datetime') assert (compute(s.truncate(weeks=2), data[0].to_pydatetime()) == datetime(1999, 12, 26).date())
def test_eq(): assert_series_equal(compute(t['amount'] == 100, df), df['amount'] == 100)
def test_nelements(): assert compute(t.nelements(), df) == len(df) assert compute(t.nrows, df) == len(df)
def test_str_interp(): a = Series(('%s', '%s', '%s')) s = symbol('s', "3 * string[1, 'U32']") expr = s.interp(1) assert (compute(expr, a) == (a % 1)).all()
def test_series_slice(): assert compute(t.amount[0], df) == df.amount.iloc[0] assert compute(t.amount[2], df) == df.amount.iloc[2] assert_series_equal(compute(t.amount[:2], df), df.amount.iloc[:2]) assert_series_equal(compute(t.amount[1:3], df), df.amount.iloc[1:3]) assert_series_equal(compute(t.amount[1::2], df), df.amount.iloc[1::2])
def test_isnan(): assert (compute(nt.amount.isnan(), ndf) == ndf.amount.isnull()).all()
def test_timedelta_arith(): series = Series(pd.date_range('2014-01-01', '2014-02-01')) sym = symbol('s', discover(series)) delta = timedelta(days=1) assert (compute(sym + delta, series) == series + delta).all() assert (compute(sym - delta, series) == series - delta).all()
def test_neg_projection(): assert_series_equal(compute(-t[['amount', 'id']], df), -df[['amount', 'id']])
def test_selection(): tm.assert_frame_equal(compute(t[t['amount'] == 0], df), df[df['amount'] == 0]) tm.assert_frame_equal(compute(t[t['amount'] > 150], df), df[df['amount'] > 150])
def test_nunique_table(): expr = t.nunique() result = compute(expr, df) assert result == len(df.drop_duplicates())
def test_strlen(): expr = t.name.strlen() expected = pd.Series([5, 3, 5], name='name') result = compute(expr, df).reset_index(drop=True) assert_series_equal(expected, result)
def test_isin(keys): expr = t[t.id.isin(keys)] result = compute(expr, df) expected = df.loc[df.id.isin(keys)] tm.assert_frame_equal(result, expected)
def test_projection(): tm.assert_frame_equal(compute(t[['name', 'id']], df), df[['name', 'id']])
def test_str_repeat(): a = Series(('a', 'b', 'c')) s = symbol('s', "3 * string[1, 'U32']") expr = s.repeat(3) assert (compute(expr, a) == (a * 3)).all()