def test_by_columns(): t = TableSymbol("t", "{name: string, amount: int32, id: int32}") assert len(by(t["id"], total=t["amount"].sum()).fields) == 2 assert len(by(t["id"], count=t["id"].count()).fields) == 2 print(by(t, count=t.count()).fields) assert len(by(t, count=t.count()).fields) == 4
def test_by_raises_informative_error_on_old_syntax(): s = symbol('t', 'var * {name: string, amount: int}') try: by(s.name, s.amount.sum()) assert False except ValueError as e: assert 'please' in str(e).lower()
def test_by_count(): (chunk, chunk_expr), (agg, agg_expr) = \ split(t, by(t.name, total=t.amount.count())) assert chunk_expr.isidentical(by(chunk.name, total=chunk.amount.count())) assert agg_expr.isidentical(by(agg.name, total=agg.total.sum()))
def test_by_columns(): t = symbol('t', 'var * {name: string, amount: int32, id: int32}') assert len(by(t['id'], total=t['amount'].sum()).fields) == 2 assert len(by(t['id'], count=t['id'].count()).fields) == 2 print(by(t, count=t.count()).fields) assert len(by(t, count=t.count()).fields) == 4
def test_lean_by_with_summary(): assert lean_projection(by(t.x, total=t.y.sum()))._child.isidentical(t[["x", "y"]]) tt = t[["x", "y"]] result = lean_projection(by(t.x, a=t.y.sum(), b=t.z.sum())[["x", "a"]]) expected = Projection(By(Field(tt, "x"), summary(a=sum(Field(tt, "y")))), ("x", "a")) assert result.isidentical(expected)
def test_by_columns(): t = TableSymbol('t', '{name: string, amount: int32, id: int32}') assert len(by(t['id'], t['amount'].sum()).fields) == 2 assert len(by(t['id'], t['id'].count()).fields) == 2 print(by(t, t.count()).fields) assert len(by(t, t.count()).fields) == 4
def test_count_values(): t = symbol('t', 'var * {name: string, amount: int, city: string}') assert t.name.count_values(sort=False).isidentical( by(t.name, count=t.name.count()), ) assert t.name.count_values(sort=True).isidentical( by(t.name, count=t.name.count()).sort('count', ascending=False), )
def test_by(): assert set(compute(by(t.name, sum=t.amount.sum()), c)) == \ set([('Alice', -200), ('Bob', 200), ('Charlie', 400), ('Edith', 200)]) assert set(compute(by(t.name, count=t.amount.count()), c)) == \ set([('Alice', 2), ('Bob', 1), ('Charlie', 1), ('Edith', 1)])
def test_by(bank): assert set(compute(by(t.name, total=t.amount.sum()), bank)) == \ set([('Alice', 300), ('Bob', 600)]) assert set(compute(by(t.name, min=t.amount.min()), bank)) == \ set([('Alice', 100), ('Bob', 100)]) assert set(compute(by(t.name, max=t.amount.max()), bank)) == \ set([('Alice', 200), ('Bob', 300)]) assert set(compute(by(t.name, count=t.name.count()), bank)) == \ set([('Alice', 2), ('Bob', 3)])
def test_by(bank): assert set(compute(by(t.name, t.amount.sum()), bank)) == \ set([('Alice', 300), ('Bob', 600)]) assert set(compute(by(t.name, t.amount.min()), bank)) == \ set([('Alice', 100), ('Bob', 100)]) assert set(compute(by(t.name, t.amount.max()), bank)) == \ set([('Alice', 200), ('Bob', 300)]) assert set(compute(by(t.name, t.name.count()), bank)) == \ set([('Alice', 2), ('Bob', 3)])
def test_by_sum(): (chunk, chunk_expr), (agg, agg_expr) = \ split(t, by(t.name, total=t.amount.sum())) assert chunk.schema == t.schema assert chunk_expr.isidentical(by(chunk.name, total=chunk.amount.sum())) assert not isscalar(agg.dshape.measure) assert agg_expr.isidentical(by(agg.name, total=agg.total.sum()))
def test_by_with_single_field_child(): x = symbol('x', 'var * int') (chunk, chunk_expr), (agg, agg_expr) = split(x, by(x, total=x.sum())) assert chunk_expr.isidentical(by(chunk, total=chunk.sum())) assert (agg_expr.isidentical( by(agg[agg.fields[0]], total=agg.total.sum()).relabel({agg.fields[0]: 'x'})))
def test_by_with_single_field_child(): x = symbol('x', 'var * int') (chunk, chunk_expr), (agg, agg_expr) = split(x, by(x, total=x.sum())) assert chunk_expr.isidentical(by(chunk, total=chunk.sum())) assert (agg_expr.isidentical(by(agg[agg.fields[0]], total=agg.total.sum()) .relabel({agg.fields[0]: 'x'})))
def test_summary_by(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum())) assert str(compute(expr, df)) == \ str(DataFrame([['Alice', 2, 150], ['Bob', 1, 200]], columns=['name', 'count', 'sum'])) expr = by(t.name, summary(count=t.id.count(), sum=(t.amount + 1).sum())) assert str(compute(expr, df)) == \ str(DataFrame([['Alice', 2, 152], ['Bob', 1, 201]], columns=['name', 'count', 'sum']))
def test_by(): with collection(bank) as coll: assert set(compute(by(t, t.name, t.amount.sum()), coll)) == \ set([('Alice', 300), ('Bob', 600)]) assert set(compute(by(t, t.name, t.amount.min()), coll)) == \ set([('Alice', 100), ('Bob', 100)]) assert set(compute(by(t, t.name, t.amount.max()), coll)) == \ set([('Alice', 200), ('Bob', 300)]) assert set(compute(by(t, t.name, t.name.count()), coll)) == \ set([('Alice', 2), ('Bob', 3)])
def test_by_mean(): (chunk, chunk_expr), (agg, agg_expr) = \ split(t, by(t.name, avg=t.amount.mean())) assert chunk_expr.isidentical(by(chunk.name, avg_total=chunk.amount.sum(), avg_count=chunk.amount.count())) assert agg_expr.isidentical(by(agg.name, avg=(agg.avg_total.sum() / agg.avg_count.sum())))
def test_summary_by(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum())) result = compute(expr, df) expected = DataFrame([['Alice', 2, 150], ['Bob', 1, 200]], columns=['name', 'count', 'sum']) expr = by(t.name, summary(count=t.id.count(), sum=(t.amount + 1).sum())) result = compute(expr, df) expected = DataFrame([['Alice', 2, 152], ['Bob', 1, 201]], columns=['name', 'count', 'sum']) tm.assert_frame_equal(result, expected)
def test_by_mean(): (chunk, chunk_expr), (agg, agg_expr) = \ split(t, by(t.name, avg=t.amount.mean())) assert chunk_expr.isidentical( by(chunk.name, avg_total=chunk.amount.sum(), avg_count=chunk.amount.count())) assert agg_expr.isidentical( by(agg.name, avg=(agg.avg_total.sum() / agg.avg_count.sum())))
def test_lean_by_with_summary(): assert lean_projection(by(t.x, total=t.y.sum()))._child.isidentical( t[['x', 'y']], ) tt = t[['x', 'y']] result = lean_projection(by(t.x, a=t.y.sum(), b=t.z.sum())[['x', 'a']]) expected = Projection( By(Field(tt, 'x'), summary(a=sum(Field(tt, 'y')))), ('x', 'a'), ) assert result.isidentical(expected)
def test_summary_by(): expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum())) assert set(compute(expr, data)) == set([('Alice', 2, 150), ('Bob', 1, 200)]) expr = by(t.name, summary(count=t.id.count(), sum=(t.amount + 1).sum())) assert set(compute(expr, data)) == set([('Alice', 2, 152), ('Bob', 1, 201)]) expr = by(t.name, summary(count=t.id.count(), sum=t.amount.sum() + 1)) assert set(compute(expr, data)) == set([('Alice', 2, 151), ('Bob', 1, 201)])
def test_aliased_views_with_two_group_bys(): expr = by(bank.name, total=bank.amount.sum()) expr2 = by(expr.total, count=expr.name.count()) result = compute(expr2, {bank: sql_bank, cities: sql_cities}) assert normalize(str(result)) == normalize(""" SELECT alias.total, count(alias.name) as count FROM (SELECT bank.name AS name, sum(bank.amount) AS total FROM bank GROUP BY bank.name) as alias GROUP BY alias.total """)
def test_aliased_views_more(): metadata = sa.MetaData() lhs = sa.Table('aaa', metadata, sa.Column('x', sa.Integer), sa.Column('y', sa.Integer), sa.Column('z', sa.Integer)) rhs = sa.Table('bbb', metadata, sa.Column('w', sa.Integer), sa.Column('x', sa.Integer), sa.Column('y', sa.Integer)) L = symbol('L', 'var * {x: int, y: int, z: int}') R = symbol('R', 'var * {w: int, x: int, y: int}') expr = join(by(L.x, y_total=L.y.sum()), R) result = compute(expr, {L: lhs, R: rhs}) assert normalize(str(result)) == normalize(""" SELECT alias.x, alias.y_total, bbb.w, bbb.y FROM (SELECT aaa.x as x, sum(aaa.y) as y_total FROM aaa GROUP BY aaa.x) AS alias JOIN bbb ON alias.x = bbb.x """) expr2 = by(expr.w, count=expr.x.count(), total2=expr.y_total.sum()) result2 = compute(expr2, {L: lhs, R: rhs}) assert ( normalize(str(result2)) == normalize(""" SELECT alias_2.w, count(alias_2.x) as count, sum(alias_2.y_total) as total2 FROM (SELECT alias.x, alias.y_total, bbb.w, bbb.y FROM (SELECT aaa.x as x, sum(aaa.y) as y_total FROM aaa GROUP BY aaa.x) AS alias JOIN bbb ON alias.x = bbb.x) AS alias_2 GROUP BY alias_2.w""") or normalize(str(result2)) == normalize(""" SELECT bbb.w, count(alias.x) as count, sum(alias.y_total) as total2 FROM (SELECT aaa.x as x, sum(aaa.y) as y_total FROM aaa GROUP BY aaa.x) as alias JOIN bbb ON alias.x = bbb.x GROUP BY bbb.w"""))
def test_aliased_views_with_join(): joined = join(bank, cities) expr = by(joined.city, total=joined.amount.sum()) expr2 = by(expr.total, count=expr.city.nunique()) result = compute(expr2, {bank: sql_bank, cities: sql_cities}) assert normalize(str(result)) == normalize(""" SELECT alias.total, count(DISTINCT alias.city) AS count FROM (SELECT cities.city AS city, sum(bank.amount) AS total FROM bank JOIN cities ON bank.name = cities.name GROUP BY cities.city) as alias GROUP BY alias.total """)
def test_by_multi_column_grouper(): t = TableSymbol('t', '{x: int, y: int, z: int}') expr = by(t[['x', 'y']], t['z'].count()) data = [(1, 2, 0), (1, 2, 0), (1, 1, 0)] print(set(compute(expr, data))) assert set(compute(expr, data)) == set([(1, 2, 2), (1, 1, 1)])
def compute_up(t, rdd, **kwargs): grouper = optimize(t.grouper, rdd) apply = optimize(t.apply, rdd) t = by(grouper, apply) if ((isinstance(t.apply, Reduction) and type(t.apply) in binops) or (isinstance(t.apply, Summary) and builtins.all(type(val) in binops for val in t.apply.values))): grouper, binop, combiner, initial = reduce_by_funcs(t) if isscalar(t.grouper.dshape.measure): keyfunc = lambda x: (x, ) else: keyfunc = identity if isscalar(t.apply.dshape.measure): valfunc = lambda x: (x, ) else: valfunc = identity unpack = lambda kv: keyfunc(kv[0]) + valfunc(kv[1]) create = lambda v: binop(initial, v) return (rdd.keyBy(grouper).combineByKey(create, binop, combiner).map(unpack)) else: raise NotImplementedError("By only implemented for common reductions." "\nGot %s" % type(t.apply))
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = Symbol('t_idx', 'var * {name: string, node_id: int32}') t_arc = Symbol('t_arc', 'var * {node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx:df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') expected = result_pandas.groupby('name')['node_id'].count().reset_index() assert str(result.values) == str(expected.values) assert list(result.columns) == ['name', 'node_id_count']
def test_by_multi_column_grouper(): t = Symbol('t', 'var * {x: int, y: int, z: int}') expr = by(t[['x', 'y']], t['z'].count()) data = [(1, 2, 0), (1, 2, 0), (1, 1, 0)] print(set(compute(expr, data))) assert set(compute(expr, data)) == set([(1, 2, 2), (1, 1, 1)])
def test_by_two(): result = compute(by(tbig[['name', 'sex']], tbig['amount'].sum()), databig) expected = [('Alice', 'F', 200), ('Drew', 'F', 100), ('Drew', 'M', 300)] print(set(result)) assert set(result) == set(expected)
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = TableSymbol('t_idx', '{name: string, node_id: int32}') t_arc = TableSymbol('t_arc', '{node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx:df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') expected = result_pandas.groupby('name')['node_id'].count().reset_index() assert str(result.values) == str(expected.values) assert list(result.columns) == ['name', 'node_id_count']
def test_by(): t = symbol('t', 'var * {name: string, amount: int32, id: int32}') r = by(t['name'], total=sum(t['amount'])) print(r.schema) assert isinstance(r.schema[0], Record) assert str(r.schema[0]['name']) == 'string'
def test_by(): t = TableSymbol("t", "{name: string, amount: int32, id: int32}") r = by(t["name"], total=sum(t["amount"])) print(r.schema) assert isinstance(r.schema[0], Record) assert str(r.schema[0]["name"]) == "string"
def test_path_split(): expr = t.amount.sum() + 1 assert path_split(t, expr).isidentical(t.amount.sum()) expr = t.amount.distinct().sort() assert path_split(t, expr).isidentical(t.amount.distinct()) t2 = transform(t, id=t.id * 2) expr = by(t2.id, amount=t2.amount.sum()).amount + 1 assert path_split(t, expr).isidentical(by(t2.id, amount=t2.amount.sum())) expr = count(t.amount.distinct()) assert path_split(t, expr).isidentical(t.amount.distinct()) expr = summary(total=t.amount.sum()) assert path_split(t, expr).isidentical(expr)
def test_by(): t = TableSymbol('t', '{name: string, amount: int32, id: int32}') r = by(t['name'], sum(t['amount'])) print(r.schema) assert isinstance(r.schema[0], Record) assert str(r.schema[0]['name']) == 'string'
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = symbol('t_idx', 'var * {name: string, node_id: int32}') t_arc = symbol('t_arc', 'var * {node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], count=joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx: df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') gb = result_pandas.groupby('name') expected = gb.node_id.count().reset_index().rename(columns={ 'node_id': 'count' }) tm.assert_frame_equal(result, expected) assert list(result.columns) == ['name', 'count']
def test_complex_group_by(): expr = by(merge(tbig.amount // 10, tbig.id % 2), count=tbig.name.count()) result = compute(expr, dfbig) # can we do this? yes we can! expected = dfbig.groupby([dfbig.amount // 10, dfbig.id % 2])['name'].count().reset_index() expected = expected.rename(columns={'name': 'count'}) tm.assert_frame_equal(result, expected)
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = symbol('t_idx', 'var * {name: string, node_id: int32}') t_arc = symbol('t_arc', 'var * {node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], count=joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx: df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') gb = result_pandas.groupby('name') expected = gb.node_id.count().reset_index().rename( columns={'node_id': 'count'}) tm.assert_frame_equal(result, expected) assert list(result.columns) == ['name', 'count']
def test_agg_shape_in_tabular_case_with_explicit_chunk(): t = symbol('t', '1000 * {name: string, amount: int, id: int}') c = symbol('chunk', 100 * t.schema) expr = by(t.name, total=t.amount.sum()) (chunk, chunk_expr), (agg, agg_expr) = split(t, expr, chunk=c) assert agg.dshape == dshape('var * {name: string, total: int64}')
def test_by_four(): t = tbig[['sex', 'amount']] expr = by(t['sex'], max=t['amount'].max()) result = compute(expr, dfbig) expected = DataFrame([['F', 100], ['M', 200]], columns=['sex', 'max']) tm.assert_frame_equal(result, expected)
def test_aliased_views_with_computation(): engine = sa.create_engine('sqlite:///:memory:') df_aaa = DataFrame({ 'x': [1, 2, 3, 2, 3], 'y': [2, 1, 2, 3, 1], 'z': [3, 3, 3, 1, 2] }) df_bbb = DataFrame({ 'w': [1, 2, 3, 2, 3], 'x': [2, 1, 2, 3, 1], 'y': [3, 3, 3, 1, 2] }) df_aaa.to_sql('aaa', engine) df_bbb.to_sql('bbb', engine) metadata = sa.MetaData(engine) metadata.reflect() sql_aaa = metadata.tables['aaa'] sql_bbb = metadata.tables['bbb'] L = symbol('aaa', discover(df_aaa)) R = symbol('bbb', discover(df_bbb)) expr = join(by(L.x, y_total=L.y.sum()), R) a = compute(expr, {L: df_aaa, R: df_bbb}) b = compute(expr, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr2 = by(expr.w, count=expr.x.count(), total2=expr.y_total.sum()) a = compute(expr2, {L: df_aaa, R: df_bbb}) b = compute(expr2, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr3 = by(expr.x, count=expr.y_total.count()) a = compute(expr3, {L: df_aaa, R: df_bbb}) b = compute(expr3, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr4 = join(expr2, R) a = compute(expr4, {L: df_aaa, R: df_bbb}) b = compute(expr4, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) """ # Takes a while
def test_by(): expr = by(t['name'], total=t['amount'].sum()) result = compute(expr, s) expected = sa.select([s.c.name, sa.sql.functions.sum(s.c.amount).label('total')] ).group_by(s.c.name) assert str(result) == str(expected)
def test_by(): expr = by(t['name'], total=t['amount'].sum()) result = compute(expr, s) expected = sa.select( [s.c.name, sa.sql.functions.sum(s.c.amount).label('total')]).group_by(s.c.name) assert str(result) == str(expected)
def test_by_four(): t = tbig[['sex', 'amount']] expr = by(t['sex'], t['amount'].max()) result = compute(expr, dfbig) expected = DataFrame([['F', 100], ['M', 200]], columns=['sex', 'amount_max']) assert str(result) == str(expected)
def test_by_on_same_column(): df = pd.DataFrame([[1, 2], [1, 4], [2, 9]], columns=['id', 'value']) t = symbol('data', 'var * {id: int, value: int}') gby = by(t['id'], count=t['id'].count()) expected = DataFrame([[1, 2], [2, 1]], columns=['id', 'count']) result = compute(gby, {t: df}) tm.assert_frame_equal(result, expected)
def test_by_three(): result = compute( by(tbig[['name', 'sex']], (tbig['id'] + tbig['amount']).sum()), databig) expected = [('Alice', 'F', 204), ('Drew', 'F', 104), ('Drew', 'M', 310)] print(result) assert set(result) == set(expected)
def test_by_on_count(): expr = by(t.name, count=t.count()) result = compute(expr, s) assert normalize(str(result)) == normalize(""" SELECT accounts.name, count(accounts.id) AS count FROM accounts GROUP BY accounts.name """)
def test_by_two(): result = compute(by(tbig[['name', 'sex']], total=sum(tbig['amount'])), dfbig) expected = DataFrame( [['Alice', 'F', 200], ['Drew', 'F', 100], ['Drew', 'M', 300]], columns=['name', 'sex', 'total']) tm.assert_frame_equal(result, expected)
def test_summary_by(): expr = by(t.name, summary(a=t.amount.sum(), b=t.id.count())) result = str(compute(expr, s)) assert 'sum(accounts.amount) as a' in result.lower() assert 'count(accounts.id) as b' in result.lower() assert 'group by accounts.name' in result.lower()
def test_by_two(): expr = by(tbig[['name', 'sex']], total=tbig['amount'].sum()) result = compute(expr, sbig) expected = (sa.select([ sbig.c.name, sbig.c.sex, sa.sql.functions.sum(sbig.c.amount).label('total') ]).group_by(sbig.c.name, sbig.c.sex)) assert str(result) == str(expected)
def test_reduce_does_not_compose(): expr = by(t.name, counts=t.count()).counts.max() result = str(compute(expr, s)) expected = """WITH alias AS (SELECT count(accounts.id) AS counts FROM accounts GROUP BY accounts.name) SELECT max(alias.counts) AS counts_max FROM alias""" assert normalize(result) == normalize(expected)