def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] L = Symbol('L', 'var * {id: int, name: string, amount: real}') R = Symbol('R', 'var * {city: string, id: int}') assert set(compute(join(L, R), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='left'), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='right'), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='outer'), { L: left, R: right })) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')])
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] L = TableSymbol('L', '{id: int, name: string, amount: real}') R = TableSymbol('R', '{city: string, id: int}') assert set(compute(join(L, R), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='left'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='right'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='outer'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')])
def test_graph_double_join(): idx = [['A', 1], ['B', 2], ['C', 3], ['D', 4], ['E', 5], ['F', 6]] arc = [[1, 3], [2, 3], [4, 3], [5, 3], [3, 1], [2, 1], [5, 1], [1, 6], [2, 6], [4, 6]] wanted = [['A'], ['F']] t_idx = symbol('t_idx', 'var * {name: string, b: int32}') t_arc = symbol('t_arc', 'var * {a: int32, b: int32}') t_wanted = symbol('t_wanted', 'var * {name: string}') # >>> compute(join(t_idx, t_arc, 'b'), {t_idx: idx, t_arc: arc}) # [[1, A, 3], # [1, A, 2], # [1, A, 5], # [3, C, 1], # [3, C, 2], # [3, C, 4], # [3, C, 5], # [6, F, 1], # [6, F, 2], # [6, F, 4]] j = join(join(t_idx, t_arc, 'b'), t_wanted, 'name')[['name', 'b', 'a']] result = compute(j, {t_idx: idx, t_arc: arc, t_wanted: wanted}) result = sorted(map(tuple, result)) expected = sorted([('A', 1, 3), ('A', 1, 2), ('A', 1, 5), ('F', 6, 1), ('F', 6, 2), ('F', 6, 4)]) assert result == expected
def test_join(): t = TableSymbol('t', '{name: string, amount: int}') s = TableSymbol('t', '{name: string, id: int}') j = join(t, s, 'name', 'name') assert j.schema == dshape('{name: string, amount: int, id: int}') assert join(t, s, 'name') == join(t, s, 'name')
def test_leaves(): t = TableSymbol("t", "{id: int32, name: string}") v = TableSymbol("v", "{id: int32, city: string}") x = symbol("x", "int32") assert t._leaves() == [t] assert t.id._leaves() == [t] assert by(t.name, count=t.id.nunique())._leaves() == [t] assert join(t, v)._leaves() == [t, v] assert join(v, t)._leaves() == [v, t] assert (x + 1)._leaves() == [x]
def test_leaves(): t = TableSymbol('t', '{id: int32, name: string}') v = TableSymbol('v', '{id: int32, city: string}') x = Symbol('x', 'int32') assert t._leaves() == [t] assert t.id._leaves() == [t] assert by(t.name, t.id.nunique())._leaves() == [t] assert join(t, v)._leaves() == [t, v] assert join(v, t)._leaves() == [v, t] assert (x + 1)._leaves() == [x]
def test_leaves(): t = symbol('t', 'var * {id: int32, name: string}') v = symbol('v', 'var * {id: int32, city: string}') x = symbol('x', 'int32') assert t._leaves() == [t] assert t.id._leaves() == [t] assert by(t.name, count=t.id.nunique())._leaves() == [t] assert join(t, v)._leaves() == [t, v] assert join(v, t)._leaves() == [v, t] assert (x + 1)._leaves() == [x]
def test_join_on_single_column(): expr = join(cities[['name']], bank) result = compute(expr, {bank: sql_bank, cities: sql_cities}) assert normalize(str(result)) == """ SELECT bank.id, bank.name, bank.amount FROM bank join cities ON bank.name = cities.name""" expr = join(bank, cities.name) result = compute(expr, {bank: sql_bank, cities: sql_cities}) assert normalize(str(result)) == """
def test_clean_join(): metadata = sa.MetaData() name = sa.Table( 'name', metadata, sa.Column('id', sa.Integer), sa.Column('name', sa.String), ) city = sa.Table( 'place', metadata, sa.Column('id', sa.Integer), sa.Column('city', sa.String), sa.Column('country', sa.String), ) friends = sa.Table( 'friends', metadata, sa.Column('a', sa.Integer), sa.Column('b', sa.Integer), ) tcity = symbol('city', discover(city)) tfriends = symbol('friends', discover(friends)) tname = symbol('name', discover(name)) ns = {tname: name, tfriends: friends, tcity: city} expr = join(tfriends, tname, 'a', 'id') assert normalize(str(compute(expr, ns))) == normalize(""" SELECT friends.a, friends.b, name.name FROM friends JOIN name on friends.a = name.id""") expr = join(join(tfriends, tname, 'a', 'id'), tcity, 'a', 'id') result = compute(expr, ns) expected1 = """ SELECT friends.a, friends.b, name.name, place.city, place.country FROM friends JOIN name ON friends.a = name.id JOIN place ON friends.a = place.id """ expected2 = """ SELECT alias.a, alias.b, alias.name, place.city, place.country FROM (SELECT friends.a AS a, friends.b AS b, name.name AS name FROM friends JOIN name ON friends.a = name.id) AS alias JOIN place ON alias.a = place.id """ assert (normalize(str(result)) == normalize(expected1) or normalize(str(result)) == normalize(expected2))
def test_path(): from blaze.expr import TableSymbol, join t = TableSymbol('t', '{name: string, amount: int, id: int}') v = TableSymbol('v', '{city: string, id: int}') expr = t['amount'].sum() assert list(path(expr, t)) == [t.amount.sum(), t.amount, t] assert list(path(expr, t.amount)) == [t.amount.sum(), t.amount] assert list(path(expr, t.amount)) == [t.amount.sum(), t.amount] expr = join(t, v).amount assert list(path(expr, t)) == [join(t, v).amount, join(t, v), t] assert list(path(expr, v)) == [join(t, v).amount, join(t, v), v]
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = DataFrame(left, columns=['id', 'name', 'amount']) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = DataFrame(right, columns=['city', 'id']) lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}') rsym = symbol('rsym', 'var * {city: string, id: int}') convert = lambda df: set(df.to_records(index=False).tolist()) assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')])) assert (convert(compute(join(lsym, rsym, how='left'), {lsym: left, rsym: right})) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, np.nan), (4, 'Dennis', 400, 'Moscow')])) df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right}) expected = DataFrame([(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (3., np.nan, np.nan, 'lsymA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) result = pdsort(df, 'id').to_records(index=False) expected = pdsort(expected, 'id').to_records(index=False) np.array_equal(result, expected) df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right}) expected = DataFrame([(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (2., 'Bob', 200., np.nan), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) result = pdsort(df, 'id').to_records(index=False) expected = pdsort(expected, 'id').to_records(index=False) np.array_equal(result, expected)
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = DataFrame(left, columns=['id', 'name', 'amount']) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = DataFrame(right, columns=['city', 'id']) lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}') rsym = symbol('rsym', 'var * {city: string, id: int}') convert = lambda df: set(df.to_records(index=False).tolist()) assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')])) assert (convert(compute(join(lsym, rsym, how='left'), {lsym: left, rsym: right})) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, np.nan), (4, 'Dennis', 400, 'Moscow')])) df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right}) expected = DataFrame([(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (3., np.nan, np.nan, 'lsymA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) result = df.sort('id').to_records(index=False) expected = expected.sort('id').to_records(index=False) np.array_equal(result, expected) df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right}) expected = DataFrame([(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (2., 'Bob', 200., np.nan), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) result = df.sort('id').to_records(index=False) expected = expected.sort('id').to_records(index=False) np.array_equal(result, expected)
def test_outer_join(): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = DataFrame(left, columns=['id', 'name', 'amount']) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = DataFrame(right, columns=['city', 'id']) L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') convert = lambda df: set(df.to_records(index=False).tolist()) assert convert(compute(join(L, R), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert convert(compute(join(L, R, how='left'), {L: left, R: right})) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, np.nan), (4, 'Dennis', 400, 'Moscow')]) df = compute(join(L, R, how='right'), {L: left, R: right}) expected = DataFrame( [(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) assert str(df.sort('id').to_records(index=False)) ==\ str(expected.sort('id').to_records(index=False)) df = compute(join(L, R, how='outer'), {L: left, R: right}) expected = DataFrame( [(1., 'Alice', 100., 'NYC'), (1., 'Alice', 100., 'Boston'), (2., 'Bob', 200., np.nan), (3., np.nan, np.nan, 'LA'), (4., 'Dennis', 400., 'Moscow')], columns=['id', 'name', 'amount', 'city']) assert str(df.sort('id').to_records(index=False)) ==\ str(expected.sort('id').to_records(index=False))
def test_clean_join(): metadata = sa.MetaData() name = sa.Table('name', metadata, sa.Column('id', sa.Integer), sa.Column('name', sa.String), ) city = sa.Table('place', metadata, sa.Column('id', sa.Integer), sa.Column('city', sa.String), sa.Column('country', sa.String), ) friends = sa.Table('friends', metadata, sa.Column('a', sa.Integer), sa.Column('b', sa.Integer), ) tcity = symbol('city', discover(city)) tfriends = symbol('friends', discover(friends)) tname = symbol('name', discover(name)) ns = {tname: name, tfriends: friends, tcity: city} expr = join(tfriends, tname, 'a', 'id') assert normalize(str(compute(expr, ns))) == normalize(""" SELECT friends.a, friends.b, name.name FROM friends JOIN name on friends.a = name.id""") expr = join(join(tfriends, tname, 'a', 'id'), tcity, 'a', 'id') result = compute(expr, ns) expected1 = """ SELECT friends.a, friends.b, name.name, place.city, place.country FROM friends JOIN name ON friends.a = name.id JOIN place ON friends.a = place.id """ expected2 = """ SELECT alias.a, alias.b, alias.name, place.city, place.country FROM (SELECT friends.a AS a, friends.b AS b, name.name AS name FROM friends JOIN name ON friends.a = name.id) AS alias JOIN place ON alias.a = place.id """ assert (normalize(str(result)) == normalize(expected1) or normalize(str(result)) == normalize(expected2))
def test_multi_column_join(): left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)] left = DataFrame(left, columns=['x', 'y', 'z']) right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)] right = DataFrame(right, columns=['x', 'y', 'w']) lsym = symbol('lsym', 'var * {x: int, y: int, z: int}') rsym = symbol('rsym', 'var * {x: int, y: int, w: int}') j = join(lsym, rsym, ['x', 'y']) expected = [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)] expected = DataFrame(expected, columns=['x', 'y', 'z', 'w']) result = compute(j, {lsym: left, rsym: right}) print(result) tm.assert_frame_equal(result, expected) assert list(result.columns) == list(j.fields)
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = symbol('t_idx', 'var * {name: string, node_id: int32}') t_arc = symbol('t_arc', 'var * {node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], count=joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx: df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') gb = result_pandas.groupby('name') expected = gb.node_id.count().reset_index().rename(columns={ 'node_id': 'count' }) tm.assert_frame_equal(result, expected) assert list(result.columns) == ['name', 'count']
def test_join_complex_clean(): metadata = sa.MetaData() name = sa.Table('name', metadata, sa.Column('id', sa.Integer), sa.Column('name', sa.String), ) city = sa.Table('place', metadata, sa.Column('id', sa.Integer), sa.Column('city', sa.String), sa.Column('country', sa.String), ) tname = symbol('name', discover(name)) tcity = symbol('city', discover(city)) ns = {tname: name, tcity: city} expr = join(tname[tname.id > 0], tcity, 'id') result = compute(expr, ns) expected1 = """ SELECT name.id, name.name, place.city, place.country FROM name JOIN place ON name.id = place.id WHERE name.id > :id_1""" expected2 = """ SELECT alias.id, alias.name, place.city, place.country FROM (SELECT name.id as id, name.name AS name FROM name WHERE name.id > :id_1) AS alias JOIN place ON alias.id = place.id""" assert (normalize(str(result)) == normalize(expected1) or normalize(str(result)) == normalize(expected2))
def test_relabel_join(): names = TableSymbol('names', '{first: string, last: string}') siblings = join(names.relabel({'last': 'left'}), names.relabel({'last': 'right'}), 'first') assert siblings.fields == ['first', 'left', 'right']
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = symbol('t_idx', 'var * {name: string, node_id: int32}') t_arc = symbol('t_arc', 'var * {node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], count=joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx: df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') gb = result_pandas.groupby('name') expected = gb.node_id.count().reset_index().rename( columns={'node_id': 'count'}) tm.assert_frame_equal(result, expected) assert list(result.columns) == ['name', 'count']
def test_join_count(): ds = datashape.dshape( '{t1: var * {x: int, y: int}, t2: var * {a: int, b: int}}') engine = resource('sqlite:///:memory:', dshape=ds) db = symbol('db', ds) expr = join(db.t1[db.t1.x > -1], db.t2, 'x', 'a').count() result = compute(expr, {db: engine}, post_compute=False) expected1 = """ SELECT count(alias.x) as count FROM (SELECT t1.x AS x, t1.y AS y, t2.b AS b FROM t1 JOIN t2 ON t1.x = t2.a WHERE t1.x > ?) as alias """ expected2 = """ SELECT count(alias2.x) AS count FROM (SELECT alias1.x AS x, alias1.y AS y, t2.b AS b FROM (SELECT t1.x AS x, t1.y AS y FROM t1 WHERE t1.x > ?) AS alias1 JOIN t2 ON alias1.x = t2.a) AS alias2""" assert (normalize(str(result)) == normalize(expected1) or normalize(str(result)) == normalize(expected2))
def test_clean_complex_join(): metadata = sa.MetaData() lhs = sa.Table('amounts', metadata, sa.Column('name', sa.String), sa.Column('amount', sa.Integer)) rhs = sa.Table('ids', metadata, sa.Column('name', sa.String), sa.Column('id', sa.Integer)) L = symbol('L', 'var * {name: string, amount: int}') R = symbol('R', 'var * {name: string, id: int}') joined = join(L[L.amount > 0], R, 'name') result = compute(joined, {L: lhs, R: rhs}) expected1 = """ SELECT amounts.name, amounts.amount, ids.id FROM amounts JOIN ids ON amounts.name = ids.name WHERE amounts.amount > :amount_1""" expected2 = """ SELECT alias.name, alias.amount, ids.id FROM (SELECT amounts.name AS name, amounts.amount AS amount FROM amounts WHERE amounts.amount > :amount_1) AS alias JOIN ids ON alias.name = ids.name""" assert (normalize(str(result)) == normalize(expected1) or normalize(str(result)) == normalize(expected2))
def test_selection_of_join(): metadata = sa.MetaData() name = sa.Table('name', metadata, sa.Column('id', sa.Integer), sa.Column('name', sa.String), ) city = sa.Table('place', metadata, sa.Column('id', sa.Integer), sa.Column('city', sa.String), sa.Column('country', sa.String), ) tname = symbol('name', discover(name)) tcity = symbol('city', discover(city)) ns = {tname: name, tcity: city} j = join(tname, tcity, 'id') expr = j[j.city == 'NYC'].name result = compute(expr, ns) assert normalize(str(result)) == normalize(""" SELECT name.name FROM name JOIN place ON name.id = place.id WHERE place.city = :city_1""")
def test_multi_column_join(): metadata = sa.MetaData() lhs = sa.Table('aaa', metadata, sa.Column('x', sa.Integer), sa.Column('y', sa.Integer), sa.Column('z', sa.Integer)) rhs = sa.Table('bbb', metadata, sa.Column('w', sa.Integer), sa.Column('x', sa.Integer), sa.Column('y', sa.Integer)) L = symbol('L', 'var * {x: int, y: int, z: int}') R = symbol('R', 'var * {w: int, x: int, y: int}') joined = join(L, R, ['x', 'y']) expected = lhs.join(rhs, (lhs.c.x == rhs.c.x) & (lhs.c.y == rhs.c.y)) expected = select(list(unique(expected.columns, key=lambda c: c.name))).select_from(expected) result = compute(joined, {L: lhs, R: rhs}) assert str(result) == str(expected) assert str(select(result)) == str(select(expected)) # Schemas match print(result.c.keys()) print(joined.fields) assert list(result.c.keys()) == list(joined.fields)
def test_multi_column_join(): left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)] left = DataFrame(left, columns=['x', 'y', 'z']) right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)] right = DataFrame(right, columns=['x', 'y', 'w']) L = symbol('L', 'var * {x: int, y: int, z: int}') R = symbol('R', 'var * {x: int, y: int, w: int}') j = join(L, R, ['x', 'y']) expected = [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)] expected = DataFrame(expected, columns=['x', 'y', 'z', 'w']) result = compute(j, {L: left, R: right}) print(result) assert str(result) == str(expected) assert list(result.columns) == list(j.fields)
def test_relabel_join(): names = symbol('names', 'var * {first: string, last: string}') siblings = join(names.relabel({'last': 'left'}), names.relabel({'last': 'right'}), 'first') assert siblings.fields == ['first', 'left', 'right']
def test_selection_of_join(): metadata = sa.MetaData() name = sa.Table( 'name', metadata, sa.Column('id', sa.Integer), sa.Column('name', sa.String), ) city = sa.Table( 'place', metadata, sa.Column('id', sa.Integer), sa.Column('city', sa.String), sa.Column('country', sa.String), ) tname = symbol('name', discover(name)) tcity = symbol('city', discover(city)) ns = {tname: name, tcity: city} j = join(tname, tcity, 'id') expr = j[j.city == 'NYC'].name result = compute(expr, ns) assert normalize(str(result)) == normalize(""" SELECT name.name FROM name JOIN place ON name.id = place.id WHERE place.city = :city_1""")
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = TableSymbol('t_idx', '{name: string, node_id: int32}') t_arc = TableSymbol('t_arc', '{node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx:df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') expected = result_pandas.groupby('name')['node_id'].count().reset_index() assert str(result.values) == str(expected.values) assert list(result.columns) == ['name', 'node_id_count']
def test_join_by_arcs(): df_idx = DataFrame([['A', 1], ['B', 2], ['C', 3]], columns=['name', 'node_id']) df_arc = DataFrame([[1, 3], [2, 3], [3, 1]], columns=['node_out', 'node_id']) t_idx = Symbol('t_idx', 'var * {name: string, node_id: int32}') t_arc = Symbol('t_arc', 'var * {node_out: int32, node_id: int32}') joined = join(t_arc, t_idx, "node_id") want = by(joined['name'], joined['node_id'].count()) result = compute(want, {t_arc: df_arc, t_idx:df_idx}) result_pandas = pd.merge(df_arc, df_idx, on='node_id') expected = result_pandas.groupby('name')['node_id'].count().reset_index() assert str(result.values) == str(expected.values) assert list(result.columns) == ['name', 'node_id_count']
def test_aliased_views_with_computation(): engine = sa.create_engine('sqlite:///:memory:') df_aaa = DataFrame({ 'x': [1, 2, 3, 2, 3], 'y': [2, 1, 2, 3, 1], 'z': [3, 3, 3, 1, 2] }) df_bbb = DataFrame({ 'w': [1, 2, 3, 2, 3], 'x': [2, 1, 2, 3, 1], 'y': [3, 3, 3, 1, 2] }) df_aaa.to_sql('aaa', engine) df_bbb.to_sql('bbb', engine) metadata = sa.MetaData(engine) metadata.reflect() sql_aaa = metadata.tables['aaa'] sql_bbb = metadata.tables['bbb'] L = symbol('aaa', discover(df_aaa)) R = symbol('bbb', discover(df_bbb)) expr = join(by(L.x, y_total=L.y.sum()), R) a = compute(expr, {L: df_aaa, R: df_bbb}) b = compute(expr, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr2 = by(expr.w, count=expr.x.count(), total2=expr.y_total.sum()) a = compute(expr2, {L: df_aaa, R: df_bbb}) b = compute(expr2, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr3 = by(expr.x, count=expr.y_total.count()) a = compute(expr3, {L: df_aaa, R: df_bbb}) b = compute(expr3, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr4 = join(expr2, R) a = compute(expr4, {L: df_aaa, R: df_bbb}) b = compute(expr4, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) """ # Takes a while
def test_join(): cities = np.array([("Alice", "NYC"), ("Alice", "LA"), ("Bob", "Chicago")], dtype=[("name", "S7"), ("city", "O")]) c = symbol("cities", discover(cities)) expr = join(t, c, "name") result = compute(expr, {t: x, c: cities}) assert (b"Alice", 1, 100, "LA") in into(list, result)
def test_join(): cities = np.array([('Alice', 'NYC'), ('Alice', 'LA'), ('Bob', 'Chicago')], dtype=[('name', 'S7'), ('city', 'O')]) c = symbol('cities', discover(cities)) expr = join(t, c, 'name') result = compute(expr, {t: x, c: cities}) assert (b'Alice', 1, 100, 'LA') in into(list, result)
def test_aliased_views_with_computation(): engine = sa.create_engine('sqlite:///:memory:') df_aaa = DataFrame({'x': [1, 2, 3, 2, 3], 'y': [2, 1, 2, 3, 1], 'z': [3, 3, 3, 1, 2]}) df_bbb = DataFrame({'w': [1, 2, 3, 2, 3], 'x': [2, 1, 2, 3, 1], 'y': [3, 3, 3, 1, 2]}) df_aaa.to_sql('aaa', engine) df_bbb.to_sql('bbb', engine) metadata = sa.MetaData(engine) metadata.reflect() sql_aaa = metadata.tables['aaa'] sql_bbb = metadata.tables['bbb'] L = symbol('aaa', discover(df_aaa)) R = symbol('bbb', discover(df_bbb)) expr = join(by(L.x, y_total=L.y.sum()), R) a = compute(expr, {L: df_aaa, R: df_bbb}) b = compute(expr, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr2 = by(expr.w, count=expr.x.count(), total2=expr.y_total.sum()) a = compute(expr2, {L: df_aaa, R: df_bbb}) b = compute(expr2, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr3 = by(expr.x, count=expr.y_total.count()) a = compute(expr3, {L: df_aaa, R: df_bbb}) b = compute(expr3, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) expr4 = join(expr2, R) a = compute(expr4, {L: df_aaa, R: df_bbb}) b = compute(expr4, {L: sql_aaa, R: sql_bbb}) assert into(set, a) == into(set, b) """ # Takes a while
def test_join_on_same_table(): metadata = sa.MetaData() T = sa.Table( 'tab', metadata, sa.Column('a', sa.Integer), sa.Column('b', sa.Integer), ) t = symbol('tab', discover(T)) expr = join(t, t, 'a') result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" SELECT tab_left.a, tab_left.b, tab_right.b FROM tab AS tab_left JOIN tab AS tab_right ON tab_left.a = tab_right.a """) expr = join(t, t, 'a').b_left.sum() result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" with alias as (select tab_left.b as b from tab as tab_left join tab as tab_right on tab_left.a = tab_right.a) select sum(alias.b) as b_left_sum from alias""") expr = join(t, t, 'a') expr = summary(total=expr.a.sum(), smallest=expr.b_right.min()) result = compute(expr, {t: T}) assert normalize(str(result)) == normalize(""" SELECT min(tab_right.b) as smallest, sum(tab_left.a) as total FROM tab AS tab_left JOIN tab AS tab_right ON tab_left.a = tab_right.a """)
def test_multi_column_join(): a = TableSymbol("a", "{x: int, y: int, z: int}") b = TableSymbol("b", "{w: int, x: int, y: int}") j = join(a, b, ["x", "y"]) assert set(j.fields) == set("wxyz") assert j.on_left == j.on_right == ["x", "y"] assert hash(j) assert j.fields == ["x", "y", "z", "w"]
def test_join(): t = TableSymbol("t", "{name: string, amount: int}") s = TableSymbol("t", "{name: string, id: int}") r = TableSymbol("r", "{name: string, amount: int}") q = TableSymbol("q", "{name: int}") j = join(t, s, "name", "name") assert j.schema == dshape("{name: string, amount: int, id: int}") assert join(t, s, "name") == join(t, s, "name") assert join(t, s, "name").on_left == "name" assert join(t, s, "name").on_right == "name" assert join(t, r, ("name", "amount")).on_left == ["name", "amount"] with pytest.raises(TypeError): join(t, q, "name") with pytest.raises(ValueError): join(t, s, how="upside_down")
def test_multi_column_join(): a = TableSymbol('a', '{x: int, y: int, z: int}') b = TableSymbol('b', '{w: int, x: int, y: int}') j = join(a, b, ['x', 'y']) assert set(j.fields) == set('wxyz') assert j.on_left == j.on_right == ['x', 'y'] assert hash(j) assert j.fields == ['x', 'y', 'z', 'w']
def test_multi_column_join(): a = symbol('a', 'var * {x: int, y: int, z: int}') b = symbol('b', 'var * {w: int, x: int, y: int}') j = join(a, b, ['x', 'y']) assert set(j.fields) == set('wxyz') assert j.on_left == j.on_right == ['x', 'y'] assert hash(j) assert j.fields == ['x', 'y', 'z', 'w']
def test_join(): t = TableSymbol('t', '{name: string, amount: int}') s = TableSymbol('t', '{name: string, id: int}') r = TableSymbol('r', '{name: string, amount: int}') q = TableSymbol('q', '{name: int}') j = join(t, s, 'name', 'name') assert j.schema == dshape('{name: string, amount: int, id: int}') assert join(t, s, 'name') == join(t, s, 'name') assert join(t, s, 'name').on_left == 'name' assert join(t, s, 'name').on_right == 'name' assert join(t, r, ('name', 'amount')).on_left == ['name', 'amount'] with pytest.raises(TypeError): join(t, q, 'name') with pytest.raises(ValueError): join(t, s, how='upside_down')
def test_join(): t = symbol('t', 'var * {name: string, amount: int}') s = symbol('t', 'var * {name: string, id: int}') r = symbol('r', 'var * {name: string, amount: int}') q = symbol('q', 'var * {name: int}') j = join(t, s, 'name', 'name') assert j.schema == dshape('{name: string, amount: int, id: int}') assert join(t, s, 'name') == join(t, s, 'name') assert join(t, s, 'name').on_left == 'name' assert join(t, s, 'name').on_right == 'name' assert join(t, r, ('name', 'amount')).on_left == ['name', 'amount'] with pytest.raises(TypeError): join(t, q, 'name') with pytest.raises(ValueError): join(t, s, how='upside_down')
def test_serializable(): t = TableSymbol('t', '{id: int, name: string, amount: int}') import pickle t2 = pickle.loads(pickle.dumps(t)) assert t.isidentical(t2) s = TableSymbol('t', '{id: int, city: string}') expr = join(t[t.amount < 0], s).sort('id').city.head() expr2 = pickle.loads(pickle.dumps(expr)) assert expr.isidentical(expr2)
def test_relabel_join(): names = Symbol('names', 'var * {first: string, last: string}') siblings = join(names.relabel({'first': 'left'}), names.relabel({'first': 'right'}), 'last')[['left', 'right']] data = [('Alice', 'Smith'), ('Bob', 'Jones'), ('Charlie', 'Smith')] print(set(compute(siblings, {names: data}))) assert ('Alice', 'Charlie') in set(compute(siblings, {names: data})) assert ('Alice', 'Bob') not in set(compute(siblings, {names: data}))
def test_join_promotion(): a_data = pd.DataFrame([[0.0, 1.5], [1.0, 2.5]], columns=list('ab')) b_data = pd.DataFrame([[0, 1], [1, 2]], columns=list('ac')) a = symbol('a', discover(a_data)) b = symbol('b', discover(b_data)) joined = join(a, b, 'a') assert joined.dshape == dshape('var * {a: float64, b: float64, c: int64}') expected = pd.merge(a_data, b_data, on='a') result = compute(joined, {a: a_data, b: b_data}) tm.assert_frame_equal(result, expected)
def test_join_suffixes(): df = pd.DataFrame( list(dict((k, n) for k in ascii_lowercase[:5]) for n in range(5)), ) a = symbol('a', discover(df)) b = symbol('b', discover(df)) suffixes = '_x', '_y' joined = join(a, b, 'a', suffixes=suffixes) expected = pd.merge(df, df, on='a', suffixes=suffixes) result = compute(joined, {a: df, b: df}) tm.assert_frame_equal(result, expected)
def test_serializable(): t = symbol('t', 'var * {id: int, name: string, amount: int}') import pickle t2 = pickle.loads(pickle.dumps(t, protocol=pickle.HIGHEST_PROTOCOL)) assert t.isidentical(t2) s = symbol('t', 'var * {id: int, city: string}') expr = join(t[t.amount < 0], s).sort('id').city.head() expr2 = pickle.loads(pickle.dumps(expr, protocol=pickle.HIGHEST_PROTOCOL)) assert expr.isidentical(expr2)
def test_join(): cities = Symbol('cities', dshape='var * {id: int, city: string}') j = join(t, cities, 'id') city_data = [[1, 'NYC'], [1, 'Chicago'], [5, 'Paris']] assert set(concat(compute(j[['name', 'city']], {t: c, cities: city_data}))) == \ set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris'))) assert set(concat(compute(j[['name', 'city']], {t: c, cities: city_data}))) == \ set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris')))
def test_multi_column_join(): left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)] right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)] L = Symbol('L', 'var * {x: int, y: int, z: int}') R = Symbol('R', 'var * {x: int, y: int, w: int}') j = join(L, R, ['x', 'y']) print(list(compute(j, {L: left, R: right}))) assert list(compute(j, { L: left, R: right })) == [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)]
def test_aliased_views_with_join(): joined = join(bank, cities) expr = by(joined.city, total=joined.amount.sum()) expr2 = by(expr.total, count=expr.city.nunique()) result = compute(expr2, {bank: sql_bank, cities: sql_cities}) assert normalize(str(result)) == normalize(""" SELECT alias.total, count(DISTINCT alias.city) AS count FROM (SELECT cities.city AS city, sum(bank.amount) AS total FROM bank JOIN cities ON bank.name = cities.name GROUP BY cities.city) as alias GROUP BY alias.total """)