def test_path(): from blaze import TableSymbol, join t = TableSymbol('t', '{name: string, amount: int, id: int}') v = TableSymbol('v', '{city: string, id: int}') expr = t['amount'].sum() assert list(path(expr, t)) == [t.amount.sum(), t.amount, t] assert list(path(expr, t.amount)) == [t.amount.sum(), t.amount] assert list(path(expr, t.amount)) == [t.amount.sum(), t.amount] expr = join(t, v).amount assert list(path(expr, t)) == [join(t, v).amount, join(t, v), t] assert list(path(expr, v)) == [join(t, v).amount, join(t, v), v]
def test_join_diff_contexts(db, ctx, cities): expr = join(db.t, db.s, "name") people = ctx.table("t") cities = into(ctx, cities, dshape=discover(ctx.table("s"))) scope = {db: {"t": people, "s": cities}} result = compute(expr, scope) expected = compute(expr, {db: {"t": df, "s": cities_df}}) assert set(map(frozenset, odo(result, set))) == set(map(frozenset, odo(expected, set)))
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {"t": df, "s": cities_df}}) assert isinstance(result, SparkDataFrame) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert isinstance(result, (SparkDataFrame, SchemaRDD)) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_join(rdd, rdd2): joined = join(t, t2, 'name') expected = [('Alice', 100, 1, 'Austin'), ('Bob', 200, 2, 'Boston'), ('Alice', 50, 3, 'Austin')] result = compute(joined, {t: rdd, t2: rdd2}).collect() assert all(i in expected for i in result)
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df, 's': cities_df}}, return_type='native') assert isinstance(result, SparkDataFrame) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_join_diff_contexts(db, ctx, cities): expr = join(db.t, db.s, 'name') people = ctx.table('t') cities = into(ctx, cities, dshape=discover(ctx.table('s'))) scope = {db: {'t': people, 's': cities}} result = compute(expr, scope) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert (set(map(frozenset, odo(result, set))) == set(map(frozenset, odo(expected, set))))
def test_groupby(sc): rddidx = sc.parallelize(data_idx) rddarc = sc.parallelize(data_arc) joined = join(t_arc, t_idx, "node_id") t = by(joined['name'], count=joined['node_id'].count()) a = compute(t, {t_arc: rddarc, t_idx: rddidx}) in_degree = dict(a.collect()) assert in_degree == {'A': 1, 'C': 2}
def test_csv_join(): d = {"a.csv": "a,b,c\n0,1,2\n3,4,5", "b.csv": "c,d,e\n2,3,4\n5,6,7"} with filetexts(d): resource_a = resource("a.csv") resource_b = resource("b.csv") a = symbol("a", discover(resource_a)) b = symbol("b", discover(resource_b)) tm.assert_frame_equal( odo(compute(join(a, b, "c"), {a: resource_a, b: resource_b}), pd.DataFrame), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype="int64"), columns=list("cabde")), )
def test_outer_join(sc): left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = sc.parallelize(left) right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = sc.parallelize(right) L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') assert set(compute(join(L, R), {L: left, R: right}).collect()) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='left'), {L: left, R: right}).collect()) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) assert set(compute(join(L, R, how='right'), {L: left, R: right}).collect()) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) # Full outer join not yet supported assert set(compute(join(L, R, how='outer'), {L: left, R: right}).collect()) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')])
def test_multi_expression_compute(app_context): s = symbol('s', discover(data)) expr = join(s.accounts, s.cities) resp = test.post('/compute.json', data=json.dumps({'expr': to_tree(expr)}), content_type='application/json') assert 'OK' in resp.status result = json.loads(resp.data.decode('utf-8'))['data'] expected = compute(expr, {s: data}) assert list(map(tuple, result))== into(list, expected)
def test_multi_expression_compute(): a = Symbol('accounts', discover(accounts)) c = Symbol('cities', discover(cities)) expr = join(a, c) resp = test.post('/compute.json', data=json.dumps({'expr': to_tree(expr)}), content_type='application/json') assert 'OK' in resp.status result = json.loads(resp.data)['data'] expected = compute(expr, {a: accounts, c: cities}) assert list(map(tuple, result)) == into(list, expected)
def test_multi_expression_compute(serial): s = symbol('s', discover(data)) expr = join(s.accounts, s.cities) resp = test.post( '/compute.{name}'.format(name=serial.name), data=serial.dumps({'expr': to_tree(expr)}), ) assert 'OK' in resp.status result = serial.loads(resp.data)['data'] expected = compute(expr, {s: data}) assert list(map(tuple, result)) == into(list, expected)
def test_multi_expression_compute(): a = Symbol('accounts', discover(accounts)) c = Symbol('cities', discover(cities)) expr = join(a, c) resp = test.post('/compute.json', data=json.dumps({'expr': to_tree(expr)}), content_type='application/json') assert 'OK' in resp.status result = json.loads(resp.data)['data'] expected = compute(expr, {a: accounts, c: cities}) assert list(map(tuple, result))== into(list, expected)
def test_multi_expression_compute(test, serial): s = symbol('s', discover(tdata)) expr = join(s.accounts, s.cities) resp = test.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) assert 'OK' in resp.status respdata = serial.loads(resp.data) result = serial.data_loads(respdata['data']) expected = compute(expr, {s: tdata}) assert list(map(tuple, odo(result, list))) == into(list, expected) assert list(respdata['names']) == expr.fields
def test_multi_expression_compute(test, serial): s = symbol('s', discover(data)) expr = join(s.accounts, s.cities) resp = test.post( '/compute.{name}'.format(name=serial.name), data=serial.dumps({'expr': to_tree(expr)}), ) assert 'OK' in resp.status respdata = serial.loads(resp.data) result = respdata['data'] expected = compute(expr, {s: data}) assert list(map(tuple, result)) == into(list, expected) assert respdata['names'] == expr.fields
def test_csv_join(): d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5', 'b.csv': 'c,d,e\n2,3,4\n5,6,7'} with filetexts(d): data_a = data('a.csv') data_b = data('b.csv') a = symbol('a', discover(data_a)) b = symbol('b', discover(data_b)) tm.assert_frame_equal( odo( compute(join(a, b, 'c'), {a: data_a, b: data_b}), pd.DataFrame, ), # windows needs explicit int64 construction b/c default is int32 pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype='int64'), columns=list('cabde')) )
def test_multi_column_join(sc): left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)] right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)] rleft = sc.parallelize(left) rright = sc.parallelize(right) L = symbol('L', 'var * {x: int, y: int, z: int}') R = symbol('R', 'var * {x: int, y: int, w: int}') j = join(L, R, ['x', 'y']) result = compute(j, {L: rleft, R: rright}) expected = [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)] assert set(result.collect()) == set(expected)
def test_join_type_promotion(sqla, sqlb): t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb)) expr = join(t, s, 'B', how='inner') result = set(map(tuple, compute(expr, {t: sqla, s: sqlb}, return_type='native').execute().fetchall())) expected = set([(1, 'a', 'a'), (1, None, 'a')]) assert result == expected
def test_join_type_promotion(sqla, sqlb): t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb)) expr = join(t, s, 'B', how='inner') result = set(map(tuple, compute(expr, {t: sqla, s: sqlb}).execute().fetchall())) expected = set([(1, 'a', 'a'), (1, None, 'a')]) assert result == expected
def test_join_foreign_key(): a = symbol('a', "var * {timestamp: string, pkid: map[int32, {pkid: int32, label: ?string}]}") b = symbol('a', "var * {pkid: int32, label: ?string}") assert join(a, b, 'pkid', 'pkid').dshape == dshape("var * {pkid: int32, timestamp: string, label: ?string}")