Example #1
0
def test_path():
    from blaze import TableSymbol, join
    t = TableSymbol('t', '{name: string, amount: int, id: int}')
    v = TableSymbol('v', '{city: string, id: int}')
    expr = t['amount'].sum()

    assert list(path(expr, t)) == [t.amount.sum(), t.amount, t]
    assert list(path(expr, t.amount)) == [t.amount.sum(), t.amount]
    assert list(path(expr, t.amount)) == [t.amount.sum(), t.amount]

    expr = join(t, v).amount
    assert list(path(expr, t)) == [join(t, v).amount, join(t, v), t]
    assert list(path(expr, v)) == [join(t, v).amount, join(t, v), v]
Example #2
0
def test_join_diff_contexts(db, ctx, cities):
    expr = join(db.t, db.s, "name")
    people = ctx.table("t")
    cities = into(ctx, cities, dshape=discover(ctx.table("s")))
    scope = {db: {"t": people, "s": cities}}
    result = compute(expr, scope)
    expected = compute(expr, {db: {"t": df, "s": cities_df}})
    assert set(map(frozenset, odo(result, set))) == set(map(frozenset, odo(expected, set)))
Example #3
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {"t": df, "s": cities_df}})

    assert isinstance(result, SparkDataFrame)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Example #4
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})

    assert isinstance(result, (SparkDataFrame, SchemaRDD))
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Example #5
0
def test_join(rdd, rdd2):

    joined = join(t, t2, 'name')
    expected = [('Alice', 100, 1, 'Austin'),
                ('Bob', 200, 2, 'Boston'),
                ('Alice', 50, 3, 'Austin')]
    result = compute(joined, {t: rdd, t2: rdd2}).collect()
    assert all(i in expected for i in result)
Example #6
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx, return_type='native')
    expected = compute(expr, {db: {'t': df, 's': cities_df}}, return_type='native')

    assert isinstance(result, SparkDataFrame)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Example #7
0
def test_join_diff_contexts(db, ctx, cities):
    expr = join(db.t, db.s, 'name')
    people = ctx.table('t')
    cities = into(ctx, cities, dshape=discover(ctx.table('s')))
    scope = {db: {'t': people, 's': cities}}
    result = compute(expr, scope)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})
    assert (set(map(frozenset, odo(result, set))) ==
            set(map(frozenset, odo(expected, set))))
Example #8
0
def test_groupby(sc):
    rddidx = sc.parallelize(data_idx)
    rddarc = sc.parallelize(data_arc)

    joined = join(t_arc, t_idx, "node_id")

    t = by(joined['name'], count=joined['node_id'].count())
    a = compute(t, {t_arc: rddarc, t_idx: rddidx})
    in_degree = dict(a.collect())
    assert in_degree == {'A': 1, 'C': 2}
Example #9
0
def test_csv_join():
    d = {"a.csv": "a,b,c\n0,1,2\n3,4,5", "b.csv": "c,d,e\n2,3,4\n5,6,7"}

    with filetexts(d):
        resource_a = resource("a.csv")
        resource_b = resource("b.csv")
        a = symbol("a", discover(resource_a))
        b = symbol("b", discover(resource_b))
        tm.assert_frame_equal(
            odo(compute(join(a, b, "c"), {a: resource_a, b: resource_b}), pd.DataFrame),
            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype="int64"), columns=list("cabde")),
        )
Example #10
0
def test_outer_join(sc):
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = sc.parallelize(left)
    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = sc.parallelize(right)

    L = symbol('L', 'var * {id: int, name: string, amount: real}')
    R = symbol('R', 'var * {city: string, id: int}')

    assert set(compute(join(L, R), {L: left, R: right}).collect()) == set(
        [(1, 'Alice', 100, 'NYC'),
         (1, 'Alice', 100, 'Boston'),
         (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='left'), {L: left, R: right}).collect()) == set(
        [(1, 'Alice', 100, 'NYC'),
         (1, 'Alice', 100, 'Boston'),
         (2, 'Bob', 200, None),
         (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='right'), {L: left, R: right}).collect()) == set(
        [(1, 'Alice', 100, 'NYC'),
         (1, 'Alice', 100, 'Boston'),
         (3, None, None, 'LA'),
         (4, 'Dennis', 400, 'Moscow')])

    # Full outer join not yet supported
    assert set(compute(join(L, R, how='outer'), {L: left, R: right}).collect()) == set(
        [(1, 'Alice', 100, 'NYC'),
         (1, 'Alice', 100, 'Boston'),
         (2, 'Bob', 200, None),
         (3, None, None, 'LA'),
         (4, 'Dennis', 400, 'Moscow')])
Example #11
0
def test_multi_expression_compute(app_context):
    s = symbol('s', discover(data))

    expr = join(s.accounts, s.cities)

    resp = test.post('/compute.json',
                     data=json.dumps({'expr': to_tree(expr)}),
                     content_type='application/json')

    assert 'OK' in resp.status
    result = json.loads(resp.data.decode('utf-8'))['data']
    expected = compute(expr, {s: data})

    assert list(map(tuple, result))== into(list, expected)
Example #12
0
def test_multi_expression_compute():
    a = Symbol('accounts', discover(accounts))
    c = Symbol('cities', discover(cities))

    expr = join(a, c)

    resp = test.post('/compute.json',
                     data=json.dumps({'expr': to_tree(expr)}),
                     content_type='application/json')

    assert 'OK' in resp.status
    result = json.loads(resp.data)['data']
    expected = compute(expr, {a: accounts, c: cities})

    assert list(map(tuple, result)) == into(list, expected)
Example #13
0
def test_multi_expression_compute(serial):
    s = symbol('s', discover(data))

    expr = join(s.accounts, s.cities)

    resp = test.post(
        '/compute.{name}'.format(name=serial.name),
        data=serial.dumps({'expr': to_tree(expr)}),
    )

    assert 'OK' in resp.status
    result = serial.loads(resp.data)['data']
    expected = compute(expr, {s: data})

    assert list(map(tuple, result)) == into(list, expected)
Example #14
0
def test_multi_expression_compute():
    a = Symbol('accounts', discover(accounts))
    c = Symbol('cities', discover(cities))

    expr = join(a, c)

    resp = test.post('/compute.json',
                     data=json.dumps({'expr': to_tree(expr)}),
                     content_type='application/json')

    assert 'OK' in resp.status
    result = json.loads(resp.data)['data']
    expected = compute(expr, {a: accounts, c: cities})

    assert list(map(tuple, result))== into(list, expected)
Example #15
0
def test_multi_expression_compute(test, serial):
    s = symbol('s', discover(tdata))

    expr = join(s.accounts, s.cities)

    resp = test.post('/compute',
                     data=serial.dumps({'expr': to_tree(expr)}),
                     headers=mimetype(serial))

    assert 'OK' in resp.status
    respdata = serial.loads(resp.data)
    result = serial.data_loads(respdata['data'])
    expected = compute(expr, {s: tdata})

    assert list(map(tuple, odo(result, list))) == into(list, expected)
    assert list(respdata['names']) == expr.fields
Example #16
0
def test_multi_expression_compute(test, serial):
    s = symbol('s', discover(tdata))

    expr = join(s.accounts, s.cities)

    resp = test.post('/compute',
                     data=serial.dumps({'expr': to_tree(expr)}),
                     headers=mimetype(serial))

    assert 'OK' in resp.status
    respdata = serial.loads(resp.data)
    result = serial.data_loads(respdata['data'])
    expected = compute(expr, {s: tdata})

    assert list(map(tuple, odo(result, list))) == into(list, expected)
    assert list(respdata['names']) == expr.fields
Example #17
0
def test_multi_expression_compute(test, serial):
    s = symbol('s', discover(data))

    expr = join(s.accounts, s.cities)

    resp = test.post(
        '/compute.{name}'.format(name=serial.name),
        data=serial.dumps({'expr': to_tree(expr)}),
    )

    assert 'OK' in resp.status
    respdata = serial.loads(resp.data)
    result = respdata['data']
    expected = compute(expr, {s: data})

    assert list(map(tuple, result)) == into(list, expected)
    assert respdata['names'] == expr.fields
Example #18
0
def test_csv_join():
    d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5',
         'b.csv': 'c,d,e\n2,3,4\n5,6,7'}

    with filetexts(d):
        data_a = data('a.csv')
        data_b = data('b.csv')
        a = symbol('a', discover(data_a))
        b = symbol('b', discover(data_b))
        tm.assert_frame_equal(
            odo(
                compute(join(a, b, 'c'), {a: data_a, b: data_b}),
                pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4],
                                   [5, 3, 4, 6, 7]], dtype='int64'),
                         columns=list('cabde'))
        )
Example #19
0
def test_csv_join():
    d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5',
         'b.csv': 'c,d,e\n2,3,4\n5,6,7'}

    with filetexts(d):
        data_a = data('a.csv')
        data_b = data('b.csv')
        a = symbol('a', discover(data_a))
        b = symbol('b', discover(data_b))
        tm.assert_frame_equal(
            odo(
                compute(join(a, b, 'c'), {a: data_a, b: data_b}),
                pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4],
                                   [5, 3, 4, 6, 7]], dtype='int64'),
                         columns=list('cabde'))
        )
Example #20
0
def test_multi_column_join(sc):
    left = [(1, 2, 3),
            (2, 3, 4),
            (1, 3, 5)]
    right = [(1, 2, 30),
             (1, 3, 50),
             (1, 3, 150)]
    rleft = sc.parallelize(left)
    rright = sc.parallelize(right)

    L = symbol('L', 'var * {x: int, y: int, z: int}')
    R = symbol('R', 'var * {x: int, y: int, w: int}')

    j = join(L, R, ['x', 'y'])

    result = compute(j, {L: rleft, R: rright})
    expected = [(1, 2, 3, 30),
                (1, 3, 5, 50),
                (1, 3, 5, 150)]

    assert set(result.collect()) == set(expected)
Example #21
0
def test_join_type_promotion(sqla, sqlb):
    t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb))
    expr = join(t, s, 'B', how='inner')
    result = set(map(tuple, compute(expr, {t: sqla, s: sqlb}, return_type='native').execute().fetchall()))
    expected = set([(1, 'a', 'a'), (1, None, 'a')])
    assert result == expected
Example #22
0
def test_join_type_promotion(sqla, sqlb):
    t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb))
    expr = join(t, s, 'B', how='inner')
    result = set(map(tuple, compute(expr, {t: sqla, s: sqlb}).execute().fetchall()))
    expected = set([(1, 'a', 'a'), (1, None, 'a')])
    assert result == expected
Example #23
0
def test_join_foreign_key():
    a = symbol('a', "var * {timestamp: string, pkid: map[int32, {pkid: int32, label: ?string}]}")
    b = symbol('a', "var * {pkid: int32, label: ?string}")
    assert join(a, b, 'pkid', 'pkid').dshape == dshape("var * {pkid: int32, timestamp: string, label: ?string}")