Example #1
0
def test_spark_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = sc.parallelize(left)
    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = sc.parallelize(right)

    L = symbol('L', 'var * {id: int, name: string, amount: real}')
    R = symbol('R', 'var * {city: string, id: int}')

    assert set(compute(join(L, R), {L: left, R: right}).collect()) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='left'), {L: left, R: right}).collect()) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (2, 'Bob', 200, None),
             (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='right'), {L: left, R: right}).collect()) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (3, None, None, 'LA'),
             (4, 'Dennis', 400, 'Moscow')])

    # Full outer join not yet supported
    """
Example #2
0
 def test_sparksql_with_literals():
     srdd = into(sqlContext, data, schema=t.schema)
     expr = t[t.amount >= 100]
     result = compute(expr, srdd)
     assert isinstance(result, SchemaRDD)
     assert set(map(tuple, result.collect())) == \
             set(map(tuple, compute(expr, data)))
Example #3
0
def test_sparksql_with_literals():
    srdd = into(sqlContext, data, schema=t.schema)
    expr = t[t.amount >= 100]
    result = compute(expr, srdd)
    assert isinstance(result, SchemaRDD)
    assert set(map(tuple, result.collect())) == \
            set(map(tuple, compute(expr, data)))
Example #4
0
def test_spark_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = sc.parallelize(left)
    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = sc.parallelize(right)

    L = TableSymbol('L', '{id: int, name: string, amount: real}')
    R = TableSymbol('R', '{city: string, id: int}')

    assert set(compute(join(L, R), {L: left, R: right}).collect()) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='left'), {L: left, R: right}).collect()) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (2, 'Bob', 200, None),
             (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='right'), {L: left, R: right}).collect()) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (3, None, None, 'LA'),
             (4, 'Dennis', 400, 'Moscow')])

    # Full outer join not yet supported
    """
Example #5
0
def test_base():
    for expr, exclusions in expressions.items():
        if iscollection(expr.dshape):
            model = into(
                DataFrame,
                into(np.ndarray, expr._subs({t: Data(base, t.dshape)})))
        else:
            model = compute(expr._subs({t: Data(base, t.dshape)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Data(source)
            if iscollection(expr.dshape):
                result = into(type(model), expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into(list, result)) == set(into(list, model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                try:
                    result = result.scalar()
                except AttributeError:
                    pass
                assert result == model
Example #6
0
def test_base():
    for expr, exclusions in expressions.items():
        if iscollection(expr.dshape):
            model = into(DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)})))
        else:
            model = compute(expr._subs({t: Data(base, t.dshape)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Data(source)
            if iscollection(expr.dshape):
                result = into(type(model), expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into(list, result)) == set(into(list, model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                try:
                    result = result.scalar()
                except AttributeError:
                    pass
                assert result == model
Example #7
0
def test_spark_groupby():
    rddidx = sc.parallelize(data_idx)
    rddarc = sc.parallelize(data_arc)

    joined = join(t_arc, t_idx, "node_id")

    result_blaze = compute(joined, {t_arc: rddarc, t_idx:rddidx})
    t = by(joined['name'], joined['node_id'].count())
    a = compute(t, {t_arc: rddarc, t_idx:rddidx})
    in_degree = dict(a.collect())
    assert in_degree == {'A': 1, 'C': 2}
Example #8
0
def test_spark_groupby():
    rddidx = sc.parallelize(data_idx)
    rddarc = sc.parallelize(data_arc)

    joined = join(t_arc, t_idx, "node_id")

    result_blaze = compute(joined, {t_arc: rddarc, t_idx:rddidx})
    t = by(joined['name'], count=joined['node_id'].count())
    a = compute(t, {t_arc: rddarc, t_idx:rddidx})
    in_degree = dict(a.collect())
    assert in_degree == {'A': 1, 'C': 2}
Example #9
0
def test_spark_reductions():
    for expr in reduction_exprs:
        result = compute(expr, rdd)
        expected = compute(expr, data)
        if not result == expected:
            print(result)
            print(expected)
            if isinstance(result, float):
                assert abs(result - expected) < 0.001
            else:
                assert result == expected
Example #10
0
def test_spark_reductions():
    for expr in reduction_exprs:
        result = compute(expr, rdd)
        expected = compute(expr, data)
        if not result == expected:
            print(result)
            print(expected)
            if isinstance(result, float):
                assert abs(result - expected) < 0.001
            else:
                assert result == expected
Example #11
0
def check_exprs_against_python(exprs, data, rdd):
    any_bad = False
    for expr in exprs:
        result = compute(expr, rdd).collect()
        expected = list(compute(expr, data))
        if not result == expected:
            any_bad = True
            print("Expression:", expr)
            print("Spark:", result)
            print("Python:", expected)

    assert not any_bad
Example #12
0
def check_exprs_against_python(exprs, data, rdd):
    any_bad = False
    for expr in exprs:
        result = compute(expr, rdd).collect()
        expected = list(compute(expr, data))
        if not result == expected:
            any_bad = True
            print("Expression:", expr)
            print("Spark:", result)
            print("Python:", expected)

    assert not any_bad
Example #13
0
def test_sparksql_compute():
    srdd = into(sqlContext, data, schema=t.schema)
    assert compute_up(t, srdd).context == sqlContext
    assert discover(compute_up(t, srdd).query).subshape[0] == \
            dshape('{name: string, amount: int64, id: int64}')

    assert isinstance(compute(t[['name', 'amount']], srdd), SchemaRDD)

    assert sorted(compute(t.name, srdd).collect()) == ['Alice', 'Alice', 'Bob']

    assert isinstance(compute(t[['name', 'amount']].head(2), srdd),
                      (tuple, list))
Example #14
0
def test_sparksql_compute():
    srdd = into(sqlContext, data, schema=t.schema)
    assert compute_one(t, srdd).context == sqlContext
    assert discover(compute_one(t, srdd).query).subshape[0] == \
            dshape('{name: string, amount: int64, id: int64}')

    assert isinstance(compute(t[['name', 'amount']], srdd),
                      SchemaRDD)

    assert sorted(compute(t.name, srdd).collect()) == ['Alice', 'Alice', 'Bob']

    assert isinstance(compute(t[['name', 'amount']].head(2), srdd),
                     (tuple, list))
Example #15
0
def test_spark_join():

    joined = join(t, t2, 'name')
    expected = [('Alice', 100, 1, 'Austin'), ('Bob', 200, 2, 'Boston'),
                ('Alice', 50, 3, 'Austin')]
    result = compute(joined, {t: rdd, t2: rdd2}).collect()
    assert all(i in expected for i in result)
Example #16
0
 def test_sparksql_by_summary():
     t = symbol('t', 'var * {name: string, amount: int64, id: int64}')
     srdd = into(sqlContext, data, schema=t.schema)
     expr = by(t.name, mymin=t.amount.min(), mymax=t.amount.max())
     result = compute(expr, srdd)
     assert result.collect()
     assert (str(discover(result)).replace('?', '')
          == str(expr.dshape).replace('?', ''))
Example #17
0
def test_spark_multicols_projection():
    result = compute(t[["amount", "name"]], rdd).collect()
    expected = [(100, "Alice"), (200, "Bob"), (50, "Alice")]

    print(result)
    print(expected)

    assert result == expected
Example #18
0
def test_sparksql_by_summary():
    t = TableSymbol('t', '{name: string, amount: int64, id: int64}')
    srdd = into(sqlContext, data, schema=t.schema)
    expr = by(t.name, mymin=t.amount.min(), mymax=t.amount.max())
    result = compute(expr, srdd)
    assert result.collect()
    assert (str(discover(result)).replace('?', '')
         == str(expr.dshape).replace('?', ''))
Example #19
0
def test_spark_join():

    joined = join(t, t2, 'name')
    expected = [['Alice', 100, 1, 'Austin'],
                ['Bob', 200, 2, 'Boston'],
                ['Alice', 50, 3, 'Austin']]
    result = compute(joined, {t: rdd, t2: rdd2}).collect()
    assert all(i in expected for i in result)
Example #20
0
def test_spark_multicols_projection():
    result = compute(t[['amount', 'name']], rdd).collect()
    expected = [(100, 'Alice'), (200, 'Bob'), (50, 'Alice')]

    print(result)
    print(expected)

    assert result == expected
Example #21
0
def test_spark_multicols_projection():
    result = compute(t[['amount', 'name']], rdd).collect()
    expected = [(100, 'Alice'), (200, 'Bob'), (50, 'Alice')]

    print(result)
    print(expected)

    assert result == expected
Example #22
0
def test_comprehensive():
    L = [[100, 1, 'Alice'],
         [200, 2, 'Bob'],
         [300, 3, 'Charlie'],
         [400, 4, 'Dan'],
         [500, 5, 'Edith']]

    df = DataFrame(L, columns=['amount', 'id', 'name'])

    rdd = into(sc, df)
    srdd = into(sqlContext, df)

    t = TableSymbol('t', '{amount: int64, id: int64, name: string}')

    expressions = {
            t: [],
            t['id']: [],
            t.id.max(): [],
            t.amount.sum(): [],
            t.amount + 1: [],
            sin(t.amount): [srdd], # sparksql without hiveql doesn't support math
            exp(t.amount): [srdd], # sparksql without hiveql doesn't support math
            t.amount > 50: [],
            t[t.amount > 50]: [],
            t.sort('name'): [],
            t.sort('name', ascending=False): [],
            t.head(3): [],
            t.name.distinct(): [],
            t[t.amount > 50]['name']: [],
            t.id.map(lambda x: x + 1, '{id: int}'): [srdd], # no udfs yet
            t[t.amount > 50]['name']: [],
            by(t.name, t.amount.sum()): [],
            by(t.id, t.id.count()): [],
            by(t[['id', 'amount']], t.id.count()): [],
            by(t[['id', 'amount']], (t.amount + 1).sum()): [],
            by(t[['id', 'amount']], t.name.nunique()): [rdd, srdd],
            by(t.id, t.amount.count()): [],
            by(t.id, t.id.nunique()): [rdd, srdd],
            # by(t, t.count()): [],
            # by(t.id, t.count()): [df],
            t[['amount', 'id']]: [],
            t[['id', 'amount']]: [],
            }

    for e, exclusions in expressions.items():
        if rdd not in exclusions:
            if isinstance(e, TableExpr):
                assert into(set, compute(e, rdd)) == into(set, compute(e, df))
            else:
                assert compute(e, rdd) == compute(e, df)
        if srdd not in exclusions:
            if isinstance(e, TableExpr):
                assert into(set, compute(e, srdd)) == into(set, compute(e, df))
            else:
                assert compute(e, rdd) == compute(e, df)
Example #23
0
    def test_comprehensive():
        L = [[100, 1, 'Alice'],
             [200, 2, 'Bob'],
             [300, 3, 'Charlie'],
             [400, 4, 'Dan'],
             [500, 5, 'Edith']]

        df = DataFrame(L, columns=['amount', 'id', 'name'])

        rdd = into(sc, df)
        srdd = into(sqlContext, df)

        t = symbol('t', 'var * {amount: int64, id: int64, name: string}')

        expressions = {
                t: [],
                t['id']: [],
                t.id.max(): [],
                t.amount.sum(): [],
                t.amount + 1: [],
                sin(t.amount): [srdd], # sparksql without hiveql doesn't support math
                exp(t.amount): [srdd], # sparksql without hiveql doesn't support math
                t.amount > 50: [],
                t[t.amount > 50]: [],
                t.sort('name'): [],
                t.sort('name', ascending=False): [],
                t.head(3): [],
                t.name.distinct(): [],
                t[t.amount > 50]['name']: [],
                t.id.map(lambda x: x + 1, 'int'): [srdd], # no udfs yet
                t[t.amount > 50]['name']: [],
                by(t.name, total=t.amount.sum()): [],
                by(t.id, total=t.id.count()): [],
                by(t[['id', 'amount']], total=t.id.count()): [],
                by(t[['id', 'amount']], total=(t.amount + 1).sum()): [],
                by(t[['id', 'amount']], total=t.name.nunique()): [rdd, srdd],
                by(t.id, total=t.amount.count()): [],
                by(t.id, total=t.id.nunique()): [rdd, srdd],
                # by(t, t.count()): [],
                # by(t.id, t.count()): [df],
                t[['amount', 'id']]: [],
                t[['id', 'amount']]: [],
                }

        for e, exclusions in expressions.items():
            if rdd not in exclusions:
                if iscollection(e.dshape):
                    assert into(set, compute(e, rdd)) == into(set, compute(e, df))
                else:
                    assert compute(e, rdd) == compute(e, df)
            if srdd not in exclusions:
                if iscollection(e.dshape):
                    assert into(set, compute(e, srdd)) == into(set, compute(e, df))
                else:
                    assert compute(e, rdd) == compute(e, df)
Example #24
0
def test_table_resource():
    with tmpfile('csv') as filename:
        ds = dshape('var * {a: int, b: int}')
        csv = CSV(filename)
        append(csv, [[1, 2], [10, 20]], dshape=ds)

        t = data(filename)
        assert isinstance(t.data, CSV)
        assert into(list, compute(t)) == into(list, csv)
Example #25
0
def test_base():
    for expr, exclusions in expressions.items():
        model = compute(expr._subs({t: Table(base, t.schema)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Table(source)
            if iscollection(expr.dshape):
                result = into(model, expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into([], result)) == set(into([], model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                assert result == model
Example #26
0
def test_spark_outer_join():
    left = [(1, "Alice", 100), (2, "Bob", 200), (4, "Dennis", 400)]
    left = sc.parallelize(left)
    right = [("NYC", 1), ("Boston", 1), ("LA", 3), ("Moscow", 4)]
    right = sc.parallelize(right)

    L = TableSymbol("L", "{id: int, name: string, amount: real}")
    R = TableSymbol("R", "{city: string, id: int}")

    assert set(compute(join(L, R), {L: left, R: right}).collect()) == set(
        [(1, "Alice", 100, "NYC"), (1, "Alice", 100, "Boston"), (4, "Dennis", 400, "Moscow")]
    )

    assert set(compute(join(L, R, how="left"), {L: left, R: right}).collect()) == set(
        [(1, "Alice", 100, "NYC"), (1, "Alice", 100, "Boston"), (2, "Bob", 200, None), (4, "Dennis", 400, "Moscow")]
    )

    assert set(compute(join(L, R, how="right"), {L: left, R: right}).collect()) == set(
        [(1, "Alice", 100, "NYC"), (1, "Alice", 100, "Boston"), (3, None, None, "LA"), (4, "Dennis", 400, "Moscow")]
    )

    # Full outer join not yet supported
    """
Example #27
0
def test_base():
    for expr, exclusions in expressions.items():
        model = compute(expr.subs({t: Table(base, t.schema)}))
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            T = Table(source)
            result = into(model, expr.subs({t: T}))
            if isinstance(expr, TableExpr):
                if expr.iscolumn:
                    assert set(into([], result)) == set(into([], model))
                else:
                    assert df_eq(result, model)
            else:
                assert result == model
Example #28
0
def test_spark_multi_column_join():
    left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)]
    right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)]
    rleft = sc.parallelize(left)
    rright = sc.parallelize(right)

    L = TableSymbol("L", "{x: int, y: int, z: int}")
    R = TableSymbol("R", "{x: int, y: int, w: int}")

    j = join(L, R, ["x", "y"])

    result = compute(j, {L: rleft, R: rright})
    expected = [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)]

    print(result.collect())
    assert result.collect() == expected
Example #29
0
def test_spark_multi_column_join():
    left = [(1, 2, 3), (2, 3, 4), (1, 3, 5)]
    right = [(1, 2, 30), (1, 3, 50), (1, 3, 150)]
    rleft = sc.parallelize(left)
    rright = sc.parallelize(right)

    L = Symbol('L', 'var * {x: int, y: int, z: int}')
    R = Symbol('R', 'var * {x: int, y: int, w: int}')

    j = join(L, R, ['x', 'y'])

    result = compute(j, {L: rleft, R: rright})
    expected = [(1, 2, 3, 30), (1, 3, 5, 50), (1, 3, 5, 150)]

    print(result.collect())
    assert set(result.collect()) == set(expected)
Example #30
0
def test_union():
    L1 = [["Alice", 100, 1], ["Bob", 200, 2], ["Alice", 50, 3]]
    L2 = [["Alice", 100, 4], ["Bob", 200, 5], ["Alice", 50, 6]]
    L3 = [["Alice", 100, 7], ["Bob", 200, 8], ["Alice", 50, 9]]
    r1 = sc.parallelize(L1)
    r2 = sc.parallelize(L2)
    r3 = sc.parallelize(L3)

    t1 = TableSymbol("t1", "{name: string, amount: int, id: int}")
    t2 = TableSymbol("t2", "{name: string, amount: int, id: int}")
    t3 = TableSymbol("t3", "{name: string, amount: int, id: int}")

    expr = union(t1, t2, t3)

    result = compute(expr, {t1: r1, t2: r2, t3: r3}).collect()

    assert set(map(tuple, result)) == set(map(tuple, L1 + L2 + L3))
Example #31
0
def test_union():
    L1 = [['Alice', 100, 1], ['Bob', 200, 2], ['Alice', 50, 3]]
    L2 = [['Alice', 100, 4], ['Bob', 200, 5], ['Alice', 50, 6]]
    L3 = [['Alice', 100, 7], ['Bob', 200, 8], ['Alice', 50, 9]]
    r1 = sc.parallelize(L1)
    r2 = sc.parallelize(L2)
    r3 = sc.parallelize(L3)

    t1 = Symbol('t1', 'var * {name: string, amount: int, id: int}')
    t2 = Symbol('t2', 'var * {name: string, amount: int, id: int}')
    t3 = Symbol('t3', 'var * {name: string, amount: int, id: int}')

    expr = union(t1, t2, t3)

    result = compute(expr, {t1: r1, t2: r2, t3: r3}).collect()

    assert set(map(tuple, result)) == set(map(tuple, L1 + L2 + L3))
Example #32
0
def test_spqrksql_join():
    accounts = TableSymbol('accounts', '{name: string, amount: int64, id: int64}')
    accounts_rdd = into(sqlContext, data, schema=accounts.schema)

    cities = TableSymbol('cities', '{name: string, city: string}')
    cities_data = [('Alice', 'NYC'), ('Bob', 'LA')]
    cities_rdd = into(sqlContext,
                      cities_data,
                      schema='{name: string, city: string}')

    expr = join(accounts, cities)

    result = compute(expr, {cities: cities_rdd, accounts: accounts_rdd})

    assert isinstance(result, SchemaRDD)

    assert (str(discover(result)).replace('?', '') ==
            str(expr.dshape))
Example #33
0
    def test_spqrksql_join():
        accounts = symbol('accounts', 'var * {name: string, amount: int64, id: int64}')
        accounts_rdd = into(sqlContext, data, schema=accounts.schema)

        cities = symbol('cities', 'var * {name: string, city: string}')
        cities_data = [('Alice', 'NYC'), ('Bob', 'LA')]
        cities_rdd = into(sqlContext,
                          cities_data,
                          schema='{name: string, city: string}')

        expr = join(accounts, cities)

        result = compute(expr, {cities: cities_rdd, accounts: accounts_rdd})

        assert isinstance(result, SchemaRDD)

        assert (str(discover(result)).replace('?', '') ==
                str(expr.dshape))
Example #34
0
def test_spark_multi_column_join():
    left = [(1, 2, 3),
            (2, 3, 4),
            (1, 3, 5)]
    right = [(1, 2, 30),
             (1, 3, 50),
             (1, 3, 150)]
    rleft = sc.parallelize(left)
    rright = sc.parallelize(right)

    L = TableSymbol('L', '{x: int, y: int, z: int}')
    R = TableSymbol('R', '{x: int, y: int, w: int}')

    j = join(L, R, ['x', 'y'])

    result = compute(j, {L: rleft, R: rright})
    expected = [(1, 2, 3, 30),
                (1, 3, 5, 50),
                (1, 3, 5, 150)]

    print(result.collect())
    assert set(result.collect()) ==  set(expected)
Example #35
0
def test_union():
    L1 = [['Alice', 100, 1],
          ['Bob', 200, 2],
          ['Alice', 50, 3]]
    L2 = [['Alice', 100, 4],
          ['Bob', 200, 5],
          ['Alice', 50, 6]]
    L3 = [['Alice', 100, 7],
          ['Bob', 200, 8],
          ['Alice', 50, 9]]
    r1 = sc.parallelize(L1)
    r2 = sc.parallelize(L2)
    r3 = sc.parallelize(L3)

    t1 = TableSymbol('t1', '{name: string, amount: int, id: int}')
    t2 = TableSymbol('t2', '{name: string, amount: int, id: int}')
    t3 = TableSymbol('t3', '{name: string, amount: int, id: int}')

    expr = union(t1, t2, t3)

    result = compute(expr, {t1: r1, t2: r2, t3: r3}).collect()

    assert set(map(tuple, result)) == set(map(tuple, L1 + L2 + L3))
Example #36
0
def test_spark_multi_level_rowfunc_works():
    expr = t['amount'].map(lambda x: x + 1, 'int')

    assert compute(expr, rdd).collect() == [x[1] + 1 for x in data]
Example #37
0
def test_spark_merge():
    col = (t['amount'] * 2).label('new')
    expr = merge(t['name'], col)

    assert compute(expr, rdd).collect() == [(row[0], row[1] * 2) for row in data]
Example #38
0
def test_spark_recursive_rowfunc_is_used():
    expr = by(t['name'], (2 * (t['amount'] + t['id'])).sum())
    expected = [('Alice', 2*(101 + 53)),
                ('Bob', 2*(202))]
    assert set(compute(expr, rdd).collect()) == set(expected)
Example #39
0
def test_spark_table():
    assert compute(t, rdd) == rdd
Example #40
0
def test_spark_symbol():
    assert compute(t, rdd) == rdd
Example #41
0
def test_into_nd_array_column_failure():
    tble = data(L, fields=['id', 'name', 'balance'])
    expr = tble[tble['balance'] < 0]
    colarray = into(np.ndarray, expr)
    assert len(list(compute(expr))) == len(colarray)
Example #42
0
def test_into_np_ndarray_column():
    t = data(L, fields=['id', 'name', 'balance'])
    expr = t[t.balance < 0].name
    colarray = into(np.ndarray, expr)
    assert len(list(compute(expr))) == len(colarray)
Example #43
0
def test_compute_on_Data_gives_back_data():
    assert compute(data([1, 2, 3])) == [1, 2, 3]
Example #44
0
def test_spark_projection():
    assert compute(t['name'], rdd).collect() == [row[0] for row in data]
Example #45
0
def test_spark_symbol():
    assert compute(t, rdd) == rdd
Example #46
0
def test_spark_head():
    assert list(compute(t.head(1), rdd)) == list(compute(t.head(1), data))
Example #47
0
def test_spark_recursive_rowfunc_is_used():
    expr = by(t['name'], total=(2 * (t['amount'] + t['id'])).sum())
    expected = [('Alice', 2*(101 + 53)),
                ('Bob', 2*(202))]
    assert set(compute(expr, rdd).collect()) == set(expected)
Example #48
0
def test_spark_distinct():
    assert set(compute(t['name'].distinct(), rdd).collect()) == \
            set(['Alice', 'Bob'])
Example #49
0
def test_into_nd_array_selection():
    t = data(L, fields=['id', 'name', 'balance'])
    expr = t[t['balance'] < 0]
    selarray = into(np.ndarray, expr)
    assert len(list(compute(expr))) == len(selarray)
Example #50
0
def test_resources_fail():
    t = symbol('t', 'var * {x: int, y: int}')
    d = t[t['x'] > 100]
    with pytest.raises(ValueError):
        compute(d)
Example #51
0
def test_spark_merge():
    col = (t['amount'] * 2).label('new')
    expr = merge(t['name'], col)

    assert compute(expr, rdd).collect() == [(row[0], row[1] * 2) for row in data]
Example #52
0
def test_compute_on_literal_gives_back_data():
    assert compute(literal([1, 2, 3])) == [1, 2, 3]
Example #53
0
def test_spark_selection_out_of_order():
    expr = t['name'][t['amount'] < 100]

    assert compute(expr, rdd).collect() == ['Alice']
Example #54
0
def test_spark_projection():
    assert compute(t['name'], rdd).collect() == [row[0] for row in data]
Example #55
0
def test_Data_on_json_is_concrete():
    d = data(example('accounts-streaming.json'))

    assert compute(d.amount.sum()) == 100 - 200 + 300 + 400 - 500
    assert compute(d.amount.sum()) == 100 - 200 + 300 + 400 - 500
Example #56
0
def test_spark_selection_out_of_order():
    expr = t['name'][t['amount'] < 100]

    assert compute(expr, rdd).collect() == ['Alice']
Example #57
0
def test_spark_multi_level_rowfunc_works():
    expr = t['amount'].map(lambda x: x + 1)

    assert compute(expr, rdd).collect() == [x[1] + 1 for x in data]
Example #58
0
def test_spark_distinct():
    assert set(compute(t['name'].distinct(), rdd).collect()) == \
            set(['Alice', 'Bob'])
Example #59
0
def test_compute():
    assert list(compute(t['amount'] + 1)) == [101, 201]