Example #1
0
def test_dir():
    i = symbol('i', '10 * int')
    d = symbol('d', '10 * datetime')

    assert isinstance(i + 1, Add)  # this works
    with pytest.raises(Exception):  # this doesn't
        d + 1
Example #2
0
def test_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]

    L = symbol('L', 'var * {id: int, name: string, amount: real}')
    R = symbol('R', 'var * {city: string, id: int}')

    assert set(compute(join(L, R), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='left'), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (2, 'Bob', 200, None),
             (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='right'), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (3, None, None, 'LA'),
             (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='outer'), {L: left, R: right})) == set(
            [(1, 'Alice', 100, 'NYC'),
             (1, 'Alice', 100, 'Boston'),
             (2, 'Bob', 200, None),
             (3, None, None, 'LA'),
             (4, 'Dennis', 400, 'Moscow')])
Example #3
0
def test_multi_dataset_broadcast_with_Record_types():
    x = symbol('x', '3 * {p: int, q: int}')
    y = symbol('y', '3 * int')

    a = [(1, 1), (2, 2), (3, 3)]
    b = [10, 20, 30]

    assert list(compute(x.p + x.q + y, {x: iter(a), y: iter(b)})) == [12, 24, 36]
Example #4
0
def test_compute_signature():
    s = symbol('s', 'int64')
    t = symbol('t', 'float32')
    d = symbol('d', 'datetime')

    assert compute_signature(s + t) == float64(int64, float32)
    assert (compute_signature(d.truncate(days=1)) ==
            datetime64('D')(datetime64('us')))
    assert compute_signature(d.day + 1) == int64(datetime64('us'))
Example #5
0
def test_multi_dataset_broadcast():
    x = symbol('x', '3 * int')
    y = symbol('y', '3 * int')

    a = [1, 2, 3]
    b = [10, 20, 30]

    assert list(compute(x + y, {x: a, y: b})) == [11, 22, 33]
    assert list(compute(2*x + (y + 1), {x: a, y: b})) == [13, 25, 37]
Example #6
0
def test_truncate_datetime():
    s = symbol('x', 'datetime')
    assert compute(s.truncate(2, 'days'), datetime(2002, 1, 3, 12, 30)) ==\
            date(2002, 1, 2)

    s = symbol('x', 'var * datetime')
    assert list(compute(s.truncate(2, 'days'),
                        [datetime(2002, 1, 3, 12, 30)])) ==\
            [date(2002, 1, 2)]
Example #7
0
def test_selection_inner_inputs():
    s_data = pd.DataFrame({'a': np.arange(5)})
    t_data = pd.DataFrame({'a': np.arange(5)})

    s_dd = dd.from_pandas(s_data, npartitions=2)
    t_dd = dd.from_pandas(t_data, npartitions=2)

    s = symbol('s', 'var * {a: int64}')
    t = symbol('t', 'var * {a: int64}')

    eq(compute(s[s.a == t.a], {s: s_dd, t: t_dd}), s_data)
def test_concat(sql_two_tables):
    t_table, u_table = sql_two_tables
    t_data = pd.DataFrame(np.arange(5), columns=['a'])
    u_data = pd.DataFrame(np.arange(5, 10), columns=['a'])
    odo(t_data, t_table)
    odo(u_data, u_table)

    t = symbol('t', discover(t_data))
    u = symbol('u', discover(u_data))
    tm.assert_frame_equal(
        compute(concat(t, u).sort('a'), {t: t_table, u: u_table}, return_type=pd.DataFrame),
        pd.DataFrame(np.arange(10), columns=['a']),
    )
Example #9
0
def test_concat(sql_two_tables):
    t_table, u_table = sql_two_tables
    t_data = pd.DataFrame(np.arange(5), columns=["a"])
    u_data = pd.DataFrame(np.arange(5, 10), columns=["a"])
    odo(t_data, t_table)
    odo(u_data, u_table)

    t = symbol("t", discover(t_data))
    u = symbol("u", discover(u_data))
    tm.assert_frame_equal(
        odo(compute(concat(t, u).sort("a"), {t: t_table, u: u_table}), pd.DataFrame),
        pd.DataFrame(np.arange(10), columns=["a"]),
    )
Example #10
0
def test_str_cat_bcast(sql_with_null):
    t = symbol('t', discover(sql_with_null))
    lit_sym = symbol('s', 'string')
    s = t[t.amount <= 200]
    result = compute(s.comment.str_cat(lit_sym, sep=' '),
                     {t: sql_with_null, lit_sym: '!!'},
                     return_type=pd.Series)
    df = compute(s, sql_with_null,
                 return_type=pd.DataFrame)
    expected = df.comment.str.cat(['!!']*len(df.comment), sep=' ')

    assert all(expected[~expected.isnull()] == result[~result.isnull()])
    assert all(expected[expected.isnull()].index == result[result.isnull()].index)
Example #11
0
def test_graph_double_join():
    idx = [['A', 1],
           ['B', 2],
           ['C', 3],
           ['D', 4],
           ['E', 5],
           ['F', 6]]

    arc = [[1, 3],
           [2, 3],
           [4, 3],
           [5, 3],
           [3, 1],
           [2, 1],
           [5, 1],
           [1, 6],
           [2, 6],
           [4, 6]]

    wanted = [['A'],
              ['F']]

    t_idx = symbol('t_idx', 'var * {name: string, b: int32}')
    t_arc = symbol('t_arc', 'var * {a: int32, b: int32}')
    t_wanted = symbol('t_wanted', 'var * {name: string}')

    # >>> compute(join(t_idx, t_arc, 'b'), {t_idx: idx, t_arc: arc})
    # [[1, A, 3],
    #  [1, A, 2],
    #  [1, A, 5],
    #  [3, C, 1],
    #  [3, C, 2],
    #  [3, C, 4],
    #  [3, C, 5],
    #  [6, F, 1],
    #  [6, F, 2],
    #  [6, F, 4]]

    j = join(join(t_idx, t_arc, 'b'), t_wanted, 'name')[['name', 'b', 'a']]

    result = compute(j, {t_idx: idx, t_arc: arc, t_wanted: wanted})
    result = sorted(map(tuple, result))
    expected = sorted([('A', 1, 3),
                       ('A', 1, 2),
                       ('A', 1, 5),
                       ('F', 6, 1),
                       ('F', 6, 2),
                       ('F', 6, 4)])

    assert result == expected
Example #12
0
def test_compute_with_variable_in_namespace(iris_server):
    test = iris_server
    t = symbol('t', discover(iris))
    pl = symbol('pl', 'float32')
    expr = t[t.petal_length > pl].species
    tree = to_tree(expr, {pl: 'pl'})

    blob = json.dumps({'expr': tree, 'namespace': {'pl': 5}})
    resp = test.post('/compute.json', data=blob,
                     content_type='application/json')

    assert 'OK' in resp.status
    result = json.loads(resp.data.decode('utf-8'))['data']
    expected = list(compute(expr._subs({pl: 5}), {t: iris}))
    assert result == expected
Example #13
0
def strcat_sym():
    '''
    blaze symbol used to test exceptions raised by cat()
    '''
    ds = dshape('3 * {name: string, comment: string, num: int32}')
    s = symbol('s', dshape=ds)
    return s
Example #14
0
def test_join():
    left = [['Alice', 100], ['Bob', 200]]
    right = [['Alice', 1], ['Bob', 2]]

    L = symbol('L', 'var * {name: string, amount: int}')
    R = symbol('R', 'var * {name: string, id: int}')
    joined = join(L, R, 'name')

    assert dshape(joined.schema) == \
            dshape('{name: string, amount: int, id: int}')

    result = list(compute(joined, {L: left, R: right}))

    expected = [('Alice', 100, 1), ('Bob', 200, 2)]

    assert result == expected
Example #15
0
def test_slicing_with_lists():
    nx = np.arange(20).reshape((4, 5))
    dx = from_array(nx, (2, 2))
    sx = symbol('x', discover(dx))

    expr = sx[[2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[::2, [2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[1, [2, 0, 3]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[[2, 0, 3], -2]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[:, :]
    assert compute(expr, dx).dask == dx.dask

    expr = sx[0]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))

    expr = sx[0, [3, 1, 4]]
    assert eq(np.array(compute(expr, dx)), compute(expr, nx))
Example #16
0
def test_coerce_series():
    s = pd.Series(list('1234'), name='a')
    dds = dd.from_pandas(s, npartitions=2)
    t = symbol('t', discover(s))
    result = compute(t.coerce(to='int64'), dds)
    expected = pd.Series([1, 2, 3, 4], name=s.name)
    eq(result, expected)
Example #17
0
def test_str_namespace():
    t = symbol('t', 'var * {name: string}')
    assert bzs.upper(t.name).isidentical(t.name.str.upper())
    assert bzs.lower(t.name).isidentical(t.name.str.lower())
    assert (bzs.lower(bzs.upper(t.name))
            .isidentical(t.name.str.upper().str.lower()))
    assert bzs.len(t.name).isidentical(t.name.str.len())
    assert bzs.like(t.name, '*a').isidentical(t.name.str.like('*a'))
    assert (bzs.cat(bzs.cat(t.name, t.name, sep=' ++ '), t.name)
            .isidentical(t.name.str.cat(t.name, sep=' ++ ')
                               .str.cat(t.name)))
    assert bzs.isalnum(t.name).isidentical(t.name.str.isalnum())
    assert bzs.isalpha(t.name).isidentical(t.name.str.isalpha())
    assert bzs.isdecimal(t.name).isidentical(t.name.str.isdecimal())
    assert bzs.isdigit(t.name).isidentical(t.name.str.isdigit())
    assert bzs.islower(t.name).isidentical(t.name.str.islower())
    assert bzs.isnumeric(t.name).isidentical(t.name.str.isnumeric())
    assert bzs.isspace(t.name).isidentical(t.name.str.isspace())
    assert bzs.istitle(t.name).isidentical(t.name.str.istitle())
    assert bzs.isupper(t.name).isidentical(t.name.str.isupper())

    assert bzs.replace(t.name, 'A', 'a').isidentical(t.name.str.replace('A', 'a'))
    assert bzs.capitalize(t.name).isidentical(t.name.str.capitalize())
    assert bzs.strip(t.name).isidentical(t.name.str.strip())
    assert bzs.lstrip(t.name).isidentical(t.name.str.lstrip())
    assert bzs.rstrip(t.name).isidentical(t.name.str.rstrip())
    assert bzs.pad(t.name, 5).isidentical(t.name.str.pad(5))
    assert (bzs.slice_replace(t.name, 1, 3, 'foo')
            .isidentical(t.name.str.slice_replace(1, 3, 'foo')))
Example #18
0
def test_by_multi_column_grouper():
    t = symbol('t', 'var * {x: int, y: int, z: int}')
    expr = by(t[['x', 'y']], total=t['z'].count())
    data = [(1, 2, 0), (1, 2, 0), (1, 1, 0)]

    print(set(compute(expr, data)))
    assert set(compute(expr, data)) == set([(1, 2, 2), (1, 1, 1)])
Example #19
0
def test_like(ds):
    t = symbol('t', ds)
    expr = getattr(t, 'name', t).like('Alice*')
    assert expr.pattern == 'Alice*'
    assert expr.schema.measure == dshape(
        '%sbool' % ('?' if '?' in ds else '')
    ).measure
Example #20
0
def test_upper_schema(ds):
    t = symbol('t', ds)
    expr_upper = getattr(t, 'name', t).str.upper()
    expr_lower = getattr(t, 'name', t).str.lower()
    assert (expr_upper.schema.measure ==
            expr_lower.schema.measure ==
            dshape('%sstring' % ('?' if '?' in ds else '')).measure)
Example #21
0
def test_pre_compute():
    s = symbol('s', 'var * {a: int, b: int}')
    assert pre_compute(s, [(1, 2)]) == [(1, 2)]
    assert list(pre_compute(s, iter([(1, 2)]))) == [(1, 2)]
    assert list(pre_compute(s, iter([(1, 2), (3, 4)]))) == [(1, 2), (3, 4)]
    assert list(pre_compute(s, iter([{'a': 1, 'b': 2},
                                     {'a': 3, 'b': 4}]))) == [(1, 2), (3, 4)]
Example #22
0
def test_dist(nyc):
    def distance(lat1, lon1, lat2, lon2, R=3959):
        # http://andrew.hedges.name/experiments/haversine/
        dlon = radians(lon2 - lon1)
        dlat = radians(lat2 - lat1)
        a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2
        return R * 2 * atan2(sqrt(a), sqrt(1 - a))

    t = symbol('t', discover(nyc))

    filtered = t[
        (t.pickup_latitude >= 40.477399) &
        (t.pickup_latitude <= 40.917577) &
        (t.dropoff_latitude >= 40.477399) &
        (t.dropoff_latitude <= 40.917577) &
        (t.pickup_longitude >= -74.259090) &
        (t.pickup_longitude <= -73.700272) &
        (t.dropoff_longitude >= -74.259090) &
        (t.dropoff_longitude <= -73.700272) &
        (t.passenger_count < 6)
    ]
    dist = distance(filtered.pickup_latitude, filtered.pickup_longitude,
                    filtered.dropoff_latitude, filtered.dropoff_longitude)
    transformed = transform(filtered, dist=dist)
    assert (
        compute(transformed.dist.max(), nyc, return_type=float) ==
        compute(transformed.dist, nyc, return_type=pd.Series).max()
    )
Example #23
0
def test_from_tree_is_robust_to_unnecessary_namespace():
    t = symbol('t', 'var * {name: string, amount: int32}')
    expr = t.amount + 1

    tree = to_tree(expr)  # don't use namespace

    assert from_tree(tree, {'t': t}).isidentical(expr)
Example #24
0
def test_shift_arithmetic(sql, n):
    t = symbol('t', discover(sql))
    expr = t.B - t.B.shift(n)
    result = compute(expr, sql, return_type=pd.Series)
    df = odo(sql, pd.DataFrame)
    expected = df.B - df.B.shift(n)
    tm.assert_series_equal(result, expected)
Example #25
0
def test_sample(big_sql):
    nn = symbol('nn', discover(big_sql))
    nrows = odo(compute(nn.nrows, big_sql), int)
    result = compute(nn.sample(n=nrows // 2), big_sql, return_type=pd.DataFrame)
    assert len(result) == nrows // 2
    result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame)
    assert len(result) == len(result2)
Example #26
0
def test_to_tree():
    t = symbol('t', 'var * {name: string, amount: int32}')
    expr = t.amount.sum()
    expected = {
        'op': 'sum',
        'args': [
            {
                'op': 'Field',
                'args': [
                    {
                        'op': 'Symbol',
                        'args': [
                            't',
                            'var * {name: string, amount: int32}',
                            0,
                        ]
                    },
                    'amount'
                ]
            },
            [0],
            False,
        ],
    }
    assert to_tree(expr) == expected
Example #27
0
def test_coerce_bool_and_sum(sql):
    n = sql.name
    t = symbol(n, discover(sql))
    expr = (t.B > 1.0).coerce(to='int32').sum()
    result = compute(expr, sql).scalar()
    expected = odo(compute(t.B, sql), pd.Series).gt(1).sum()
    assert result == expected
Example #28
0
def test_shift_on_column(n, column, sql):
    sql = sql.data
    t = symbol('t', discover(sql))
    expr = t[column].shift(n)
    result = compute(expr, sql, return_type=pd.Series)
    expected = odo(sql, pd.DataFrame)[column].shift(n)
    tm.assert_series_equal(result, expected)
Example #29
0
def test_isin_selectable(sql):
    s = symbol('s', discover(sql))

    # wrap the resource in a select
    assert compute(s.B.isin({1, 3}),
                   sa.select(sql._resources()[sql].columns),
                   return_type=list) == [(True,), (False,)]
Example #30
0
def test_str_slice(slc, sql_with_null):
    name_series = pd.Series(['Alice', None, 'Drew', 'Bob', 'Drew', 'first', None],       
                            name='substring_1')
    t = symbol('t', discover(sql_with_null))
    result = compute(t.name.str[slc], sql_with_null, return_type=pd.Series).fillna('zzz')
    result[result == ''] = 'zzz'
    expected = name_series.str[slc].fillna('zzz')
    tm.assert_series_equal(result, expected)
Example #31
0
def test_sample(big_sql):
    nn = symbol('nn', discover(big_sql))
    nrows = odo(compute(nn.nrows, big_sql), int)
    result = compute(nn.sample(n=nrows // 2),
                     big_sql,
                     return_type=pd.DataFrame)
    assert len(result) == nrows // 2
    result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame)
    assert len(result) == len(result2)
Example #32
0
def test_to_from_tree_namespace():
    t = symbol('t', 'var * {name: string, amount: int32}')
    expr = t.name

    tree = to_tree(expr, names={t: 't'})
    assert tree == {'op': 'Field', 'args': ['t', 'name']}

    new = from_tree(tree, namespace={'t': t})
    assert new.isidentical(expr)
Example #33
0
def test_foreign_key_isin(fkey):
    t = symbol('fkey', discover(fkey))
    expr = t.sym_id.isin([1, 2])
    result = compute(expr, fkey, return_type='native')
    expected = """SELECT
        fkey.sym_id IN (%(sym_id_1)s, %(sym_id_2)s) AS anon_1
    FROM fkey
    """
    assert normalize(str(result)) == normalize(expected)
Example #34
0
def test_timedelta_stat_reduction(sql_with_timedeltas, func):
    sym = symbol('s', discover(sql_with_timedeltas))
    expr = getattr(sym.N, func)()

    deltas = pd.Series([timedelta(seconds=n) for n in range(10)])
    expected = timedelta(seconds=getattr(deltas.astype('int64') /
                                         1e9, func)(ddof=expr.unbiased))
    assert compute(expr, sql_with_timedeltas,
                   return_type=timedelta) == expected
Example #35
0
def test_distinct_on(sql):
    t = symbol('t', discover(sql))
    computation = compute(t[['A', 'B']].sort('A').distinct('A'), sql)
    assert normalize(str(computation)) == normalize("""
    SELECT DISTINCT ON (anon_1."A") anon_1."A", anon_1."B"
    FROM (SELECT {tbl}."A" AS "A", {tbl}."B" AS "B"
    FROM {tbl}) AS anon_1 ORDER BY anon_1."A" ASC
    """.format(tbl=sql.name))
    assert odo(computation, tuple) == (('a', 1), ('b', 2))
Example #36
0
def test_dataset():
    ns = {'t': df, 'x': 10}
    cache = dict()
    d = CachedDataset(ns, cache=cache)

    assert discover(d) == discover(ns)

    s = symbol('s', discover(d))
    compute(s.x * 2, d) == 20
    cache == {s.x * 2: 20}
Example #37
0
def test_foreign_key_chain(fkey):
    t = symbol('t', discover(fkey))
    expr = t.sym_id.main.data
    result = compute(expr, fkey)
    expected = """SELECT
        main.data
    FROM main, fkey, pkey
    WHERE fkey.sym_id = pkey.id and pkey.main = main.id
    """
    assert normalize(str(result)) == normalize(expected)
Example #38
0
def test_auto_join_field(orders):
    t = symbol('t', discover(orders))
    expr = t.product_id.color
    result = compute(expr, orders)
    expected = """SELECT
        products.color
    FROM products, orders
    WHERE orders.product_id = products.product_id
    """
    assert normalize(str(result)) == normalize(expected)
Example #39
0
def test_map_datetime():
    from datetime import datetime
    data = [['A', 0], ['B', 1]]
    t = symbol('t', 'var * {foo: string, datetime: int64}')

    result = list(compute(t['datetime'].map(datetime.utcfromtimestamp,
    'datetime'), data))
    expected = [datetime(1970, 1, 1, 0, 0, 0), datetime(1970, 1, 1, 0, 0, 1)]

    assert result == expected
Example #40
0
def test_timedelta_arith(sql_with_dts):
    delta = timedelta(days=1)
    dates = pd.Series(pd.date_range('2014-01-01', '2014-02-01'))
    sym = symbol('s', discover(dates))
    assert (
        odo(compute(sym + delta, sql_with_dts), pd.Series) == dates + delta
    ).all()
    assert (
        odo(compute(sym - delta, sql_with_dts), pd.Series) == dates - delta
    ).all()
Example #41
0
def test_str_slice(slc, sql_with_null):
    name_series = pd.Series(
        ['Alice', None, 'Drew', 'Bob', 'Drew', 'first', None],
        name='substring_1')
    t = symbol('t', discover(sql_with_null))
    result = compute(t.name.str[slc], sql_with_null,
                     return_type=pd.Series).fillna('zzz')
    result[result == ''] = 'zzz'
    expected = name_series.str[slc].fillna('zzz')
    tm.assert_series_equal(result, expected)
def test_coalesce(sqla):
    t = symbol('t', discover(sqla))
    assert (
        compute(coalesce(t.B, -1), {t: sqla}, return_type=list) ==
        [1, 1, -1]
    )
    assert (
        compute(coalesce(t.A, 'z'), {t: sqla}, return_type=list) ==
        ['a', 'z', 'c']
    )
Example #43
0
def test_scalar():
    s = symbol('s', '{name: string, id: int32, payments: var * {amount: int32, when: datetime}}')
    data = ('Alice', 1, ((100, datetime(2000, 1, 1, 1, 1 ,1)),
                         (200, datetime(2000, 2, 2, 2, 2, 2)),
                         (300, datetime(2000, 3, 3, 3, 3, 3))))

    assert compute(s.name, data) == 'Alice'
    assert compute(s.id + 1, data) == 2
    assert tuple(compute(s.payments.amount, data)) == (100, 200, 300)
    assert tuple(compute(s.payments.amount + 1, data)) == (101, 201, 301)
Example #44
0
def test_like():
    t = symbol('t', 'var * {name: string, city: string}')
    data = [('Alice Smith', 'New York'),
            ('Bob Smith', 'Chicago'),
            ('Alice Walker', 'LA')]

    assert list(compute(t.like(name='Alice*'), data)) == [data[0], data[2]]
    assert list(compute(t.like(name='lice*'), data)) == []
    assert list(compute(t.like(name='*Smith*'), data)) == [data[0], data[1]]
    assert list(compute(t.like(name='*Smith*', city='New York'), data)) == [data[0]]
Example #45
0
def test_coalesce(sqla):
    t = symbol('t', discover(sqla))
    assert (
        compute(coalesce(t.B, -1), {t: sqla}, return_type=list) ==
        [(1,), (1,), (-1,)]
    )
    assert (
        compute(coalesce(t.A, 'z'), {t: sqla}, return_type=list) ==
        [('a',), ('z',), ('c',)]
    )
Example #46
0
def test_nested():
    t = symbol('t', payment_dshape)
    assert list(compute(t.name, payments_ordered)) == ['Alice', 'Bob']

    assert list(compute(t.payments, payments_ordered)) == \
                [p[1] for p in payments_ordered]
    assert list(compute(t.payments.amount, payments_ordered)) == \
            [(100, 200), (300, -400, 500)]
    assert list(compute(t.payments.amount + 1, payments_ordered)) ==\
            [(101, 201), (301, -399, 501)]
Example #47
0
def test_concat(sql_two_tables):
    t_table, u_table = sql_two_tables
    t_data = pd.DataFrame(np.arange(5), columns=['a'])
    u_data = pd.DataFrame(np.arange(5, 10), columns=['a'])
    odo(t_data, t_table)
    odo(u_data, u_table)

    t = symbol('t', discover(t_data))
    u = symbol('u', discover(u_data))
    tm.assert_frame_equal(
        odo(
            compute(concat(t, u).sort('a'), {
                t: t_table,
                u: u_table
            }),
            pd.DataFrame,
        ),
        pd.DataFrame(np.arange(10), columns=['a']),
    )
Example #48
0
def test_compute_with_variable_in_namespace(iris_server, serial):
    test = iris_server
    t = symbol('t', discover(iris))
    pl = symbol('pl', 'float32')
    expr = t[t.petal_length > pl].species
    tree = to_tree(expr, {pl: 'pl'})

    blob = serial.dumps({'expr': tree, 'namespace': {'pl': 5}})
    resp = test.post(
        '/compute.{name}'.format(name=serial.name),
        data=blob,
    )

    assert 'OK' in resp.status
    data = serial.loads(resp.data)
    result = data['data']
    expected = list(compute(expr._subs({pl: 5}), {t: iris}))
    assert result == expected
    assert data['names'] == ['species']
Example #49
0
def test_datetime_access():
    data = [['Alice', 100, 1, datetime(2000, 1, 1, 1, 1, 1)],
            ['Bob', 200, 2, datetime(2000, 1, 1, 1, 1, 1)],
            ['Alice', 50, 3, datetime(2000, 1, 1, 1, 1, 1)]]

    t = symbol('t',
            'var * {amount: float64, id: int64, name: string, when: datetime}')

    assert list(compute(t.when.year, data)) == [2000, 2000, 2000]
    assert list(compute(t.when.second, data)) == [1, 1, 1]
    assert list(compute(t.when.date, data)) == [date(2000, 1, 1)] * 3
Example #50
0
def test_by_groupby_deep():
    data = [(1, 2, 'Alice'), (1, 3, 'Bob'), (2, 4, 'Alice'), (2, 4, '')]

    schema = '{x: int, y: int, name: string}'
    t = symbol('t', datashape.var * schema)

    t2 = t[t['name'] != '']
    t3 = merge(t2.x, t2.name)
    expr = by(t3.name, avg=t3.x.mean())
    result = set(compute(expr, data))
    assert result == set([('Alice', 1.5), ('Bob', 1.0)])
Example #51
0
def test_auto_join_projection(orders):
    t = symbol('t', discover(orders))
    expr = t.product_id[['color', 'price']]
    result = compute(expr, orders)
    expected = """SELECT
        products.color,
        products.price
    FROM products, orders
    WHERE orders.product_id = products.product_id
    """
    assert normalize(str(result)) == normalize(expected)
def test_datetime_access(attr, dtype, sql_with_dts):
    s = symbol('s', discover(sql_with_dts))
    expr = getattr(s.A.dt, attr)()
    result = compute(expr, sql_with_dts, return_type=pd.Series)
    assert result.dtype == dtype
    assert_series_equal(
        result,
        getattr(compute(s.A, sql_with_dts, return_type=pd.Series).dt, attr),
        check_names=False,
        check_dtype=False,
    )
Example #53
0
def test_group_by_map(fkey, grouper):
    t = symbol('fkey', discover(fkey))
    expr = by(t[grouper], id_count=t.size.count())
    result = compute(expr, fkey)
    expected = """SELECT
        fkey.sym_id,
        count(fkey.size) AS id_count
    FROM fkey
    GROUP BY fkey.sym_id
    """
    assert normalize(str(result)) == normalize(expected)
def test_str_cat_with_null(sql_with_null, sep):
    t = symbol('t', discover(sql_with_null))
    res = compute(t.name.str_cat(t.sex, sep=sep), sql_with_null,
                  return_type=list)
    cols = compute(t[['name', 'sex']], sql_with_null, return_type=list)

    for r, (n, s) in zip(res, cols):
        if n is None or s is None:
            assert r is None
        else:
            assert (r == n + s if sep is None else r == n + sep + s)
Example #55
0
def test_outer_join(sc):
    left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)]
    left = sc.parallelize(left)
    right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)]
    right = sc.parallelize(right)

    L = symbol('L', 'var * {id: int, name: string, amount: real}')
    R = symbol('R', 'var * {city: string, id: int}')

    assert set(compute(join(L, R), {
        L: left,
        R: right
    }).collect()) == set([(1, 'Alice', 100, 'NYC'),
                          (1, 'Alice', 100, 'Boston'),
                          (4, 'Dennis', 400, 'Moscow')])

    assert set(compute(join(L, R, how='left'), {
        L: left,
        R: right
    }).collect()) == set([(1, 'Alice', 100, 'NYC'),
                          (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None),
                          (4, 'Dennis', 400, 'Moscow')])

    assert set(
        compute(join(L, R, how='right'), {
            L: left,
            R: right
        }).collect()) == set([(1, 'Alice', 100, 'NYC'),
                              (1, 'Alice', 100, 'Boston'),
                              (3, None, None, 'LA'),
                              (4, 'Dennis', 400, 'Moscow')])

    # Full outer join not yet supported
    assert set(
        compute(join(L, R, how='outer'), {
            L: left,
            R: right
        }).collect()) == set([(1, 'Alice', 100, 'NYC'),
                              (1, 'Alice', 100, 'Boston'),
                              (2, 'Bob', 200, None), (3, None, None, 'LA'),
                              (4, 'Dennis', 400, 'Moscow')])
Example #56
0
def test_multi_column_join(sc):
    left = [(1, 2, 3),
            (2, 3, 4),
            (1, 3, 5)]
    right = [(1, 2, 30),
             (1, 3, 50),
             (1, 3, 150)]
    rleft = sc.parallelize(left)
    rright = sc.parallelize(right)

    L = symbol('L', 'var * {x: int, y: int, z: int}')
    R = symbol('R', 'var * {x: int, y: int, w: int}')

    j = join(L, R, ['x', 'y'])

    result = compute(j, {L: rleft, R: rright})
    expected = [(1, 2, 3, 30),
                (1, 3, 5, 50),
                (1, 3, 5, 150)]

    assert set(result.collect()) == set(expected)
Example #57
0
def test_groups():
    with tmpfile('.hdf5') as fn:
        df.to_hdf(fn, '/data/fixed')

        hdf = resource('hdfstore://%s' % fn)
        assert discover(hdf) == discover({'data': {'fixed': df}})

        s = symbol('s', discover(hdf))

        assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4]

        hdf.close()
Example #58
0
def test_hdfstore():
    with tmpfile('.hdf5') as fn:
        df.to_hdf(fn, '/appendable', format='table')
        df.to_hdf(fn, '/fixed')

        hdf = resource('hdfstore://%s' % fn)
        s = symbol('s', discover(hdf))

        assert isinstance(compute(s.fixed, hdf),
                          (pd.DataFrame, pd.io.pytables.Fixed))
        assert isinstance(compute(s.appendable, hdf),
                          (pd.io.pytables.AppendableFrameTable, Chunks))

        s = symbol('s', discover(df))
        f = resource('hdfstore://%s::/fixed' % fn)
        a = resource('hdfstore://%s::/appendable' % fn)
        assert isinstance(pre_compute(s, a), Chunks)

        hdf.close()
        f.parent.close()
        a.parent.close()
Example #59
0
def test_foreign_key_group_by(fkey, grouper):
    t = symbol('fkey', discover(fkey))
    expr = by(t.sym_id[grouper], avg_price=t.sym_id.price.mean())
    result = compute(expr, fkey)
    expected = """SELECT
        pkey.sym,
        avg(pkey.price) AS avg_price
    FROM pkey, fkey
    WHERE fkey.sym_id = pkey.id
    GROUP BY pkey.sym
    """
    assert normalize(str(result)) == normalize(expected)
Example #60
0
def test_builtin_501_exception(iris_server, serial):
    t = symbol('t', discover(iris))

    for name in ('map', 'apply'):
        func = getattr(t.species, name)
        expr = func(copy, 'int')
        query = {'expr': to_tree(expr)}
        response = iris_server.post('/compute',
                                    data=serial.dumps(query),
                                    headers=mimetype(serial))

        assert '501 Not Implemented'.lower() in response.status.lower()