Beispiel #1
0
def test_discovery_numeric_column():
    assert discover(sa.String()) == datashape.string
    metadata = sa.MetaData()
    s = sa.Table('name', metadata,
                 sa.Column('name', sa.types.NUMERIC),)

    assert discover(s)
def test_concat_arr():
    s_data = Series(np.arange(15))
    t_data = Series(np.arange(15, 30))

    s = symbol('s', discover(s_data))
    t = symbol('t', discover(t_data))

    assert (
        compute(concat(s, t), {s: s_data, t: t_data}) == Series(np.arange(30))
    ).all()
Beispiel #3
0
def test_dataset():
    ns = {'t': df, 'x': 10}
    cache=dict()
    d = CachedDataset(ns, cache=cache)

    assert discover(d) == discover(ns)

    s = symbol('s', discover(d))
    compute(s.x * 2, d) == 20
    cache == {s.x * 2: 20}
Beispiel #4
0
def test_discovery():
    assert discover(sa.String()) == datashape.string
    metadata = sa.MetaData()
    s = sa.Table('accounts', metadata,
                 sa.Column('name', sa.String),
                 sa.Column('amount', sa.Integer),
                 sa.Column('timestamp', sa.DateTime, primary_key=True))

    assert discover(s) == \
            dshape('var * {name: ?string, amount: ?int32, timestamp: datetime}')
def test_concat_mat():
    s_data = DataFrame(np.arange(15).reshape(5, 3), columns=list('abc'))
    t_data = DataFrame(np.arange(15, 30).reshape(5, 3), columns=list('abc'))

    s = symbol('s', discover(s_data))
    t = symbol('t', discover(t_data))

    tm.assert_frame_equal(
        compute(concat(s, t), {s: s_data, t: t_data}),
        pd.DataFrame(np.arange(30).reshape(10, 3), columns=list('abc')),
    )
Beispiel #6
0
def test_client():
    c = Client('localhost:6363')
    assert str(discover(c)) == str(discover(tdata))

    t = symbol('t', discover(c))
    expr = t.accounts.amount.sum()

    assert compute(expr, c) == 300
    assert 'name' in t.accounts.fields
    assert isinstance(t.accounts.name, Field)
    assert compute(t.accounts.name, c) == ['Alice', 'Bob']
def test_join_promotion():
    a_data = pd.DataFrame([[0.0, 1.5], [1.0, 2.5]], columns=list('ab'))
    b_data = pd.DataFrame([[0, 1], [1, 2]], columns=list('ac'))
    a = symbol('a', discover(a_data))
    b = symbol('b', discover(b_data))

    joined = join(a, b, 'a')
    assert joined.dshape == dshape('var * {a: float64, b: float64, c: int64}')

    expected = pd.merge(a_data, b_data, on='a')
    result = compute(joined, {a: a_data, b: b_data})
    tm.assert_frame_equal(result, expected)
Beispiel #8
0
def test_groups():
    with tmpfile('.hdf5') as fn:
        df.to_hdf(fn, '/data/fixed')

        hdf = resource('hdfstore://%s' % fn)
        assert discover(hdf) == discover({'data': {'fixed': df}})

        s = symbol('s', discover(hdf))

        assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4]

        hdf.close()
Beispiel #9
0
def test_csv_join():
    d = {"a.csv": "a,b,c\n0,1,2\n3,4,5", "b.csv": "c,d,e\n2,3,4\n5,6,7"}

    with filetexts(d):
        resource_a = resource("a.csv")
        resource_b = resource("b.csv")
        a = symbol("a", discover(resource_a))
        b = symbol("b", discover(resource_b))
        tm.assert_frame_equal(
            odo(compute(join(a, b, "c"), {a: resource_a, b: resource_b}), pd.DataFrame),
            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4], [5, 3, 4, 6, 7]], dtype="int64"), columns=list("cabde")),
        )
def test_join_suffixes():
    df = pd.DataFrame(
        list(dict((k, n) for k in ascii_lowercase[:5]) for n in range(5)),
    )
    a = symbol('a', discover(df))
    b = symbol('b', discover(df))

    suffixes = '_x', '_y'
    joined = join(a, b, 'a', suffixes=suffixes)

    expected = pd.merge(df, df, on='a', suffixes=suffixes)
    result = compute(joined, {a: df, b: df})
    tm.assert_frame_equal(result, expected)
Beispiel #11
0
def test_concat():
    d = {"a.csv": "a,b\n1,2\n3,4", "b.csv": "a,b\n5,6\n7,8"}

    with filetexts(d):
        a_rsc = resource("a.csv")
        b_rsc = resource("b.csv")

        a = symbol("a", discover(a_rsc))
        b = symbol("b", discover(b_rsc))

        tm.assert_frame_equal(
            odo(compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame),
            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.arange(1, 9, dtype="int64").reshape(4, 2), columns=list("ab")),
        )
Beispiel #12
0
def test_multi_expression_compute():
    a = Symbol('accounts', discover(accounts))
    c = Symbol('cities', discover(cities))

    expr = join(a, c)

    resp = test.post('/compute.json',
                     data=json.dumps({'expr': to_tree(expr)}),
                     content_type='application/json')

    assert 'OK' in resp.status
    result = json.loads(resp.data)['data']
    expected = compute(expr, {a: accounts, c: cities})

    assert list(map(tuple, result))== into(list, expected)
Beispiel #13
0
def test_str_predicates(what, expected):
    predicate = 'is' + what
    expr = getattr(t.name.str, predicate)()
    expected = pd.Series([expected, expected, expected], name='name')
    result = compute(expr, df).reset_index(drop=True)
    assert_series_equal(expected, result)
    assert discover(result).measure == expr.dshape.measure
Beispiel #14
0
def test_coerce_series_string_datetime(d, tp, ptp):
    s = pd.Series(d, name='a')
    e = symbol('t', discover(s)).coerce(to=tp)
    assert e.schema == dshape(tp)
    result = compute(e, s)
    expected = s.astype(ptp)
    assert_series_equal(result, expected)
Beispiel #15
0
def test_coerce_series():
    s = pd.Series(list('1234'), name='a')
    dds = dd.from_pandas(s, npartitions=2)
    t = symbol('t', discover(s))
    result = compute(t.coerce(to='int64'), dds)
    expected = pd.Series([1, 2, 3, 4], name=s.name)
    eq(result, expected)
def test_time_field():
    data = pd.Series(pd.date_range(start='20120101', end='20120102', freq='H'))
    s = symbol('s', discover(data))
    result = compute(s.time, data)
    expected = data.dt.time
    expected.name = 's_time'
    assert_series_equal(result, expected)
Beispiel #17
0
def test_add_data_to_empty_server(empty_server, serial):
    # add data
    with temp_server() as test:
        iris_path = example('iris.csv')
        blob = serial.dumps({'iris': iris_path})
        response1 = empty_server.post(
            '/add',
            headers=mimetype(serial),
            data=blob,
        )
        assert 'OK' in response1.status
        assert response1.status_code == 200

        # check for expected server datashape
        response2 = empty_server.get('/datashape')
        expected2 = str(discover({'iris': resource(iris_path)}))
        assert response2.data.decode('utf-8') == expected2

        # compute on added data
        t = Data({'iris': resource(iris_path)})
        expr = t.iris.petal_length.sum()

        response3 = empty_server.post(
            '/compute',
            data=serial.dumps({'expr': to_tree(expr)}),
            headers=mimetype(serial)
        )

        result3 = serial.loads(response3.data)['data']
        expected3 = compute(expr, {'iris': resource(iris_path)})
        assert result3 == expected3
Beispiel #18
0
def test_odo_kwargs(test, serial):
    expr = t.dumb
    bad_query = {'expr': to_tree(expr)}

    result = test.post(
        '/compute',
        headers=mimetype(serial),
        data=serial.dumps(bad_query),
    )
    assert result.status_code == 500
    assert b'return_df must be passed' in result.data

    good_query = {
        'expr': to_tree(expr),
        'odo_kwargs': {
            'return_df': odo(DumbResource.df, list),
        },
    }
    result = test.post(
        '/compute',
        headers=mimetype(serial),
        data=serial.dumps(good_query)
    )
    assert result.status_code == 200
    data = serial.loads(result.data)
    dshape = discover(DumbResource.df)
    assert_dshape_equal(
        datashape.dshape(data['datashape']),
        dshape,
    )
    assert_frame_equal(
        odo(data['data'], DataFrame, dshape=dshape),
        DumbResource.df,
    )
Beispiel #19
0
def test_add_data_to_server(temp_add_server, serial):
    # add data
    iris_path = example('iris.csv')
    blob = serial.dumps({'iris': iris_path})
    response1 = temp_add_server.post('/add',
                                     headers=mimetype(serial),
                                     data=blob)
    assert 'CREATED' in response1.status
    assert response1.status_code == RC.CREATED

    # check for expected server datashape
    response2 = temp_add_server.get('/datashape')
    expected2 = discover({'iris': data(iris_path)})
    response_dshape = datashape.dshape(response2.data.decode('utf-8'))
    assert_dshape_equal(response_dshape.measure.dict['iris'],
                        expected2.measure.dict['iris'])

    # compute on added data
    client.requests = temp_add_server
    t = data(bz.Client('localhost:6363'))
    expr = t.iris.petal_length.sum()

    response3 = temp_add_server.post('/compute',
                                     data=serial.dumps({'expr':
                                                        to_tree(expr)}),
                                     headers=mimetype(serial))

    result3 = serial.data_loads(serial.loads(response3.data)['data'])
    expected3 = compute(expr, {'iris': data(iris_path)})
    assert result3 == expected3
Beispiel #20
0
def test_add_expanded_payload_has_effect(temp_add_server, serial):
    # Ensure that the expanded payload format actually passes the arguments
    # through to the resource constructor
    iris_path = example('iris-latin1.tsv')
    csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'}
    blob = serial.dumps({'iris': {'source': iris_path,
                                  'kwargs': csv_kwargs}})
    response1 = temp_add_server.post('/add',
                                     headers=mimetype(serial),
                                     data=blob)
    assert 'CREATED' in response1.status
    assert response1.status_code == RC.CREATED

    # check for expected server datashape
    response2 = temp_add_server.get('/datashape')
    expected2 = discover({'iris': data(iris_path, **csv_kwargs)})
    response_dshape = datashape.dshape(response2.data.decode('utf-8'))
    assert_dshape_equal(response_dshape.measure.dict['iris'],
                        expected2.measure.dict['iris'])

    # compute on added data
    t = data({'iris': data(iris_path, **csv_kwargs)})
    expr = t.iris.petal_length.sum()

    response3 = temp_add_server.post('/compute',
                                     data=serial.dumps({'expr': to_tree(expr)}),
                                     headers=mimetype(serial))

    result3 = serial.data_loads(serial.loads(response3.data)['data'])
    expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)})
    assert result3 == expected3
def test_compute_on_db(bank, points):
    assert bank.database == points.database
    db = bank.database

    d = symbol(db.name, discover(db))
    assert (compute(d.points.x.sum(),
                    db) == sum(x['x'] for x in db.points.find()))
Beispiel #22
0
def test_add_data_to_server(temp_add_server, serial):
    # add data
    iris_path = example('iris.csv')
    blob = serial.dumps({'iris': iris_path})
    response1 = temp_add_server.post('/add',
                                     headers=mimetype(serial),
                                     data=blob)
    assert 'CREATED' in response1.status
    assert response1.status_code == RC.CREATED

    # check for expected server datashape
    response2 = temp_add_server.get('/datashape')
    expected2 = discover({'iris': data(iris_path)})
    response_dshape = datashape.dshape(response2.data.decode('utf-8'))
    assert_dshape_equal(response_dshape.measure.dict['iris'],
                        expected2.measure.dict['iris'])

    # compute on added data
    t = data({'iris': data(iris_path)})
    expr = t.iris.petal_length.sum()

    response3 = temp_add_server.post('/compute',
                                     data=serial.dumps({'expr': to_tree(expr)}),
                                     headers=mimetype(serial))

    result3 = serial.data_loads(serial.loads(response3.data)['data'])
    expected3 = compute(expr, {'iris': data(iris_path)})
    assert result3 == expected3
Beispiel #23
0
def test_time_field():
    data = pd.Series(pd.date_range(start='20120101', end='20120102', freq='H'))
    s = symbol('s', discover(data))
    result = compute(s.time, data)
    expected = data.dt.time
    expected.name = 's_time'
    assert_series_equal(result, expected)
Beispiel #24
0
def test_compute_on_db(bank, points):
    assert bank.database == points.database
    db = bank.database

    d = symbol(db.name, discover(db))
    assert (compute(d.points.x.sum(), db) ==
            sum(x['x'] for x in db.points.find()))
Beispiel #25
0
def test_pre_compute_with_projection_projects_on_data_frames():
    csv = CSV(example('iris.csv'))
    s = symbol('s', discover(csv))
    result = pre_compute(s[['sepal_length', 'sepal_width']].distinct(),
                         csv, comfortable_memory=10)
    assert set(first(result).columns) == \
            set(['sepal_length', 'sepal_width'])
Beispiel #26
0
def test_create_from_datashape():
    engine = sa.create_engine('sqlite:///:memory:')
    ds = dshape('''{bank: var * {name: string, amount: int},
                    points: var * {x: int, y: int}}''')
    engine = create_from_datashape(engine, ds)

    assert discover(engine) == ds
Beispiel #27
0
def test_compute_kwargs(test, serial):
    expr = t.dumb.sort()
    bad_query = {'expr': to_tree(expr)}

    result = test.post(
        '/compute',
        headers=mimetype(serial),
        data=serial.dumps(bad_query),
    )
    assert result.status_code == 500
    assert b'return_df must be passed' in result.data

    good_query = {
        'expr': to_tree(expr),
        'compute_kwargs': {
            'return_df': odo(DumbResource.df, list),
        },
    }
    result = test.post(
        '/compute',
        headers=mimetype(serial),
        data=serial.dumps(good_query)
    )
    assert result.status_code == 200
    data = serial.loads(result.data)
    dshape = discover(DumbResource.df)
    assert_dshape_equal(
        datashape.dshape(data['datashape']),
        dshape,
    )
    assert_frame_equal(
        odo(data['data'], DataFrame, dshape=dshape),
        DumbResource.df,
    )
Beispiel #28
0
def test_pre_compute_calls_lean_projection():
    csv = CSV(example('iris.csv'))
    s = symbol('s', discover(csv))
    result = pre_compute(s.sort('sepal_length').species,
                         csv, comfortable_memory=10)
    assert set(first(result).columns) == \
            set(['sepal_length', 'species'])
Beispiel #29
0
def test_unused_datetime_columns():
    ds = dshape('2 * {val: string, when: datetime}')
    with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn:
        csv = CSV(fn, has_header=True)

        s = symbol('s', discover(csv))
        assert into(list, compute(s.val, csv)) == ['a', 'b']
Beispiel #30
0
def test_datasets():
    response = test.get('/datasets.json')
    assert json.loads(response.data) == {'accounts': str(accounts.dshape),
                                         'accounts_df': str(discover(df)),
                                         'cities': str(cities.dshape),
                                         'pairs': str(pairs.dshape),
                                         'times': str(times.dshape)}
Beispiel #31
0
def test_coerce_series():
    s = pd.Series(list('1234'), name='a')
    dds = dd.from_pandas(s, npartitions=2)
    t = symbol('t', discover(s))
    result = compute(t.coerce(to='int64'), dds)
    expected = pd.Series([1, 2, 3, 4], name=s.name)
    eq(result, expected)
Beispiel #32
0
def test_compute_kwargs(test, serial):
    expr = t.dumb.sort()
    bad_query = {'expr': to_tree(expr)}

    result = test.post('/compute',
                       headers=mimetype(serial),
                       data=serial.dumps(bad_query))
    assert result.status_code == RC.INTERNAL_SERVER_ERROR
    assert b'return_df must be passed' in result.data

    good_query = {
        'expr': to_tree(expr),
        'compute_kwargs': {
            'return_df': odo(DumbResource.df, list)
        }
    }
    result = test.post('/compute',
                       headers=mimetype(serial),
                       data=serial.dumps(good_query))
    assert result.status_code == RC.OK
    tdata = serial.loads(result.data)
    dshape = discover(DumbResource.df)
    assert_dshape_equal(datashape.dshape(tdata['datashape']), dshape)
    assert_frame_equal(
        odo(serial.data_loads(tdata['data']), DataFrame, dshape=dshape),
        DumbResource.df)
Beispiel #33
0
def test_add_expanded_payload_has_effect(temp_add_server, serial):
    # Ensure that the expanded payload format actually passes the arguments
    # through to the resource constructor
    iris_path = example('iris-latin1.tsv')
    csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'}
    blob = serial.dumps({'iris': {'source': iris_path, 'kwargs': csv_kwargs}})
    response1 = temp_add_server.post('/add',
                                     headers=mimetype(serial),
                                     data=blob)
    assert 'CREATED' in response1.status
    assert response1.status_code == RC.CREATED

    # check for expected server datashape
    response2 = temp_add_server.get('/datashape')
    expected2 = discover({'iris': data(iris_path, **csv_kwargs)})
    response_dshape = datashape.dshape(response2.data.decode('utf-8'))
    assert_dshape_equal(response_dshape.measure.dict['iris'],
                        expected2.measure.dict['iris'])

    # compute on added data
    client.requests = temp_add_server
    t = data(bz.Client('localhost:6363'))
    expr = t.iris.petal_length.sum()

    response3 = temp_add_server.post('/compute',
                                     data=serial.dumps({'expr':
                                                        to_tree(expr)}),
                                     headers=mimetype(serial))

    result3 = serial.data_loads(serial.loads(response3.data)['data'])
    expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)})
    assert result3 == expected3
Beispiel #34
0
def test_add_data_to_empty_server(empty_server, serial):
    # add data
    with temp_server() as test:
        iris_path = example('iris.csv')
        blob = serial.dumps({'iris': iris_path})
        response1 = empty_server.post(
            '/add',
            headers=mimetype(serial),
            data=blob,
        )
        assert 'OK' in response1.status
        assert response1.status_code == 200

        # check for expected server datashape
        response2 = empty_server.get('/datashape')
        expected2 = str(discover({'iris': resource(iris_path)}))
        assert response2.data.decode('utf-8') == expected2

        # compute on added data
        t = Data({'iris': resource(iris_path)})
        expr = t.iris.petal_length.sum()

        response3 = empty_server.post(
            '/compute',
            data=serial.dumps({'expr': to_tree(expr)}),
            headers=mimetype(serial)
        )

        result3 = serial.loads(response3.data)['data']
        expected3 = compute(expr, {'iris': resource(iris_path)})
        assert result3 == expected3
Beispiel #35
0
def test_unused_datetime_columns():
    ds = dshape('2 * {val: string, when: datetime}')
    with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn:
        csv = CSV(fn, has_header=True)

        s = symbol('s', discover(csv))
        assert into(list, compute(s.val, csv)) == ['a', 'b']
Beispiel #36
0
def test_timedelta_arith():
    series = Series(pd.date_range('2014-01-01', '2014-02-01'))
    sym = symbol('s', discover(series))
    delta = timedelta(days=1)
    assert (compute(sym + delta, series) == series + delta).all()
    assert (compute(sym - delta, series) == series - delta).all()
    assert (compute(sym - (sym - delta),
                    series) == series - (series - delta)).all()
Beispiel #37
0
    def test_strings(self):
        schema = '{x: int32, y: string}'
        dd = HDF5(self.filename, 'data', schema=schema)
        dd.extend([(1, 'Hello'), (2, 'World!')])

        with h5py.File(dd.path) as f:
            d = f.get(dd.datapath)
            self.assertEqual(discover(d), dshape('2 * ' + schema))
Beispiel #38
0
def test_map_called_on_data_star():
    r = data(example('accounts_*.csv'))
    s = symbol('s', discover(r))
    flag[0] = False
    a = compute(s.count(), r)
    b = compute(s.count(), r, map=mymap)
    assert a == b
    assert flag[0]
Beispiel #39
0
def test_map_called_on_resource_star():
    r = resource(example('accounts_*.csv'))
    s = symbol('s', discover(r))
    flag[0] = False
    a = compute(s.count(), r)
    b = compute(s.count(), r, map=mymap)
    assert a == b
    assert flag[0]
Beispiel #40
0
def test_join_diff_contexts(db, ctx, cities):
    expr = join(db.t, db.s, 'name')
    people = ctx.table('t')
    cities = into(SchemaRDD, cities, dshape=discover(ctx.table('s')))
    scope = {db: {'t': people, 's': cities}}
    result = compute(expr, scope)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})
    assert into(set, result) == into(set, expected)
Beispiel #41
0
def test_join(db, ctx):
    expr = join(db.t, db.s)
    result = compute(expr, ctx)
    expected = compute(expr, {db: {'t': df, 's': cities_df}})

    assert isinstance(result, SchemaRDD)
    assert into(set, result) == into(set, expected)
    assert discover(result) == expr.dshape
Beispiel #42
0
 def test_discovery(self):
     dd = HDF5(self.filename, 'data',
               schema='2 * int32')
     dd.extend([(1, 2), (2, 3), (4, 5)])
     with h5py.File(dd.path) as f:
         d = f.get(dd.datapath)
         self.assertEqual(discover(d),
                          dshape('3 * 2 * int32'))
Beispiel #43
0
def test_csv_with_trailing_commas():
    with tmpfile('.csv') as fn:
        with open(fn, 'wt') as f:
            # note the trailing space in the header
            f.write('a,b,c, \n1, 2, 3, ')
        csv = CSV(fn)
        assert expr_repr(data(fn))
        assert discover(csv).measure.names == [
            'a', 'b', 'c', ''
        ]
    with tmpfile('.csv') as fn:
        with open(fn, 'wt') as f:
            f.write('a,b,c,\n1, 2, 3, ')  # NO trailing space in the header
        csv = CSV(fn)
        assert expr_repr(data(fn))
        assert discover(csv).measure.names == [
            'a', 'b', 'c', 'Unnamed: 3'
        ]
def test_dplyr_transform():
    df = DataFrame({'timestamp': pd.date_range('now', periods=5)})
    t = symbol('t', discover(df))
    expr = transform(t, date=t.timestamp.map(lambda x: x.date(),
                                             schema='datetime'))
    lhs = compute(expr, df)
    rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()),
                                name='date').to_frame()], axis=1)
    tm.assert_frame_equal(lhs, rhs)
Beispiel #45
0
def test_csv_with_trailing_commas():
    with tmpfile('.csv') as fn:
        with open(fn, 'wt') as f:
            # note the trailing space in the header
            f.write('a,b,c, \n1, 2, 3, ')
        csv = CSV(fn)
        assert repr(Data(fn))
        assert discover(csv).measure.names == [
            'a', 'b', 'c', ''
        ]
    with tmpfile('.csv') as fn:
        with open(fn, 'wt') as f:
            f.write('a,b,c,\n1, 2, 3, ')  # NO trailing space in the header
        csv = CSV(fn)
        assert repr(Data(fn))
        assert discover(csv).measure.names == [
            'a', 'b', 'c', 'Unnamed: 3'
        ]
Beispiel #46
0
def test_dplyr_transform():
    df = DataFrame({'timestamp': pd.date_range('now', periods=5)})
    t = symbol('t', discover(df))
    expr = transform(t, date=t.timestamp.map(lambda x: x.date(),
                                             schema='datetime'))
    lhs = compute(expr, df)
    rhs = pd.concat([df, Series(df.timestamp.map(lambda x: x.date()),
                                name='date').to_frame()], axis=1)
    assert str(lhs) == str(rhs)
Beispiel #47
0
def test_spider(data):
    result = spider(str(data))
    ss = """{
    %r: {
        'foo.csv': var * {a: int64, b: int64},
        'foo.hdf5': {fooh5: 10 * 2 * float64},
        sub: {'foo.json': 2 * {a: int64, b: float64, c: ?datetime, d: ?string}}
    }
}""" % os.path.basename(str(data))
    assert dshape(discover(result)) == dshape(ss)
Beispiel #48
0
def test_spider_cycle(data_with_cycle):
    result = spider(str(data_with_cycle), followlinks=True)
    ss = """{
    %r: {
        'foo.csv': var * {a: int64, b: int64},
        'foo.hdf5': {fooh5: 10 * 2 * float64},
        sub: {'foo.json': 2 * {a: int64, b: float64, c: ?datetime, d: ?string}}
    }
}""" % os.path.basename(str(data_with_cycle))
    assert dshape(discover(result)) != dshape(ss)
Beispiel #49
0
def test_merge_with_common_subexpression():
    df = DataFrame(np.random.rand(5, 2), columns=list('ab'))
    t = symbol('t', discover(df))
    expr = merge((t.a - t.a % 3).label('a'), (t.a % 3).label('b'))
    result = compute(expr, {t: df})
    expected = pd.concat(
        [pd.Series(df.a - df.a % 3, name='a'),
         pd.Series(df.a % 3, name='b')],
        axis=1)
    tm.assert_frame_equal(result, expected)
Beispiel #50
0
def test_concat():
    d = {'a.csv': 'a,b\n1,2\n3,4',
         'b.csv': 'a,b\n5,6\n7,8'}

    with filetexts(d):
        a_rsc = data('a.csv')
        b_rsc = data('b.csv')

        a = symbol('a', discover(a_rsc))
        b = symbol('b', discover(b_rsc))

        tm.assert_frame_equal(
            odo(
                compute(concat(a, b), {a: a_rsc, b: b_rsc}), pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.arange(1, 9, dtype='int64').reshape(4, 2),
                         columns=list('ab')),
        )
Beispiel #51
0
def test_csv_join():
    d = {'a.csv': 'a,b,c\n0,1,2\n3,4,5',
         'b.csv': 'c,d,e\n2,3,4\n5,6,7'}

    with filetexts(d):
        data_a = data('a.csv')
        data_b = data('b.csv')
        a = symbol('a', discover(data_a))
        b = symbol('b', discover(data_b))
        tm.assert_frame_equal(
            odo(
                compute(join(a, b, 'c'), {a: data_a, b: data_b}),
                pd.DataFrame,
            ),

            # windows needs explicit int64 construction b/c default is int32
            pd.DataFrame(np.array([[2, 0, 1, 3, 4],
                                   [5, 3, 4, 6, 7]], dtype='int64'),
                         columns=list('cabde'))
        )
Beispiel #52
0
def test_transform_with_common_subexpression():
    df = DataFrame(np.random.rand(5, 2), columns=list('ab'))
    t = symbol('t', discover(df))
    expr = transform(t, c=t.a - t.a % 3, d=t.a % 3)
    result = compute(expr, df)
    expected = pd.concat(
        [df[c] for c in df.columns] +
        [pd.Series(df.a - df.a % 3, name='c'),
         pd.Series(df.a % 3, name='d')],
        axis=1)
    tm.assert_frame_equal(result, expected)
Beispiel #53
0
def test_datetime_access():
    df = DataFrame({'name': ['Alice', 'Bob', 'Joe'],
                    'when': [datetime(2010, 1, 1, 1, 1, 1)] * 3,
                    'amount': [100, 200, 300],
                    'id': [1, 2, 3]})

    t = symbol('t', discover(df))

    for attr in ['day', 'month', 'minute', 'second']:
        assert (compute(getattr(t.when, attr), df) == \
                Series([1, 1, 1])).all()
Beispiel #54
0
def test_hdfstore():
    with tmpfile('.hdf5') as fn:
        df.to_hdf(fn, '/appendable', format='table')
        df.to_hdf(fn, '/fixed')

        hdf = resource('hdfstore://%s' % fn)
        s = symbol('s', discover(hdf))

        assert isinstance(compute(s.fixed, hdf),
                          (pd.DataFrame, pd.io.pytables.Fixed))
        assert isinstance(compute(s.appendable, hdf),
                          (pd.io.pytables.AppendableFrameTable, Chunks))

        s = symbol('s', discover(df))
        f = resource('hdfstore://%s::/fixed' % fn)
        a = resource('hdfstore://%s::/appendable' % fn)
        assert isinstance(pre_compute(s, a), Chunks)

        hdf.close()
        f.parent.close()
        a.parent.close()
Beispiel #55
0
def test_builtin_501_exception(iris_server, serial):
    t = symbol('t', discover(iris))

    for name in ('map', 'apply'):
        func = getattr(t.species, name)
        expr = func(copy, 'int')
        query = {'expr': to_tree(expr)}
        response = iris_server.post('/compute',
                                    data=serial.dumps(query),
                                    headers=mimetype(serial))

        assert '501 Not Implemented'.lower() in response.status.lower()
Beispiel #56
0
def test_streaming():
    seq = [{'name': 'Alice', 'x': 1}, {'name': 'Bob', 'x': 1}]
    ns = {'t': seq, 'x': 10}
    cache = dict()
    d = CachedDataset(ns, cache=cache)

    s = symbol('s', discover(d))
    expr = s.t.x * 2
    result = compute(expr, d)

    assert not isinstance(d.cache[expr], Iterator)
    assert into(list, d.cache[expr]) == [2, 2]