def test_can_trivially_create_sqlite_table(): pytest.importorskip('sqlalchemy') Data('sqlite:///'+example('iris.db')+'::iris') # in context with Data('sqlite:///'+example('iris.db')+'::iris') as d: assert d is not None
def test_client_add_dataset(): client.requests = test_add # OMG more monkey patching ec = Client('localhost:6363') ec.add('iris', example('iris.csv')) assert 'iris' in ec.dshape.measure.dict iris_data = bz_data(example('iris.csv')) assert ec.dshape.measure.dict['iris'] == iris_data.dshape
def test_can_trivially_create_sqlite_table(): pytest.importorskip("sqlalchemy") Data("sqlite:///" + example("iris.db") + "::iris") # in context with Data("sqlite:///" + example("iris.db") + "::iris") as d: assert d is not None
def test_client_add_dataset_with_args(): client.requests = test_add # OMG more monkey patching ec = Client('localhost:6363') ec.add('teams', 'sqlite:///' + example('teams.db'), 'teams', primary_key='teamID') assert 'teams' in ec.dshape.measure.dict teams_data = bz_data('sqlite:///' + example('teams.db') + '::teams') assert ec.dshape.measure.dict['teams'] == teams_data.dshape
def test_add_data_to_empty_server(empty_server, serial): # add data with temp_server() as test: iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = empty_server.post( '/add', headers=mimetype(serial), data=blob, ) assert 'OK' in response1.status assert response1.status_code == 200 # check for expected server datashape response2 = empty_server.get('/datashape') expected2 = str(discover({'iris': resource(iris_path)})) assert response2.data.decode('utf-8') == expected2 # compute on added data t = Data({'iris': resource(iris_path)}) expr = t.iris.petal_length.sum() response3 = empty_server.post( '/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial) ) result3 = serial.loads(response3.data)['data'] expected3 = compute(expr, {'iris': resource(iris_path)}) assert result3 == expected3
def test_add_data_to_server(temp_add_server, serial): # add data iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data t = data({'iris': data(iris_path)}) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path)}) assert result3 == expected3
def test_pre_compute_calls_lean_projection(): csv = CSV(example('iris.csv')) s = symbol('s', csv.dshape) result = pre_compute(s.sort('sepal_length').species, csv, comfortable_memory=10) assert set(result.get_chunk().columns) == \ set(['sepal_length', 'species'])
def test_add_expanded_payload_has_effect(temp_add_server, serial): # Ensure that the expanded payload format actually passes the arguments # through to the resource constructor iris_path = example('iris-latin1.tsv') csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'} blob = serial.dumps({'iris': {'source': iris_path, 'kwargs': csv_kwargs}}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path, **csv_kwargs)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data t = data({'iris': data(iris_path, **csv_kwargs)}) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)}) assert result3 == expected3
def test_add_data_to_server(temp_add_server, serial): # add data iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data client.requests = temp_add_server t = data(bz.Client('localhost:6363')) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path)}) assert result3 == expected3
def test_into_ColumnDataSource_pytables(): pytest.importorskip('bokeh') from bokeh.objects import ColumnDataSource pyt = PyTables(example('accounts.h5'), '/accounts') cds = into(ColumnDataSource, pyt) assert 'balance' and 'id' and 'name' in cds.column_names
def test_add_expanded_payload_has_effect(temp_add_server, serial): # Ensure that the expanded payload format actually passes the arguments # through to the resource constructor iris_path = example('iris-latin1.tsv') csv_kwargs = {'delimiter': '\t', 'encoding': 'iso-8859-1'} blob = serial.dumps({'iris': {'source': iris_path, 'kwargs': csv_kwargs}}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED # check for expected server datashape response2 = temp_add_server.get('/datashape') expected2 = discover({'iris': data(iris_path, **csv_kwargs)}) response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(response_dshape.measure.dict['iris'], expected2.measure.dict['iris']) # compute on added data client.requests = temp_add_server t = data(bz.Client('localhost:6363')) expr = t.iris.petal_length.sum() response3 = temp_add_server.post('/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial)) result3 = serial.data_loads(serial.loads(response3.data)['data']) expected3 = compute(expr, {'iris': data(iris_path, **csv_kwargs)}) assert result3 == expected3
def test_pre_compute_calls_lean_projection(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) result = pre_compute(s.sort('sepal_length').species, csv, comfortable_memory=10) assert set(first(result).columns) == \ set(['sepal_length', 'species'])
def test_pre_compute_with_projection_projects_on_data_frames(): csv = CSV(example('iris.csv')) s = symbol('s', csv.dshape) result = pre_compute(s[['sepal_length', 'sepal_width']].distinct(), csv, comfortable_memory=10) assert set(result.get_chunk().columns) == \ set(['sepal_length', 'sepal_width'])
def test_cant_add_data_to_server(iris_server, serial): # try adding more data to server iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = iris_server.post('/add', headers=mimetype(serial), data=blob) assert response1.status_code == RC.UNPROCESSABLE_ENTITY
def test_add_default_not_allowed(temp_server, serial): iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = temp_server.post('/add', headers=mimetype(serial), data=blob) assert 'NOT FOUND' in response1.status assert response1.status_code == RC.NOT_FOUND
def test_repr_hdma(): csv = CSV(example('hmda-small.csv')) t = TableSymbol('hmda', csv.schema) assert compute(t.head(), csv) columns = ['action_taken_name', 'agency_abbr', 'applicant_ethnicity_name'] assert compute(t[columns].head(), csv)
def test_pre_compute_with_projection_projects_on_data_frames(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) result = pre_compute(s[['sepal_length', 'sepal_width']].distinct(), csv, comfortable_memory=10) assert set(first(result).columns) == \ set(['sepal_length', 'sepal_width'])
def test_map_called_on_data_star(): r = data(example('accounts_*.csv')) s = symbol('s', discover(r)) flag[0] = False a = compute(s.count(), r) b = compute(s.count(), r, map=mymap) assert a == b assert flag[0]
def test_map_called_on_resource_star(): r = resource(example('accounts_*.csv')) s = symbol('s', discover(r)) flag[0] = False a = compute(s.count(), r) b = compute(s.count(), r, map=mymap) assert a == b assert flag[0]
def test_add_expanded_payload(temp_add_server, serial): # Ensure that the expanded payload format is accepted by the server iris_path = example('iris.csv') blob = serial.dumps({'iris': {'source': iris_path, 'kwargs': {'delimiter': ','}}}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert 'CREATED' in response1.status assert response1.status_code == RC.CREATED
def data(): try: t = odo(example('nyc.csv'), 'mysql+pymysql://%s@localhost/test::nyc' % getuser()) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield t.bind finally: drop(t)
def nyc(): try: t = odo(example('nyc.csv'), 'postgresql://postgres@localhost/test::nyc') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield t finally: drop(t)
def test_into_ctable_pytables(): from bcolz import ctable tble = PyTables(example('accounts.h5'), datapath='/accounts') ct = into(ctable, tble) ctn = len(ct) tbn = len(tble) ctf, ctl = ct[0], ct[-1] tbf, tbl = tble[0], tble[-1] tble._v_file.close() assert ctn == tbn assert ctf == tbf assert ctl == tbl
def nyc_csv(pg_ip): try: t = odo( example('nyc.csv'), 'postgresql://postgres@{}/test::nyc'.format(pg_ip), ) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield t finally: drop(t)
def nyc(pg_ip): # odoing csv -> pandas -> postgres is more robust, as it doesn't require # the postgres server to be on the same filesystem as the csv file. nyc_pd = odo(example('nyc.csv'), pd.DataFrame) try: t = odo(nyc_pd, 'postgresql://postgres@{}/test::nyc'.format(pg_ip)) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield t finally: drop(t)
def data(): try: t = odo( example('nyc.csv'), 'mysql+pymysql://%s@localhost/test::nyc' % getuser() ) except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield t.bind finally: drop(t)
def test_pre_compute_on_multiple_datasets_is_selective(): from odo import CSV from blaze import Data from blaze.cached import CachedDataset df = pd.DataFrame( [[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]], columns=['id', 'name', 'amount']) iris = CSV(example('iris.csv')) dset = CachedDataset({'df': df, 'iris': iris}) d = Data(dset) assert str(compute(d.df.amount)) == str(df.amount)
def test_pandas_dynd(data, schema): arr = nd.array(data, dtype=schema) result = into(DataFrame, arr) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected) nda = nd.array([[1,2,3], [4,5,6], [7,8,9]]) csv = CSV(example('accounts.csv')) df_csv = into(DataFrame, csv) df_nd = into(df_csv, nda) df_no_names = into(DataFrame, nda) assert list(df_nd.columns) == list(df_csv.columns) assert list(df_no_names.columns) == [0,1,2]
def test_pandas_dynd(): arr = nd.array(data, dtype=schema) result = into(DataFrame, arr) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected) nda = nd.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) csv = CSV(example('accounts.csv')) df_csv = into(DataFrame, csv) df_nd = into(df_csv, nda) df_no_names = into(DataFrame, nda) assert list(df_nd.columns) == list(df_csv.columns) assert list(df_no_names.columns) == [0, 1, 2]
def nyc(): with open(example('nyc.csv'), 'rb') as f: raw = f.read() with tmpfile('.csv') as name: with open(name, 'wb') as g: g.write(raw) try: t = odo(name, 'postgresql://postgres@localhost/test::nyc') except sa.exc.OperationalError as e: pytest.skip(str(e)) else: try: yield t finally: drop(t)
def test_pre_compute_on_multiple_datasets_is_selective(): from odo import CSV from blaze import Data from blaze.cached import CachedDataset df = pd.DataFrame([[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]], columns=['id', 'name', 'amount']) iris = CSV(example('iris.csv')) dset = CachedDataset({'df': df, 'iris': iris}) d = Data(dset) assert str(compute(d.df.amount)) == str(df.amount)
def test_add_errors(temp_add_server, serial): pre_datashape = datashape.dshape( temp_add_server.get('/datashape').data.decode('utf-8')) bunk_path = example('bunk.csv') blob = serial.dumps({'bunk': bunk_path}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert response1.status_code == RC.UNPROCESSABLE_ENTITY # Test that the datashape of the server is accessible and unchanged after # trying to add a non-existent dataset. response2 = temp_add_server.get('/datashape') assert response2.status_code == RC.OK response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(pre_datashape, response_dshape)
def test_add_errors(temp_add_server, serial): pre_datashape = datashape.dshape(temp_add_server .get('/datashape') .data.decode('utf-8')) bunk_path = example('bunk.csv') blob = serial.dumps({'bunk': bunk_path}) response1 = temp_add_server.post('/add', headers=mimetype(serial), data=blob) assert response1.status_code == RC.UNPROCESSABLE_ENTITY # Test that the datashape of the server is accessible and unchanged after # trying to add a non-existent dataset. response2 = temp_add_server.get('/datashape') assert response2.status_code == RC.OK response_dshape = datashape.dshape(response2.data.decode('utf-8')) assert_dshape_equal(pre_datashape, response_dshape)
def test_add_data_twice_error(temp_server, serial): # add iris iris_path = example('iris.csv') payload = serial.dumps({'iris': iris_path}) temp_server.post('/add', headers=mimetype(serial), data=payload) # Try to add to existing 'iris' resp = temp_server.post('/add', headers=mimetype(serial), data=payload) assert resp.status_code == RC.CONFLICT # Verify the server still serves the original 'iris'. ds = datashape.dshape(temp_server.get('/datashape').data.decode('utf-8')) t = symbol('t', ds) query = {'expr': to_tree(t.iris)} resp = temp_server.post('/compute', data=serial.dumps(query), headers=mimetype(serial)) assert resp.status_code == RC.OK
def test_add_data_to_server(serial): with temp_server(data) as test: # add data initial_datashape = test.get('/datashape').data.decode('utf-8') iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = test.post( '/add', headers=mimetype(serial), data=blob, ) assert 'OK' in response1.status assert response1.status_code == 200 # check for expected server datashape new_datashape = test.get('/datashape').data.decode('utf-8') data2 = data.copy() data2.update({'iris': resource(iris_path)}) expected2 = str(discover(data2)) from pprint import pprint as pp #import ipdb; ipdb.set_trace() assert new_datashape == expected2 a = new_datashape != initial_datashape assert new_datashape != initial_datashape # compute on added data t = Data({'iris': resource(iris_path)}) expr = t.iris.petal_length.sum() response3 = test.post( '/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial) ) result3 = serial.loads(response3.data)['data'] expected3 = compute(expr, {'iris': resource(iris_path)}) assert result3 == expected3
def test_add_data_twice_error(temp_add_server, serial): # add iris iris_path = example('iris.csv') payload = serial.dumps({'iris': iris_path}) temp_add_server.post('/add', headers=mimetype(serial), data=payload) # Try to add to existing 'iris' resp = temp_add_server.post('/add', headers=mimetype(serial), data=payload) assert resp.status_code == RC.CONFLICT # Verify the server still serves the original 'iris'. response_ds = temp_add_server.get('/datashape').data.decode('utf-8') ds = datashape.dshape(response_ds) t = symbol('t', ds) query = {'expr': to_tree(t.iris)} resp = temp_add_server.post('/compute', data=serial.dumps(query), headers=mimetype(serial)) assert resp.status_code == RC.OK
def test_add_data_to_server(serial): with temp_server(data) as test: # add data initial_datashape = datashape.dshape(test.get('/datashape').data.decode('utf-8')) iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = test.post( '/add', headers=mimetype(serial), data=blob, ) assert 'OK' in response1.status assert response1.status_code == 200 # check for expected server datashape new_datashape = datashape.dshape(test.get('/datashape').data.decode('utf-8')) data2 = data.copy() data2.update({'iris': resource(iris_path)}) expected2 = datashape.dshape(discover(data2)) from pprint import pprint as pp assert_dshape_equal(new_datashape, expected2) assert new_datashape.measure.fields != initial_datashape.measure.fields # compute on added data t = Data({'iris': resource(iris_path)}) expr = t.iris.petal_length.sum() response3 = test.post( '/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial) ) result3 = serial.loads(response3.data)['data'] expected3 = compute(expr, {'iris': resource(iris_path)}) assert result3 == expected3
def test_compute_chunks_on_single_csv(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) expr = s.sepal_length.max() assert compute(expr, {s: csv}, comfortable_memory=10, chunksize=50) == 7.9
def test_pre_compute_with_head_on_large_csv_yields_iterator(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) assert isinstance( pre_compute(s.species.head(), csv, comfortable_memory=10), Iterator)
def test_pre_compute_on_large_csv_gives_chunked_reader(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) assert isinstance(pre_compute(s.species, csv, comfortable_memory=10), (chunks(pd.DataFrame), pd.io.parsers.TextFileReader))
def test_pre_compute_on_small_csv_gives_dataframe(): csv = CSV(example('iris.csv')) s = symbol('s', discover(csv)) assert isinstance(pre_compute(s.species, csv), (Series, DataFrame))
def test_Data_on_json_is_concrete(): d = Data(example('accounts-streaming.json')) assert compute(d.amount.sum()) == 100 - 200 + 300 + 400 - 500 assert compute(d.amount.sum()) == 100 - 200 + 300 + 400 - 500
def test_data_passes_kwargs_to_resource(): assert Data(example('iris.csv'), encoding='ascii').data.encoding == 'ascii'
from blaze.server.client import mimetype from blaze.server.server import Server, to_tree, from_tree from blaze.server.serialization import all_formats accounts = DataFrame([['Alice', 100], ['Bob', 200]], columns=['name', 'amount']) cities = DataFrame([['Alice', 'NYC'], ['Bob', 'LA']], columns=['name', 'city']) events = DataFrame([[1, datetime(2000, 1, 1, 12, 0, 0)], [2, datetime(2000, 1, 2, 12, 0, 0)]], columns=['value', 'when']) db = resource('sqlite:///' + example('iris.db')) data = {'accounts': accounts, 'cities': cities, 'events': events, 'db': db} @pytest.fixture(scope='module') def server(): s = Server(data, all_formats) s.app.testing = True return s @pytest.yield_fixture
def iris_server(): iris = CSV(example('iris.csv')) s = Server(iris, all_formats) s.app.testing = True with s.app.test_client() as c: yield c
def test_pre_compute_on_small_csv_gives_dataframe(): csv = CSV(example('iris.csv')) s = symbol('s', csv.dshape) assert isinstance(pre_compute(s.species, csv), DataFrame)