def test_column(sql): t = Data(sql) r = compute(t['x']) assert r == [1, 10, 100] assert compute(t[['x']]) == [(1,), (10,), (100,)] assert compute(t.count()) == 3
def test_column(sql): t = Data(sql) r = compute(t['x']) assert r == [1, 10, 100] assert compute(t[['x']]) == [(1, ), (10, ), (100, )] assert compute(t.count()) == 3
def test_column(sql): t = Data(sql) r = list(t['x']) assert r == [1, 10, 100] assert list(t[['x']]) == [(1,), (10,), (100,)] assert int(t.count()) == 3
def test_swap_resources_into_scope(): from blaze import Data t = Data([1, 2, 3], dshape='3 * int', name='t') expr, scope = swap_resources_into_scope(t.head(2), {t: t.data}) assert t._resources() assert not expr._resources() assert t not in scope
def test_add_data_to_empty_server(empty_server, serial): # add data with temp_server() as test: iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = empty_server.post( '/add', headers=mimetype(serial), data=blob, ) assert 'OK' in response1.status assert response1.status_code == 200 # check for expected server datashape response2 = empty_server.get('/datashape') expected2 = str(discover({'iris': resource(iris_path)})) assert response2.data.decode('utf-8') == expected2 # compute on added data t = Data({'iris': resource(iris_path)}) expr = t.iris.petal_length.sum() response3 = empty_server.post( '/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial) ) result3 = serial.loads(response3.data)['data'] expected3 = compute(expr, {'iris': resource(iris_path)}) assert result3 == expected3
def test_interactive_dshape_works(): try: d = Data('mongodb://localhost:27017/test_db::bank', dshape='var * {name: string, amount: int64}') except pymongo.errors.ConnectionFailure: pytest.skip('No mongo server running') assert d.dshape == dshape('var * {name: string, amount: int64}')
def test_into_cds_mixed(): pytest.importorskip('bokeh') from bokeh.objects import ColumnDataSource n = 25 ddict = { 'first': np.random.choice(list('abc'), size=n), 'second': np.random.choice(['cachaça', 'tres leches', 'pizza'], size=n), 'third': list(range(n)) } df = pd.DataFrame(ddict) with tmpfile('.csv') as fn: df.to_csv(fn, header=None, index=False, encoding='utf8') csv = CSV(fn, columns=['first', 'second', 'third'], encoding='utf8') t = Data(csv) cds = into(ColumnDataSource, t) assert isinstance(cds, ColumnDataSource) expected = dict( (k, into(list, csv[:, k])) for k in ['first', 'second', 'third']) assert cds.data == expected cds = into(ColumnDataSource, t[['first', 'second']]) assert isinstance(cds, ColumnDataSource) expected = dict( (k, into(list, csv[:, k])) for k in ['first', 'second']) assert cds.data == expected cds = into(ColumnDataSource, t['first']) assert isinstance(cds, ColumnDataSource) assert cds.data == {'first': into(list, csv[:, 'first'])}
def test_interactive_dshape_works(mongo_host_port): try: d = Data('mongodb://{}:{}/test_db::bank'.format(*mongo_host_port), dshape='var * {name: string, amount: int64}') except pymongo.errors.ConnectionFailure: pytest.skip('No mongo server running') assert d.dshape == dshape('var * {name: string, amount: int64}')
def test_url_csv_data(iris_local): iris_remote = Data(iris_url) assert isinstance(iris_remote.data, URL(CSV)) iris_remote_df = compute(iris_remote) assert isinstance(iris_remote_df, pd.DataFrame) iris_local_df = compute(iris_local) tm.assert_frame_equal(iris_remote_df, iris_local_df)
def test_series_single_column_projection(): data = [('Alice', -200.0, 1), ('Bob', -300.0, 2)] t = Data(data, '2 * {name: string, amount: float64, id: int64}') df = into(pd.Series, t[['name']]) assert isinstance(df, pd.Series) expected = pd.DataFrame(data, columns=t.schema.measure.names).name assert str(df) == str(expected)
def test_detect_types(self): for dataset in config['datasets']: for uri in dataset['uris']: data = Data(uri) result = al.types(data) self.check_type(result, dataset['types'], dataset['table']) print('for %s on %s' % (dataset['table'], getengine(dataset['uris'])))
def test_numeric_cols(self): for dataset in config['datasets']: for uri in dataset['uris']: data = Data(uri) result = al.get_numeric_cols(data.dshape) eq_(set(result), set(dataset['types']['numbers'])) print('for %s on %s' % (dataset['table'], getengine(dataset['uris'])))
def test_gains(self): for dataset in config['datasets']: for uri in dataset['uris']: data = Data(uri) types = al.types(data) result = al.groupmeans(data, types['groups'], types['numbers']) self.check_gain(result, dataset['groupmeans']['gain'], uri) print('for %s on %s' % (dataset['table'], getengine(dataset['uris'])))
def to_blaze(self): from blaze.server.client import Client from blaze.server import from_tree from blaze import Data # hacky - blaze urls have `compute.json` in it, but we need to strip it off # to feed it into the blaze client lib c = Client(self.data_url.rsplit('compute.json', 1)[0]) d = Data(c) return from_tree(self.expr, {':leaf': d})
def test_stats(self): for dataset in config['datasets']: for uri in dataset['uris']: data = Data(uri) groups = dataset['types']['groups'] result = al.crosstabs(data, groups, details=False) expected = pd.DataFrame(dataset['crosstabs']) self.check_stats(result, expected, uri) print('for %s on %s' % (dataset['table'], getengine(dataset['uris'])))
def test_expr_client_interactive(): c = Client('localhost:6363') t = Data(c) assert compute(t.accounts.name) == ['Alice', 'Bob'] assert (into( set, compute( by(t.accounts.name, min=t.accounts.amount.min(), max=t.accounts.amount.max()))) == set([('Alice', 100, 100), ('Bob', 200, 200)]))
def test_gains_changed_types(self): # Issue #24 for dataset in config['datasets']: if 'changedtypes' not in dataset: continue for uri in dataset['uris']: data = Data(uri) types = dataset['changedtypes'] result = al.groupmeans(data, types['groups'], types['numbers']) self.check_gain(result, types['groupmeans']['gain'], uri) print('for %s on %s' % (dataset['table'], getengine(dataset['uris'])))
def test_expressions(): sources = [v for k, v in data if k not in [nd.array, CSV, Data]] targets = [ v for k, v in no_date if k not in [Data, CSV, Collection, nd.array, PyTable, SQL] ] for a in sources: for b in targets: c = Data( a, "var * {amount: int64, id: int64, name: string, timestamp: datetime}" )[['amount', 'id', 'name']] assert normalize(into(type(b), c)) == normalize(b)
def test_pre_compute_on_multiple_datasets_is_selective(): from odo import CSV from blaze import Data from blaze.cached import CachedDataset df = pd.DataFrame( [[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]], columns=['id', 'name', 'amount']) iris = CSV(example('iris.csv')) dset = CachedDataset({'df': df, 'iris': iris}) d = Data(dset) assert str(compute(d.df.amount)) == str(df.amount)
def setUpModule(): "Download test data files into data/ target folder" # Set root logger logging level to INFO logging.basicConfig(level=logging.INFO) # Download datasets for dataset in config['datasets']: dataset['path'] = os.path.join(DATA_DIR, dataset['table'] + '.csv') dataset['uris'] = [dataset['path']] if not os.path.exists(dataset['path']): logging.info('Downloading %s', dataset['table']) pd.read_csv(dataset['url'], encoding='cp1252').to_csv(dataset['path'], index=False, encoding='cp1252') # Create autolysis databases (sqlite3 data directory is DATA_DIR) os.chdir(DATA_DIR) dburl = {} for db, url in config['databases'].items(): if not server_exists(url): continue dburl[db] = url # Load datasets into databases for dataset in config['datasets']: for db in dataset.get('databases', []): if db not in dburl: logging.warning('%s cannot use unconfigured DB %s', dataset['table'], db) continue url = dburl[db] # Don't load data if a non-empty table already exists target = dburl[db] + '::' + dataset['table'] dataset['uris'].append(target) engine = sa.create_engine(url, encoding='utf-8') if engine.dialect.has_table(engine.connect(), dataset['table']): if Data(target).count() > 0: continue logging.info('Creating table %s on %s', dataset['table'], db) try: odo(dataset['path'], target) except sa.exc.InternalError: logging.warning('Loading %s into %s failed: %s', dataset['table'], db, traceback.format_exc(0))
def test_add_data_to_server(serial): with temp_server(data) as test: # add data initial_datashape = datashape.dshape(test.get('/datashape').data.decode('utf-8')) iris_path = example('iris.csv') blob = serial.dumps({'iris': iris_path}) response1 = test.post( '/add', headers=mimetype(serial), data=blob, ) assert 'OK' in response1.status assert response1.status_code == 200 # check for expected server datashape new_datashape = datashape.dshape(test.get('/datashape').data.decode('utf-8')) data2 = data.copy() data2.update({'iris': resource(iris_path)}) expected2 = datashape.dshape(discover(data2)) from pprint import pprint as pp assert_dshape_equal(new_datashape, expected2) assert new_datashape.measure.fields != initial_datashape.measure.fields # compute on added data t = Data({'iris': resource(iris_path)}) expr = t.iris.petal_length.sum() response3 = test.post( '/compute', data=serial.dumps({'expr': to_tree(expr)}), headers=mimetype(serial) ) result3 = serial.loads(response3.data)['data'] expected3 = compute(expr, {'iris': resource(iris_path)}) assert result3 == expected3
from bokeh.plotting import figure, show, output_server from bokeh.transforms import image_downsample from blaze.server.client import Client from blaze import Data N = 1000 x = np.linspace(0, 10, N) y = np.linspace(0, 10, N) xx, yy = np.meshgrid(x, y) d = np.sin(xx) * np.cos(yy) output_server("remote_image") c = Client('http://localhost:5006') d = Data(c) source = image_downsample.source() source.from_blaze(d.array, local=True) plot = figure( x_range=[0, 10], y_range=[0, 10], ) plot.image(source=source, image="image", x="x", y="y", dw="dw", dh="dh", palette="Spectral11", tools="pan,wheel_zoom,box_zoom,reset,previewsave")
def test_interactive_len(sql): t = Data(sql) assert len(t) == int(t.count())
def test_Data_construct_with_table(bank): d = Data('mongodb://localhost/test_db::bank') assert set(d.fields) == set(('name', 'amount')) assert int(d.count()) == 5
def test_Data_construct(bank, points): d = Data('mongodb://localhost/test_db') assert 'bank' in d.fields assert 'points' in d.fields assert isinstance(d.dshape.measure, Record)
def test_Data_construct_with_table(bank, mongo_host_port): d = Data('mongodb://{}:{}/test_db::bank'.format(*mongo_host_port)) assert set(d.fields) == set(('name', 'amount')) assert int(d.count()) == 5
def test_Data_construct(bank, points, mongo_host_port): d = Data('mongodb://{}:{}/test_db'.format(*mongo_host_port)) assert 'bank' in d.fields assert 'points' in d.fields assert isinstance(d.dshape.measure, Record)
import IO dataPath = os.getcwd() + '/data' twts_data = IO.IO_json(dataPath, 'twitterData').load() logger.debug('Type of loaded json is %s' % str(type(twts_data))) twts_read = IO_twitter.TwitterAPI().parseTweets(twts_data) fields = ['id', 'created_at', 'user_id', 'user_name', 'tweet_text', 'url'] twts_pd_df = pd.DataFrame(twts_read, columns=fields) twts_pd_df.head() twts_pd_df.describe() twts_bz_df = Data(twts_pd_df) twts_bz_df.schema twts_bz_df.dshape twts_bz_df.data tweet_text_distinct = twts_bz_df.tweet_text.distinct() tweet_text_distinct twts_bz_df[['id', 'user_name', 'tweet_text']].distinct() twts_odo_df = Data(twts_pd_df) print twts_odo_df.dshape odo(twts_bz_df, 'jsonlines://{}/{}.json'.format(dataPath, 'jsonData'))
def test_into_list_Column(): with filetext('Alice,1\nBob,2') as fn: csv = CSV(fn, columns=['name', 'id']) t = Data(csv) assert into(list, t.name) == ['Alice', 'Bob']
def test_into_numpy_from_tableexpr_with_option_types(): t = Data([[1, 'Alice'], [2, 'Bob']], '2 * {id: ?int32, name: string[5, "ascii"]}') assert into(np.ndarray, t).dtype == np.dtype([('id', 'i4'), ('name', 'S5')])
def test_into_csv_blaze_table(good_csv): t = Data(CSV(good_csv)) df = into(pd.DataFrame, t[['userid', 'text']]) assert list(df.columns) == ['userid', 'text']