def test_csv_into_mongodb_columns(empty_collec, file_name): csv = CSV(file_name, schema='{x: int, y: int}') coll = empty_collec lhs = into(list, csv) assert lhs == into(list, into(coll, csv))
def test_literals(db, ctx): expr = db.t[db.t.amount >= 100] result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list( map(set, into(list, expected)) )
def test_sort(ctx, db, field, ascending): expr = db.t.sort(field, ascending=ascending) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list( map(set, into(list, expected)) )
def test_multikey_by(ctx, db, reducer, reduction): t = db.t expr = by(t[['id', 'amount']], total=getattr(t[reducer], reduction)()) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert (set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected))))
def test_selection(ctx, db): expr = db.t[db.t.amount > 50] result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df}}, return_type='native') assert list(map(set, into(list, result))) == list( map(set, into(list, expected)) )
def test_by(ctx, db, grouper, reducer, reduction): t = db.t expr = by(t[grouper], total=getattr(t[reducer], reduction)()) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df}}) assert (set(map(frozenset, into(list, result))) == set(map(frozenset, into(list, expected))))
def test_csv_into_mongodb_colon_del(empty_collec, file_name_colon): csv = CSV(file_name_colon) coll = empty_collec lhs = into(list, csv) newcoll = into(coll, csv) rhs = into(list, newcoll) assert lhs == rhs
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", skipinitialspace="alpha") # failing call
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert isinstance(result, (SparkDataFrame, SchemaRDD)) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_repr_hdma(): csv = CSV(example('hmda-small.csv')) t = TableSymbol('hmda', csv.schema) assert len(into(list, compute(t.head(), csv))) == 10 columns = ['action_taken_name', 'agency_abbr', 'applicant_ethnicity_name'] assert into(list, compute(t[columns].head(), csv))
def test_csv_into_mongodb(empty_collec): csv = CSV(file_name) coll = empty_collec into(coll,csv) mongo_data = list(coll.find({},{'_0': 1, '_id': 0})) assert list(csv[:,'_0']) == [i['_0'] for i in mongo_data]
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx, return_type='native') expected = compute(expr, {db: {'t': df, 's': cities_df}}, return_type='native') assert isinstance(result, SparkDataFrame) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_join_diff_contexts(db, ctx, cities): expr = join(db.t, db.s, 'name') people = ctx.table('t') cities = into(SchemaRDD, cities, dshape=discover(ctx.table('s'))) scope = {db: {'t': people, 's': cities}} result = compute(expr, scope) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert into(set, result) == into(set, expected)
def test_sql_new_schema(): with non_existing_schema('myschema2'): sql = SQL(url, 'accounts', schema_name='myschema2', schema='{name: string, value: int}') into(sql, data) assert engine.has_table('accounts', schema='myschema2') sql2 = SQL(url, 'accounts', schema_name='myschema2') assert list(sql2) == data
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {"t": df, "s": cities_df}}) assert isinstance(result, SparkDataFrame) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape
def test_chunks_compute(): exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()] for e in exprs: result = compute(e, {s: cL}) expected = compute(e, {s: L}) if iscollection(e.dshape): result = into(list, result) expected = into(list, expected) assert result == expected
def test_get_datetimes(): response = test.post('/compute.json', data=json.dumps({'expr': 'events'}), content_type='application/json') assert 'OK' in response.status data = json.loads(response.data) result = nd.array(data['data'], type=data['datashape']) assert into(list, result) == into(list, events)
def test_no_header_no_columns(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_csv_json_chunked(self): with filetext('1,1\n2,2\n') as csv_fn: with filetext('') as json_fn: schema = '{a: int32, b: int32}' csv = CSV(csv_fn, schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) into(json, csv) self.assertEquals(tuplify(tuple(json)), ((1, 1), (2, 2)))
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_simple_float_into(): tbl = 'testtable_into_float' csv = CSV(file_name_floats, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql,csv, if_exists="replace") assert into(list, sql) == \ [(1.02, 2.02), (102.02, 202.02), (1002.02, 2002.02)]
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", QUOTE="alpha", FORMAT="csv") # uses multi-byte character and # fails over to using sql.extend() assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_csv_hdf5(self): from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='{a: int32, b: int32}') hdf5 = resource(hdf5_fn + '::/data', dshape='var * {a: int32, b: int32}') into(hdf5, csv) self.assertEquals(hdf5[:].tolist(), [(1, 1), (2, 2)])
def test_sql_schema_behavior(): with existing_schema('myschema'): sql = SQL(url, 'accounts', schema_name='myschema', schema='{name: string, value: int}') into(sql, data) assert engine.has_table('accounts', schema='myschema') sql2 = SQL(url, 'accounts', schema_name='myschema') assert list(sql2) == data sql3 = SQL(url, 'myschema.accounts') assert list(sql2) == data
def test_leaf_symbol(test, serial): query = {'expr': {'op': 'Field', 'args': [':leaf', 'cities']}} resp = test.post('/compute', data=serial.dumps(query), headers=mimetype(serial)) tdata = serial.loads(resp.data) a = serial.data_loads(tdata['data']) b = into(list, cities) assert list(map(tuple, into(list, a))) == b assert list(tdata['names']) == cities.columns.tolist()
def test_hour(): dts = [datetime(2000, 6, 20, 1, 00, 00), datetime(2000, 6, 20, 12, 59, 59), datetime(2000, 6, 20, 12, 00, 00), datetime(2000, 6, 20, 11, 59, 59)] dts = into(np.ndarray, dts) assert eq(compute(s.truncate(1, 'hour'), dts), into(np.ndarray, [datetime(2000, 6, 20, 1, 0), datetime(2000, 6, 20, 12, 0), datetime(2000, 6, 20, 12, 0), datetime(2000, 6, 20, 11, 0)]))
def test_month(): dts = [datetime(2000, 7, 1), datetime(2000, 6, 30), datetime(2000, 6, 1), datetime(2000, 5, 31)] dts = into(np.ndarray, dts) assert eq(compute(s.truncate(1, 'month'), dts), into(np.ndarray, [date(2000, 7, 1), date(2000, 6, 1), date(2000, 6, 1), date(2000, 5, 1)]))
def test_ndarray_into_table(self, dt_tb, dt_data): dtype = ds.from_numpy(dt_data.shape, dt_data.dtype) t = PyTables(dt_tb, '/out', dtype) try: res = into(np.ndarray, into(t, dt_data, filename=dt_tb, datapath='/out')) for k in res.dtype.fields: lhs, rhs = res[k], dt_data[k] if (issubclass(np.datetime64, lhs.dtype.type) and issubclass(np.datetime64, rhs.dtype.type)): lhs, rhs = lhs.astype('M8[us]'), rhs.astype('M8[us]') assert np.array_equal(lhs, rhs) finally: t._v_file.close()
def test_get_datetimes(app_context): expr = t.events query = {'expr': to_tree(expr)} response = test.post('/compute.json', data=json.dumps(query), content_type='application/json') assert 'OK' in response.status data = json.loads(response.data.decode('utf-8')) ds = datashape.dshape(data['datashape']) result = into(np.ndarray, data['data'], dshape=ds) assert into(list, result) == into(list, events)
def test_get_datetimes(): expr = t.events query = {'expr': to_tree(expr)} response = test.post('/compute.json', data=json.dumps(query), content_type='application/json') assert 'OK' in response.status data = json.loads(response.data.decode('utf-8')) ds = datashape.dshape(data['datashape']) result = into(np.ndarray, data['data'], dshape=ds) assert into(list, result) == into(list, events)
def test_get_datetimes(serial): expr = t.events query = {'expr': to_tree(expr)} response = test.post( '/compute.{name}'.format(name=serial.name), data=serial.dumps(query), ) assert 'OK' in response.status data = serial.loads(response.data) ds = datashape.dshape(data['datashape']) result = into(np.ndarray, data['data'], dshape=ds) assert into(list, result) == into(list, events)
def test_get_datetimes(test, serial): expr = t.events query = {'expr': to_tree(expr)} response = test.post('/compute', data=serial.dumps(query), headers=mimetype(serial)) assert 'OK' in response.status tdata = serial.loads(response.data) ds = datashape.dshape(tdata['datashape']) result = into(np.ndarray, serial.data_loads(tdata['data']), dshape=ds) assert into(list, result) == into(list, events) assert list(tdata['names']) == events.columns.tolist()
def test_multiple_csv_files(): d = {"mult1.csv": "name,val\nAlice,1\nBob,2", "mult2.csv": "name,val\nAlice,3\nCharlie,4"} data = [("Alice", 1), ("Bob", 2), ("Alice", 3), ("Charlie", 4)] with filetexts(d) as fns: r = resource("mult*.csv") s = symbol("s", discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: data}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def test_get_datetimes(test, serial): expr = t.events query = {'expr': to_tree(expr)} response = test.post( '/compute.{name}'.format(name=serial.name), data=serial.dumps(query), ) assert 'OK' in response.status data = serial.loads(response.data) ds = datashape.dshape(data['datashape']) result = into(np.ndarray, data['data'], dshape=ds) assert into(list, result) == into(list, events) assert data['names'] == events.columns.tolist()
def test_compute_column_wise(iris_server, serial): test = iris_server t = symbol('t', discover(iris)) subexpr = ((t.petal_width / 2 > 0.5) & (t.petal_length / 2 > 0.5)) expr = t[subexpr] tree = to_tree(expr) blob = serial.dumps({'expr': tree}) resp = test.post('/compute', data=blob, headers=mimetype(serial)) assert 'OK' in resp.status tdata = serial.loads(resp.data) result = serial.data_loads(tdata['data']) expected = compute(expr, iris) assert list(map(tuple, into(list, result))) == into(list, expected) assert list(tdata['names']) == t.fields
def test_unused_datetime_columns(): ds = dshape('2 * {val: string, when: datetime}') with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn: csv = CSV(fn, has_header=True) s = symbol('s', discover(csv)) assert into(list, compute(s.val, csv)) == ['a', 'b']
def test_multiple_csv_files(): d = {'mult1.csv': 'name,val\nAlice,1\nBob,2', 'mult2.csv': 'name,val\nAlice,3\nCharlie,4'} dta = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)] with filetexts(d) as fns: r = data('mult*.csv') s = symbol('s', discover(r)) for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]: a = compute(e, {s: r}) b = compute(e, {s: dta}) if iscollection(e.dshape): a, b = into(set, a), into(set, b) assert a == b
def test_sparksql_with_literals(): srdd = into(sqlContext, data, schema=t.schema) expr = t[t.amount >= 100] result = compute(expr, srdd) assert isinstance(result, SchemaRDD) assert set(map(tuple, result.collect())) == \ set(map(tuple, compute(expr, data)))
def update_source(self): # Relevant df = pd.read_csv(self.relevant_data, delimiter='\t', header=None, names=['url', 'timestamp']) df['domain'] = df['url'].apply(partial(get_tld, fail_silently=True)) df1 = df.groupby(['domain']).size() # Crawled df = pd.read_csv(self.crawled_data, delimiter='\t', header=None, names=['url', 'timestamp']) df['domain'] = df['url'].apply(partial(get_tld, fail_silently=True)) df2 = df.groupby(['domain']).size() # Frontier df = pd.read_csv(self.frontier_data, delimiter='\t', header=None, names=['url']) df['domain'] = df['url'].apply(partial(get_tld, fail_silently=True)) df3 = df.groupby(['domain']).size() df = pd.concat((df1, df2, df3), axis=1) df.columns = ['relevant', 'crawled', 'frontier'] df = df.sort(self.sort, ascending=False).head(25).fillna(value=0) for col in df.columns: df['%s_half' % col] = df[col] / 2 df.reset_index(inplace=True) source = into(ColumnDataSource, df) return source
def bank(db): coll = db.bank coll = into(coll, bank_raw) try: yield coll finally: coll.drop()
def test_field_access(db, ctx): for field in db.t.fields: expr = getattr(db.t, field) result = into(pd.Series, compute(expr, ctx)) expected = compute(expr, {db: {'t': df}}) assert result.name == expected.name np.testing.assert_array_equal(result.values, expected.values)
def comp(datasets, name): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 404) try: data = json.loads(request.data) except ValueError: return ("Bad JSON. Got %s " % request.data, 404) try: dset = datasets[name] except KeyError: return ("Dataset %s not found" % name, 404) t = Symbol(name, discover(dset)) namespace = data.get('namespace', dict()) namespace[name] = t expr = from_tree(data['expr'], namespace=namespace) result = compute(expr, dset) if iscollection(expr.dshape): result = into(list, result) return jsonify({ 'name': name, 'datashape': str(expr.dshape), 'data': result })
def big_bank(db): data = [{ 'name': 'Alice', 'amount': 100, 'city': 'New York City' }, { 'name': 'Alice', 'amount': 200, 'city': 'Austin' }, { 'name': 'Bob', 'amount': 100, 'city': 'New York City' }, { 'name': 'Bob', 'amount': 200, 'city': 'New York City' }, { 'name': 'Bob', 'amount': 300, 'city': 'San Francisco' }] coll = db.bigbank coll = into(coll, data) try: yield coll finally: coll.drop()
def test_csv_into_mongodb(empty_collec, file_name): csv = CSV(file_name) coll = empty_collec res = into(coll, csv) mongo_data = list(res.find({}, {'_0': 1, '_id': 0})) assert list(csv[:, '_0']) == [i['_0'] for i in mongo_data]
def test_jsonarray_into_mongodb(empty_collec): filename = tempfile.mktemp(".json") with open(filename, "w") as f: json.dump(data, f) dd = JSON(filename, schema="3 * { id : string, name : string, " "posts : var * { content : string, title : string }," " tv_show : string }") coll = empty_collec into(coll, dd, json_array=True) mongo_data = list(coll.find({}, {'_id': 0})) assert mongo_data[0] == data[0]
def compserver(dataset): if request.headers['content-type'] != 'application/json': return ("Expected JSON data", 404) try: payload = json.loads(request.data.decode('utf-8')) except ValueError: return ("Bad JSON. Got %s " % request.data, 404) ns = payload.get('namespace', dict()) ns[':leaf'] = symbol('leaf', discover(dataset)) expr = from_tree(payload['expr'], namespace=ns) assert len(expr._leaves()) == 1 leaf = expr._leaves()[0] try: result = compute(expr, {leaf: dataset}) except Exception as e: return ("Computation failed with message:\n%s" % e, 500) if iscollection(expr.dshape): result = into(list, result) return json.dumps({ 'datashape': str(expr.dshape), 'data': result }, default=json_dumps)
def test_string_dataset(tmpcsv): raw = 'a,b,2.0\nc,1999,3.0\nd,3.0,4.0' with open(tmpcsv, mode='w') as f: f.write(raw) csv = CSV(tmpcsv, columns=list('xyz')) t = Table(csv) x = into(list, t) assert x == [('a', 'b', 2.0), ('c', '1999', 3.0), ('d', '3.0', 4.0)]
def test_expr_client_interactive(): ec = Client('localhost:6363', 'accounts') t = Table(ec) assert compute(t.name) == ['Alice', 'Bob'] assert (into(set, compute(by(t.name, min=t.amount.min(), max=t.amount.max()))) == set([('Alice', 100, 100), ('Bob', 200, 200)]))