def test_sql_to_csv(sql, csv, tmpdir): sql, bind = sql sql = odo(csv, sql, bind=bind) with tmpfile('.csv', dir=tmpdir) as fn: csv = odo(sql, fn, bind=bind) assert odo(csv, list) == data assert discover(csv).measure.names == discover(sql).measure.names
def test_temp_ssh_files(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = into(Temp(SSH(CSV)), csv, hostname='localhost') assert discover(csv) == discover(scsv) assert isinstance(scsv, _Temp)
def correct_commodities(): src_dir = path.join(data_dir, 'agmarknet/by_commodity') init_dir = os.getcwd() os.chdir(src_dir) folders = glob.glob('*') csv_dir = os.getcwd() for folder in folders: os.chdir(path.join(csv_dir, folder)) files = glob.glob('*_all.csv') for file in files: csvr = odo.resource(path.join(csv_dir, folder, file)) # Have to use resource to discover URIs num_col = len(odo.discover(csvr)[1].types) ds = None if num_col == 9: ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, min: ?string, max: ?string, modal: ?string}") elif num_col == 10: ds = bz.dshape("var * {date: datetime, state: ?string, market: ?string, commodity: ?string, variety: ?string, arrival: ?string, grade: ?string, min: ?string, max: ?string, modal: ?string}") else: ds = odo.discover(csvr) d = bz.Data(path.join(csv_dir, folder, file), dshape=ds) if num_col == 10: d = bz.transform(d, grade=d.grade.map(lambda x: x.strip(), 'string')) d = bz.transform(d, commodity=d.commodity.map(lambda x: x.strip(), 'string')) d = bz.transform(d, commodity=d.commodity.map(lambda x: commodity_corrections[x] if x in commodity_corrections else x, 'string')) d = bz.transform(d, state=d.state.map(lambda x: x.strip(), 'string')) d = bz.transform(d, state=d.state.map(lambda x: state_corrections[x] if x in state_corrections else x, 'string')) d = bz.transform(d, market=d.market.map(lambda x: x.strip(), 'string')) return
def test_s3_to_ssh(): pytest.importorskip('boto') tips_uri = 's3://nyqpug/tips.csv' with tmpfile('.csv') as fn: result = into(Temp(SSH(CSV))(fn, hostname='localhost'), tips_uri) assert into(list, result) == into(list, tips_uri) assert discover(result) == discover(resource(tips_uri))
def test_sql_to_csv(sql, csv): sql = odo(csv, sql) with tmpfile('.csv') as fn: csv = odo(sql, fn) assert odo(csv, list) == data # explicitly test that we do NOT preserve the header here assert discover(csv).measure.names != discover(sql).measure.names
def test_append_other(): with tmpfile('.hdf5') as fn: x = into(np.ndarray, df) dset = into('hdfstore://'+fn+'::/data', x) try: assert discover(dset) == discover(df) finally: dset.parent.close()
def test_into_resource(): with tmpfile('.hdf5') as fn: d = into('hdfstore://' + fn + '::/x', df) try: assert discover(d) == discover(df) assert eq(into(pd.DataFrame, d), df) finally: d.parent.close()
def test_append_other(): with tmpfile('.hdf5') as fn: x = into(np.ndarray, df) dset = into('hdfstore://' + fn + '::/data', x) try: assert discover(dset) == discover(df) finally: dset.parent.close()
def test_jsonlines_to_s3(): with tmpfile('.json') as fn: with open(fn, mode='w') as f: for row in js: f.write(pd.io.json.dumps(row)) f.write(os.linesep) with s3_bucket('.json') as b: result = into(b, resource(fn)) assert discover(result) == discover(js)
def discover_bson(b, n=10, **kwargs): with bson_lines(b.path) as lines: data = list(take(n, lines)) if len(data) < n: ds = discover(data) else: ds = var * discover(data).subshape[0] return ds
def test_ssh_csv_to_s3_csv(): # for some reason this can only be run in the same file as other ssh tests # and must be a Temp(SSH(CSV)) otherwise tests above this one fail s3_bucket = pytest.importorskip('odo.backends.tests.test_aws').s3_bucket with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: remote = into(Temp(SSH(CSV)), CSV(fn), hostname='localhost') with s3_bucket('.csv') as b: result = into(b, remote) assert discover(result) == discover(resource(b))
def test_float_dtype(sql_with_floats): sql_with_floats, bind = sql_with_floats expected = dshape("var * {a: float64, b: ?float64}") assert_dshape_equal(discover(sql_with_floats), expected) # Also check that reflection from the database returns expected dshape. assert_dshape_equal(discover(bind).subshape[sql_with_floats.name], expected)
def test_discover(): df = pd.DataFrame( {"x": list("a" * 5 + "b" * 5 + "c" * 5), "y": np.arange(15, dtype=np.int64), "z": list(map(float, range(15)))}, columns=["x", "y", "z"], ) df.x = df.x.astype("category") ddf = dd.from_pandas(df, npartitions=2) assert_dshape_equal( discover(ddf), var * Record([("x", Categorical(["a", "b", "c"])), ("y", int64), ("z", float64)]) ) assert_dshape_equal(discover(ddf.x), var * Categorical(["a", "b", "c"]))
def test_join_type_promotion(sqla, sqlb): t, s = symbol(sqla.name, discover(sqla)), symbol(sqlb.name, discover(sqlb)) expr = join(t, s, 'B', how='inner') result = set( map(tuple, compute(expr, { t: sqla, s: sqlb }).execute().fetchall())) expected = set([(1, 'a', 'a'), (1, None, 'a')]) assert result == expected
def test_discover(): df = pd.DataFrame({'x': list('a'*5 + 'b'*5 + 'c'*5), 'y': range(15), 'z': list(map(float, range(15)))}, columns=['x', 'y', 'z']) df.x = df.x.astype('category') ddf = dd.from_pandas(df, npartitions=2) assert_dshape_equal(discover(ddf), var * Record([('x', Categorical(['a', 'b', 'c'])), ('y', int64), ('z', float64)])) assert_dshape_equal(discover(ddf.x), var * Categorical(['a', 'b', 'c']))
def test_concat(sql_two_tables): t_table, u_table = sql_two_tables t_data = pd.DataFrame(np.arange(5), columns=['a']) u_data = pd.DataFrame(np.arange(5, 10), columns=['a']) odo(t_data, t_table) odo(u_data, u_table) t = symbol('t', discover(t_data)) u = symbol('u', discover(u_data)) tm.assert_frame_equal( compute(concat(t, u).sort('a'), {t: t_table, u: u_table}, return_type=pd.DataFrame), pd.DataFrame(np.arange(10), columns=['a']), )
def test_concat(sql_two_tables): t_table, u_table = sql_two_tables t_data = pd.DataFrame(np.arange(5), columns=["a"]) u_data = pd.DataFrame(np.arange(5, 10), columns=["a"]) odo(t_data, t_table) odo(u_data, u_table) t = symbol("t", discover(t_data)) u = symbol("u", discover(u_data)) tm.assert_frame_equal( odo(compute(concat(t, u).sort("a"), {t: t_table, u: u_table}), pd.DataFrame), pd.DataFrame(np.arange(10), columns=["a"]), )
def test_discover_nested(): with tmpfile('hdf5') as fn: df.to_hdf(fn, '/a/b/data') df.to_hdf(fn, '/a/b/data2') df.to_hdf(fn, '/a/data') hdf = pd.HDFStore(fn) try: assert discover(hdf) == discover( {'a': {'b': {'data': df, 'data2': df}, 'data': df}} ) finally: hdf.close()
def test_url_to_hdfs(): from .test_hdfs import tmpfile_hdfs, hdfs, HDFS with tmpfile_hdfs() as target: # build temp csv for assertion check url_csv = resource(iris_url) csv = convert(Temp(CSV), url_csv) # test against url scsv = HDFS(CSV)(target, hdfs=hdfs) odo(iris_url, scsv) assert discover(scsv) == discover(csv)
def test_discover(): df = pd.DataFrame( { 'x': list('a' * 5 + 'b' * 5 + 'c' * 5), 'y': np.arange(15, dtype=np.int64), 'z': list(map(float, range(15))) }, columns=['x', 'y', 'z']) df.x = df.x.astype('category') ddf = dd.from_pandas(df, npartitions=2) assert_dshape_equal( discover(ddf), var * Record([('x', Categorical(['a', 'b', 'c'])), ('y', int64), ('z', float64)])) assert_dshape_equal(discover(ddf.x), var * Categorical(['a', 'b', 'c']))
def test_copy_remote_csv(): with tmpfile('csv') as target: with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = resource(fn) uri = 'ssh://localhost:%s.csv' % target scsv = into(uri, csv) assert isinstance(scsv, SSH(CSV)) assert discover(scsv) == discover(csv) # Round trip csv2 = into(target, scsv) assert into(list, csv) == into(list, csv2)
def test_shift_arithmetic(sql, n): t = symbol('t', discover(sql)) expr = t.B - t.B.shift(n) result = odo(compute(expr, sql), pd.Series) df = odo(sql, pd.DataFrame) expected = df.B - df.B.shift(n) tm.assert_series_equal(result, expected)
def test_sample(big_sql): nn = symbol('nn', discover(big_sql)) nrows = odo(compute(nn.nrows, big_sql), int) result = compute(nn.sample(n=nrows // 2), big_sql, return_type=pd.DataFrame) assert len(result) == nrows // 2 result2 = compute(nn.sample(frac=0.5), big_sql, return_type=pd.DataFrame) assert len(result) == len(result2)
def test_isin_selectable(sql): s = symbol('s', discover(sql)) # wrap the resource in a select assert compute(s.B.isin({1, 3}), sa.select(sql._resources()[sql].columns), return_type=list) == [(True, ), (False, )]
def test_shift_on_column(n, column, sql): sql = sql.data t = symbol('t', discover(sql)) expr = t[column].shift(n) result = compute(expr, sql, return_type=pd.Series) expected = odo(sql, pd.DataFrame)[column].shift(n) tm.assert_series_equal(result, expected)
def test_relabel_columns_over_selection(big_sql): t = symbol('t', discover(big_sql)) result = compute(t[t['B'] == 2].relabel(B=u'b'), big_sql, return_type=pd.DataFrame) expected = pd.DataFrame([['a', 2]], columns=[u'A', u'b']) tm.assert_frame_equal(result, expected)
def test_slicing_with_lists(): nx = np.arange(20).reshape((4, 5)) dx = from_array(nx, (2, 2)) sx = symbol('x', discover(dx)) expr = sx[[2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[::2, [2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[1, [2, 0, 3]] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[[2, 0, 3], -2] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[:, :] assert compute(expr, dx).dask == dx.dask expr = sx[0] assert eq(np.array(compute(expr, dx)), compute(expr, nx)) expr = sx[0, [3, 1, 4]] assert eq(np.array(compute(expr, dx)), compute(expr, nx))
def test_append_convert(empty_bank, raw_bank): ds = discover(raw_bank) assert set(ds.measure.names) == {'name', 'amount'} append(empty_bank, raw_bank, dshape=ds) assert odo(empty_bank, list, dshape=ds) == list(pluck(ds.measure.names, raw_bank))
def test_discover_with_dotted_names(): with tmpfile(".csv") as fn: with open(fn, "w") as f: f.write("a.b,c.d\n1,2\n3,4") dshape = discover(resource(fn)) assert dshape == datashape.dshape('var * {"a.b": int64, "c.d": int64}') assert dshape.measure.names == [u"a.b", u"c.d"]
def test_complex_into(complex_csv, complex_sql): complex_sql, bind = complex_sql # data from: http://dummydata.me/generate into(complex_sql, complex_csv, dshape=discover(complex_sql), bind=bind) assert_allclose( into(list, complex_sql, bind=bind), into(list, complex_csv) )
def test_multiple_object_ids(): data = [{'x': 1, 'y': 2, 'other': ObjectId('1' * 24)}, {'x': 3, 'y': 4, 'other': ObjectId('2' * 24)}] with coll(data) as c: assert discover(c) == dshape('2 * {x: int64, y: int64}') assert convert(list, c) == [(1, 2), (3, 4)]
def test_dist(nyc): def distance(lat1, lon1, lat2, lon2, R=3959): # http://andrew.hedges.name/experiments/haversine/ dlon = radians(lon2 - lon1) dlat = radians(lat2 - lat1) a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2 return R * 2 * atan2(sqrt(a), sqrt(1 - a)) t = symbol('t', discover(nyc)) filtered = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] dist = distance(filtered.pickup_latitude, filtered.pickup_longitude, filtered.dropoff_latitude, filtered.dropoff_longitude) transformed = transform(filtered, dist=dist) assert ( odo(compute(transformed.dist.max(), nyc), float) == odo(compute(transformed.dist, nyc), pd.Series).max().item() )
def test_coerce_bool_and_sum(sql): n = sql.name t = symbol(n, discover(sql)) expr = (t.B > 1.0).coerce(to='int32').sum() result = compute(expr, sql).scalar() expected = odo(compute(t.B, sql), pd.Series).gt(1).sum() assert result == expected
def test_discover_with_dotted_names(): with tmpfile('.csv') as fn: with open(fn, 'w') as f: f.write('a.b,c.d\n1,2\n3,4') dshape = discover(resource(fn)) assert dshape == datashape.dshape('var * {"a.b": int64, "c.d": int64}') assert dshape.measure.names == [u'a.b', u'c.d']
def test_s3_jsonlines_discover(): json_dshape = discover(resource('s3://nyqpug/tips.json')) names = list(map(str, sorted(json_dshape.measure.names))) assert names == ['day', 'sex', 'size', 'smoker', 'time', 'tip', 'total_bill'] types = [json_dshape.measure[name] for name in names] assert types == [string, string, int64, string, string, float64, float64]
def test_header_mix_str_digits(): ds = datashape.dshape('''var * {"On- or Off- Budget": ?string, "1990": ?string}''') with filetext('On- or Off- Budget,1990\nOn Budget,-628\nOff budget,"5,962"\n') as fn: csv = CSV(fn, has_header=True) df = convert(pd.DataFrame, csv) assert discover(csv).measure == ds.measure
def test_isin_selectable(sql): s = symbol('s', discover(sql)) # wrap the resource in a select assert compute(s.B.isin({1, 3}), sa.select(sql._resources()[sql].columns), return_type=list) == [(True,), (False,)]
def test_dist(nyc): def distance(lat1, lon1, lat2, lon2, R=3959): # http://andrew.hedges.name/experiments/haversine/ dlon = radians(lon2 - lon1) dlat = radians(lat2 - lat1) a = sin(dlat / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2.0) ** 2 return R * 2 * atan2(sqrt(a), sqrt(1 - a)) t = symbol('t', discover(nyc)) filtered = t[ (t.pickup_latitude >= 40.477399) & (t.pickup_latitude <= 40.917577) & (t.dropoff_latitude >= 40.477399) & (t.dropoff_latitude <= 40.917577) & (t.pickup_longitude >= -74.259090) & (t.pickup_longitude <= -73.700272) & (t.dropoff_longitude >= -74.259090) & (t.dropoff_longitude <= -73.700272) & (t.passenger_count < 6) ] dist = distance(filtered.pickup_latitude, filtered.pickup_longitude, filtered.dropoff_latitude, filtered.dropoff_longitude) transformed = transform(filtered, dist=dist) assert ( compute(transformed.dist.max(), nyc, return_type=float) == compute(transformed.dist, nyc, return_type=pd.Series).max() )
def test_shift_arithmetic(sql, n): t = symbol('t', discover(sql)) expr = t.B - t.B.shift(n) result = compute(expr, sql, return_type=pd.Series) df = odo(sql, pd.DataFrame) expected = df.B - df.B.shift(n) tm.assert_series_equal(result, expected)
def test_s3_to_sqlite(): with tmpfile('.db') as fn: tb = into('sqlite:///%s::tips' % fn, tips_uri, dshape=discover(resource(tips_uri))) lhs = into(list, tb) assert lhs == into(list, tips_uri)
def test_pandas_loads_in_datetimes_naively(): with filetext('name,when\nAlice,2014-01-01\nBob,2014-02-02') as fn: csv = CSV(fn, has_header=True) ds = datashape.dshape('var * {name: ?string, when: ?datetime}') assert discover(csv) == ds df = convert(pd.DataFrame, csv) assert df.dtypes['when'] == 'M8[ns]'
def test_join(db, ctx): expr = join(db.t, db.s) result = compute(expr, ctx) expected = compute(expr, {db: {'t': df, 's': cities_df}}) assert isinstance(result, (SparkDataFrame, SchemaRDD)) assert into(set, result) == into(set, expected) assert discover(result) == expr.dshape