def test_convert_logfiles_to_bag(): with filetexts({'a1.log': 'Hello\nWorld', 'a2.log': 'Hola\nMundo'}) as fns: logs = chunks(TextFile)(list(map(TextFile, fns))) b = convert(Bag, logs) assert isinstance(b, Bag) assert 'a1.log' in str(b.dask.values()) assert convert(list, b) == convert(list, logs)
def test_small_chunk_size(): normal = convert(Temp(CSV), resource(iris_url)) small_chunk = convert(Temp(CSV), resource(iris_url, chunk_size=1)) with open(normal.path, 'rb') as fn: normal_data = fn.read() with open(small_chunk.path, 'rb') as fn: small_chunk_data = fn.read() assert normal_data == small_chunk_data
def test_append_sas_to_sqlite_round_trip(): expected = convert(set, sasfile) with tmpfile('db') as fn: r = resource('sqlite:///%s::SAS' % fn, dshape=discover(sasfile)) append(r, sasfile) result = convert(set, r) assert expected == result
def test_empty_line(): text = '{"a": 1}\n{"a": 2}\n\n' # extra endline with tmpfile('.json') as fn: with open(fn, 'w') as f: f.write(text) j = JSONLines(fn) assert len(convert(list, j)) == 2
def test_multiple_object_ids(): data = [{'x': 1, 'y': 2, 'other': ObjectId('1' * 24)}, {'x': 3, 'y': 4, 'other': ObjectId('2' * 24)}] with coll(data) as c: assert discover(c) == dshape('2 * {x: int64, y: int64}') assert convert(list, c) == [(1, 2), (3, 4)]
def test_select_to_iterator(): engine, t = single_table_engine() append(t, [('Alice', 100), ('Bob', 200)]) sel = sa.select([t.c.amount + 1]) assert convert(list, sel) == [(101,), (201,)] assert convert(list, sel, dshape=dshape('var * int')) == [101, 201] sel2 = sa.select([sa.sql.func.sum(t.c.amount)]) assert convert(int, sel2, dshape=dshape('int')) == 300 sel3 = sa.select([t]) result = convert(list, sel3, dshape=discover(t)) assert type(result[0]) is tuple
def test_read_gzip(): with tmpfile('json.gz') as fn: f = gzip.open(fn, 'wb') s = json.dumps(dat).encode('utf-8') f.write(s) f.close() js = JSON(fn) assert convert(list, js) == dat
def test_fixed_convert(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, 'foo') r = resource('hdfstore://' + fn + '::/foo') try: assert eq(convert(pd.DataFrame, r), df) finally: r.parent.close()
def test_convert_sas_to_dataframe(): df = convert(pd.DataFrame, sasfile) assert isinstance(df, pd.DataFrame) # pandas doesn't support date expected = str(ds.measure).replace('date', 'datetime') assert str(discover(df).measure).replace('?', '') == expected
def test_read_gzip(): with tmpfile('.bson.gz') as fn: f = gzip.open(fn, 'wb') for item in dat: f.write(bson.BSON.encode(item)) f.close() b = BSON(fn) assert convert(list, b) == dat
def test_select_to_iterator(): engine, t = single_table_engine() append(t, [('Alice', 100), ('Bob', 200)]) sel = sa.select([t.c.amount + 1]) assert convert(list, sel) == [(101, ), (201, )] assert convert(list, sel, dshape=dshape('var * int')) == [101, 201] sel2 = sa.select([sa.sql.func.sum(t.c.amount)]) assert convert(int, sel2, dshape=dshape('int')) == 300 sel3 = sa.select([t]) result = convert(list, sel3, dshape=discover(t)) assert type(result[0]) is tuple
def test_fixed_convert(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, 'foo') r = resource('hdfstore://'+fn+'::/foo') try: assert eq(convert(pd.DataFrame, r), df) finally: r.parent.close()
def test_into_table_iterator(): engine = sa.create_engine("sqlite:///:memory:") metadata = sa.MetaData(engine) t = dshape_to_table("points", "{x: int, y: int}", metadata=metadata) t.create() data = [(1, 1), (2, 4), (3, 9)] append(t, data) assert convert(list, t) == data assert isinstance(convert(list, t)[0], tuple) t2 = dshape_to_table("points2", "{x: int, y: int}", metadata=metadata) t2.create() data2 = [{"x": 1, "y": 1}, {"x": 2, "y": 4}, {"x": 3, "y": 9}] append(t2, data2) assert convert(list, t2) == data
def test_into_table_iterator(): engine = sa.create_engine('sqlite:///:memory:') metadata = sa.MetaData(engine) t = dshape_to_table('points', '{x: int, y: int}', metadata=metadata) t.create() data = [(1, 1), (2, 4), (3, 9)] append(t, data) assert convert(list, t) == data assert isinstance(convert(list, t)[0], tuple) t2 = dshape_to_table('points2', '{x: int, y: int}', metadata=metadata) t2.create() data2 = [{'x': 1, 'y': 1}, {'x': 2, 'y': 4}, {'x': 3, 'y': 9}] append(t2, data2) assert convert(list, t2) == data
def test_insert_to_ooc(): x = np.arange(600).reshape((20, 30)) y = np.empty(shape=x.shape, dtype=x.dtype) a = convert(Array, x, blockshape=(4, 5)) dsk = insert_to_ooc(y, a) core.get(merge(dsk, a.dask), list(dsk.keys())) assert eq(y, x)
def test_insert_to_ooc(): x = np.arange(600).reshape((20, 30)) y = np.empty(shape=x.shape, dtype=x.dtype) a = convert(Array, x, chunks=(4, 5)) dsk = insert_to_ooc(y, a) core.get(merge(dsk, a.dask), list(dsk.keys())) assert eq(y, x)
def test_append_and_convert_round_trip(): engine = sa.create_engine("sqlite:///:memory:") metadata = sa.MetaData(engine) t = sa.Table("bank", metadata, sa.Column("name", sa.String, primary_key=True), sa.Column("balance", sa.Integer)) t.create() data = [("Alice", 1), ("Bob", 2)] append(t, data) assert convert(list, t) == data
def test_read_gzip_lines(): with tmpfile('json.gz') as fn: f = gzip.open(fn, 'wb') for item in dat: s = json.dumps(item).encode('utf-8') f.write(s) f.write(b'\n') f.close() js = JSONLines(fn) assert convert(list, js) == dat
def test_append_and_convert_round_trip(): engine = sa.create_engine('sqlite:///:memory:') metadata = sa.MetaData(engine) t = sa.Table('bank', metadata, sa.Column('name', sa.String, primary_key=True), sa.Column('balance', sa.Integer)) t.create() data = [('Alice', 1), ('Bob', 2)] append(t, data) assert convert(list, t) == data
def test_url_to_hdfs(): from .test_hdfs import tmpfile_hdfs, hdfs, HDFS with tmpfile_hdfs() as target: # build temp csv for assertion check url_csv = resource(iris_url) csv = convert(Temp(CSV), url_csv) # test against url scsv = HDFS(CSV)(target, hdfs=hdfs) odo(iris_url, scsv) assert discover(scsv) == discover(csv)
def test_multiple_object_ids(): data = [{ 'x': 1, 'y': 2, 'other': ObjectId('1' * 24) }, { 'x': 3, 'y': 4, 'other': ObjectId('2' * 24) }] with coll(data) as c: assert discover(c) == dshape('2 * {x: int64, y: int64}') assert convert(list, c) == [(1, 2), (3, 4)]
def test_multiple_jsonlines(): a, b = '_test_a1.json', '_test_a2.json' try: with ignoring(OSError): os.remove(a) with ignoring(OSError): os.remove(b) with open(a, 'w') as f: json.dump(dat, f) with open(b'_test_a2.json', 'w') as f: json.dump(dat, f) r = resource('_test_a*.json') result = convert(list, r) assert len(result) == len(dat) * 2 finally: with ignoring(OSError): os.remove(a) with ignoring(OSError): os.remove(b)
def test_convert(): x = np.arange(600).reshape((20, 30)) d = convert(Array, x, chunks=(4, 5)) assert isinstance(d, Array)
def test_convert_local_file_to_temp_ssh_file(): with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = CSV(fn) scsv = convert(Temp(SSH(CSV)), csv, hostname='localhost') assert into(list, csv) == into(list, scsv)
def test_chunks(): with file(df) as (fn, f, dset): c = convert(chunks(pd.DataFrame), dset) assert eq(convert(np.ndarray, c), df)
def test_convert_to_numpy_array(): x = np.arange(600).reshape((20, 30)) d = convert(Array, x, chunks=(4, 5)) x2 = convert(np.ndarray, d) assert eq(x, x2)
def test_write_gzip(): with tmpfile('.bson.gz') as fn: b = BSON(fn) append(b, dat) assert convert(list, b) == dat
def test_convert_to_temp_bson(): bs = convert(Temp(BSON), dat) assert isinstance(bs, BSON) assert isinstance(bs, _Temp) assert convert(list, bs) == dat
def pre_compute(expr, data, **kwargs): leaf = expr._leaves()[0] if all(isinstance(e, Cheap) for e in path(expr, leaf)): return convert(Iterator, data) else: raise MDNotImplementedError()
def test_convert(): x = pd.DataFrame(np.arange(50).reshape(10, 5), columns=list('abcde')) d = convert(dd.DataFrame, x, npartitions=2) assert isinstance(d, dd.DataFrame)
def test_convert_to_pandas_series(): x = pd.DataFrame(np.arange(50).reshape(10, 5), columns=list('abcde')) d = convert(dd.DataFrame, x, npartitions=2) a = convert(pd.Series, d.a) tm.assert_series_equal(a, x.a)
def test_convert_to_pandas_dataframe(): x = pd.DataFrame(np.arange(50).reshape(10, 5), columns=list('abcde')) d = convert(dd.DataFrame, x, npartitions=2) x2 = convert(pd.DataFrame, d) tm.assert_frame_equal(x2, x)
def test_convert_pandas(): with file(df) as (fn, f, dset): assert eq(convert(pd.DataFrame, dset), df)
def test_extend_empty(): engine, t = single_table_engine() assert not convert(list, t) append(t, []) assert not convert(list, t)
def test_convert_chunks(): with file(df) as (fn, f, dset): c = convert(chunks(pd.DataFrame), dset, chunksize=len(df) / 2) assert len(list(c)) == 2 assert eq(convert(pd.DataFrame, c), df)
def test_sql_field_names_disagree_on_order(): r = resource('sqlite:///:memory:::tb', dshape=dshape('{x: int, y: int}')) append(r, [(1, 2), (10, 20)], dshape=dshape('{y: int, x: int}')) assert convert(set, r) == set([(2, 1), (20, 10)])
def test_convert_bson_list(): with bson_file(dat) as fn: b = BSON(fn) assert convert(list, b) == dat
def test_convert(): url_csv = resource(iris_url) t_csv = convert(Temp(CSV), url_csv) assert discover(url_csv) == discover(t_csv) assert isinstance(t_csv, _Temp)
def test_append_bson(): with tmpfile('.bson') as fn: b = BSON(fn) append(b, dat) assert convert(list, b) == dat
def test_array_interface(): x = np.arange(600).reshape((20, 30)) d = convert(Array, x, chunks=(4, 5)) assert eq(x, np.array(d))