def test_tmpfile(): with tmpfile() as f: with open(f, 'w') as a: a.write('') with tmpfile() as g: assert f != g assert not os.path.exists(f)
def test_resource(): with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = resource(uri, 'foo', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table) with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = resource(uri + '::' + 'foo', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table)
def test_into(self): with filetext('1,1\n2,2', extension='.csv') as a: with tmpfile(extension='.csv') as b: A = resource(a, schema='2 * int') B = resource(b, schema='2 * int', mode='a') B = into(B, A) assert tuplify(list(B)) == ((1, 1), (2, 2))
def data(): with tmpfile('.h5') as filename: f = tb.open_file(filename, mode='w') d = f.create_table('/', 'title', x) yield d d.close() f.close()
def data(): with tmpfile(".h5") as filename: f = tb.open_file(filename, mode="w") d = f.create_table("/", "title", x) yield d d.close() f.close()
def test_into_filename_filename(): with filetext('1,2\n3,4', extension='csv') as source_fn: with tmpfile('csv') as target_fn: into(target_fn, source_fn) csv = CSV(target_fn) assert into(list, csv) == [(1, 2), (3, 4)]
def test_into_cds_mixed(): pytest.importorskip('bokeh') from bokeh.objects import ColumnDataSource n = 25 ddict = {'first': np.random.choice(list('abc'), size=n), 'second': np.random.choice(['cachaça', 'tres leches', 'pizza'], size=n), 'third': list(range(n))} df = pd.DataFrame(ddict) with tmpfile('.csv') as fn: df.to_csv(fn, header=None, index=False, encoding='utf8') csv = CSV(fn, columns=['first', 'second', 'third'], encoding='utf8') t = Table(csv) cds = into(ColumnDataSource, t) assert isinstance(cds, ColumnDataSource) expected = dict((k, into(list, csv[:, k])) for k in ['first', 'second', 'third']) assert cds.data == expected cds = into(ColumnDataSource, t[['first', 'second']]) assert isinstance(cds, ColumnDataSource) expected = dict((k, into(list, csv[:, k])) for k in ['first', 'second']) assert cds.data == expected cds = into(ColumnDataSource, t['first']) assert isinstance(cds, ColumnDataSource) assert cds.data == {'first': into(list, csv[:, 'first'])}
def test_into(self): with filetext('1,1\n2,2', extension='.csv') as a: with tmpfile(extension='.csv') as b: A = resource(a, schema='{x: int, y: int}') B = resource(b, schema='{x: int, y: int}', mode='a') B = into(B, A) assert into(list, B) == [(1, 1), (2, 2)]
def test_copy(self): with filetext('1,1\n2,2', extension='.csv') as a: with tmpfile(extension='.csv') as b: A = resource(a, schema='2 * int') B = resource(b, schema='2 * int', mode='a') copy(A, B) assert list(B) == [[1, 1], [2, 2]]
def engine(): tbl = 'testtable' with tmpfile('db') as filename: engine = sqlalchemy.create_engine('sqlite:///' + filename) t = resource('sqlite:///' + filename + '::' + tbl, dshape='var * {a: int32, b: int32}') yield engine, t
def PyTables(path, datapath, dshape=None, **kwargs): """Create or open a ``tables.Table`` object. Parameters ---------- path : str Path to a PyTables HDF5 file. datapath : str The name of the node in the ``tables.File``. dshape : str or datashape.DataShape DataShape to use to create the ``Table``. Returns ------- t : tables.Table Examples -------- >>> from blaze.utils import tmpfile >>> # create from scratch >>> with tmpfile('.h5') as f: ... t = PyTables(filename, '/bar', ... dshape='var * {volume: float64, planet: string[10, "A"]}') ... data = [(100.3, 'mars'), (100.42, 'jupyter')] ... t.append(data) ... t[:] # doctest: +SKIP ... array([(100.3, b'mars'), (100.42, b'jupyter')], dtype=[('volume', '<f8'), ('planet', 'S10')]) """ def possibly_create_table(filename, dtype): f = tb.open_file(filename, mode='a') try: if datapath not in f: if dtype is None: raise ValueError('dshape cannot be None and datapath not' ' in file') else: f.create_table('/', datapath.lstrip('/'), description=dtype) finally: f.close() if dshape: if isinstance(dshape, str): dshape = datashape.dshape(dshape) if dshape[0] == datashape.var: dshape = dshape.subshape[0] dtype = dtype_to_pytables(datashape.to_numpy_dtype(dshape)) else: dtype = None if os.path.exists(path): possibly_create_table(path, dtype) else: with tmpfile('.h5') as filename: possibly_create_table(filename, dtype) shutil.copyfile(filename, path) return tb.open_file(path, mode='a').get_node(datapath)
def test_into_filename(): with tmpfile('csv') as filename: df = DataFrame([['Alice', 100], ['Bob', 200]], columns=['name', 'amount']) into(filename, df) csv = CSV(filename) assert into(list, csv) == into(list, df)
def csv(): data = [(1, 2), (10, 20), (100, 200)] with tmpfile('csv') as filename: csv = CSV(filename, 'w', schema='{a: int32, b: int32}') csv.extend(data) csv = CSV(filename, schema='{a: int32, b: int32}') yield csv
def h(): with tmpfile('.hdf5') as f: f = h5py.File(f) fx = f.create_dataset('/x', shape=x.shape, dtype=x.dtype, chunks=True, maxshape=(None,)) fx[:] = x yield f
def file(): with tmpfile('.h5') as filename: f = h5py.File(filename) d = f.create_dataset('/x', shape=x.shape, dtype=x.dtype, fillvalue=0.0, chunks=(4, 6)) d[:] = x yield f f.close()
def data(): with tmpfile('.h5') as filename: f = h5py.File(filename) d = f.create_dataset('/x', shape=x.shape, dtype=x.dtype, fillvalue=0.0, chunks=(4, 6)) d[:] = x yield d f.close()
def test_table_resource(): with tmpfile('csv') as filename: csv = CSV(filename, 'w', schema='{x: int, y: int}') csv.extend([[1, 2], [10, 20]]) t = Data(filename) assert isinstance(t.data, CSV) assert list(compute(t)) == list(csv)
def date_data(): data = [('Alice', 100.0, datetime(2014, 9, 11, 0, 0, 0, 0)), ('Alice', -200.0, datetime(2014, 9, 10, 0, 0, 0, 0)), ('Bob', 300.0, None)] schema = dshape('{name: string, amount: float32, date: ?datetime}') with tmpfile('.csv') as f: csv = CSV(f, schema=schema, mode='w') csv.extend(data) yield CSV(f, schema=schema, mode='r')
def good_csv(): with tmpfile(".csv") as filename: with open(filename, mode='w') as f: # Insert a new record f.write("userid,text,country\n") f.write("1,Alice,az\n") f.write("2,Bob,bl\n") f.write("3,Charlie,cz\n") yield filename
def idx_data(): with tmpfile('.h5') as fn: f = tb.open_file(fn, mode='w') d = f.create_table('/', 'title', x) d.cols.amount.create_index() d.cols.id.create_index() yield d d.close() f.close()
def csi_data(): with tmpfile('.h5') as filename: f = tb.open_file(filename, mode='w') d = f.create_table('/', 'title', x) d.cols.amount.create_csindex() d.cols.id.create_csindex() yield d d.close() f.close()
def test_create_index_uri(): from blaze.data.csv import drop with tmpfile(extension='.db') as fn: uri = 'sqlite:///%s::table' % fn sql = resource(uri, schema='{x: int, y: int}') create_index(uri, 'x', name='x_index') sql = resource(uri, schema='{x: int, y: int}') assert list(list(sql.table.indexes)[0].columns)[0].name == 'x'
def test_csv_with_trailing_commas(): with tmpfile('.csv') as fn: with open(fn, 'wt') as f: # note the trailing space in the header f.write('a,b,c, \n1, 2, 3, ') csv = CSV(fn) assert expr_repr(data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', '' ] with tmpfile('.csv') as fn: with open(fn, 'wt') as f: f.write('a,b,c,\n1, 2, 3, ') # NO trailing space in the header csv = CSV(fn) assert expr_repr(data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', 'Unnamed: 3' ]
def idx_data(): with tmpfile(".h5") as fn: f = tb.open_file(fn, mode="w") d = f.create_table("/", "title", x) d.cols.amount.create_index() d.cols.id.create_index() yield d d.close() f.close()
def csi_data(): with tmpfile(".h5") as filename: f = tb.open_file(filename, mode="w") d = f.create_table("/", "title", x) d.cols.amount.create_csindex() d.cols.id.create_csindex() yield d d.close() f.close()
def test_table_resource(): with tmpfile('csv') as filename: ds = dshape('var * {a: int, b: int}') csv = CSV(filename) append(csv, [[1, 2], [10, 20]], dshape=ds) t = Data(filename) assert isinstance(t.data, CSV) assert into(list, compute(t)) == into(list, csv)
def test_csv_with_trailing_commas(): with tmpfile('.csv') as fn: with open(fn, 'wt') as f: # note the trailing space in the header f.write('a,b,c, \n1, 2, 3, ') csv = CSV(fn) assert repr(Data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', '' ] with tmpfile('.csv') as fn: with open(fn, 'wt') as f: f.write('a,b,c,\n1, 2, 3, ') # NO trailing space in the header csv = CSV(fn) assert repr(Data(fn)) assert discover(csv).measure.names == [ 'a', 'b', 'c', 'Unnamed: 3' ]
def test_hdf5_from_datashape(): with tmpfile('.hdf5') as fn: f = hdf5_from_datashape(fn, '{x: int32, y: {z: 3 * int32}}') assert isinstance(f, h5py.File) assert 'x' in f assert f['y/z'].shape == (3,) assert f['y/z'].dtype == 'i4' # ensure idempotence f = hdf5_from_datashape(fn, '{x: int32, y: {z: 3 * int32}}')
def recdata(): with tmpfile('.h5') as filename: f = h5py.File(filename) d = f.create_dataset('/x', shape=rec.shape, dtype=rec.dtype, chunks=(4, 6)) d['x'] = rec['x'] d['y'] = rec['y'] yield d f.close()
def test_computation_on_engine(): with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = resource(uri, 'foo', dshape='var * {x: int, y: int}') into(sql, [(1, 2), (10, 20)]) r = resource(uri) s = symbol('s', discover(r)) assert compute(s.foo.x.max(), r) == 10
def test_inconsistent_schemas(): with tmpfile('.db') as fn: t = resource('sqlite:///' + fn + '::badtable', dshape='var * {name: string, amount: string}') into(t, [('Alice', '100'), ('Bob', '200')]) t2 = resource('sqlite:///' + fn + '::badtable', dshape='var * {name: string, amount: int}') assert into(list, t2) == [('Alice', 100), ('Bob', 200)]
def bad_csv_df(): with tmpfile(".csv") as filename: with open(filename, mode='w') as badfile: # Insert a new record badfile.write("userid,text,country\n") badfile.write("1,Alice,az\n") badfile.write("2,Bob,bl\n") for i in range(100): badfile.write("%d,badguy,zz\n" % i) badfile.write("4,Dan,gb,extra,extra\n") yield filename
def test_csv_hdf5(self): import h5py from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='2 * int') hdf5 = HDF5(hdf5_fn, '/data', mode='a', schema='2 * int') copy(csv, hdf5) self.assertEquals(nd.as_py(hdf5.as_dynd()), [[1, 1], [2, 2]])
def test_gzopen_csv(): with tmpfile('.csv.gz') as filename: with gzip.open(filename, 'w') as f: f.write('1,1\n2,2') # Not a valid CSV file assert raises(Exception, lambda: list(CSV(filename, schema='2 * int'))) dd = CSV(filename, schema='2 * int', open=gzip.open) assert list(dd) == [[1, 1], [2, 2]]
def test_gzopen_json(): with tmpfile('.json.gz') as filename: with gzip.open(filename, 'w') as f: f.write('[[1, 1], [2, 2]]') # Not a valid JSON file assert raises(Exception, lambda: list(JSON(filename, schema='2 * int'))) dd = JSON(filename, schema='2 * int', open=gzip.open) assert list(dd) == [[1, 1], [2, 2]]
def test_resource_works_with_empty_file(): f = None with tmpfile('.bcolz') as filename: f = filename bc = resource(f, dshape=dshape('{a: int32, b: float64}')) assert len(bc) == 0 assert discover(bc).measure == dshape('{a: int32, b: float64}').measure try: os.remove(f) except OSError: pass
def test_groups(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/data/fixed') hdf = resource('hdfstore://%s' % fn) assert discover(hdf) == discover({'data': {'fixed': df}}) s = symbol('s', discover(hdf)) assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4] hdf.close()
def test_gzopen_json(): with tmpfile('.json.gz') as filename: f = gzip.open(filename, 'wt') f.write('[[1, 1], [2, 2]]') f.close() # Not a valid JSON file assert raises(Exception, lambda: list(JSON(filename, schema='2 * int'))) dd = JSON(filename, schema='2 * int', open=gzip.open) assert tuplify(list(dd)) == ((1, 1), (2, 2))
def test_register(sql): with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = SQL(uri, 'foo', schema='{x: int, y: int}') assert isinstance(resource(uri, 'foo'), SQL) assert isinstance(resource(uri + '::foo'), SQL) sql = SQL('sqlite:///:memory:', 'foo', schema='{x: int, y: int}') assert isinstance(resource('sqlite:///:memory:', 'foo', schema='{x: int, y: int}'), SQL) assert isinstance(resource('sqlite:///:memory:::foo', schema='{x: int, y: int}'), SQL)
def test_merge_compute(): data = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] ds = datashape.dshape('var * {id: int, name: string, amount: real}') s = symbol('s', ds) with tmpfile('db') as fn: uri = 'sqlite:///' + fn into(uri + '::table', data, dshape=ds) expr = transform(s, amount10=s.amount * 10) result = into(list, compute(expr, {s: data})) assert result == [(1, 'Alice', 100, 1000), (2, 'Bob', 200, 2000), (4, 'Dennis', 400, 4000)]
def test_all_string_infer_header(): sdata = """x,tl,z Be careful driving.,hy,en Be careful.,hy,en Can you translate this for me?,hy,en Chicago is very different from Boston.,hy,en Don't worry.,hy,en""" with tmpfile('.csv') as fn: with open(fn, 'w') as f: f.write(sdata) tdata = data(fn, has_header=True) assert tdata.data.has_header assert tdata.fields == ['x', 'tl', 'z']
def test_hdf5_csv(self): import h5py with tmpfile('hdf5') as hdf5_fn: with filetext('') as csv_fn: with h5py.File(hdf5_fn, 'w') as f: d = f.create_dataset('data', (3, 3), dtype='i8') d[:] = 1 csv = CSV(csv_fn, mode='r+', schema='3 * int') hdf5 = HDF5(hdf5_fn, '/data') copy(hdf5, csv) self.assertEquals(list(csv), [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
def test_resource(): f = None with tmpfile('.bcolz') as filename: f = filename bcolz.ctable(rootdir=f, columns=[[1, 2, 3], [1., 2., 3.]], names=['a', 'b']) bc2 = resource(f) assert isinstance(bc2, bcolz.ctable) try: os.remove(f) except OSError: pass
def test_csv_hdf5(self): from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='{a: int32, b: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema='{a: int32, b: int32}') into(hdf5, csv) self.assertEquals(nd.as_py(hdf5.as_dynd()), [{ 'a': 1, 'b': 1 }, { 'a': 2, 'b': 2 }])
def test_explicit_override_dshape(): ds = dshape("""var * {a: ?float64, b: ?string, c: ?float32}""") # If not overridden, the dshape discovery will return: # var * {a: int64, b: string, c: int64}. s = textwrap.dedent("""\ a,b,c 1,x,3 2,y,4 3,z,5 """) with tmpfile('.csv') as filename: with open(filename, 'w') as fd: fd.write(s) bdf = data(filename, dshape=ds) assert bdf.dshape == ds
def test_hdf5_csv(self): import h5py with tmpfile('hdf5') as hdf5_fn: with filetext('') as csv_fn: with h5py.File(hdf5_fn, 'w') as f: d = f.create_dataset('data', (3, ), dtype=np.dtype([(c, 'i4') for c in 'abc'])) d[:] = np.array(1) csv = CSV(csv_fn, mode='r+', schema='{a: int32, b: int32, c: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema=csv.schema) into(csv, hdf5) self.assertEquals(tuple(map(tuple, csv)), ((1, 1, 1), (1, 1, 1), (1, 1, 1)))
def test_csv_sql_json(self): data = [('Alice', 100), ('Bob', 200)] text = '\n'.join(','.join(map(str, row)) for row in data) schema = '{name: string, amount: int}' with filetext(text) as csv_fn: with filetext('') as json_fn: with tmpfile('db') as sqldb: csv = CSV(csv_fn, mode='r', schema=schema) sql = SQL('sqlite:///' + sqldb, 'testtable', schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) into(sql, csv) self.assertEqual(into(list, sql), data) into(json, sql) with open(json_fn) as f: assert 'Alice' in f.read()