def test_csv_json(self): with filetext('1,1\n2,2\n') as csv_fn: with filetext('') as json_fn: schema = '2 * int' csv = CSV(csv_fn, schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) json.extend(csv) self.assertEquals(list(json), [[1, 1], [2, 2]])
def test_csv_json_chunked(self): with filetext('1,1\n2,2\n') as csv_fn: with filetext('') as json_fn: schema = '{a: int32, b: int32}' csv = CSV(csv_fn, schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) into(json, csv) self.assertEquals(tuplify(tuple(json)), ((1, 1), (2, 2)))
def test_csv_json(self): with filetext('1,1\n2,2\n') as csv_fn: with filetext('') as json_fn: schema = '2 * int' csv = CSV(csv_fn, schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) json.extend(csv) self.assertEquals(tuple(map(tuple, json)), ((1, 1), (2, 2)))
def test_json_csv_chunked(self): data = [{'x': 1, 'y': 1}, {'x': 2, 'y': 2}] text = '\n'.join(map(json.dumps, data)) schema = '{x: int, y: int}' with filetext(text) as json_fn: with filetext('') as csv_fn: js = JSON_Streaming(json_fn, schema=schema) csv = CSV(csv_fn, mode='r+', schema=schema) copy(js, csv) self.assertEquals(list(csv), data)
def test_json_csv_structured(self): data = [{'x': 1, 'y': 1}, {'x': 2, 'y': 2}] text = '\n'.join(map(json.dumps, data)) schema = '{x: int, y: int}' with filetext(text) as json_fn: with filetext('') as csv_fn: js = JSON_Streaming(json_fn, schema=schema) csv = CSV(csv_fn, mode='r+', schema=schema) csv.extend(js) self.assertEquals(tuple(map(tuple, (csv))), ((1, 1), (2, 2)))
def test_json_csv_chunked(self): data = [{'x': 1, 'y': 1}, {'x': 2, 'y': 2}] tuples = ((1, 1), (2, 2)) text = '\n'.join(map(json.dumps, data)) schema = '{x: int, y: int}' with filetext(text) as json_fn: with filetext('') as csv_fn: js = JSON_Streaming(json_fn, schema=schema) csv = CSV(csv_fn, mode='r+', schema=schema) into(csv, js) self.assertEquals(tuple(csv), tuples)
def test_unused_datetime_columns(): ds = dshape('2 * {val: string, when: datetime}') with filetext("val,when\na,2000-01-01\nb,2000-02-02") as fn: csv = CSV(fn, has_header=True) s = symbol('s', discover(csv)) assert into(list, compute(s.val, csv)) == ['a', 'b']
def test_init(self): with filetext(self.text) as fn: dd = JSON_Streaming(fn, schema=self.schema) self.assertEquals(tuple(dd), self.tuples) assert dd.dshape in set(( datashape.dshape('var * {name: string, amount: int32}'), datashape.dshape('5 * {name: string, amount: int32}')))
def test_chunks(self): with filetext(self.text) as fn: dd = JSON_Streaming(fn, schema=self.schema) chunks = list(dd.chunks(blen=2)) assert isinstance(chunks[0], nd.array) self.assertEquals(len(chunks), 3) self.assertEquals(nd.as_py(chunks[0]), self.dicts[:2])
def test_init(self): with filetext(self.text) as fn: dd = JSON_Streaming(fn, schema=self.schema) self.assertEquals(list(dd), self.data) assert dd.dshape in set(( datashape.dshape('var * {name: string, amount: int32}'), datashape.dshape('5 * {name: string, amount: int32}')))
def test_resource_gz(self): with filetext(b'1,1\n2,2\n', extension='.csv.gz', open=gzip.open, mode='wb') as fn: dd = resource(fn, schema='{x: int, y: int}') assert isinstance(dd, CSV) assert dd.open == gzip.open assert into(list, dd) == [(1, 1), (2, 2)]
def test_into(self): with filetext('1,1\n2,2', extension='.csv') as a: with tmpfile(extension='.csv') as b: A = resource(a, schema='{x: int, y: int}') B = resource(b, schema='{x: int, y: int}', mode='a') B = into(B, A) assert into(list, B) == [(1, 1), (2, 2)]
def test_into_filename_filename(): with filetext('1,2\n3,4', extension='csv') as source_fn: with tmpfile('csv') as target_fn: into(target_fn, source_fn) csv = CSV(target_fn) assert into(list, csv) == [(1, 2), (3, 4)]
def test_chunks(self): with filetext(self.text) as fn: dd = JSON_Streaming(fn, schema=self.schema) chunks = list(dd.chunks(blen=2)) assert isinstance(chunks[0], nd.array) self.assertEquals(len(chunks), 3) self.assertEquals(nd.as_py(chunks[0]), self.data[:2])
def test_into(self): with filetext('1,1\n2,2', extension='.csv') as a: with tmpfile(extension='.csv') as b: A = resource(a, schema='2 * int') B = resource(b, schema='2 * int', mode='a') B = into(B, A) assert tuplify(list(B)) == ((1, 1), (2, 2))
def test_copy(self): with filetext('1,1\n2,2', extension='.csv') as a: with tmpfile(extension='.csv') as b: A = resource(a, schema='2 * int') B = resource(b, schema='2 * int', mode='a') copy(A, B) assert list(B) == [[1, 1], [2, 2]]
def test_delayed_bad_datashape(): text = 'a,b\n' + '\n'.join(['1,2'] * 20) + '\n1,3.14' with filetext(text) as fn: csv = CSV(fn, nrows_discovery=2) assert csv.schema == dshape('{a: int64, b: int64}') with pytest.raises(ValueError): list(csv)
def test_csv_gzip_into_sql(): from blaze.data.csv import CSV engine, sql = single_table_engine() with filetext(b'Alice,2\nBob,4', extension='csv.gz', open=gzip.open, mode='wb') as fn: csv = CSV(fn, schema=sql.schema) into(sql, csv) assert into(list, sql) == into(list, csv)
def test_delayed_bad_datashape(): text = 'a,b\n' + '\n'.join(['1,2']*20) + '\n1,3.14' with filetext(text) as fn: csv = CSV(fn, nrows_discovery=2) assert csv.schema == dshape('{a: int64, b: int64}') with pytest.raises(ValueError): list(csv)
def test_extend_structured_many_newlines(self): inan = np.array([np.nan]).astype('int32').item() with filetext('1,1.0\n2,2.0\n\n\n\n') as fn: csv = CSV(fn, 'r+', schema='{x: int32, y: float32}', delimiter=',') csv.extend([(3, 3)]) result = tuplify(tuple(csv)) expected = ((1, 1.0), (2, 2.0), (inan, np.nan), (inan, np.nan), (inan, np.nan), (3, 3.0)) assert np.isclose(result, expected, equal_nan=True).all()
def test_a_mode(): text = ("id, name, balance\n1, Alice, 100\n2, Bob, 200\n" "3, Charlie, 300\n4, Denis, 400\n5, Edith, 500") with filetext(text) as fn: csv = CSV(fn, 'a') csv.extend([(6, 'Frank', 600), (7, 'Georgina', 700)]) result = set(csv[:, 'name']) assert 'Georgina' in result
def test_extend_structured(self): with filetext('1,1.0\n2,2.0\n') as fn: csv = CSV(fn, 'r+', schema='{x: int32, y: float32}', delimiter=',') csv.extend([(3, 3)]) assert (list(csv) == [[1, 1.0], [2, 2.0], [3, 3.0]] or list(csv) == [{'x': 1, 'y': 1.0}, {'x': 2, 'y': 2.0}, {'x': 3, 'y': 3.0}])
def test_re_dialect(self): dialect1 = {'delimiter': ',', 'lineterminator': '\n'} dialect2 = {'delimiter': ';', 'lineterminator': '--'} text = '1,1\n2,2\n' schema = '2 * int32' with filetext(text) as source_fn: with filetext('') as dest_fn: src = CSV(source_fn, schema=schema, **dialect1) dst = CSV(dest_fn, mode='w', schema=schema, **dialect2) # Perform copy dst.extend(src) with open(dest_fn) as f: self.assertEquals(f.read(), '1;1--2;2--')
def test_a_mode(self): text = ("id, name, balance\n1, Alice, 100\n2, Bob, 200\n" "3, Charlie, 300\n4, Denis, 400\n5, Edith, 500") with filetext(text) as fn: csv = CSV(fn, 'a') csv.extend([(6, 'Frank', 600), (7, 'Georgina', 700)]) assert 'Georgina' in set(csv.py[:, 'name'])
def test_DataFrame_CSV(): with filetext('1,2\n3,4\n') as fn: csv = CSV(fn, schema='{a: int64, b: float64}') df = into(DataFrame, csv) expected = DataFrame([[1, 2.0], [3, 4.0]], columns=['a', 'b']) assert str(df) == str(expected) assert list(df.dtypes) == [np.int64, np.float64]
def test_tuple_types(): """ CSVs with uniform types still create record types with names """ with filetext('1,1\n2,2\n') as fn: csv = CSV(fn, 'r+', delimiter=',') assert csv[0] == (1, 1) assert isinstance(csv.schema[0], Record) assert len(csv.schema[0].types) == 2 assert len(set(csv.schema[0].types)) == 1
def test_append(self): with filetext('') as fn: dd = JSON_Streaming(fn, mode='w', schema=self.schema) dd.extend([self.tuples[0]]) with open(fn) as f: self.assertEquals(json.loads(f.read().strip()), self.dicts[0]) self.assertRaises(ValueError, lambda : dd.extend([5.5])) self.assertRaises(ValueError, lambda : dd.extend([{'name': 5, 'amount': 1.3}]))
def test_append(self): with filetext('') as fn: dd = JSON_Streaming(fn, mode='w', schema=self.schema) dd.extend([self.data[0]]) with open(fn) as f: self.assertEquals(json.loads(f.read().strip()), self.data[0]) self.assertRaises(ValueError, lambda : dd.extend([5.5])) self.assertRaises(ValueError, lambda : dd.extend([{'name': 5, 'amount': 1.3}]))
def test_csv_sql_json(self): data = [('Alice', 100), ('Bob', 200)] text = '\n'.join(','.join(map(str, row)) for row in data) schema = '{name: string, amount: int}' engine = create_engine('sqlite:///:memory:') with filetext(text) as csv_fn: with filetext('') as json_fn: csv = CSV(csv_fn, mode='r', schema=schema) sql = SQL(engine, 'testtable', schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) copy(csv, sql) self.assertEqual(list(sql), data) copy(sql, json) with open(json_fn) as f: assert 'Alice' in f.read()
def test_csv_sql_json(self): data = [('Alice', 100), ('Bob', 200)] text = '\n'.join(','.join(map(str, row)) for row in data) schema = '{name: string, amount: int}' with filetext(text) as csv_fn: with filetext('') as json_fn: with tmpfile('db') as sqldb: csv = CSV(csv_fn, mode='r', schema=schema) sql = SQL('sqlite:///' + sqldb, 'testtable', schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) into(sql, csv) self.assertEqual(into(list, sql), data) into(json, sql) with open(json_fn) as f: assert 'Alice' in f.read()
def test_csv_hdf5(self): import h5py from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='2 * int') hdf5 = HDF5(hdf5_fn, '/data', mode='a', schema='2 * int') copy(csv, hdf5) self.assertEquals(nd.as_py(hdf5.as_dynd()), [[1, 1], [2, 2]])
def test_csv_hdf5(self): from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='{a: int32, b: int32}') hdf5 = resource(hdf5_fn + '::/data', dshape='var * {a: int32, b: int32}') into(hdf5, csv) self.assertEquals(hdf5[:].tolist(), [(1, 1), (2, 2)])
def test_csv_hdf5(self): import h5py from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='2 * int') hdf5 = HDF5(hdf5_fn, '/data', schema='2 * int') copy(csv, hdf5) self.assertEquals(nd.as_py(hdf5.as_dynd()), [[1, 1], [2, 2]])
def test_csv_hdf5(self): from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='{a: int32, b: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema='{a: int32, b: int32}') into(hdf5, csv) self.assertEquals(nd.as_py(hdf5.as_dynd()), [{'a': 1, 'b': 1}, {'a': 2, 'b': 2}])
def test_csv_gzip_into_sql(): from blaze.data.csv import CSV from blaze.data.sql import into engine = sa.create_engine('sqlite:///:memory:') sql = SQL(engine, 'accounts', schema='{name: string, amount: int32}') with filetext(b'Alice,2\nBob,4', extension='csv.gz', open=gzip.open, mode='wb') as fn: csv = CSV(fn, schema=sql.schema) into(sql, csv) assert list(sql) == list(csv)
def test_schema_detection_modifiers(): text = "name amount date\nAlice 100 20120101\nBob 200 20120102" with filetext(text) as fn: assert (CSV(fn).schema == dshape('{name: string, amount: ?int64, date: ?int64}')) assert (CSV(fn, columns=['NAME', 'AMOUNT', 'DATE']).schema == dshape('{NAME: string, AMOUNT: ?int64, DATE: ?int64}')) assert (str(CSV(fn, types=['string', 'int32', 'date']).schema) == str(dshape('{name: string, amount: int32, date: date}'))) a = CSV(fn, typehints={'date': 'date'}).schema b = dshape('{name: string, amount: ?int64, date: date}') assert str(a) == str(b)
def test_schema_detection_modifiers(): text = "name amount date\nAlice 100 20120101\nBob 200 20120102" with filetext(text) as fn: assert (CSV(fn).schema == dshape( '{name: string, amount: ?int64, date: ?int64}')) assert (CSV(fn, columns=[ 'NAME', 'AMOUNT', 'DATE' ]).schema == dshape('{NAME: string, AMOUNT: ?int64, DATE: ?int64}')) assert (str(CSV(fn, types=['string', 'int32', 'date']).schema) == str( dshape('{name: string, amount: int32, date: date}'))) a = CSV(fn, typehints={'date': 'date'}).schema b = dshape('{name: string, amount: ?int64, date: date}') assert str(a) == str(b)
def test_hdf5_csv(self): import h5py with tmpfile('hdf5') as hdf5_fn: with filetext('') as csv_fn: with h5py.File(hdf5_fn, 'w') as f: d = f.create_dataset('data', (3, 3), dtype='i8') d[:] = 1 csv = CSV(csv_fn, mode='r+', schema='3 * int') hdf5 = HDF5(hdf5_fn, '/data') copy(hdf5, csv) self.assertEquals(list(csv), [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
def test_extend_structured(self): with filetext('1,1.0\n2,2.0\n') as fn: csv = CSV(fn, 'r+', schema='{x: int32, y: float32}', delimiter=',') csv.extend([(3, 3)]) assert (list(csv) == [[1, 1.0], [2, 2.0], [3, 3.0]] or list(csv) == [{ 'x': 1, 'y': 1.0 }, { 'x': 2, 'y': 2.0 }, { 'x': 3, 'y': 3.0 }])
def test_hdf5_csv(self): import h5py with tmpfile('hdf5') as hdf5_fn: with filetext('') as csv_fn: with h5py.File(hdf5_fn, 'w') as f: d = f.create_dataset('data', (3, 3), dtype='i8') d[:] = 1 csv = CSV(csv_fn, mode='r+', schema='3 * int') hdf5 = HDF5(hdf5_fn, '/data') copy(hdf5, csv) self.assertEquals(tuple(map(tuple, csv)), ((1, 1, 1), (1, 1, 1), (1, 1, 1)))
def test_csv_hdf5(self): from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='{a: int32, b: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema='{a: int32, b: int32}') into(hdf5, csv) self.assertEquals(nd.as_py(hdf5.as_dynd()), [{ 'a': 1, 'b': 1 }, { 'a': 2, 'b': 2 }])
def test_hdf5_csv(self): import h5py with tmpfile('hdf5') as hdf5_fn: with filetext('') as csv_fn: with h5py.File(hdf5_fn, 'w') as f: d = f.create_dataset('data', (3,), dtype=np.dtype([(c, 'i4') for c in 'abc'])) d[:] = np.array(1) csv = CSV(csv_fn, mode='r+', schema='{a: int32, b: int32, c: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema=csv.schema) into(csv, hdf5) self.assertEquals(tuple(map(tuple, csv)), ((1, 1, 1), (1, 1, 1), (1, 1, 1)))
def test_json_into_mongodb(empty_collec): with filetext(json.dumps(les_mis_data)) as filename: dd = JSON(filename) coll = empty_collec into(coll, dd) mongo_data = list(coll.find()) last = mongo_data[0]['nodes'][-1] first = mongo_data[0]['nodes'][0] first = (first['group'], first['name']) last = (last['group'], last['name']) assert dd.as_py()[1][-1] == last assert dd.as_py()[1][0] == first
def test_hdf5_csv(self): import h5py with tmpfile('hdf5') as hdf5_fn: with filetext('') as csv_fn: with h5py.File(hdf5_fn, 'w') as f: d = f.create_dataset('data', (3, ), dtype=np.dtype([(c, 'i4') for c in 'abc'])) d[:] = np.array(1) csv = CSV(csv_fn, mode='r+', schema='{a: int32, b: int32, c: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema=csv.schema) into(csv, hdf5) self.assertEquals(tuple(map(tuple, csv)), ((1, 1, 1), (1, 1, 1), (1, 1, 1)))
def test_into_list_Column(): with filetext('Alice,1\nBob,2') as fn: csv = CSV(fn, columns=['name', 'id']) t = Data(csv) assert into(list, t.name) == ['Alice', 'Bob']
def test_as_py(self): with filetext(self.text) as fn: dd = JSON_Streaming(fn, mode='r', schema=self.schema) assert dd.as_py() == self.data