def date_data(): data = [('Alice', 100.0, datetime(2014, 9, 11, 0, 0, 0, 0)), ('Alice', -200.0, datetime(2014, 9, 10, 0, 0, 0, 0)), ('Bob', 300.0, None)] schema = dshape('{name: string, amount: float32, date: ?datetime}') with tmpfile('.csv') as f: csv = CSV(f, schema=schema, mode='w') csv.extend(data) yield CSV(f, schema=schema, mode='r')
def test_gzopen_csv(): with tmpfile('.csv.gz') as filename: with gzip.open(filename, 'w') as f: f.write('1,1\n2,2') # Not a valid CSV file assert raises(Exception, lambda: list(CSV(filename, schema='2 * int'))) dd = CSV(filename, schema='2 * int', open=gzip.open) assert list(dd) == [[1, 1], [2, 2]]
def test_schema_detection_modifiers(): text = "name amount date\nAlice 100 20120101\nBob 200 20120102" with filetext(text) as fn: assert (CSV(fn).schema == dshape( '{name: string, amount: ?int64, date: ?int64}')) assert (CSV(fn, columns=[ 'NAME', 'AMOUNT', 'DATE' ]).schema == dshape('{NAME: string, AMOUNT: ?int64, DATE: ?int64}')) assert (str(CSV(fn, types=['string', 'int32', 'date']).schema) == str( dshape('{name: string, amount: int32, date: date}'))) a = CSV(fn, typehints={'date': 'date'}).schema b = dshape('{name: string, amount: ?int64, date: date}') assert str(a) == str(b)
def test_csv_into_mongodb_colon_del(empty_collec, file_name_colon): csv = CSV(file_name_colon) coll = empty_collec lhs = into(list, csv) newcoll = into(coll, csv) rhs = into(list, newcoll) assert lhs == rhs
def test_csv_into_mongodb(empty_collec, file_name): csv = CSV(file_name) coll = empty_collec res = into(coll, csv) mongo_data = list(res.find({}, {'_0': 1, '_id': 0})) assert list(csv[:, '_0']) == [i['_0'] for i in mongo_data]
def test_csv_into_mongodb_columns(empty_collec, file_name): csv = CSV(file_name, schema='{x: int, y: int}') coll = empty_collec lhs = into(list, csv) assert lhs == into(list, into(coll, csv))
def test_into_cds_mixed(): pytest.importorskip('bokeh') from bokeh.objects import ColumnDataSource n = 25 ddict = { 'first': np.random.choice(list('abc'), size=n), 'second': np.random.choice(['cachaça', 'tres leches', 'pizza'], size=n), 'third': list(range(n)) } df = pd.DataFrame(ddict) with tmpfile('.csv') as fn: df.to_csv(fn, header=None, index=False, encoding='utf8') csv = CSV(fn, columns=['first', 'second', 'third'], encoding='utf8') t = Data(csv) cds = into(ColumnDataSource, t) assert isinstance(cds, ColumnDataSource) expected = dict( (k, into(list, csv[:, k])) for k in ['first', 'second', 'third']) assert cds.data == expected cds = into(ColumnDataSource, t[['first', 'second']]) assert isinstance(cds, ColumnDataSource) expected = dict( (k, into(list, csv[:, k])) for k in ['first', 'second']) assert cds.data == expected cds = into(ColumnDataSource, t['first']) assert isinstance(cds, ColumnDataSource) assert cds.data == {'first': into(list, csv[:, 'first'])}
def test_stack(stack_data): descriptors = [CSV(fn, schema='2 * int32') for fn in sorted(stack_data)] dd = Stack(descriptors) assert dd.dshape == 3 * descriptors[0].dshape expected = (((1, 1), (2, 2)), ((3, 3), (4, 4)), ((5, 5), (6, 6))) assert tuplify(tuple(dd.as_py())) == expected result = dd.as_dynd() expected2 = nd.array(expected, dtype='int32') assert nd.as_py(result) == nd.as_py(expected2) assert tuplify(tuple(dd)) == expected assert tuplify(tuple(dd)) == expected # Not one use only chunks = dd.chunks() assert all(isinstance(chunk, nd.array) for chunk in chunks) assert tuple(dd[[0, 2], 0, 0]) == (1, 5) assert tuplify(tuple(dd[0])) == ((1, 1), (2, 2)) res = dd[0, :, [1]] x = tuple(res) assert tuplify(x) == ((1, ), (2, )) assert tuplify(tuple(dd[0])) == expected[0] assert isinstance(dd[:, 0], Iterator) assert isinstance(dd[:], Iterator)
def file_data(): data = {'a.csv': '1,1\n2,2', 'b.csv': '3,3\n4,4\n5,5', 'c.csv': '6,6\n7,7'} with filetexts(data) as filenames: descriptors = [ CSV(fn, schema='{a: int32, b: int32}') for fn in sorted(filenames) ] yield Concat(descriptors)
def test_into_filename_filename(): with filetext('1,2\n3,4', extension='csv') as source_fn: with tmpfile('csv') as target_fn: into(target_fn, source_fn) csv = CSV(target_fn) assert into(list, csv) == [(1, 2), (3, 4)]
def test_append(self): # Get a private file so as to not mess the original one csv_file = tempfile.mktemp(".csv") with open(csv_file, "w") as f: f.write(self.buf) dd = CSV(csv_file, schema=self.schema, mode='r+') dd.extend([["k4", "v4", 4, True]]) vals = [nd.as_py(v) for v in dd.chunks(blen=2)] self.assertEqual(vals, [[{ u'f0': u'k1', u'f1': u'v1', u'f2': 1, u'f3': False }, { u'f0': u'k2', u'f1': u'v2', u'f2': 2, u'f3': True }], [{ u'f0': u'k3', u'f1': u'v3', u'f2': 3, u'f3': False }, { u'f0': u'k4', u'f1': u'v4', u'f2': 4, u'f3': True }]]) self.assertRaises(ValueError, lambda: dd.extend([3.3])) os.remove(csv_file)
def test_csv_mongodb_load(db, file_name, empty_collec): csv = CSV(file_name) #with out header # mongoimport -d test_db -c testcollection --type csv --file /Users/quasiben/test.csv --fields alpha,beta # with collection([]) as coll: # --ignoreBlanks coll = empty_collec copy_info = { 'dbname': db.name, 'coll': coll.name, 'abspath': csv._abspath, 'column_names': ','.join(csv.columns) } copy_cmd = """mongoimport -d {dbname} -c {coll} --type csv --file {abspath} --fields {column_names}""" copy_cmd = copy_cmd.format(**copy_info) ps = subprocess.Popen(copy_cmd, shell=os.name != 'nt', stdout=subprocess.PIPE) output = ps.stdout.read() mongo_data = list(coll.find({}, {'_0': 1, '_id': 0})) assert list(csv[:, '_0']) == [i['_0'] for i in mongo_data]
def test_string_dataset(tmpcsv): raw = 'a,b,2.0\nc,1999,3.0\nd,3.0,4.0' with open(tmpcsv, mode='w') as f: f.write(raw) csv = CSV(tmpcsv, columns=list('xyz')) t = Table(csv) x = into(list, t) assert x == [('a', 'b', 2.0), ('c', '1999', 3.0), ('d', '3.0', 4.0)]
def test_getitem_stop(self): dd = CSV(self.csv_file, schema=self.schema) self.assertEqual(dd[:1], [{ u'f0': u'k1', u'f1': u'v1', u'f2': 1, u'f3': False }])
def test_repr_hdma(): csv = CSV(example('hmda-small.csv')) t = TableSymbol('hmda', csv.schema) assert compute(t.head(), csv) columns = ['action_taken_name', 'agency_abbr', 'applicant_ethnicity_name'] assert compute(t[columns].head(), csv)
def test_getitem_start_step(self): dd = CSV(self.csv_file, schema=self.schema) self.assertEqual(dd[1::2], [{ u'f0': u'k2', u'f1': u'v2', u'f2': 2, u'f3': True }])
def test_delayed_bad_datashape(): text = 'a,b\n' + '\n'.join(['1,2'] * 20) + '\n1,3.14' with filetext(text) as fn: csv = CSV(fn, nrows_discovery=2) assert csv.schema == dshape('{a: int64, b: int64}') with pytest.raises(ValueError): list(csv)
def csv(schema): csv = CSV('test.csv', schema=schema, mode='w') csv.extend(data) yield csv try: os.remove(csv.path) except OSError: pass
def test_into_DataFrame_concat(): csv = CSV(os.path.join(os.path.dirname(__file__), 'accounts.csv')) df = into(pd.DataFrame, Concat([csv, csv])) csv_df = csv.pandas_read_csv() assert df.index.tolist() == list(range(len(df))) assert df.values.tolist() == (csv_df.values.tolist() + csv_df.values.tolist()) assert df.columns.tolist() == csv_df.columns.tolist()
def test_into_filename(): with tmpfile('csv') as filename: df = DataFrame([['Alice', 100], ['Bob', 200]], columns=['name', 'amount']) into(filename, df) csv = CSV(filename) assert into(list, csv) == into(list, df)
def test_table_resource(): with tmpfile('csv') as filename: csv = CSV(filename, 'w', schema='{x: int, y: int}') csv.extend([[1, 2], [10, 20]]) t = Data(filename) assert isinstance(t.data, CSV) assert list(compute(t)) == list(csv)
def test_re_dialect(self): dialect1 = {'delimiter': ',', 'lineterminator': '\n'} dialect2 = {'delimiter': ';', 'lineterminator': '--'} text = '1,1\n2,2\n' schema = '2 * int32' with filetext(text) as source_fn: with filetext('') as dest_fn: src = CSV(source_fn, schema=schema, **dialect1) dst = CSV(dest_fn, mode='w', schema=schema, **dialect2) # Perform copy dst.extend(src) with open(dest_fn) as f: self.assertEquals(f.read(), '1;1--2;2--')
def setUp(self): self.csv_file = tempfile.mktemp(".csv") with open(self.csv_file, "w") as f: f.write(self.buf) self.dd = CSV(self.csv_file, dialect='excel', schema=self.schema, delimiter=' ', mode='r+')
def test_DataFrame_CSV(): with filetext('1,2\n3,4\n') as fn: csv = CSV(fn, schema='{a: int64, b: float64}') df = into(DataFrame, csv) expected = DataFrame([[1, 2.0], [3, 4.0]], columns=['a', 'b']) assert str(df) == str(expected) assert list(df.dtypes) == [np.int64, np.float64]
def test_a_mode(): text = ("id, name, balance\n1, Alice, 100\n2, Bob, 200\n" "3, Charlie, 300\n4, Denis, 400\n5, Edith, 500") with filetext(text) as fn: csv = CSV(fn, 'a') csv.extend([(6, 'Frank', 600), (7, 'Georgina', 700)]) result = set(csv[:, 'name']) assert 'Georgina' in result
def test_datetime_csv_reader_same_as_into_types(): csv = CSV(os.path.join(os.path.dirname(__file__), 'accounts.csv')) rhs = csv.pandas_read_csv().dtypes df = into(pd.DataFrame, csv) dtypes = df.dtypes expected = pd.Series( [np.dtype(x) for x in ['i8', 'i8', 'O', 'datetime64[ns]']], index=csv.columns) assert dtypes.index.tolist() == expected.index.tolist() assert dtypes.tolist() == expected.tolist()
def test_csv_json_chunked(self): with filetext('1,1\n2,2\n') as csv_fn: with filetext('') as json_fn: schema = '{a: int32, b: int32}' csv = CSV(csv_fn, schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) into(json, csv) self.assertEquals(tuplify(tuple(json)), ((1, 1), (2, 2)))
def test_csv_json(self): with filetext('1,1\n2,2\n') as csv_fn: with filetext('') as json_fn: schema = '2 * int' csv = CSV(csv_fn, schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) json.extend(csv) self.assertEquals(list(json), [[1, 1], [2, 2]])
def test_tuple_types(): """ CSVs with uniform types still create record types with names """ with filetext('1,1\n2,2\n') as fn: csv = CSV(fn, 'r+', delimiter=',') assert csv[0] == (1, 1) assert isinstance(csv.schema[0], Record) assert len(csv.schema[0].types) == 2 assert len(set(csv.schema[0].types)) == 1
def test_csv_hdf5(self): import h5py from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='2 * int') hdf5 = HDF5(hdf5_fn, '/data', mode='a', schema='2 * int') copy(csv, hdf5) self.assertEquals(nd.as_py(hdf5.as_dynd()), [[1, 1], [2, 2]])