def test_into_filename_filename(): with filetext('1,2\n3,4', extension='csv') as source_fn: with tmpfile('csv') as target_fn: into(target_fn, source_fn) csv = CSV(target_fn) assert into(list, csv) == [(1, 2), (3, 4)]
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' csv = CSV( file_name, schema= '{Name: string, RegistrationDate: date, ZipCode: int32, Consts: float64}' ) sql = SQL(url, tbl, schema=csv.schema) into(sql, csv, if_exists="replace") df = pd.read_csv(file_name, parse_dates=['RegistrationDate']) assert_allclose([sql[0]], [csv[0]]) for col in sql.columns: # need to convert to python datetime if col == "RegistrationDate": py_dates = list( df['RegistrationDate'].map(lambda x: x.date()).values) assert list(sql[:, col]) == list(csv[:, col]) == py_dates elif col == 'Consts': l, r = list(sql[:, col]), list(csv[:, col]) assert np.allclose(l, df[col].values) assert np.allclose(l, r) else: assert list(sql[:, col]) == list(csv[:, col]) == list( df[col].values)
def test_into(self): with filetext('1,1\n2,2', extension='.csv') as a: with tmpfile(extension='.csv') as b: A = resource(a, schema='{x: int, y: int}') B = resource(b, schema='{x: int, y: int}', mode='a') B = into(B, A) assert into(list, B) == [(1, 1), (2, 2)]
def test_dynd(self): self.assertEqual(nd.as_py(into(nd.array(), (1, 2, 3))), nd.as_py(nd.array([1, 2, 3]))) self.assertEqual(into([], nd.array([1, 2])), [1, 2]) self.assertEqual(into([], nd.array([[1, 2], [3, 4]])), [[1, 2], [3, 4]])
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' csv = CSV(file_name, schema='{Name: string, RegistrationDate: date, ZipCode: int32, Consts: float64}') sql = SQL(url,tbl, schema=csv.schema) into(sql,csv, if_exists="replace") df = pd.read_csv(file_name, parse_dates=['RegistrationDate']) assert sql[0] == csv[0] #implement count method print(len(list(sql[:]))) # assert sql[] == csv[-1] for col in sql.columns: #need to convert to python datetime if col == "RegistrationDate": py_dates = list(df['RegistrationDate'].astype(object).values) py_dates = [dt.date(d.year, d.month, d.day) for d in py_dates] assert list(sql[:,col]) == list(csv[:,col]) == py_dates #handle floating point precision -- perhaps it's better to call out to assert_array_almost_equal elif col == 'Consts': assert list(sql[:,col]) == list(csv[:,col]) == [round(val, 6) for val in df[col].values] else: assert list(sql[:,col]) == list(csv[:,col]) == list(df[col].values)
def test_pandas_seq(): assert str(into(DataFrame, [1, 2])) == \ str(DataFrame([1, 2])) assert str(into(DataFrame, (1, 2))) == \ str(DataFrame([1, 2])) assert str(into(DataFrame(columns=['a', 'b']), [(1, 2), (3, 4)])) == \ str(DataFrame([[1, 2], [3, 4]], columns=['a', 'b']))
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' csv = CSV(file_name, schema='{Name: string, RegistrationDate: date, ZipCode: int32, Consts: float64}') sql = SQL(url, tbl, schema=csv.schema) into(sql, csv, if_exists="replace") df = pd.read_csv(file_name, parse_dates=['RegistrationDate']) assert_allclose([sql[0]], [csv[0]]) for col in sql.columns: # need to convert to python datetime if col == "RegistrationDate": py_dates = list(df['RegistrationDate'].map(lambda x: x.date()).values) assert list(sql[:, col]) == list(csv[:, col]) == py_dates elif col == 'Consts': l, r = list(sql[:, col]), list(csv[:, col]) assert np.allclose(l, df[col].values) assert np.allclose(l, r) else: assert list(sql[:, col]) == list(csv[:,col]) == list(df[col].values)
def test_into_filename(): with tmpfile('csv') as filename: df = DataFrame([['Alice', 100], ['Bob', 200]], columns=['name', 'amount']) into(filename, df) csv = CSV(filename) assert into(list, csv) == into(list, df)
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url, tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", skipinitialspace="alpha") # failing call
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = SQL(url,tbl, schema= csv.schema) into(sql,csv, if_exists="replace", skipinitialspace="alpha") # failing call
def test_numpy_list(): data = [('Alice', 100), ('Bob', 200)] dtype = into(np.ndarray, data).dtype assert np.issubdtype(dtype[0], str) assert np.issubdtype(dtype[1], int) assert into([], into(np.ndarray, data)) == data
def test_series_single_column(data): data = [('Alice', -200.0, 1), ('Bob', -300.0, 2)] t = Table(data, '{name: string, amount: float64, id: int64}') df = into(pd.Series, t['name']) out_df = into(df, into(DataFrame, t['amount'])) assert isinstance(df, pd.Series) expected = pd.DataFrame(data, columns=t.schema.measure.names).name assert str(df) == str(expected) assert df.name == out_df.name
def test_csv_json_chunked(self): with filetext('1,1\n2,2\n') as csv_fn: with filetext('') as json_fn: schema = '{a: int32, b: int32}' csv = CSV(csv_fn, schema=schema) json = JSON_Streaming(json_fn, mode='r+', schema=schema) into(json, csv) self.assertEquals(tuplify(tuple(json)), ((1, 1), (2, 2)))
def test_no_header_no_columns(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url, tbl, dshape=csv.dshape) into(sql,csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url, tbl, dshape=csv.dshape) into(sql,csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_series_single_column(): data = [('Alice', -200.0, 1), ('Bob', -300.0, 2)] t = Data(data, '2 * {name: string, amount: float64, id: int64}') df = into(pd.Series, t['name']) out_df = into(df, into(DataFrame, t['amount'])) assert isinstance(df, pd.Series) expected = pd.DataFrame(data, columns=t.schema.measure.names).name assert str(df) == str(expected) assert df.name == out_df.name
def test_into_tables_path(good_csv, out_hdf5, out_hdf5_alt): import tables as tb tble = into(tb.Table, good_csv, filename=out_hdf5, datapath='/foo') tble2 = into(tb.Table, good_csv, filename=out_hdf5_alt, datapath='/foo', output_path=out_hdf5_alt) n = len(tble) x = len(tble2) tble._v_file.close() assert n == x assert n == 3
def test_containers(self): self.assertEqual(into([], (1, 2, 3)), [1, 2, 3]) self.assertEqual(into((), (1, 2, 3)), (1, 2, 3)) self.assertEqual(into({}, [(1, 2), (3, 4)]), {1: 2, 3: 4}) self.assertEqual(into((), {1: 2, 3: 4}), ((1, 2), (3, 4))) self.assertEqual(into((), {'cat': 2, 'dog': 4}), (('cat', 2), ('dog', 4)))
def test_no_header_no_columns(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = SQL(url,tbl, schema= '{x: int, y: int}') into(sql,csv, if_exists="replace") assert list(sql[:, 'x']) == [1, 10, 100] assert list(sql[:, 'y']) == [2, 20, 200]
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = SQL(url, tbl, schema=csv.schema) into(sql, csv, if_exists="replace") assert list(sql[:, 'a']) == [1, 10, 100] assert list(sql[:, 'b']) == [2, 20, 200]
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url, tbl, dshape=csv.dshape) into(sql,csv, if_exists="replace", QUOTE="alpha", FORMAT="csv") # uses multi-byte character and # fails over to using sql.extend() assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_simple_float_into(): tbl = 'testtable_into_float' csv = CSV(file_name_floats, columns=['a', 'b']) sql = SQL(url,tbl, schema= csv.schema) into(sql,csv, if_exists="replace") assert list(sql[:, 'a']) == [1.02, 102.02, 1002.02] assert list(sql[:, 'b']) == [2.02, 202.02, 2002.02]
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = SQL(url,tbl, schema= csv.schema) into(sql,csv, if_exists="replace") assert list(sql[:, 'a']) == [1, 10, 100] assert list(sql[:, 'b']) == [2, 20, 200]
def test_pandas_numpy(data): dtype = [('name', 'O'), ('amount', int)] x = np.array(data, dtype=dtype) result = into(DataFrame(), x) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected) result = into(DataFrame(columns=['name', 'amount']), x) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected)
def test_pandas_numpy(): dtype = [('name', 'O'), ('amount', int)] x = np.array(data, dtype=dtype) result = into(DataFrame(), x) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected) result = into(DataFrame(columns=['name', 'amount']), x) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected)
def test_csv_hdf5(self): from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='{a: int32, b: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema='{a: int32, b: int32}') into(hdf5, csv) self.assertEquals(nd.as_py(hdf5.as_dynd()), [{'a': 1, 'b': 1}, {'a': 2, 'b': 2}])
def test_csv_hdf5(self): import h5py from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='2 * int') hdf5 = HDF5(hdf5_fn, '/data', schema='2 * int') into(hdf5, csv) self.assertEquals(nd.as_py(hdf5.as_dynd()), [[1, 1], [2, 2]])
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = SQL(url,tbl, schema= csv.schema) into(sql,csv, if_exists="replace", QUOTE="alpha", FORMAT="csv") # uses multi-byte character and # fails over to using sql.extend() assert list(sql[:, 'a']) == [1, 10, 100] assert list(sql[:, 'b']) == [2, 20, 200]
def test_into_tables_path_bad_csv(bad_csv_df, out_hdf5): import tables as tb tble = into(tb.Table, bad_csv_df, filename=out_hdf5, datapath='/foo', error_bad_lines=False) df_from_tbl = into(DataFrame, tble) tble._v_file.close() # Check that it's the same as straight from the CSV df_from_csv = into(DataFrame, bad_csv_df, error_bad_lines=False) assert len(df_from_csv) == len(df_from_tbl) assert list(df_from_csv.columns) == list(df_from_tbl.columns) assert (df_from_csv == df_from_tbl).all().all()
def test_json_csv_chunked(self): data = [{'x': 1, 'y': 1}, {'x': 2, 'y': 2}] tuples = ((1, 1), (2, 2)) text = '\n'.join(map(json.dumps, data)) schema = '{x: int, y: int}' with filetext(text) as json_fn: with filetext('') as csv_fn: js = JSON_Streaming(json_fn, schema=schema) csv = CSV(csv_fn, mode='r+', schema=schema) into(csv, js) self.assertEquals(tuple(csv), tuples)
def test_into_PyTables(a, h5tmp): dshape = 'var * {amount: int64, id: int64, name: string[7, "A"], timestamp: datetime}' lhs = into(tables.Table, a, dshape=dshape, filename=h5tmp, datapath='/data') result = into(np.ndarray, lhs) expected = numpy_ensure_bytes(x) assert into(list, result) == into(list, expected) assert result.dtype.names == expected.dtype.names # Ideally we would be doing this. Sadly there is a float/int discrepancy # np.testing.assert_array_equal(into(np.ndarray, lhs), # numpy_ensure_bytes(x)) lhs._v_file.close()
def test_hdf5_csv(self): import h5py with tmpfile('hdf5') as hdf5_fn: with filetext('') as csv_fn: with h5py.File(hdf5_fn, 'w') as f: d = f.create_dataset('data', (3, 3), dtype='i8') d[:] = 1 csv = CSV(csv_fn, mode='r+', schema='3 * int') hdf5 = HDF5(hdf5_fn, '/data') into(csv, hdf5) self.assertEquals(tuple(map(tuple, csv)), ((1, 1, 1), (1, 1, 1), (1, 1, 1)))
def test_pandas_dynd(): arr = nd.array(data, dtype=schema) result = into(DataFrame, arr) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected) nda = nd.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) csv = CSV(example('accounts.csv')) df_csv = into(DataFrame, csv) df_nd = into(df_csv, nda) df_no_names = into(DataFrame, nda) assert list(df_nd.columns) == list(df_csv.columns) assert list(df_no_names.columns) == [0, 1, 2]
def test_pandas_dynd(data, schema): arr = nd.array(data, dtype=schema) result = into(DataFrame, arr) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected) nda = nd.array([[1,2,3], [4,5,6], [7,8,9]]) csv = CSV(example('accounts.csv')) df_csv = into(DataFrame, csv) df_nd = into(df_csv, nda) df_no_names = into(DataFrame, nda) assert list(df_nd.columns) == list(df_csv.columns) assert list(df_no_names.columns) == [0,1,2]
def test_pandas_numpy(): data = [('Alice', 100), ('Bob', 200)] dtype=[('name', 'O'), ('amount', int)] x = np.array(data, dtype=dtype) result = into(DataFrame(), x) expected = DataFrame(data, columns=['name', 'amount']) assert str(result) == str(expected) result = into(DataFrame(columns=['name', 'amount']), x) expected = DataFrame(data, columns=['name', 'amount']) print(result) print(expected) assert str(result) == str(expected)
def test_csv_hdf5(self): from dynd import nd with tmpfile('hdf5') as hdf5_fn: with filetext('1,1\n2,2\n') as csv_fn: csv = CSV(csv_fn, schema='{a: int32, b: int32}') hdf5 = HDF5(hdf5_fn, '/data', schema='{a: int32, b: int32}') into(hdf5, csv) self.assertEquals(nd.as_py(hdf5.as_dynd()), [{ 'a': 1, 'b': 1 }, { 'a': 2, 'b': 2 }])
def test_into_ColumnDataSource_pytables(): pytest.importorskip('bokeh') from bokeh.objects import ColumnDataSource pyt = PyTables(example('accounts.h5'), '/accounts') cds = into(ColumnDataSource, pyt) assert 'balance' and 'id' and 'name' in cds.column_names
def test_pandas_pandas(): df = DataFrame(data, columns=['name', 'balance']) new_df = into(DataFrame, df) # Data must be the same assert np.all(new_df == df) # new_df should be a copy of df assert id(new_df) != id(df)
def test_into_DataFrame_concat(): csv = CSV(os.path.join(os.path.dirname(__file__), 'accounts.csv')) df = into(pd.DataFrame, Concat([csv, csv])) csv_df = csv.pandas_read_csv() assert df.index.tolist() == list(range(len(df))) assert df.values.tolist() == (csv_df.values.tolist() + csv_df.values.tolist()) assert df.columns.tolist() == csv_df.columns.tolist()
def test_data_frame_single_column_projection(): data = [('Alice', -200.0, 1), ('Bob', -300.0, 2)] t = Data(data, '2 * {name: string, amount: float64, id: int64}') df = into(pd.DataFrame, t[['name']]) assert isinstance(df, pd.DataFrame) expected = pd.DataFrame(data, columns=t.schema.measure.names)[['name']] assert str(df) == str(expected)
def test_into_DataFrame_Excel_xlsx_format(): pytest.importorskip('xlrd') dirname = os.path.dirname(__file__) fn = os.path.join(dirname, 'accounts_1.xlsx') exp = DataFrame([[1, "Alice", 100], [2, "Bob", 200]], columns=["id", "name", "amount"]) df = into(DataFrame, fn) assert (df == exp).all().all()
def test_Column_data_source(): pytest.importorskip('bokeh') from bokeh.objects import ColumnDataSource cds = into(ColumnDataSource(), data_table) assert isinstance(cds, ColumnDataSource) assert set(cds.column_names) == set(data_table.fields)
def test_resource_gz(self): with filetext(b'1,1\n2,2\n', extension='.csv.gz', open=gzip.open, mode='wb') as fn: dd = resource(fn, schema='{x: int, y: int}') assert isinstance(dd, CSV) assert dd.open == gzip.open assert into(list, dd) == [(1, 1), (2, 2)]
def test_base(): """ Test all pairs of base in-memory data structures """ sources = [v for k, v in data if k not in [list]] targets = [ v for k, v in data if k not in [Data, Collection, CSV, nd.array, SQL] ] for a in sources: for b in targets: assert normalize(into(type(b), a)) == normalize(b)
def test_simple_into(engine, csv): tbl = 'testtable_into_2' sql = SQL(engine, tbl, schema=csv.schema) into(sql, csv, if_exists="replace") conn = sql.engine.raw_connection() cursor = conn.cursor() cursor.execute( "SELECT name FROM sqlite_master WHERE type='table' and name='{0}';". format(tbl)) sqlite_tbl_names = cursor.fetchall() assert sqlite_tbl_names[0][0] == tbl assert list(sql[:, 'a']) == [1, 10, 100] assert list(sql[:, 'b']) == [2, 20, 200]
def test_DataFrame_CSV(): with filetext('1,2\n3,4\n') as fn: csv = CSV(fn, schema='{a: int64, b: float64}') df = into(DataFrame, csv) expected = DataFrame([[1, 2.0], [3, 4.0]], columns=['a', 'b']) assert str(df) == str(expected) assert list(df.dtypes) == [np.int64, np.float64]
def test_datetime_csv_reader_same_as_into_types(): csv = CSV(os.path.join(os.path.dirname(__file__), 'accounts.csv')) rhs = csv.pandas_read_csv().dtypes df = into(pd.DataFrame, csv) dtypes = df.dtypes expected = pd.Series( [np.dtype(x) for x in ['i8', 'i8', 'O', 'datetime64[ns]']], index=csv.columns) assert dtypes.index.tolist() == expected.index.tolist() assert dtypes.tolist() == expected.tolist()
def test_into_DataFrame_Excel_xls_format(): pytest.importorskip('xlrd') dirname = os.path.dirname(__file__) fn = os.path.join(dirname, 'accounts.xls') exp = DataFrame([[100, 1, "Alice", "2000-12-25T00:00:01"], [200, 2, "Bob", "2001-12-25T00:00:01"], [300, 3, "Charlie", "2002-12-25T00:00:01"]], columns=["amount", "id", "name", "timestamp"]) df = into(DataFrame, fn) assert (df == exp).all().all()