def test_engine_metadata_caching(): with tmpfile('db') as fn: engine = resource('sqlite:///' + fn) a = resource('sqlite:///' + fn + '::a', dshape=dshape('var * {x: int}')) b = resource('sqlite:///' + fn + '::b', dshape=dshape('var * {y: int}')) assert a.metadata is b.metadata assert engine is a.bind is b.bind
def test_resource_on_file(): with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = resource(uri, 'foo', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table) with tmpfile('.db') as fn: uri = 'sqlite:///' + fn sql = resource(uri + '::' + 'foo', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table)
def test_no_header_no_columns(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_simple_into(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_fixed_shape(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, 'foo') r = resource('hdfstore://' + fn + '::/foo') assert isinstance(r.shape, list) assert discover(r).shape == (len(df), ) r.parent.close()
def test_resource(): sql = resource('sqlite:///:memory:::mytable', dshape='var * {x: int, y: int}') assert isinstance(sql, sa.Table) assert sql.name == 'mytable' assert isinstance(sql.bind, sa.engine.base.Engine) assert set([c.name for c in sql.c]) == set(['x', 'y'])
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, skipinitialspace="alpha") # failing call
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", skipinitialspace="alpha") # failing call
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) name = name or next(names) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def test_no_header_no_columns(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) if (hasattr(data, 'schema') and isinstance(data.schema, (DataShape, str, unicode)) and ds.measure != data.dshape.measure): raise TypeError('%s schema %s does not match schema %s' % (type(data).__name__, data.schema, ds.measure)) name = name or next(names) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def test_append(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv) assert into(list, sql) == data into(sql, csv) assert into(list, sql) == data + data
def test_resource_to_engine_to_create_tables(): with tmpfile('.db') as fn: uri = 'sqlite:///' + fn ds = datashape.dshape('{mytable: var * {name: string, amt: int}}') r = resource(uri, dshape=ds) assert isinstance(r, sa.engine.Engine) assert r.dialect.name == 'sqlite' assert discover(r) == ds
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, quotechar="alpha") # uses multi-byte character and # fails over to using sql.extend() assert into(list, sql) == data
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_simple_float_into(): tbl = 'testtable_into_float' csv = CSV(file_name_floats, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql,csv, if_exists="replace") assert into(list, sql) == \ [(1.02, 2.02), (102.02, 202.02), (1002.02, 2002.02)]
def test_append_sas_to_sqlite_round_trip(): expected = convert(set, sasfile) with tmpfile('db') as fn: r = resource('sqlite:///%s::SAS' % fn, dshape=discover(sasfile)) append(r, sasfile) result = convert(set, r) assert expected == result
def test_simple_float_into(): tbl = 'testtable_into_float' csv = CSV(file_name_floats, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == \ [(1.02, 2.02), (102.02, 202.02), (1002.02, 2002.02)]
def test_append(): tbl = 'testtable_into_append' csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv) assert into(list, sql) == data into(sql, csv) assert into(list, sql) == data + data
def test_into_sqlite(): data = [('Alice', 100), ('Bob', 200)] ds = datashape.dshape('var * {name: string, amount: int}') with tmpfile('.db') as dbpath: with tmpfile('.csv') as csvpath: csv = into(csvpath, data, dshape=ds, has_header=False) sql = resource('sqlite:///%s::mytable' % dbpath, dshape=ds) append_csv_to_sql_table(sql, csv) assert into(list, sql) == data
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", QUOTE="alpha", FORMAT="csv") # uses multi-byte character and # fails over to using sql.extend() assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_hdfstore(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/appendable', format='table') df.to_hdf(fn, '/fixed') hdf = resource('hdfstore://%s' % fn) s = symbol('s', discover(hdf)) assert isinstance(compute(s.fixed, hdf), (pd.DataFrame, pd.io.pytables.Fixed)) assert isinstance(compute(s.appendable, hdf), (pd.io.pytables.AppendableFrameTable, Chunks)) s = symbol('s', discover(df)) f = resource('hdfstore://%s::/fixed' % fn) a = resource('hdfstore://%s::/appendable' % fn) assert isinstance(pre_compute(s, a), Chunks) hdf.close() f.parent.close() a.parent.close()
def test_groups(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, '/data/fixed') hdf = resource('hdfstore://%s' % fn) assert discover(hdf) == discover({'data': {'fixed': df}}) s = symbol('s', discover(hdf)) assert list(compute(s.data.fixed, hdf).a) == [1, 2, 3, 4] hdf.close()
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' ds = dshape('var * {Name: string, RegistrationDate: date, ZipCode: int32, Consts: float64}') csv = CSV(file_name, has_header=True) sql = resource(url, tbl, dshape=ds) into(sql, csv) assert_allclose(into(list, sql), into(list, csv))
def test_join_count(): ds = datashape.dshape('{t1: var * {x: int, y: int}, t2: var * {a: int, b: int}}') engine = resource('sqlite:///:memory:', dshape=ds) db = symbol('db', ds) expr = join(db.t1[db.t1.x > -1], db.t2, 'x', 'a').count() result = compute(expr, {db: engine}, post_compute=False) assert normalize(str(result)) == normalize(""" SELECT count(alias.x) as count FROM (SELECT t1.x AS x, t1.y AS y, t2.b AS b FROM t1 JOIN t2 ON t1.x = t2.a WHERE t1.x > ?) as alias """)
def test_complex_into(tbl): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') ds = dshape(""" var * { Name: string, RegistrationDate: date, ZipCode: int32, Consts: float64 }""") csv = CSV(file_name, has_header=True) sql = resource(url, tbl, dshape=ds) into(sql, csv) assert_allclose(into(list, sql), into(list, csv))
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' csv = CSV(file_name, schema='{Name: string, RegistrationDate: date, ZipCode: int64, Consts: float64}') sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") df = pd.read_csv(file_name, parse_dates=['RegistrationDate']) assert into(list, sql) == into(list, csv)
def test_simple_into(csv): tbl = 'testtable' with tmpfile('db') as filename: engine = sqlalchemy.create_engine('sqlite:///' + filename) t = resource('sqlite:///' + filename + '::' + tbl, dshape=ds) into(t, csv, dshape=ds) conn = engine.raw_connection() cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table' and name='{0}';".format(tbl)) sqlite_tbl_names = cursor.fetchall() assert sqlite_tbl_names[0][0] == tbl assert into(list, t) == data
def test_simple_into(csv): tbl = 'testtable' with tmpfile('db') as filename: engine = sqlalchemy.create_engine('sqlite:///' + filename) t = resource('sqlite:///' + filename + '::' + tbl, dshape=ds) into(t, csv, dshape=ds) conn = engine.raw_connection() cursor = conn.cursor() cursor.execute( "SELECT name FROM sqlite_master WHERE type='table' and name='{0}';" .format(tbl)) sqlite_tbl_names = cursor.fetchall() assert sqlite_tbl_names[0][0] == tbl assert into(list, t) == data
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' csv = CSV( file_name, schema= '{Name: string, RegistrationDate: date, ZipCode: int64, Consts: float64}' ) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") df = pd.read_csv(file_name, parse_dates=['RegistrationDate']) assert into(list, sql) == into(list, csv)
def test_csv_postgres_load(): tbl = 'testtable' engine = sqlalchemy.create_engine(url) if engine.has_table(tbl): metadata = sqlalchemy.MetaData() metadata.reflect(engine) t = metadata.tables[tbl] t.drop(engine) csv = CSV(file_name) sql = resource(url + '::' + tbl, dshape=csv.dshape) engine = sql.bind conn = engine.raw_connection() cursor = conn.cursor() full_path = os.path.abspath(file_name) load = '''LOAD DATA INFILE '{0}' INTO TABLE {1} FIELDS TERMINATED BY ',' lines terminated by '\n' '''.format(full_path, tbl) cursor.execute(load) conn.commit()
def test_outer_join(): L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') with tmpfile('db') as fn: uri = 'sqlite:///' + fn engine = resource(uri) _left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = resource(uri, 'left', dshape=L.dshape) into(left, _left) _right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = resource(uri, 'right', dshape=R.dshape) into(right, _right) conn = engine.connect() query = compute(join(L, R, how='inner'), {L: left, R: right}, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) query = compute(join(L, R, how='left'), {L: left, R: right}, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) query = compute(join(L, R, how='right'), {L: left, R: right}, post_compute=False) print(query) result = list(map(tuple, conn.execute(query).fetchall())) print(result) assert set(result) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) # SQLAlchemy doesn't support full outer join """ query = compute(join(L, R, how='outer'), {L: left, R: right}, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) """ conn.close()
def create_index(uri, column_name_or_names, name=None, **kwargs): data = resource(uri, **kwargs) create_index(data, column_name_or_names, name=name) return data
def test_resource_sas7bdat(): assert isinstance(resource(test_path), SAS7BDAT)
def test_fixed_convert(): with tmpfile('.hdf5') as fn: df.to_hdf(fn, 'foo') r = resource('hdfstore://' + fn + '::/foo') assert eq(convert(pd.DataFrame, r), df) r.parent.close()
from into.backends.sql_csv import * from into import resource, into import datashape from into.utils import tmpfile def normalize(s): s2 = ' '.join(s.strip().split()).lower().replace('_', '') return s2 csv = CSV('/var/tmp/myfile.csv', delimiter=',', has_header=True) ds = datashape.dshape('var * {name: string, amount: int}') tbl = resource('sqlite:///:memory:::my_table', dshape=ds) def test_postgres_load(): assert normalize(copy_command('postgresql', tbl, csv)) == normalize(r""" COPY my_table from '/var/tmp/myfile.csv' (FORMAT csv, DELIMITER E',', NULL '', QUOTE '"', ESCAPE '\', HEADER True, ENCODING 'utf-8'); """) def test_sqlite_load(): assert normalize(copy_command('sqlite', tbl, csv)) == normalize("""
def test_resource_of_dataset(): with tmpfile('.hdf5') as fn: ds = datashape.dshape('{x: int32, y: 3 * int32}') r = resource('hdfstore://' + fn + '::/x', dshape=ds) assert r r.parent.close()
def test_tryexcept_into(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, quotechar="alpha") # uses multi-byte character assert into(list, sql) == data
def test_failing_argument(tbl): # this will start to fail if we ever restrict kwargs csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, skipinitialspace="alpha") # failing call
def test_resource(): coll = resource('mongodb://localhost:27017/db::mycoll') assert coll.name == 'mycoll' assert coll.database.name == 'db' assert coll.database.connection.host == 'localhost' assert coll.database.connection.port == 27017
L = [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [400, 4, 'Dan'], [500, 5, 'Edith']] df = DataFrame(L, columns=['amount', 'id', 'name']) x = into(np.ndarray, df) sources = [df, x] try: import sqlalchemcy sql = resource('sqlite:///:memory:::accounts', dshape=t.dshape) into(sql, L) sources.append(sql) except: sql = None try: import bcolz bc = into(bcolz.ctable, df) sources.append(bc) except ImportError: bc = None try: import pymongo
sources = [] t = symbol('t', 'var * {amount: int64, id: int64, name: string}') L = [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [-400, 4, 'Dan'], [500, 5, 'Edith']] df = DataFrame(L, columns=['amount', 'id', 'name']) x = into(np.ndarray, df) sources = [df, x] try: import sqlalchemcy sql = resource('sqlite:///:memory:::accounts', dshape=t.dshape) into(sql, L) sources.append(sql) except: sql = None try: import bcolz bc = into(bcolz.ctable, df) sources.append(bc) except ImportError: bc = None try: import pymongo except ImportError:
def test_resource_no_info(): with tmpfile('.hdf5') as fn: r = resource('hdfstore://' + fn) assert isinstance(r, pd.HDFStore) r.close()