def test_csv_import_dupcols(): with bayesdb_csv_stream('foo,foo\n0,1\n') as (bdb, f): with pytest.raises(IOError): bayeslite.bayesdb_read_csv(bdb, 'bad', f, header=True, create=True) with bayesdb_csv_stream('foo,FOO\n0,1\n') as (bdb, f): with pytest.raises(IOError): bayeslite.bayesdb_read_csv(bdb, 'bad', f, header=True, create=True)
def test_csv_import(): with bayesdb_csv_stream(csv_data) as (bdb, f): bayeslite.bayesdb_read_csv(bdb, 'employees', f, header=True, create=True)
def dot_csv(self, line): '''create table from CSV file <table> </path/to/data.csv> Create a SQL table named <table> from the data in </path/to/data.csv>. ''' # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) != 2: self.stdout.write('Usage: .csv <table> </path/to/data.csv>\n') return table = tokens[0] pathname = tokens[1] try: with open(pathname, 'rU') as f: bayeslite.bayesdb_read_csv(self._bdb, table, f, header=True, create=True, ifnotexists=False) except IOError as e: self.stdout.write('%s\n' % (e, )) except Exception: self.stdout.write(traceback.format_exc())
def test_csv_import_empty(): with bayesdb_csv_stream('') as (bdb, f): with pytest.raises(IOError): bayeslite.bayesdb_read_csv(bdb, 'empty', f, header=True, create=True)
def test_csv_import_onecol_key(): with bayesdb_csv_stream('foo\n0\none\n2\n') as (bdb, f): # foo will be a key column, hence no columns to model. bayeslite.bayesdb_read_csv(bdb, 'onecol_key', f, header=True, create=True) with pytest.raises(ValueError): bayeslite.guess.bayesdb_guess_generator(bdb, 'onecol_key_cc', 'onecol_key', 'crosscat')
def test_csv_import_nocols(): with bayesdb_csv_stream('\n') as (bdb, f): # CSV import rejects no columns. with pytest.raises(IOError): bayeslite.bayesdb_read_csv(bdb, 'nocols', f, header=True, create=True)
def test_insert(): with test_csv.bayesdb_csv_stream(test_csv.csv_data) as (bdb, f): bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True) guess.bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat') bdb.execute('initialize 2 models for t_cc') bdb.execute('analyze t_cc for 1 iteration wait') generator_id = core.bayesdb_get_generator(bdb, 't_cc') row = (41, 'F', 96000, 73, 'data science', 2) bqlfn.bayesdb_insert(bdb, generator_id, row)
def test_insert(): with test_csv.bayesdb_csv_stream(test_csv.csv_data) as (bdb, f): bayeslite.bayesdb_read_csv(bdb, "t", f, header=True, create=True) guess.bayesdb_guess_generator(bdb, "t_cc", "t", "crosscat") bdb.execute("initialize 2 models for t_cc") bdb.execute("analyze t_cc for 1 iteration wait") generator_id = core.bayesdb_get_generator(bdb, "t_cc") row = (41, "F", 96000, 73, "data science", 2) bqlfn.bayesdb_insert(bdb, generator_id, row)
def test_csv_import_onecol_key(): with bayesdb_csv_stream('foo\n0\none\n2\n') as (bdb, f): # foo will be a key column, hence no columns to model. bayeslite.bayesdb_read_csv(bdb, 'onecol_key', f, header=True, create=True) with pytest.raises(ValueError): bayeslite.guess.bayesdb_guess_population(bdb, 'p_onecol_key', 'onecol_key')
def test_codebook_value_map(): ''' A categorical column in crosscat can only take on a fixed number of values v1, v2, ..., v3. In this test, we have a categorical column called `city` which takes on values `RIO, LA, SF, DC` as specified in the codebook value map. INITIALIZE dummy table with only RIO and SF appearing in dataset ANALYZE dummy_cc INSERT rows with `city` names `LA` and `DC` ANALYZE dummy_cc SIMULATE specifying `city` = `LA` (throws KeyError) ''' with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) bayeslite.bayesdb_read_csv(bdb, 'dummy', dummy_data, header=True, create=True) with tempfile.NamedTemporaryFile(prefix='bayeslite') as tempbook: with open(tempbook.name, 'w') as f: f.write(dummy_codebook) bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dummy', tempbook.name) bdb.execute(''' CREATE GENERATOR dummy_cc FOR dummy USING crosscat( GUESS(*), kerberos IGNORE, age NUMERICAL, city CATEGORICAL ) ''') bdb.execute('INITIALIZE 10 MODELS FOR dummy_cc') bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT') bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = RIO LIMIT 5') bdb.sql_execute(''' INSERT INTO dummy (kerberos, age, city) VALUES ('jackie', 18, 'LA'), ('rocker', 22, 'DC') ''') bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT') c = bdb.sql_execute('SELECT * FROM dummy') with pytest.raises(KeyError): bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = LA LIMIT 5')
def test_codebook_value_map(): """ A categorical column in crosscat can only take on a fixed number of values v1, v2, ..., v3. In this test, we have a categorical column called `city` which takes on values `RIO, LA, SF, DC` as specified in the codebook value map. INITIALIZE dummy table with only RIO and SF appearing in dataset ANALYZE dummy_cc INSERT rows with `city` names `LA` and `DC` ANALYZE dummy_cc SIMULATE specifying `city` = `LA` (throws KeyError) """ with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) bayeslite.bayesdb_read_csv(bdb, "dummy", dummy_data, header=True, create=True) with tempfile.NamedTemporaryFile(prefix="bayeslite") as tempbook: with open(tempbook.name, "w") as f: f.write(dummy_codebook) bayeslite.bayesdb_load_codebook_csv_file(bdb, "dummy", tempbook.name) bdb.execute( """ CREATE GENERATOR dummy_cc FOR dummy USING crosscat( GUESS(*), kerberos IGNORE, age NUMERICAL, city CATEGORICAL ) """ ) bdb.execute("INITIALIZE 10 MODELS FOR dummy_cc") bdb.execute("ANALYZE dummy_cc FOR 20 ITERATIONS WAIT") bdb.execute("SIMULATE age FROM dummy_cc GIVEN city = RIO LIMIT 5") bdb.sql_execute( """ INSERT INTO dummy (kerberos, age, city) VALUES ('jackie', 18, 'LA'), ('rocker', 22, 'DC') """ ) bdb.execute("ANALYZE dummy_cc FOR 20 ITERATIONS WAIT") c = bdb.sql_execute("SELECT * FROM dummy") with pytest.raises(KeyError): bdb.execute("SIMULATE age FROM dummy_cc GIVEN city = LA LIMIT 5")
def test_csv_import_badschema0(): with bayesdb_csv_stream(csv_data) as (bdb, f): bdb.sql_execute(''' CREATE TABLE emPloyEES( AGE INTeger, geNder Text, -- saLAry REal, heighT inteGER, DIVision TEXt, rank INTEGER ) ''') with pytest.raises(IOError): bayeslite.bayesdb_read_csv(bdb, 'employees', f, header=True, create=False)
def test_csv_import_badschema1(): with bayesdb_csv_stream(csv_data) as (bdb, f): bdb.sql_execute(''' CREATE TABLE employees( age INTEGER, zorblaxianism TEXT, salary INTEGER, height INTEGER NOT NULL PRIMARY KEY, division TEXT, rank CATEGORICAL ) ''') with pytest.raises(IOError): bayeslite.bayesdb_read_csv(bdb, 'employees', f, header=True, create=False)
def test_csv_import_schema_case(): with bayesdb_csv_stream(csv_data) as (bdb, f): bdb.sql_execute(''' CREATE TABLE emPloyEES( AGE INTeger, geNder Text, saLAry REal, heighT inteGER, DIVision TEXt, rank INTEGER ) ''') bayeslite.bayesdb_read_csv(bdb, 'employees', f, header=True, create=False) bayeslite.guess.bayesdb_guess_generator(bdb, 'employees_cc', 'employees', 'crosscat')
def test_engine_stamp_two_clients(): """Confirm analysis by one worker makes cache in other worker stale.""" with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: with bayeslite.bayesdb_open(f.name) as bdb0: bayeslite.bayesdb_read_csv(bdb0, 't', StringIO(test_csv.csv_data), header=True, create=True) bdb0.execute(''' CREATE POPULATION p FOR t ( age NUMERICAL; gender CATEGORICAL; salary NUMERICAL; height IGNORE; division CATEGORICAL; rank CATEGORICAL ) ''') bdb0.execute('CREATE METAMODEL m FOR p WITH BASELINE crosscat;') cgpm_metamodel = bdb0.metamodels['cgpm'] population_id = bayeslite.core.bayesdb_get_population(bdb0, 'p') generator_id = bayeslite.core.bayesdb_get_generator( bdb0, population_id, 'm') assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 0 with bayeslite.bayesdb_open(f.name) as bdb1: bdb1.execute('INITIALIZE 1 MODEL FOR m') assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 1 assert cgpm_metamodel._engine_stamp(bdb1, generator_id) == 1 bdb0.execute('ANALYZE m FOR 1 ITERATION WAIT') assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 2 assert cgpm_metamodel._get_cache_entry(bdb0, generator_id, 'engine') is not None with bayeslite.bayesdb_open(f.name) as bdb2: bdb2.execute('ANALYZE m FOR 1 ITERATION WAIT') assert cgpm_metamodel._engine_stamp(bdb2, generator_id) == 3 assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 3 # Engine in cache of bdb0 should be stale, since bdb2 analyzed. assert cgpm_metamodel._engine_latest(bdb0, generator_id) is None
def test_csv_import_schema(): with bayesdb_csv_stream(csv_data) as (bdb, f): bdb.sql_execute(''' CREATE TABLE employees( age INTEGER, gender TEXT, salary REAL, height INTEGER, division TEXT, rank INTEGER ) ''') bayeslite.bayesdb_read_csv(bdb, 'employees', f, header=True, create=False) bdb.execute('select height from employees').fetchall() # XXX Currently this test fails because we compile the query # into `SELECT "idontexist" FROM "employees"', and for # compatibility with MySQL idiocy or something, SQLite treats # double-quotes as single-quotes if the alternative would be # an error. with pytest.raises(apsw.SQLError): bdb.execute('select idontexist from employees') raise apsw.SQLError('BQL compiler is broken;' ' a.k.a. sqlite3 is stupid.') bdb.execute(''' CREATE POPULATION p_employees FOR employees ( height IGNORE; age NUMERICAL; gender NOMINAL; salary CYCLIC; division NOMINAL; rank NOMINAL ) ''') bdb.execute(''' CREATE GENERATOR p_employees_cc for p_employees USING cgpm; ''') bdb.execute('estimate height from p_employees').fetchall() with pytest.raises(bayeslite.BQLError): bdb.execute('estimate predict height with confidence 0.9' ' from p_employees')
def test_engine_increment_stamp(): """Confirm the engine stamp is incremented appropriately.""" with bayeslite.bayesdb_open(':memory:') as bdb: bayeslite.bayesdb_read_csv(bdb, 't', StringIO(test_csv.csv_data), header=True, create=True) bdb.execute(''' CREATE POPULATION p FOR t ( age NUMERICAL; gender CATEGORICAL; salary NUMERICAL; height IGNORE; division CATEGORICAL; rank CATEGORICAL ) ''') bdb.execute('CREATE METAMODEL m FOR p WITH BASELINE crosscat;') cgpm_metamodel = bdb.metamodels['cgpm'] population_id = bayeslite.core.bayesdb_get_population(bdb, 'p') generator_id = bayeslite.core.bayesdb_get_generator( bdb, population_id, 'm') # The engine stamp should be at zero without models. assert cgpm_metamodel._engine_stamp(bdb, generator_id) == 0 # The engine stamp should equal after initializing models. bdb.execute('INITIALIZE 2 MODELS FOR m;') assert cgpm_metamodel._engine_stamp(bdb, generator_id) == 1 # No caching on initialize. assert cgpm_metamodel._get_cache_entry(bdb, generator_id, 'engine') \ is None # The engine stamp should increment after analysis. bdb.execute('ANALYZE m FOR 1 ITERATIONS WAIT;') assert cgpm_metamodel._engine_stamp(bdb, generator_id) == 2 # Caching on analyze. assert cgpm_metamodel._get_cache_entry(bdb, generator_id, 'engine') \ is not None # Wipe the cache, run a simulation, and confirm the caching. cgpm_metamodel._del_cache_entry(bdb, generator_id, 'engine') assert cgpm_metamodel._get_cache_entry(bdb, generator_id, 'engine') \ is None bdb.execute('SIMULATE age FROM p LIMIT 1;').fetchall() assert cgpm_metamodel._get_cache_entry(bdb, generator_id, 'engine') \ is not None
def test_csv_missing(): with bayesdb_csv_stream(csv_data_missing) as (bdb, f): # XXX Test the automatic column type guessing too. bdb.sql_execute('CREATE TABLE t(a REAL, b REAL, c REAL)') bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False) def clean(column_name): qcn = bql_quote_name(column_name) sql = "UPDATE t SET %s = NULL WHERE %s = '' OR %s LIKE 'NaN'" % \ (qcn, qcn, qcn) bdb.sql_execute(sql) clean('a') clean('b') clean('c') assert bdb.execute('select * from t').fetchall() == [ (1.0, 2.0, 3.0), (10.0, None, 30.0), (100.0, 200.0, None), (4.0, 5.0, 6.0), ]
def test_csv_import_schema_case(): with bayesdb_csv_stream(csv_data) as (bdb, f): bdb.sql_execute(''' CREATE TABLE emPloyEES( AGE INTeger, geNder Text, saLAry REal, heighT inteGER, DIVision TEXt, rank INTEGER ) ''') bayeslite.bayesdb_read_csv(bdb, 'employees', f, header=True, create=False) bayeslite.guess.bayesdb_guess_population(bdb, 'p_employees', 'employees')
def test_engine_stamp_two_clients(): """Confirm analysis by one worker makes cache in other worker stale.""" with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: with bayeslite.bayesdb_open(f.name) as bdb0: bayeslite.bayesdb_read_csv(bdb0, 't', StringIO(test_csv.csv_data), header=True, create=True) bdb0.execute(''' CREATE POPULATION p FOR t ( age NUMERICAL; gender NOMINAL; salary NUMERICAL; height IGNORE; division NOMINAL; rank NOMINAL; ) ''') bdb0.execute('CREATE GENERATOR m FOR p;') cgpm_backend = bdb0.backends['cgpm'] population_id = bayeslite.core.bayesdb_get_population(bdb0, 'p') generator_id = bayeslite.core.bayesdb_get_generator( bdb0, population_id, 'm') assert cgpm_backend._engine_stamp(bdb0, generator_id) == 0 with bayeslite.bayesdb_open(f.name) as bdb1: bdb1.execute('INITIALIZE 1 MODEL FOR m') assert cgpm_backend._engine_stamp(bdb0, generator_id) == 1 assert cgpm_backend._engine_stamp(bdb1, generator_id) == 1 bdb0.execute('ANALYZE m FOR 1 ITERATION') assert cgpm_backend._engine_stamp(bdb0, generator_id) == 2 assert cgpm_backend._get_cache_entry( bdb0, generator_id, 'engine') is not None with bayeslite.bayesdb_open(f.name) as bdb2: bdb2.execute('ANALYZE m FOR 1 ITERATION') assert cgpm_backend._engine_stamp(bdb2, generator_id) == 3 assert cgpm_backend._engine_stamp(bdb0, generator_id) == 3 # Engine in cache of bdb0 should be stale, since bdb2 analyzed. assert cgpm_backend._engine_latest(bdb0, generator_id) is None
def test_engine_increment_stamp(): """Confirm the engine stamp is incremented appropriately.""" with bayeslite.bayesdb_open(':memory:') as bdb: bayeslite.bayesdb_read_csv(bdb, 't', StringIO(test_csv.csv_data), header=True, create=True) bdb.execute(''' CREATE POPULATION p FOR t ( age NUMERICAL; gender NOMINAL; salary NUMERICAL; height IGNORE; division NOMINAL; rank NOMINAL; ) ''') bdb.execute('CREATE GENERATOR m FOR p;') cgpm_backend = bdb.backends['cgpm'] population_id = bayeslite.core.bayesdb_get_population(bdb, 'p') generator_id = bayeslite.core.bayesdb_get_generator( bdb, population_id, 'm') # The engine stamp should be at zero without models. assert cgpm_backend._engine_stamp(bdb, generator_id) == 0 # The engine stamp should equal after initializing models. bdb.execute('INITIALIZE 2 MODELS FOR m;') assert cgpm_backend._engine_stamp(bdb, generator_id) == 1 # No caching on initialize. assert cgpm_backend._get_cache_entry(bdb, generator_id, 'engine') \ is None # The engine stamp should increment after analysis. bdb.execute('ANALYZE m FOR 1 ITERATIONS') assert cgpm_backend._engine_stamp(bdb, generator_id) == 2 # Caching on analyze. assert cgpm_backend._get_cache_entry(bdb, generator_id, 'engine') \ is not None # Wipe the cache, run a simulation, and confirm the caching. cgpm_backend._del_cache_entry(bdb, generator_id, 'engine') assert cgpm_backend._get_cache_entry(bdb, generator_id, 'engine') \ is None bdb.execute('SIMULATE age FROM p LIMIT 1;').fetchall() assert cgpm_backend._get_cache_entry(bdb, generator_id, 'engine') \ is not None
def dot_csv(self, line): """create table from CSV file <table> </path/to/data.csv> Create a SQL table named <table> from the data in </path/to/data.csv>. """ # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) != 2: self.stdout.write("Usage: .csv <table> </path/to/data.csv>\n") return table = tokens[0] pathname = tokens[1] try: with open(pathname, "rU") as f: bayeslite.bayesdb_read_csv(self._bdb, table, f, header=True, create=True, ifnotexists=False) except IOError as e: self.stdout.write("%s\n" % (e,)) except Exception: self.stdout.write(traceback.format_exc())
def test_csv_import_toomanycols(): with bayesdb_csv_stream('foo,bar\n0,1\n0,1,2\n') as (bdb, f): with pytest.raises(IOError): bayeslite.bayesdb_read_csv(bdb, 'bad', f, header=True, create=True)
def test_csv_import_onecol(): with bayesdb_csv_stream('foo\n0\none\n2\n0\n') as (bdb, f): bayeslite.bayesdb_read_csv(bdb, 'onecol', f, header=True, create=True)
def test_read_csv(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=True) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): with bdb.savepoint(): # Table must not exist if ifnotexists=False. bdb.sql_execute('CREATE TABLE t(x)') bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) with pytest.raises(IOError): # Table must have no empty values in header. csv_hdrdata_prime = csv_hdrdata[1:] f = StringIO.StringIO(csv_hdrdata_prime) with bdb.savepoint(): bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) data = bdb.sql_execute('SELECT * FROM t').fetchall() assert data == [ # XXX Would be nice if the NaN could actually be that, or # at least None/NULL. (1, 2, 3, 'foo', 'bar', u'nan', u'', u'quagga'), (4, 5, 6, 'baz', 'quux', 42.0, u'', u'eland'), (7, 8, 6, 'zot', 'mumble', 87.0, u'zoot', u'caribou'), ] f = StringIO.StringIO(csv_hdr) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=True) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master' ' WHERE name = ?', ('t',))) == \ 'CREATE TABLE "t"' \ '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \ '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)' f = StringIO.StringIO(csv_data) bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data + data # Test the BQL CREATE TABLE FROM <csv-file> syntax. f = StringIO.StringIO(csv_hdrdata) with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bdb.execute('CREATE TABLE t2 FROM \'%s\'' % (temp.name, )) assert bdb.sql_execute('SELECT * FROM t2').fetchall() == data # Trying to read a csv with an empty column name should fail. csv_header_corrupt = csv_hdr.replace('a,b', ',') csv_hdrdata_corrupt = csv_header_corrupt + csv_data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata_corrupt) with pytest.raises(IOError): bayeslite.bayesdb_read_csv_file(bdb, 't3', temp.name, header=True, create=True)
def test_add_variable(): with bayesdb_open() as bdb: bayesdb_read_csv(bdb, 't', StringIO.StringIO(test_csv.csv_data), header=True, create=True) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( age numerical; gender nominal; salary numerical; height ignore; division ignore; rank ignore; ) ''') bdb.metamodels['cgpm'].set_multiprocess(False) bdb.execute('CREATE METAMODEL m0 FOR p WITH BASELINE crosscat;') bdb.execute('INITIALIZE 1 MODELS FOR m0;') bdb.execute('ANALYZE m0 FOR 5 ITERATION WAIT;') # Run some queries on the new variable in a metamodel or aggregated. def run_queries(target, m): extra = 'MODELED BY %s' % (m, ) if m is not None else '' bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF %s = 1 BY p %s ''' % ( target, extra, )).fetchall() for other in ['age', 'gender', 'salary']: cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF %s WITH %s BY p %s ''' % (target, other, extra)) assert cursor_value(cursor) >= 0 bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF %s FROM PAIRWISE p %s; ''' % ( target, extra, )).fetchall() # Fail to run quieres on height, does not exist yet. with pytest.raises(BQLError): run_queries('height', 'm0') # Add the height variable bdb.execute('ALTER POPULATION p ADD VARIABLE height numerical;') # Run targeted analysis on the newly included height variable. bdb.execute('ANALYZE m0 FOR 5 ITERATION WAIT;') bdb.execute('ANALYZE m0 FOR 5 ITERATION WAIT (VARIABLES height);') # Queries should now be successful. run_queries('height', 'm0') # Create a new metamodel, and create a custom cateogry model for # the new variable `height`. bdb.execute(''' CREATE METAMODEL m1 FOR p WITH BASELINE crosscat( SET CATEGORY MODEL FOR age TO exponential; SET CATEGORY MODEL FOR height TO lognormal; ) ''') bdb.execute('INITIALIZE 2 MODELS FOR m1') bdb.execute('ANALYZE m1 FOR 2 ITERATION WAIT;') # Run height queries on m1. run_queries('height', 'm1') # Run height queries on population, aggregating m0 and m1. run_queries('height', None) # Add a third variable rank. bdb.execute('ALTER POPULATION p ADD VARIABLE rank numerical;') # Analyze rank on m0. bdb.execute(''' ANALYZE m0 FOR 2 ITERATION WAIT (OPTIMIZED; VARIABLES rank); ''') # Analyze all except rank on m0. bdb.execute(''' ANALYZE m0 FOR 2 ITERATION WAIT (OPTIMIZED; SKIP rank); ''') # Fail on m1 with OPTIMIZED, since non-standard category models. with pytest.raises(ValueError): bdb.execute(''' ANALYZE m1 FOR 2 ITERATION WAIT (OPTIMIZED; VARIABLES rank); ''') # Succeed analysis on non-optimized analysis. bdb.execute('ANALYZE m1 FOR 2 ITERATION WAIT;') # Run queries on the new variable. run_queries('rank', 'm0') run_queries('rank', 'm1') run_queries('rank', None)
def test_add_variable(): with bayesdb_open() as bdb: bayesdb_read_csv( bdb, 't', StringIO.StringIO(test_csv.csv_data), header=True, create=True) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( age numerical; gender nominal; salary numerical; height ignore; division ignore; rank ignore; ) ''') bdb.backends['cgpm'].set_multiprocess(False) bdb.execute('CREATE GENERATOR m0 FOR p;') bdb.execute('INITIALIZE 1 MODELS FOR m0;') bdb.execute('ANALYZE m0 FOR 5 ITERATION') # Run some queries on the new variable in the generator or aggregated. def run_queries(target, m): extra = 'MODELED BY %s' % (m,) if m is not None else '' bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF %s = 1 BY p %s ''' % (target, extra,)).fetchall() for other in ['age', 'gender', 'salary']: cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF %s WITH %s BY p %s ''' % (target, other, extra)) assert cursor_value(cursor) >= 0 bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF %s FROM PAIRWISE p %s; ''' % (target, extra,)).fetchall() # Fail to run quieres on height, does not exist yet. with pytest.raises(BQLError): run_queries('height', 'm0') # Add the height variable bdb.execute('ALTER POPULATION p ADD VARIABLE height numerical;') # Run targeted analysis on the newly included height variable. bdb.execute('ANALYZE m0 FOR 5 ITERATION') bdb.execute('ANALYZE m0 FOR 5 ITERATION (VARIABLES height);') # Queries should now be successful. run_queries('height', 'm0') # Create a new generator, and create a custom category model for # the new variable `height`. bdb.execute(''' CREATE GENERATOR m1 FOR p( SET CATEGORY MODEL FOR age TO exponential; SET CATEGORY MODEL FOR height TO lognormal; ) ''') bdb.execute('INITIALIZE 2 MODELS FOR m1') bdb.execute('ANALYZE m1 FOR 2 ITERATION') # Run height queries on m1. run_queries('height', 'm1') # Run height queries on population, aggregating m0 and m1. run_queries('height', None) # Add a third variable rank. bdb.execute('ALTER POPULATION p ADD VARIABLE rank numerical;') # Analyze rank on m0. bdb.execute(''' ANALYZE m0 FOR 2 ITERATION (OPTIMIZED; VARIABLES rank); ''') # Analyze all except rank on m0. bdb.execute(''' ANALYZE m0 FOR 2 ITERATION (OPTIMIZED; SKIP rank); ''') # Fail on m1 with OPTIMIZED, since non-standard category models. with pytest.raises(ValueError): bdb.execute(''' ANALYZE m1 FOR 2 ITERATION (OPTIMIZED; VARIABLES rank); ''') # Succeed analysis on non-optimized analysis. bdb.execute('ANALYZE m1 FOR 2 ITERATION') # Run queries on the new variable. run_queries('rank', 'm0') run_queries('rank', 'm1') run_queries('rank', None)
def test_read_csv(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=True) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): with bdb.savepoint(): # Table must not exist if ifnotexists=False. bdb.sql_execute('CREATE TABLE t(x)') bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) with pytest.raises(IOError): # Table must have no empty values in header. csv_hdrdata_prime = csv_hdrdata[1:] f = StringIO.StringIO(csv_hdrdata_prime) with bdb.savepoint(): bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) data = bdb.sql_execute('SELECT * FROM t').fetchall() assert data == [ # XXX Would be nice if the NaN could actually be that, or # at least None/NULL. (1,2,3,'foo','bar',u'nan',u'',u'quagga'), (4,5,6,'baz','quux',42.0,u'',u'eland'), (7,8,6,'zot','mumble',87.0,u'zoot',u'caribou'), ] f = StringIO.StringIO(csv_hdr) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=True) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master' ' WHERE name = ?', ('t',))) == \ 'CREATE TABLE "t"' \ '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \ '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)' f = StringIO.StringIO(csv_data) bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data + data # Test the BQL CREATE TABLE FROM <csv-file> syntax. f = StringIO.StringIO(csv_hdrdata) with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bdb.execute('CREATE TABLE t2 FROM \'%s\'' % (temp.name,)) assert bdb.sql_execute('SELECT * FROM t2').fetchall() == data # Trying to read a csv with an empty column name should fail. csv_header_corrupt = csv_hdr.replace('a,b',',') csv_hdrdata_corrupt = csv_header_corrupt + csv_data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata_corrupt) with pytest.raises(IOError): bayeslite.bayesdb_read_csv_file( bdb, 't3', temp.name, header=True, create=True)
def test_read_csv(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=True) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=False) f = StringIO.StringIO(csv_data) with pytest.raises(ValueError): # Must pass create=False for header=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Table must already exist for create=False. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): # Must pass create=True for ifnotexists=True. bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=True) f = StringIO.StringIO(csv_hdrdata) with pytest.raises(ValueError): with bdb.savepoint(): # Table must not exist if ifnotexists=False. bdb.sql_execute('CREATE TABLE t(x)') bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=False) data = bdb.sql_execute('SELECT * FROM t').fetchall() assert data == [ # XXX Would be nice if the NaN could actually be that, or # at least None/NULL. (1,2,3,'foo','bar',u'nan',u'',u'quagga'), (4,5,6,'baz','quux',42.0,u'',u'eland'), (7,8,6,'zot','mumble',87.0,u'zoot',u'caribou'), ] f = StringIO.StringIO(csv_hdr) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True, ifnotexists=True) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master' ' WHERE name = ?', ('t',))) == \ 'CREATE TABLE "t"' \ '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \ '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)' f = StringIO.StringIO(csv_data) bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data f = StringIO.StringIO(csv_hdrdata) bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp: with open(temp.name, 'w') as f: f.write(csv_hdrdata) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=False, ifnotexists=False) assert bdb.sql_execute('SELECT * FROM t').fetchall() == \ data + data + data + data