def test_schema_compatible(): for i, old_version in enumerate(USABLE_VERSIONS[:-1]): for new_version in USABLE_VERSIONS[i+1:]: with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: with bayesdb_open(pathname=f.name, version=old_version) as bdb: for same_or_older_version in USABLE_VERSIONS[:i+1]: bayesdb_schema_required( bdb, same_or_older_version, 'has%s needs% ok' % (old_version, same_or_older_version)) msg = 'has%s needs%s should fail' % ( old_version, new_version) try: with pytest.raises(BayesDBException): bayesdb_schema_required(bdb, new_version, msg) except: print msg raise # Now open it in compatible mode. Nothing should change. with bayesdb_open(pathname=f.name, compatible=True) as bdb: bayesdb_schema_required(bdb, old_version, 'opened compatible, old still ok') with pytest.raises(BayesDBException): bayesdb_schema_required( bdb, new_version, 'opened compatible, needs%s still fails' % ( new_version,)) # Now explicitly upgrade. Then everything should be okay. bayesdb_upgrade_schema(bdb) with bayesdb_open(pathname=f.name, compatible=True) as bdb: for v in USABLE_VERSIONS: bayesdb_schema_required( bdb, v, 'after explicit upgrade, needs%s ok' % (v,))
def test_schema_upgrade_on_open(): for old_version in USABLE_VERSIONS[:-1]: with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: try: with bayesdb_open(pathname=f.name, version=old_version, compatible=True) as bdb: for needs_version in USABLE_VERSIONS: case = 'has%s needs%s' % (old_version, needs_version) if needs_version <= old_version: bayesdb_schema_required( bdb, needs_version, case + ' ok') else: try: with pytest.raises(BayesDBException): bayesdb_schema_required( bdb, needs_version, case + ' fail') except: print case, "should fail" raise test_core.t1_schema(bdb) test_core.t1_data(bdb) with bayesdb_open(pathname=f.name, compatible=False) as bdb: for needs_version in USABLE_VERSIONS: bayesdb_schema_required( bdb, needs_version, 'needs%s after upgrade' % (needs_version,)) with pytest.raises(BayesDBException): # Nobody'll ever bump the schema version this many # times, right? bayesdb_schema_required(bdb, 1000000, 'a gazillion') except: print "old_version =", old_version, "file =", f.name raise
def test_schema_compatible(): for i, old_version in enumerate(USABLE_VERSIONS[:-1]): for new_version in USABLE_VERSIONS[i + 1:]: with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: with bayesdb_open(pathname=f.name, version=old_version) as bdb: for same_or_older_version in USABLE_VERSIONS[:i + 1]: bayesdb_schema_required( bdb, same_or_older_version, 'has%s needs% ok' % (old_version, same_or_older_version)) msg = 'has%s needs%s should fail' % (old_version, new_version) try: with pytest.raises(BayesDBException): bayesdb_schema_required(bdb, new_version, msg) except: print msg raise # Now open it in compatible mode. Nothing should change. with bayesdb_open(pathname=f.name, compatible=True) as bdb: bayesdb_schema_required(bdb, old_version, 'opened compatible, old still ok') with pytest.raises(BayesDBException): bayesdb_schema_required( bdb, new_version, 'opened compatible, needs%s still fails' % (new_version, )) # Now explicitly upgrade. Then everything should be okay. bayesdb_upgrade_schema(bdb) with bayesdb_open(pathname=f.name, compatible=True) as bdb: for v in USABLE_VERSIONS: bayesdb_schema_required( bdb, v, 'after explicit upgrade, needs%s ok' % (v, ))
def test_schema_upgrade_on_open(): for old_version in USABLE_VERSIONS[:-1]: with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: try: with bayesdb_open(pathname=f.name, version=old_version, compatible=True) as bdb: for needs_version in USABLE_VERSIONS: case = 'has%s needs%s' % (old_version, needs_version) if needs_version <= old_version: bayesdb_schema_required(bdb, needs_version, case + ' ok') else: try: with pytest.raises(BayesDBException): bayesdb_schema_required( bdb, needs_version, case + ' fail') except: print case, "should fail" raise test_core.t1_schema(bdb) test_core.t1_data(bdb) with bayesdb_open(pathname=f.name, compatible=False) as bdb: for needs_version in USABLE_VERSIONS: bayesdb_schema_required( bdb, needs_version, 'needs%s after upgrade' % (needs_version, )) with pytest.raises(BayesDBException): # Nobody'll ever bump the schema version this many # times, right? bayesdb_schema_required(bdb, 1000000, 'a gazillion') except:
def test_example(persist, exname): if persist: with tempfile.NamedTemporaryFile(prefix="bayeslite") as f: with bayeslite.bayesdb_open(pathname=f.name, builtin_metamodels=False) as bdb: _test_example(bdb, exname) with bayeslite.bayesdb_open(pathname=f.name, builtin_metamodels=False) as bdb: _retest_example(bdb, exname) else: with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: _test_example(bdb, exname)
def test_example(persist, exname): if persist: with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: with bayeslite.bayesdb_open(pathname=f.name, builtin_backends=False) as bdb: _test_example(bdb, exname) with bayeslite.bayesdb_open(pathname=f.name, builtin_backends=False) as bdb: _retest_example(bdb, exname) else: with bayeslite.bayesdb_open(builtin_backends=False) as bdb: _test_example(bdb, exname)
def test_schema_incompatible(): with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: with bayesdb_open(pathname=f.name, version=6) as bdb: bayesdb_schema_required(bdb, 6, 'test incompatible 0/6') with pytest.raises(BayesDBException): bayesdb_schema_required(bdb, 7, 'test incompatible 0/7') with bayesdb_open(pathname=f.name) as bdb: bayesdb_schema_required(bdb, 6, 'test incompatible 1/6') bayesdb_schema_required(bdb, 7, 'test incompatible 1/7') bayesdb_upgrade_schema(bdb) with bayesdb_open(pathname=f.name) as bdb: bayesdb_schema_required(bdb, 6, 'test incompatible 2/6') bayesdb_schema_required(bdb, 7, 'test incompatible 2/7')
def test_mutinf__ci_slow(seed): with bayesdb_open(':memory:', seed=seed) as bdb: npr = bdb.np_prng bdb.sql_execute('create table t(x, y, z)') D0_XY = npr.multivariate_normal([10,10], [[0,1],[2,0]], size=50) D1_XY = npr.multivariate_normal([0,0], [[0,-1],[2,0]], size=50) D_XY = np.concatenate([D0_XY, D1_XY]) D_Z = npr.multivariate_normal([5], [[0.5]], size=100) D = np.hstack([D_XY, D_Z]) for d in D: bdb.sql_execute('INSERT INTO t VALUES(?,?,?)', d) bdb.execute( 'create population p for t(x numerical; y numerical; z numerical)') bdb.execute('create generator m for p') bdb.execute('initialize 10 models for m') bdb.execute('analyze m for 10 iterations (optimized; quiet)') vars_by_mutinf = bdb.execute(''' estimate * from variables of p order by probability of (mutual information with x > 0.1) desc ''').fetchall() vars_by_depprob = bdb.execute(''' estimate * from variables of p order by dependence probability with x desc ''').fetchall() assert vars_by_mutinf == [('x',), ('y',), ('z',)] assert vars_by_depprob == [('x',), ('y',), ('z',)]
def test_get_metadata(): table_name = "tmp_table" generator_name = "tmp_cc" pandas_df = get_test_df() import os os.environ["BAYESDB_WIZARD_MODE"] = "1" with bayeslite.bayesdb_open() as bdb: bayesdb_read_pandas_df(bdb, table_name, pandas_df, create=True) bdb.execute( """ create generator {} for {} using crosscat(guess(*)) """.format( generator_name, table_name ) ) with pytest.raises(BLE): md = crosscat_utils.get_metadata(bdb, generator_name, 0) bdb.execute("INITIALIZE 2 MODELS FOR {}".format(generator_name)) with pytest.raises(ValueError): # XXX from BayesLite: should be a BLE? crosscat_utils.get_metadata(bdb, "Peter_Gabriel", 0) md = crosscat_utils.get_metadata(bdb, generator_name, 0) assert isinstance(md, dict) assert "X_D" in md.keys() assert "X_L" in md.keys()
def test_simulate_drawconstraint_error__ci_slow(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.backends['cgpm'].set_multiprocess(False) bayesdb_guess_population(bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute('CREATE GENERATOR hospital_cc FOR hospital USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION (OPTIMIZED);') with pytest.raises(ValueError): # Raises a ValueError since the condition variables and query # variables both ttl_mdcr_spnd. ValueError is returned since the # CGPM runtime, not cgpm_backend, captures the error. bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() samples = bdb.execute(''' SIMULATE n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() assert len(samples) == 100 assert all(len(s) == 1 for s in samples)
def test_register(): bdb = bayeslite.bayesdb_open() composer = Composer(n_samples=5) bayeslite.bayesdb_register_metamodel(bdb, composer) # Check if globally registered. try: bdb.sql_execute(''' SELECT * FROM bayesdb_metamodel WHERE name={} '''.format(quote(composer.name()))).next() except StopIteration: pytest.fail('Composer not registered in bayesdb_metamodel.') # Check all tables/triggers. schema = [('table', 'bayesdb_composer_cc_id'), ('table', 'bayesdb_composer_column_owner'), ('table', 'bayesdb_composer_column_toposort'), ('trigger', 'bayesdb_composer_column_toposort_check'), ('table', 'bayesdb_composer_column_parents'), ('table', 'bayesdb_composer_column_foreign_predictor'), ('trigger', 'bayesdb_composer_column_foreign_predictor_check')] for kind, name in schema: try: bdb.sql_execute(''' SELECT * FROM sqlite_master WHERE type={} AND name={} '''.format(quote(kind), quote(name))).next() except StopIteration: pytest.fail('Missing from Composer schema: {}'.format( (kind, name))) bdb.close()
def _query_into_queue(query_string, queue, bdb_file): """ Estimate pairwise similarity of a certain subset of the bdb according to query_string; place it in the multiprocessing Manager.Queue(). For two technical reasons, this function is defined as a toplevel class and independently creates a bdb handle: 1) Multiprocessing workers must be pickleable, and thus must be declared as toplevel functions; 2) Multiple threads cannot access the same bdb handle, lest concurrency issues arise with corrupt data. Parameters ---------- query_string : str Name of the query to execute, determined by estimate_similarity_mp. queue : multiprocessing.Manager.Queue Queue to place results into bdb_file : str File location of the BayesDB database. This function will independently open a new BayesDB handler. """ bdb = bayesdb_open(pathname=bdb_file) res = bdb.execute(query_string) queue.put(cursor_to_df(res))
def cgpm_smoke_bdb(): with bayesdb_open(':memory:', builtin_metamodels=False) as bdb: registry = { 'piecewise': PieceWise, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry, multiprocess=0)) bdb.sql_execute('CREATE TABLE t (Output, cat, Input)') for i in xrange(3): for j in xrange(3): for k in xrange(3): output = i + j / (k + 1) cat = -1 if (i + j * k) % 2 else +1 input = (i * j - k)**2 if i % 2: output = None if j % 2: cat = None if k % 2: input = None bdb.sql_execute( ''' INSERT INTO t (output, cat, input) VALUES (?, ?, ?) ''', (output, cat, input)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( MODEL output, input AS NUMERICAL; MODEL cat AS CATEGORICAL ) ''') yield bdb
def test_guess_generator(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)') a_z = range(ord('a'), ord('z') + 1) aa_zz = ((c, d) for c in a_z for d in a_z) data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz) for row in data: bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): # No modelled columns. (x is key.) bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat', overrides=[('y', 'ignore'), ('z', 'ignore')]) bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat') with pytest.raises(ValueError): # Generator already exists. bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat') assert bdb.sql_execute('SELECT *' ' FROM bayesdb_generator_column').fetchall() == [ (1, 1, 'categorical'), (1, 2, 'numerical'), ]
def test_subsample(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bayeslite.bayesdb_register_backend(bdb, backend) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm; ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm( SUBSAMPLE 100 ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)') bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF N_DEATH_ILL FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE FROM PAIRWISE hospitals_sub WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND (r1._rowid_ = 1 OR r1._rowid_ = 101) ''').fetchall() bdb.execute(''' INFER mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() sql = ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ORDER BY cgpm_rowid ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full,)) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid,)) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
def test_conditional_probability_pathologies(): data = [ ['x', 'a'], ['x', 'a'], ['x', 'a'], ['y', 'b'], ['y', 'b'], ['y', 'b'], ] with bayeslite.bayesdb_open() as bdb: bdb.sql_execute('create table t(foo, bar)') for row in data: bdb.sql_execute('insert into t values (?, ?)', row) bdb.execute(''' create generator t_cc for t using crosscat( foo categorical, bar categorical ) ''') bdb.execute('initialize 1 models for t_cc') bdb.execute('analyze t_cc for 1 iterations wait') assert bdb.execute(''' estimate probability of foo = 'x' by t_cc ''').fetchvalue() < 1 assert bdb.execute(''' estimate probability of foo = 'x' given (foo = 'x') by t_cc ''').fetchvalue() == 1 assert bdb.execute(''' estimate probability of value 'x' given (foo = 'x') from columns of t_cc where c.name = 'foo' ''').fetchvalue() == 1 assert bdb.execute(''' estimate probability of foo = 'x' given (foo = 'y') by t_cc ''').fetchvalue() == 0
def test_mix_ratio(seed): means = ((0,20), (20,0)) sample_size = 100 mix_ratio = [0.7, 0.3] table = 'data' with bayeslite.bayesdb_open(seed=seed) as bdb: sample_gaussians = axis_aligned_gaussians(means, sample_size, bdb._np_prng) samples = mix(sample_gaussians, mix_ratio, bdb._np_prng) register_loom(bdb) prepare_bdb(bdb, samples, table) cursor = bdb.execute(''' SIMULATE "0", "1" FROM data LIMIT ? ''', (sample_size,)) simulated_samples = [sample for sample in cursor] counts = collections.Counter( (0 if distance((x,y), means[0]) < distance((x,y), means[1]) else 1 for x, y in simulated_samples)) simulated_mix_ratio = [counts[key] / float(len(simulated_samples)) for key in counts] for i in xrange(len(means)): difference = abs(mix_ratio[i] - simulated_mix_ratio[i]) assert difference < 0.1
def test_table_from_url(): with bayeslite.bayesdb_open(pathname=db_pathname) as bdb: table_from_url(bdb, table_name, url) result = bdb.execute('SELECT * FROM testTable LIMIT 3;') for i, x in enumerate(result): assert str(x) == output1[i] bdb.execute("DROP TABLE IF EXISTS " + table_name)
def _query_into_queue(query_string, params, queue, bdb_file): """ Estimate pairwise similarity of a certain subset of the bdb according to query_string; place it in the multiprocessing Manager.Queue(). For two technical reasons, this function is defined as a toplevel class and independently creates a bdb handle: 1) Multiprocessing workers must be pickleable, and thus must be declared as toplevel functions; 2) Multiple threads cannot access the same bdb handle, lest concurrency issues arise with corrupt data. Parameters ---------- query_string : str Name of the query to execute, determined by estimate_similarity_mp. queue : multiprocessing.Manager.Queue Queue to place results into bdb_file : str File location of the BayesDB database. This function will independently open a new BayesDB handler. """ bdb = bayesdb_open(pathname=bdb_file) res = bdb.execute(query_string, params) queue.put(cursor_to_df(res))
def test_impossible_duplicate_dependency(): # Throw exception when two columns X and Y are both dependent and # independent. data = [(0, 1, 0, 0), (1, 0, 0, 1)] # Create the database. with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) # Read the dataset. bdb.sql_execute('CREATE TABLE foo(id,a,b,c)') for row in data: bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row) # Create schema, we will force DEP(a c) and IND(a c). bql = ''' CREATE GENERATOR bar FOR foo USING crosscat( GUESS(*), id IGNORE, a CATEGORICAL, b CATEGORICAL, c CATEGORICAL, INDEPENDENT(a,b,c), DEPENDENT(a,c), ); ''' # An error should be thrown about impossible schema. with pytest.raises(bayeslite.BQLError): bdb.execute(bql)
def test_nullify(): with bayesdb_open(':memory:') as bdb: bdb.sql_execute('create table t(x,y)') for row in [ ['1',''], ['nan','foo'], ['2','nan'], ['2','""'], ['', ''], ]: bdb.sql_execute('insert into t values(?,?)', row) assert bdb.execute('select * from t').fetchall() == [ ('1',''), ('nan','foo'), ('2','nan'), ('2','""'), ('', ''), ] assert bayesdb_nullify(bdb, 't', '') == 3 assert bdb.execute('select * from t').fetchall() == [ ('1',None), ('nan','foo'), ('2','nan'), ('2','""'), (None, None), ] assert bayesdb_nullify(bdb, 't', 'nan', columns=['x']) == 1 assert bdb.execute('select * from t').fetchall() == [ ('1',None), (None,'foo'), ('2','nan'), ('2','""'), (None, None), ] assert bayesdb_nullify(bdb, 't', 'fnord') == 0
def cgpm_dummy_satellites_bdb(): with bayesdb_open(':memory:', builtin_metamodels=False) as bdb: bdb.sql_execute(''' CREATE TABLE satellites_ucs ( apogee, class_of_orbit, country_of_operator, launch_mass, perigee, period )''') for l, f in [ ('geo', lambda x, y: x + y**2), ('leo', lambda x, y: math.sin(x + y)), (None, lambda x, y: x + y**2), (None, lambda x, y: math.sin(x + y)), ]: for x in xrange(5): for y in xrange(5): countries = ['US', 'Russia', 'China', 'Bulgaria'] country = countries[bdb._np_prng.randint( 0, len(countries))] mass = bdb._np_prng.normal(1000, 50) bdb.sql_execute( ''' INSERT INTO satellites_ucs (country_of_operator, launch_mass, class_of_orbit, apogee, perigee, period) VALUES (?,?,?,?,?,?) ''', (country, mass, l, x, y, f(x, y))) yield bdb
def cgpm_smoke_bdb(): with bayesdb_open(':memory:', builtin_backends=False) as bdb: registry = { 'piecewise': PieceWise, } bayesdb_register_backend( bdb, CGPM_Backend(registry, multiprocess=0)) bdb.sql_execute('CREATE TABLE t (Output, cat, Input)') for i in xrange(3): for j in xrange(3): for k in xrange(3): output = i + j/(k + 1) cat = -1 if (i + j*k) % 2 else +1 input = (i*j - k)**2 if i % 2: output = None if j % 2: cat = None if k % 2: input = None bdb.sql_execute(''' INSERT INTO t (output, cat, input) VALUES (?, ?, ?) ''', (output, cat, input)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( output NUMERICAL; input NUMERICAL; cat NOMINAL; ) ''') yield bdb
def cgpm_dummy_satellites_bdb(): with bayesdb_open(':memory:', builtin_backends=False) as bdb: bdb.sql_execute(''' CREATE TABLE satellites_ucs ( apogee, class_of_orbit, country_of_operator, launch_mass, perigee, period )''') for l, f in [ ('geo', lambda x, y: x + y**2), ('leo', lambda x, y: math.sin(x + y)), (None, lambda x, y: x + y**2), (None, lambda x, y: math.sin(x + y)), ]: for x in xrange(5): for y in xrange(5): countries = ['US', 'Russia', 'China', 'Bulgaria'] country = countries[bdb._np_prng.randint(0, len(countries))] mass = bdb._np_prng.normal(1000, 50) bdb.sql_execute(''' INSERT INTO satellites_ucs (country_of_operator, launch_mass, class_of_orbit, apogee, perigee, period) VALUES (?,?,?,?,?,?) ''', (country, mass, l, x, y, f(x, y))) yield bdb
def test_mix_ratio(seed): means = ((0, 20), (20, 0)) sample_size = 100 mix_ratio = [0.7, 0.3] table = 'data' with bayeslite.bayesdb_open(seed=seed) as bdb: sample_gaussians = axis_aligned_gaussians(means, sample_size, bdb._np_prng) samples = mix(sample_gaussians, mix_ratio, bdb._np_prng) register_loom(bdb) prepare_bdb(bdb, samples, table) cursor = bdb.execute( ''' SIMULATE "0", "1" FROM data LIMIT ? ''', (sample_size, )) simulated_samples = [sample for sample in cursor] counts = collections.Counter((0 if distance((x, y), means[0]) < distance( (x, y), means[1]) else 1 for x, y in simulated_samples)) simulated_mix_ratio = [ counts[key] / float(len(simulated_samples)) for key in counts ] for i in xrange(len(means)): difference = abs(mix_ratio[i] - simulated_mix_ratio[i]) assert difference < 0.1
def test_guess_population(): with bayeslite.bayesdb_open() as bdb: bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)') a_z = range(ord('a'), ord('z') + 1) aa_zz = ((c, d) for c in a_z for d in a_z) data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz) for row in data: bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row) with pytest.raises(ValueError): # No modeled columns. (x is key.) bayesdb_guess_population(bdb, 'p', 't', overrides=[('y', 'ignore'), ('z', 'ignore')]) bayesdb_guess_population(bdb, 'p', 't') with pytest.raises(ValueError): # Population already exists. bayesdb_guess_population(bdb, 'p', 't') assert bdb.sql_execute( 'SELECT * FROM bayesdb_variable').fetchall() == [ (1, None, 1, 'y', 'nominal'), (1, None, 2, 'z', 'numerical'), ]
def bdb_for_checking_cmi(backend, iterations, seed): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:', seed=seed) as bdb: bdb.sql_execute('CREATE TABLE t (a, b, c)') for row in generate_v_structured_data(1000, bdb.np_prng): bdb.sql_execute( ''' INSERT INTO t (a, b, c) VALUES (?, ?, ?) ''', row) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c TO NOMINAL; ) ''') if backend == 'loom': try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.execute('CREATE GENERATOR m FOR p using loom') elif backend == 'cgpm': bdb.execute('CREATE GENERATOR m FOR p using cgpm') bdb.backends['cgpm'].set_multiprocess('on') else: raise ValueError('Backend %s unknown' % (backend, )) # XXX we may want to downscale this eventually. bdb.execute('INITIALIZE 10 MODELS FOR m;') bdb.execute('ANALYZE m FOR %d ITERATIONS;' % (iterations, )) if backend == 'cgpm': bdb.backends['cgpm'].set_multiprocess('off') yield bdb
def test_hackbackend(): bdb = bayeslite.bayesdb_open(builtin_backends=False) bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)') bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')") bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t') bdb.execute('CREATE POPULATION p FOR t(b IGNORE; a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog;') dotdog_backend = DotdogBackend() bayeslite.bayesdb_register_backend(bdb, dotdog_backend) bayeslite.bayesdb_deregister_backend(bdb, dotdog_backend) bayeslite.bayesdb_register_backend(bdb, dotdog_backend) with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;') bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)') # XXX Rest of test originally exercised default backend, but # syntax doesn't support that now. Not clear that's wrong either. bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)')
def test_loom_guess_schema_nominal(): """Test to make sure that LoomBackend handles the case where the user provides a nominal variable with more than 256 distinct values. In this case, Loom automatically specifies the unbounded_nominal type. """ with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t (v)') vals_to_insert = [] for i in xrange(300): word = "" for _j in xrange(20): letter_index = bdb._prng.weakrandom_uniform( len(string.letters)) word += string.letters[letter_index] vals_to_insert.append(word) for i in xrange(len(vals_to_insert)): bdb.sql_execute( ''' insert into t (v) values (?) ''', (vals_to_insert[i], )) bdb.execute('create population p for t (v nominal)') bdb.execute('create generator g for p using loom') bdb.execute('initialize 1 model for g') bdb.execute('analyze g for 50 iterations') bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def test_nig_normal_latent_numbering(): with bayesdb_open(':memory:') as bdb: bayesdb_register_metamodel(bdb, NIGNormalMetamodel()) bdb.sql_execute('create table t(id integer primary key, x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x * x - 100)) bdb.execute(''' create population p for t(id ignore; model x,y as numerical) ''') assert core.bayesdb_has_population(bdb, 'p') pid = core.bayesdb_get_population(bdb, 'p') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') assert core.bayesdb_has_generator(bdb, pid, 'g0') g0 = core.bayesdb_get_generator(bdb, pid, 'g0') assert core.bayesdb_has_generator(bdb, pid, 'g1') g1 = core.bayesdb_get_generator(bdb, pid, 'g1') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2] assert core.bayesdb_generator_column_numbers(bdb, g0) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2] assert core.bayesdb_generator_column_numbers(bdb, g1) == [-1, 1, 2]
def test_conditional_probability_pathologies(): data = [ ['x', 'a'], ['x', 'a'], ['x', 'a'], ['y', 'b'], ['y', 'b'], ['y', 'b'], ] with bayeslite.bayesdb_open() as bdb: bdb.sql_execute('create table t(foo, bar)') for row in data: bdb.sql_execute('insert into t values (?, ?)', row) bdb.execute(''' create population p for t ( model foo, bar as categorical ) ''') bdb.execute('create generator p_cc for p using crosscat()') bdb.execute('initialize 1 models for p_cc') bdb.execute('analyze p_cc for 1 iterations wait') assert bdb.execute(''' estimate probability of foo = 'x' by p ''').fetchvalue() < 1 assert bdb.execute(''' estimate probability of foo = 'x' given (foo = 'x') by p ''').fetchvalue() == 1 assert bdb.execute(''' estimate probability of value 'x' given (foo = 'x') from columns of p where c.name = 'foo' ''').fetchvalue() == 1 assert bdb.execute(''' estimate probability of foo = 'x' given (foo = 'y') by p ''').fetchvalue() == 0
def test_hackmetamodel(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)') bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')") bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') dotdog_metamodel = DotdogMetamodel() bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel) bayeslite.bayesdb_deregister_metamodel(bdb, dotdog_metamodel) bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel) with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') # XXX Rest of test originally exercised default metamodel, but # syntax doesn't support that now. Not clear that's wrong either. bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
def test_mutinf__ci_slow(seed): with bayesdb_open(':memory:', seed=seed) as bdb: npr = bdb.np_prng bdb.sql_execute('create table t(x, y, z)') D0_XY = npr.multivariate_normal([10, 10], [[0, 1], [2, 0]], size=50) D1_XY = npr.multivariate_normal([0, 0], [[0, -1], [2, 0]], size=50) D_XY = np.concatenate([D0_XY, D1_XY]) D_Z = npr.multivariate_normal([5], [[0.5]], size=100) D = np.hstack([D_XY, D_Z]) for d in D: bdb.sql_execute('INSERT INTO t VALUES(?,?,?)', d) bdb.execute( 'create population p for t(x numerical; y numerical; z numerical)') bdb.execute('create generator m for p') bdb.execute('initialize 10 models for m') bdb.execute('analyze m for 10 iterations (optimized; quiet)') vars_by_mutinf = bdb.execute(''' estimate * from variables of p order by probability of (mutual information with x > 0.1) desc ''').fetchall() vars_by_depprob = bdb.execute(''' estimate * from variables of p order by dependence probability with x desc ''').fetchall() assert vars_by_mutinf == [('x', ), ('y', ), ('z', )] assert vars_by_depprob == [('x', ), ('y', ), ('z', )]
def initialize(self): if self.bdb: self.check_representation() return self.bdb = bayeslite.bayesdb_open(self.bdb_path) if not bayeslite.core.bayesdb_has_table(self.bdb, self.name): if self.df is not None: bayeslite.read_pandas.bayesdb_read_pandas_df( self.bdb, self.name, self.df, create=True, ifnotexists=True) elif self.csv_path: bayeslite.bayesdb_read_csv_file( self.bdb, self.name, self.csv_path, header=True, create=True, ifnotexists=True) else: tables = self.list_tables() metamodels = self.list_metamodels() if len(tables) + len(metamodels) == 0: raise BLE(ValueError("No data sources specified, and an empty bdb.")) else: raise BLE(ValueError("The name of the population must be the same" " as a table in the bdb, one of: " + ", ".join(tables) + "\nNote also that the bdb has the following" " metamodels defined: " + ", ".join(metamodels))) self.generators = self.query('''SELECT * FROM bayesdb_generator''') if len(self.generators) == 0: size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0] assert 0 < size self.query(''' CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''') self.check_representation()
def test_simulate_drawconstraint(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, "rU") as f: read_csv.bayesdb_read_csv(bdb, "dha", f, header=True, create=True) bdb.execute( """ CREATE GENERATOR dha_cc FOR dha USING crosscat ( GUESS(*), name KEY ) """ ) bdb.execute("INITIALIZE 1 MODEL FOR dha_cc") bdb.execute("ANALYZE dha_cc FOR 1 ITERATION WAIT") samples = bdb.execute( """ SIMULATE ttl_mdcr_spnd, n_death_ill FROM dha_cc GIVEN TTL_MDCR_SPND = 40000 LIMIT 100 """ ).fetchall() assert [s[0] for s in samples] == [40000] * 100
def test_simulate_drawconstraint_error__ci_slow(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.backends['cgpm'].set_multiprocess(False) bayesdb_guess_population( bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute( 'CREATE GENERATOR hospital_cc FOR hospital USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION (OPTIMIZED);') with pytest.raises(ValueError): # Raises a ValueError since the condition variables and query # variables both ttl_mdcr_spnd. ValueError is returned since the # CGPM runtime, not cgpm_backend, captures the error. bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() samples = bdb.execute(''' SIMULATE n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() assert len(samples) == 100 assert all(len(s) == 1 for s in samples)
def test_math_func_one_param(name, probe): # Retrieve result from python. python_value_error = None python_type_error = None try: result_python = get_python_math_call(name, probe) except ValueError: python_value_error = True except TypeError: python_type_error = True # Retrieve result from SQL. sql_value_error = None sql_type_error = None try: with bayesdb_open(':memory:') as bdb: cursor = bdb.execute(get_sql_math_call(name, probe)) result_sql = cursor_value(cursor) except ValueError: sql_value_error = True except (TypeError, apsw.SQLError): sql_type_error = True # Domain error on both. if python_value_error or sql_value_error: assert python_value_error and sql_value_error # Arity error on both. elif python_type_error or sql_type_error: assert python_type_error and sql_type_error # Both invocations succeeded, confirm results match. else: assert abserr(result_python, result_sql) < 1e-4
def test_simulate_conflict(): """Cannot override existing value in table using GIVEN in SIMULATE.""" with bayeslite.bayesdb_open() as bdb: bdb.sql_execute(''' CREATE TABLE data ( "0" NUMERIC PRIMARY KEY, "1" NUMERIC ); ''') insert_row(bdb, 'data', 1, 1) bdb.execute(''' CREATE POPULATION FOR data WITH SCHEMA ( "0" NUMERICAL; "1" NUMERICAL; ); ''') bdb.execute('CREATE GENERATOR FOR data USING cgpm;') bdb.execute('INITIALIZE 1 MODELS FOR data;') rowid = insert_row(bdb, 'data', 0, None) with pytest.raises(bayeslite.BQLError): bdb.execute( ''' SIMULATE "0" FROM data GIVEN rowid=?, "0"= 0, "1"=0 LIMIT 1; ''', (rowid, ))
def bdb_for_checking_cmi(backend, iterations, seed): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:', seed=seed) as bdb: bdb.sql_execute('CREATE TABLE t (a, b, c)') for row in generate_v_structured_data(1000, bdb.np_prng): bdb.sql_execute(''' INSERT INTO t (a, b, c) VALUES (?, ?, ?) ''', row) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c TO NOMINAL; ) ''') if backend == 'loom': try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.execute('CREATE GENERATOR m FOR p using loom') elif backend == 'cgpm': bdb.execute('CREATE GENERATOR m FOR p using cgpm') bdb.backends['cgpm'].set_multiprocess('on') else: raise ValueError('Backend %s unknown' % (backend,)) # XXX we may want to downscale this eventually. bdb.execute('INITIALIZE 10 MODELS FOR m;') bdb.execute('ANALYZE m FOR %d ITERATIONS;' % (iterations,)) if backend == 'cgpm': bdb.backends['cgpm'].set_multiprocess('off') yield bdb
def smoke_loom(): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('CREATE TABLE t (a, b, c, d, e)') for a, b, c, d, e in itertools.product(*([range(2)] * 4 + [['x', 'y']])): # XXX Insert synthetic data generator here. bdb.sql_execute( ''' INSERT INTO t (a, b, c, d, e) VALUES (?, ?, ?, ?, ?) ''', (a, b, c, d, e)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c, d TO NUMERICAL; SET STATTYPES OF e TO NOMINAL ) ''') bdb.execute('CREATE GENERATOR m FOR p using loom;') bdb.execute('INITIALIZE 1 MODELS FOR m;') yield bdb
def smoke_loom(): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('CREATE TABLE t (a, b, c, d, e)') for a, b, c, d, e in itertools.product(*([range(2)]*4+[['x','y']])): # XXX Insert synthetic data generator here. bdb.sql_execute(''' INSERT INTO t (a, b, c, d, e) VALUES (?, ?, ?, ?, ?) ''', (a, b, c, d, e)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c, d TO NUMERICAL; SET STATTYPES OF e TO NOMINAL ) ''') bdb.execute('CREATE GENERATOR m FOR p using loom;') bdb.execute('INITIALIZE 1 MODELS FOR m;') yield bdb
def test_nig_normal_latent_numbering(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(id integer primary key, x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100)) bdb.execute(''' create population p for t( id ignore; set stattypes of x,y to numerical; ) ''') assert core.bayesdb_has_population(bdb, 'p') pid = core.bayesdb_get_population(bdb, 'p') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') assert core.bayesdb_has_generator(bdb, pid, 'g0') g0 = core.bayesdb_get_generator(bdb, pid, 'g0') assert core.bayesdb_has_generator(bdb, pid, 'g1') g1 = core.bayesdb_get_generator(bdb, pid, 'g1') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
def test_hackmetamodel(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)') bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')") bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') crosscat = local_crosscat() crosscat_metamodel = CrosscatMetamodel(crosscat) dotdog_metamodel = DotdogMetamodel() bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel) bayeslite.bayesdb_deregister_metamodel(bdb, dotdog_metamodel) bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel) with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') # XXX Rest of test originally exercised default metamodel, but # syntax doesn't support that now. Not clear that's wrong either. bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
def test_nonintegral_noindex(): with bayesdb_open() as bdb: df = pandas.DataFrame([(1, 2, 'foo'), (4, 5, 6), (7, 8, 9), (10, 11, 12)], index=[42, 78, 62, 43]) with pytest.raises(ValueError): bayesdb_read_pandas_df(bdb, 't', df)
def main(): with bayeslite.bayesdb_open(pathname=db_handle) as bdb: with open(csv_handle, 'r') as f: table_from_csv(bdb, table_name, f) ignore = ["date", "time"] numerical = [ "distance_to_nearest_traffic_light", "speed_limit", "estimated_speed_of_collision" ] nominal = [ "vehicle_type", "second_vehicle_type", "road_has_pavement", "distance_to_nearest_traffic_light", "seat_belt_used", "injury_sustained", "lethal", "land_use", "city" ] schema = \ "(" + \ "; ".join("ignore "+i for i in ignore) + "; " + \ "; ".join(i + " numerical " for i in numerical) + "; " + \ "; ".join(i + " nominal" for i in nominal) + \ ")" print "Creating population..." bdb.execute("CREATE POPULATION FOR " + table_name + " " + schema) print "Creating generator..." bdb.execute("CREATE GENERATOR crimegen FOR " + table_name) print "Initialising model..." bdb.execute("INITIALIZE 1 MODEL FOR crimegen")
def do_POST(self): self.send_response(200) popcolumns = "vehicle_type,second_vehicle_type,road_has_pavement,distance_to_nearest_traffic_tight,speed_limit,estimated_speed_of_collision,seat_belt_used,injury_sustained,lethal,land_use,city" sentinel = 0 try: content_length = int(self.headers['Content-Length']) text = self.rfile.read(content_length).decode("utf8") if 'xxxGET POPULATION COLUMNS' in text: sentinel = 1 print("SET SENTINEL") if sentinel == 0: db_name, queries = RequestHandler.text_to_queries(text) except UnicodeDecodeError: self.send_err("Error: Couldn't decode request, make sure it's utf8") return except AssertionError: self.send_err("Error: You must give at least 1 query.") return if sentinel != 1: print("Running queries.") results = [] with bayeslite.bayesdb_open(pathname=db_name) as bdb: for query in queries: try: if query[0:3].upper() != "SQL": results.append(conv_cursor_to_json(bdb.execute(query))) else: results.append(conv_cursor_to_json(bdb.sql_execute(query[4:]))) except (BQLError, BQLParseError, BayesDBException), e: self.send_err(e) return
def test_register(): bdb = bayeslite.bayesdb_open() composer = Composer(n_samples=5) bayeslite.bayesdb_register_metamodel(bdb, composer) # Check if globally registered. try: bdb.sql_execute(''' SELECT * FROM bayesdb_metamodel WHERE name={} '''.format(quote(composer.name()))).next() except StopIteration: pytest.fail('Composer not registered in bayesdb_metamodel.') # Check all tables/triggers. schema = [ ('table', 'bayesdb_composer_cc_id'), ('table', 'bayesdb_composer_column_owner'), ('table', 'bayesdb_composer_column_toposort'), ('trigger', 'bayesdb_composer_column_toposort_check'), ('table', 'bayesdb_composer_column_parents'), ('table', 'bayesdb_composer_column_foreign_predictor'), ('trigger', 'bayesdb_composer_column_foreign_predictor_check') ] for kind, name in schema: try: bdb.sql_execute(''' SELECT * FROM sqlite_master WHERE type={} AND name={} '''.format(quote(kind), quote(name))).next() except StopIteration: pytest.fail('Missing from Composer schema: {}'.format((kind,name))) bdb.close()
def test_geweke_iid_gaussian(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: bayeslite.bayesdb_register_metamodel(bdb, gauss.StdNormalMetamodel()) kl_est = geweke.geweke_kl(bdb, "std_normal", [['column', 'numerical']], ['column'], [(1,0), (2,0)], 2, 2, 2, 2) assert kl_est == (2, 0, 0)
def smoke_bdb(): with bayesdb_open(':memory:') as bdb: bdb.sql_execute('CREATE TABLE t (a, b, c, d, e)') for a, b, c, d, e in itertools.product(*([range(2)] * 4 + [['x', 'y']])): # XXX Insert synthetic data generator here. bdb.sql_execute( ''' INSERT INTO t (a, b, c, d, e) VALUES (?, ?, ?, ?, ?) ''', (a, b, c, d, e)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c, d TO NUMERICAL; SET STATTYPES OF e TO NOMINAL ) ''') bdb.execute('CREATE GENERATOR m1 FOR p;') bdb.execute('INITIALIZE 10 MODELS FOR m1;') bdb.execute('CREATE GENERATOR m2 FOR p;') bdb.execute('INITIALIZE 10 MODELS FOR m2;') yield bdb
def test_legacy_models_slow(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) # Make sure guessing also works. bdb.execute('create generator dha_cc0 for dha using crosscat(guess(*))') bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) # Need to be able to overwrite existing codebook. # # XXX Not sure this is the right API. What if overwrite is a # mistake? bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) bql = ''' ESTIMATE name FROM dha_cc ORDER BY SIMILARITY TO (name = ?) DESC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql, ('Albany NY',)).fetchall() == [ ('Albany NY',), ('Scranton PA',), ('United States US',), ('Norfolk VA',), ('Reading PA',), ('Salisbury MD',), ('Louisville KY',), ('Cleveland OH',), ('Covington KY',), ('Akron OH',), ] # Tickles an issue in case-folding of column names. bql = ''' ESTIMATE name FROM dha_cc ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql).fetchall() == [ ('McAllen TX',), ('Worcester MA',), ('Beaumont TX',), ('Temple TX',), ('Corpus Christi TX',), ('Takoma Park MD',), ('Kingsport TN',), ('Bangor ME',), ('Lebanon NH',), ('Panama City FL',), ]
def test_engine_stamp_two_clients(): """Confirm analysis by one worker makes cache in other worker stale.""" with tempfile.NamedTemporaryFile(prefix='bayeslite') as f: with bayeslite.bayesdb_open(f.name) as bdb0: bayeslite.bayesdb_read_csv(bdb0, 't', StringIO(test_csv.csv_data), header=True, create=True) bdb0.execute(''' CREATE POPULATION p FOR t ( age NUMERICAL; gender CATEGORICAL; salary NUMERICAL; height IGNORE; division CATEGORICAL; rank CATEGORICAL ) ''') bdb0.execute('CREATE METAMODEL m FOR p WITH BASELINE crosscat;') cgpm_metamodel = bdb0.metamodels['cgpm'] population_id = bayeslite.core.bayesdb_get_population(bdb0, 'p') generator_id = bayeslite.core.bayesdb_get_generator( bdb0, population_id, 'm') assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 0 with bayeslite.bayesdb_open(f.name) as bdb1: bdb1.execute('INITIALIZE 1 MODEL FOR m') assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 1 assert cgpm_metamodel._engine_stamp(bdb1, generator_id) == 1 bdb0.execute('ANALYZE m FOR 1 ITERATION WAIT') assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 2 assert cgpm_metamodel._get_cache_entry(bdb0, generator_id, 'engine') is not None with bayeslite.bayesdb_open(f.name) as bdb2: bdb2.execute('ANALYZE m FOR 1 ITERATION WAIT') assert cgpm_metamodel._engine_stamp(bdb2, generator_id) == 3 assert cgpm_metamodel._engine_stamp(bdb0, generator_id) == 3 # Engine in cache of bdb0 should be stale, since bdb2 analyzed. assert cgpm_metamodel._engine_latest(bdb0, generator_id) is None
def bayesdb(backend=None, **kwargs): if backend is None: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bdb = bayeslite.bayesdb_open(builtin_backends=False, **kwargs) bayeslite.bayesdb_register_backend(bdb, backend) try: yield bdb finally: bdb.close()
def test_estimate_pairwise_similarity_long(): """ Tests larger queries that need to be broken into batch inserts of 500 values each, as well as the N parameter. """ os.environ['BAYESDB_WIZARD_MODE'] = '1' with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: # n = 40 -> 40**2 -> 1600 rows total temp.write(_bigger_csv_data(40)) temp.seek(0) bayeslite.bayesdb_read_csv_file( bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # test N = 0 parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=0 ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (0, 0) # test other values of N for N in [1, 2, 10, 20, 40]: parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=N, overwrite=True ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (N**2, 3) # N too high should fail with pytest.raises(BLE): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=41, overwrite=True ) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).sort_values(by=['rowid0', 'rowid1']) parallel_sim.index = range(parallel_sim.shape[0]) std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc') ) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_legacy_models__ci_slow(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): bayeslite.bayesdb_load_legacy_models(bdb, "dha_cc", "dha", "crosscat", dha_models, create=True) with open(dha_csv, "rU") as f: read_csv.bayesdb_read_csv(bdb, "dha", f, header=True, create=True) bayeslite.bayesdb_load_legacy_models(bdb, "dha_cc", "dha", "crosscat", dha_models, create=True) # Make sure guessing also works. bdb.execute("create generator dha_cc0 for dha using crosscat(guess(*))") bayeslite.bayesdb_load_codebook_csv_file(bdb, "dha", dha_codebook) # Need to be able to overwrite existing codebook. # # XXX Not sure this is the right API. What if overwrite is a # mistake? bayeslite.bayesdb_load_codebook_csv_file(bdb, "dha", dha_codebook) bql = """ ESTIMATE name FROM dha_cc ORDER BY SIMILARITY TO (name = ?) DESC LIMIT 10 """ with bdb.savepoint(): assert bdb.execute(bql, ("Albany NY",)).fetchall() == [ ("Albany NY",), ("Scranton PA",), ("United States US",), ("Norfolk VA",), ("Reading PA",), ("Salisbury MD",), ("Louisville KY",), ("Cleveland OH",), ("Covington KY",), ("Akron OH",), ] # Tickles an issue in case-folding of column names. bql = """ ESTIMATE name FROM dha_cc ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC LIMIT 10 """ with bdb.savepoint(): assert bdb.execute(bql).fetchall() == [ ("McAllen TX",), ("Worcester MA",), ("Beaumont TX",), ("Temple TX",), ("Corpus Christi TX",), ("Takoma Park MD",), ("Kingsport TN",), ("Bangor ME",), ("Lebanon NH",), ("Panama City FL",), ]
def test_geweke_nig_normal(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: nig = normal.NIGNormalMetamodel(seed=1) bayeslite.bayesdb_register_metamodel(bdb, nig) (ct, kl, error) = geweke.geweke_kl(bdb, "nig_normal", [['column', 'numerical']], ['column'], [(1,0), (2,0)], 2, 2, 2, 2) assert ct == 2 assert 0 < kl and kl < 10 # KL should be positive assert 0 < error and error < 10 # KL error estimate too