def bdb_for_checking_cmi(backend, iterations, seed): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:', seed=seed) as bdb: bdb.sql_execute('CREATE TABLE t (a, b, c)') for row in generate_v_structured_data(1000, bdb.np_prng): bdb.sql_execute(''' INSERT INTO t (a, b, c) VALUES (?, ?, ?) ''', row) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c TO NOMINAL; ) ''') if backend == 'loom': try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.execute('CREATE GENERATOR m FOR p using loom') elif backend == 'cgpm': bdb.execute('CREATE GENERATOR m FOR p using cgpm') bdb.backends['cgpm'].set_multiprocess('on') else: raise ValueError('Backend %s unknown' % (backend,)) # XXX we may want to downscale this eventually. bdb.execute('INITIALIZE 10 MODELS FOR m;') bdb.execute('ANALYZE m FOR %d ITERATIONS;' % (iterations,)) if backend == 'cgpm': bdb.backends['cgpm'].set_multiprocess('off') yield bdb
def smoke_loom(): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('CREATE TABLE t (a, b, c, d, e)') for a, b, c, d, e in itertools.product(*([range(2)]*4+[['x','y']])): # XXX Insert synthetic data generator here. bdb.sql_execute(''' INSERT INTO t (a, b, c, d, e) VALUES (?, ?, ?, ?, ?) ''', (a, b, c, d, e)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c, d TO NUMERICAL; SET STATTYPES OF e TO NOMINAL ) ''') bdb.execute('CREATE GENERATOR m FOR p using loom;') bdb.execute('INITIALIZE 1 MODELS FOR m;') yield bdb
def test_nig_normal_latent_numbering(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(id integer primary key, x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x * x - 100)) bdb.execute(''' create population p for t( id ignore; set stattypes of x,y to numerical; ) ''') assert core.bayesdb_has_population(bdb, 'p') pid = core.bayesdb_get_population(bdb, 'p') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') assert core.bayesdb_has_generator(bdb, pid, 'g0') g0 = core.bayesdb_get_generator(bdb, pid, 'g0') assert core.bayesdb_has_generator(bdb, pid, 'g1') g1 = core.bayesdb_get_generator(bdb, pid, 'g1') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
def test_loom_guess_schema_nominal(): """Test to make sure that LoomBackend handles the case where the user provides a nominal variable with more than 256 distinct values. In this case, Loom automatically specifies the unbounded_nominal type. """ with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t (v)') vals_to_insert = [] for i in xrange(300): word = "" for _j in xrange(20): letter_index = bdb._prng.weakrandom_uniform( len(string.letters)) word += string.letters[letter_index] vals_to_insert.append(word) for i in xrange(len(vals_to_insert)): bdb.sql_execute( ''' insert into t (v) values (?) ''', (vals_to_insert[i], )) bdb.execute('create population p for t (v nominal)') bdb.execute('create generator g for p using loom') bdb.execute('initialize 1 model for g') bdb.execute('analyze g for 50 iterations') bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def cgpm_smoke_bdb(): with bayesdb_open(':memory:', builtin_backends=False) as bdb: registry = { 'piecewise': PieceWise, } bayesdb_register_backend( bdb, CGPM_Backend(registry, multiprocess=0)) bdb.sql_execute('CREATE TABLE t (Output, cat, Input)') for i in xrange(3): for j in xrange(3): for k in xrange(3): output = i + j/(k + 1) cat = -1 if (i + j*k) % 2 else +1 input = (i*j - k)**2 if i % 2: output = None if j % 2: cat = None if k % 2: input = None bdb.sql_execute(''' INSERT INTO t (output, cat, input) VALUES (?, ?, ?) ''', (output, cat, input)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( output NUMERICAL; input NUMERICAL; cat NOMINAL; ) ''') yield bdb
def bdb_for_checking_cmi(backend, iterations, seed): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:', seed=seed) as bdb: bdb.sql_execute('CREATE TABLE t (a, b, c)') for row in generate_v_structured_data(1000, bdb.np_prng): bdb.sql_execute( ''' INSERT INTO t (a, b, c) VALUES (?, ?, ?) ''', row) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c TO NOMINAL; ) ''') if backend == 'loom': try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.execute('CREATE GENERATOR m FOR p using loom') elif backend == 'cgpm': bdb.execute('CREATE GENERATOR m FOR p using cgpm') bdb.backends['cgpm'].set_multiprocess('on') else: raise ValueError('Backend %s unknown' % (backend, )) # XXX we may want to downscale this eventually. bdb.execute('INITIALIZE 10 MODELS FOR m;') bdb.execute('ANALYZE m FOR %d ITERATIONS;' % (iterations, )) if backend == 'cgpm': bdb.backends['cgpm'].set_multiprocess('off') yield bdb
def test_nig_normal_latent_numbering(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(id integer primary key, x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100)) bdb.execute(''' create population p for t( id ignore; set stattypes of x,y to numerical; ) ''') assert core.bayesdb_has_population(bdb, 'p') pid = core.bayesdb_get_population(bdb, 'p') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') assert core.bayesdb_has_generator(bdb, pid, 'g0') g0 = core.bayesdb_get_generator(bdb, pid, 'g0') assert core.bayesdb_has_generator(bdb, pid, 'g1') g1 = core.bayesdb_get_generator(bdb, pid, 'g1') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
def smoke_loom(): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: pytest.skip('Failed to import Loom.') bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('CREATE TABLE t (a, b, c, d, e)') for a, b, c, d, e in itertools.product(*([range(2)] * 4 + [['x', 'y']])): # XXX Insert synthetic data generator here. bdb.sql_execute( ''' INSERT INTO t (a, b, c, d, e) VALUES (?, ?, ?, ?, ?) ''', (a, b, c, d, e)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA ( SET STATTYPES OF a, b, c, d TO NUMERICAL; SET STATTYPES OF e TO NOMINAL ) ''') bdb.execute('CREATE GENERATOR m FOR p using loom;') bdb.execute('INITIALIZE 1 MODELS FOR m;') yield bdb
def test_hackbackend(): bdb = bayeslite.bayesdb_open(builtin_backends=False) bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)') bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')") bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t') bdb.execute('CREATE POPULATION p FOR t(b IGNORE; a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog;') dotdog_backend = DotdogBackend() bayeslite.bayesdb_register_backend(bdb, dotdog_backend) bayeslite.bayesdb_deregister_backend(bdb, dotdog_backend) bayeslite.bayesdb_register_backend(bdb, dotdog_backend) with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;') bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_cc FOR p USING cgpm;') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR p_dd FOR p USING dotdog(a NUMERICAL)') # XXX Rest of test originally exercised default backend, but # syntax doesn't support that now. Not clear that's wrong either. bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR q_dd FOR p USING dotdog(a NUMERICAL)')
def test_subsample(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bayeslite.bayesdb_register_backend(bdb, backend) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm; ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm( SUBSAMPLE 100 ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)') bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF N_DEATH_ILL FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE FROM PAIRWISE hospitals_sub WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND (r1._rowid_ = 1 OR r1._rowid_ = 101) ''').fetchall() bdb.execute(''' INFER mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() sql = ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ORDER BY cgpm_rowid ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full,)) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid,)) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
def bayesdb(backend=None, **kwargs): if backend is None: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bdb = bayeslite.bayesdb_open(builtin_backends=False, **kwargs) bayeslite.bayesdb_register_backend(bdb, backend) try: yield bdb finally: bdb.close()
def test_population_two_generators(): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t (x)') for x in xrange(10): bdb.sql_execute('insert into t (x) values (?)', (x, )) bdb.execute('create population p for t (x numerical)') bdb.execute('create generator g0 for p using loom') bdb.execute('create generator g1 for p using loom')
def test_stattypes(): """Test of the LoomBackend on a table with all possible data types. Only checks for errors from Loom. """ with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t (u, co, b, ca, cy, nu, no)') for _x in xrange(10): cat_dict = ['a', 'b', 'c'] bdb.sql_execute( ''' insert into t (u, co, b, ca, cy, nu, no) values (?, ?, ?, ?, ?, ?, ?)''', (cat_dict[bdb._prng.weakrandom_uniform(3)], bdb._prng.weakrandom_uniform(200), bdb._prng.weakrandom_uniform(2), cat_dict[bdb._prng.weakrandom_uniform(3)], bdb._prng.weakrandom_uniform(1000) / 4.0, bdb._prng.weakrandom_uniform(1000) / 4.0 - 100.0, bdb._prng.weakrandom_uniform(1000) / 4.0)) bdb.execute('''create population p for t( u unbounded_nominal; co counts; b boolean; ca nominal; cy cyclic; nu numerical; no nominal) ''') bdb.execute('create generator g for p using loom') bdb.execute('initialize 1 model for g') bdb.execute('analyze g for 50 iterations') bdb.execute('''estimate probability density of (co=2, nu=50, u='a') by p''').fetchall() bdb.execute('''estimate probability density of (nu = 50, u='a') given (co=2) by p''').fetchall() with pytest.raises(Exception): # There seems to be an issue with encoding boolean variables # in LoomBackend.simulate_joint, although using b=1 in the # condition for simulate results in no error. bdb.execute('''estimate probability density of (b=0) by p''').fetchall() bdb.execute('''simulate u, co, b, ca, cy, nu, no from p limit 1''').fetchall() bdb.execute('''simulate u, b, ca, no from p given nu=3, co=2, b=1 limit 1''').fetchall() bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def cgpm_dummy_satellites_pop_bdb(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' create population satellites for satellites_ucs with schema( apogee numerical; class_of_orbit nominal; country_of_operator nominal; launch_mass numerical; perigee numerical; period numerical ) ''') backend = CGPM_Backend(dict(), multiprocess=0) bayesdb_register_backend(bdb, backend) yield bdb
def test_nig_normal_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x)') for x in xrange(100): bdb.sql_execute('insert into t(x) values(?)', (x, )) bdb.execute('create population p for t(x numerical)') bdb.execute('create generator g for p using nig_normal') bdb.execute('initialize 1 model for g') bdb.execute('analyze g for 1 iteration') bdb.execute('estimate probability density of x = 50 from p').fetchall() bdb.execute('simulate x from p limit 1').fetchall() bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def test_nig_normal_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x)') for x in xrange(100): bdb.sql_execute('insert into t(x) values(?)', (x,)) bdb.execute('create population p for t(x numerical)') bdb.execute('create generator g for p using nig_normal') bdb.execute('initialize 1 model for g') bdb.execute('analyze g for 1 iteration') bdb.execute('estimate probability density of x = 50 from p').fetchall() bdb.execute('simulate x from p limit 1').fetchall() bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def test_stattypes(): """Test of the LoomBackend on a table with all possible data types. Only checks for errors from Loom. """ with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t (u, co, b, ca, cy, nu, no)') for _x in xrange(10): cat_dict = ['a', 'b', 'c'] bdb.sql_execute( ''' insert into t (u, co, b, ca, cy, nu, no) values (?, ?, ?, ?, ?, ?, ?)''', (cat_dict[bdb._prng.weakrandom_uniform(3)], bdb._prng.weakrandom_uniform(200), bdb._prng.weakrandom_uniform(2), cat_dict[bdb._prng.weakrandom_uniform(3)], bdb._prng.weakrandom_uniform(1000) / 4.0, bdb._prng.weakrandom_uniform(1000) / 4.0 - 100.0, bdb._prng.weakrandom_uniform(1000) / 4.0)) bdb.execute('''create population p for t( u unbounded_nominal; co counts; b boolean; ca nominal; cy cyclic; nu numerical; no nominal) ''') bdb.execute('create generator g for p using loom') bdb.execute('initialize 1 model for g') bdb.execute('analyze g for 50 iterations') bdb.execute('''estimate probability density of nu = 50, u='a' from p''').fetchall() bdb.execute('''simulate u, co, b, ca, cy, nu, no from p limit 1''').fetchall() bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def register(self, line, cell=None): parser = argparse.ArgumentParser() parser.add_argument('backend', help='Name of backend to register.') parser.add_argument( 'args', type=str, default=[], nargs='*', help='List of arguments to provide the initialization.') args = parser.parse_args(shlex.split(line)) if args.backend == 'loom': try: from bayeslite.backends.loom_backend import LoomBackend except ImportError: raise ValueError('Failed to import loom backend.') if len(args.args) == 0: raise ValueError('Specify <path> for loom.') loom_store_path = args.args[0] bayesdb_register_backend( self._bdb, LoomBackend(loom_store_path=loom_store_path)) else: raise ValueError('Unknown backend: %s' % (args.backend, ))
def get_bdb(cfg, logger): logger.info("Using bdb file: {}".format(cfg.bdb_file)) bdb = bayeslite.bayesdb_open(pathname=cfg.bdb_file) if cfg.backend == 'loom': bayeslite.bayesdb_register_backend(bdb, get_backend_object(cfg)) # These are hacks that are necessary because bayeslite currently # assumes that `.bdb` file creation and querying will happen in the # same Python process. logger.info( 'Backend is set to {}. Manually setting loom_store_path to {}'. format(cfg.backend, cfg.loom_path)) bdb.sql_execute( 'UPDATE bayesdb_loom_generator SET loom_store_path = ?', (cfg.loom_path, )) logger.info('Backend is set to {}. Analyzing for 1 iterations.'.format( cfg.backend)) bdb.execute('ANALYZE {} FOR 1 ITERATIONS;'.format(cfg.population_name)) logger.info("Backend registered") return bdb
def test_loom_one_numeric(): """Simple test of the LoomBackend on a one variable table Only checks for errors from the Loom system.""" from datetime import datetime with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t(x)') for x in xrange(10): bdb.sql_execute('insert into t (x) values (?)', (x, )) bdb.execute('create population p for t (x numerical)') bdb.execute('create generator g for p using loom') bdb.execute('initialize 1 models for g') bdb.execute('analyze g for 10 iterations') bdb.execute(''' estimate probability density of x = 50 from p ''').fetchall() bdb.execute('simulate x from p limit 1').fetchall() bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def _retest_example(bdb, exname): (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1, cleanup) = examples[exname] qg = bql_quote_name(g) backend = be() bayeslite.bayesdb_register_backend(bdb, backend) p_id = core.bayesdb_get_population(bdb, p) assert core.bayesdb_has_table(bdb, t) assert core.bayesdb_has_generator(bdb, p_id, g) gid = core.bayesdb_get_generator(bdb, p_id, g) assert core.bayesdb_generator_has_model(bdb, gid, 0) assert core.bayesdb_generator_has_model(bdb, gid, 1) bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg,)) try: # Test analyzing models. bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg,)) bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg,)) except bayeslite.BQLError, e: # loom does not allow model numbers to be specified in analyze models assert exname == 'loom'
def _retest_example(bdb, exname): (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1, cleanup) = examples[exname] qg = bql_quote_name(g) metamodel = mm() bayeslite.bayesdb_register_backend(bdb, mm()) p_id = core.bayesdb_get_population(bdb, p) assert core.bayesdb_has_table(bdb, t) assert core.bayesdb_has_generator(bdb, p_id, g) gid = core.bayesdb_get_generator(bdb, p_id, g) assert core.bayesdb_generator_has_model(bdb, gid, 0) assert core.bayesdb_generator_has_model(bdb, gid, 1) bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg, )) try: # Test analyzing models. bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg, )) bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg, )) except bayeslite.BQLError, e: # loom does not allow model numbers to be specified in analyze models assert exname == 'loom'
def bayesdb(self, line, cell=None): parser = argparse.ArgumentParser() parser.add_argument('path', help='Path of bdb file.') parser.add_argument('-s', type=int, default=0, help='Seed.') parser.add_argument('-j', action='store_true', help='Multiprocessing.') args = parser.parse_args(line.split()) if self._bdb is not None: self._bdb.close() self._bdb = None self._path = args.path seed = struct.pack('<QQQQ', 0, 0, 0, args.s) self._bdb = bayesdb_open(pathname=args.path, seed=seed, builtin_backends=False) # Small hack for the VsCGpm, which takes in the venturescript source # from %venturescript cells! def _VsCGpm(outputs, inputs, rng, *args, **kwds): if 'source' not in kwds: kwds['source'] = '\n'.join(self._venturescript) return VsCGpm(outputs, inputs, rng, *args, **kwds) # Register cgpm backend. cgpm_registry = { 'factor_analysis': FactorAnalysis, 'inline_venturescript': InlineVsCGpm, 'linear_regression': LinearRegression, 'multivariate_kde': MultivariateKde, 'multivariate_knn': MultivariateKnn, 'ordinary_least_squares': OrdinaryLeastSquares, 'random_forest': RandomForest, 'venturescript': _VsCGpm, } mm = CGPM_Backend(cgpm_registry, multiprocess=args.j) bayesdb_register_backend(self._bdb, mm) return 'Loaded: %s' % (self._path)
def test_bad_analyze_vars(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPE OF apogee TO NUMERICAL; SET STATTYPE OF class_of_orbit TO NOMINAL; SET STATTYPE OF country_of_operator TO NOMINAL; SET STATTYPE OF launch_mass TO NUMERICAL; SET STATTYPE OF perigee TO NUMERICAL; SET STATTYPE OF period TO NUMERICAL ) ''') registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_backend(bdb, CGPM_Backend(registry)) bdb.execute(''' CREATE GENERATOR satellites_cgpm FOR satellites USING cgpm ''') bdb.execute('INITIALIZE 1 MODEL FOR satellites_cgpm') bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION ()') bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION') with pytest.raises(BQLError): # Unknown variable `perige'. bdb.execute(''' ANALYZE satellites_cgpm FOR 1 ITERATION ( VARIABLES period, perige ) ''') with pytest.raises(BQLError): # Unknown variable `perige'. bdb.execute(''' ANALYZE satellites_cgpm FOR 1 ITERATION ( SKIP period, perige ) ''')
def run(stdin, stdout, stderr, argv): args = parse_args(argv[1:]) progname = argv[0] slash = progname.rfind('/') if slash: progname = progname[slash + 1:] if args.bdbpath is None and not args.memory: stderr.write('%s: pass filename or -m/--memory\n' % (progname,)) return 1 if args.bdbpath == '-': stderr.write('%s: missing option?\n' % (progname,)) return 1 bdb = bayeslite.bayesdb_open(pathname=args.bdbpath, builtin_backends=False) multiprocess = args.jobs != 1 backend = CGPM_Backend(cgpm_registry={}, multiprocess=multiprocess) bayeslite.bayesdb_register_backend(bdb, backend) bdbshell = shell.Shell(bdb, 'cgpm', stdin, stdout, stderr) with hook.set_current_shell(bdbshell): if not args.no_init_file: init_file = os.path.join(os.path.expanduser('~/.bayesliterc')) if os.path.isfile(init_file): bdbshell.dot_read(init_file) if args.file is not None: for path in args.file: if os.path.isfile(path): bdbshell.dot_read(path) else: bdbshell.stdout.write('%s is not a file. Aborting.\n' % (str(path),)) break if not args.batch: bdbshell.cmdloop() return 0
def test_nig_normal_latent_conditional_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x)') for x in xrange(100): bdb.sql_execute('insert into t(x) values(?)', (x,)) bdb.execute('create population p for t(x numerical)') bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # observed given observed bdb.execute(''' estimate probability density of x = 50 given (x = 50) within p ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (x = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (x = 50) within p modeled by g1 ''').fetchall() # observed given latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (xe = 50) within p modeled by g1 ''').fetchall() # latent given observed with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 50 given (x = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 50 given (x = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 50 given (x = 50) within p modeled by g1 ''').fetchall() bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def register_loom(bdb): loom_store_path = temp_file_path('.bdb') loom_backend = LoomBackend(loom_store_path=loom_store_path) bayeslite.bayesdb_register_backend(bdb, loom_backend)
def test_nig_normal_latent_2var_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x * x - 100)) bdb.execute('create population p for t(x numerical; y numerical)') # CORRELATION, CORRELATION PVALUE, without generators. assert 4 == len( bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p ''').fetchall()) bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # CORRELATION, CORRELATION PVALUE, with generators. assert 4 == len( bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p ''').fetchall()) assert 4 == len( bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p modeled by g0 ''').fetchall()) with pytest.raises(BQLError): # g1 has a latent variable xe. assert 4 == len( bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p modeled by g1 ''').fetchall()) # DEPENDENCE PROBABILITY, MUTUAL INFORMATION assert 4 == len( bdb.execute(''' estimate dependence probability, mutual information from pairwise variables of p ''').fetchall()) assert 4 == len( bdb.execute(''' estimate dependence probability, mutual information from pairwise variables of p modeled by g0 ''').fetchall()) assert 9 == len( bdb.execute(''' estimate dependence probability, mutual information from pairwise variables of p modeled by g1 ''').fetchall()) # SIMULATE LATENT VARIABLE assert 10 == len( bdb.execute(''' simulate xe from p modeled by g1 limit 10; ''').fetchall()) assert 10 == len( bdb.execute(''' simulate y, xe from p modeled by g1 limit 10; ''').fetchall()) # Cannot simulate the latent xe from the population p. with pytest.raises(BQLError): assert 10 == len( bdb.execute(''' simulate xe from p limit 10; ''').fetchall()) # Cannot simulate the latent xe from the generator g0. with pytest.raises(BQLError): assert 10 == len( bdb.execute(''' simulate xe from p modeled by g0 limit 10; ''').fetchall()) bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def _test_example(bdb, exname): (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1, cleanup) = examples[exname] qt = bql_quote_name(t) qg = bql_quote_name(g) backend = be() bayeslite.bayesdb_register_backend(bdb, backend) # Create a table. assert not core.bayesdb_has_table(bdb, t) with bdb.savepoint_rollback(): bdb.sql_execute(t_sql) assert core.bayesdb_has_table(bdb, t) assert not core.bayesdb_has_table(bdb, t) bdb.sql_execute(t_sql) assert core.bayesdb_has_table(bdb, t) # Insert data into the table. assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == 0 for row in data: bdb.sql_execute(data_sql, row) n = len(data) assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == n # Create a population. assert not core.bayesdb_has_population(bdb, p) bdb.execute(p_bql) p_id = core.bayesdb_get_population(bdb, p) # Create a generator. Make sure savepoints work for this. assert not core.bayesdb_has_generator(bdb, p_id, g) with pytest.raises(Exception): with bdb.savepoint(): bdb.execute(g_bqlbad0) assert not core.bayesdb_has_generator(bdb, p_id, g) with pytest.raises(Exception): with bdb.savepoint(): bdb.execute(g_bqlbad1) assert not core.bayesdb_has_generator(bdb, p_id, g) with bdb.savepoint_rollback(): bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, p_id, g) assert not core.bayesdb_has_generator(bdb, p_id, g) bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, p_id, g) assert not core.bayesdb_has_generator(bdb, p_id+1, g) with pytest.raises(Exception): bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, p_id, g) gid = core.bayesdb_get_generator(bdb, p_id, g) assert not core.bayesdb_generator_has_model(bdb, gid, 0) assert [] == core.bayesdb_generator_modelnos(bdb, gid) with bdb.savepoint_rollback(): bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg,)) assert core.bayesdb_generator_has_model(bdb, gid, 0) assert [0] == core.bayesdb_generator_modelnos(bdb, gid) with bdb.savepoint_rollback(): bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg,)) for i in range(10): assert core.bayesdb_generator_has_model(bdb, gid, i) assert range(10) == core.bayesdb_generator_modelnos(bdb, gid) bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg,)) # Test dropping things. with pytest.raises(bayeslite.BQLError): bdb.execute('DROP TABLE %s' % (qt,)) with bdb.savepoint_rollback(): # Note that sql_execute does not protect us! bdb.sql_execute('DROP TABLE %s' % (qt,)) assert not core.bayesdb_has_table(bdb, t) assert core.bayesdb_has_table(bdb, t) # XXX Should we reject dropping a generator when there remain # models? Should we not reject dropping a table when there remain # generators? A table can be dropped when there remain indices. # # with pytest.raises(bayeslite.BQLError): # # Models remain. # bdb.execute('DROP GENERATOR %s' % (qg,)) with bdb.savepoint_rollback(): bdb.execute('DROP GENERATOR %s' % (qg,)) assert not core.bayesdb_has_generator(bdb, None, g) assert core.bayesdb_has_generator(bdb, p_id, g) with bdb.savepoint_rollback(): bdb.execute('DROP GENERATOR %s' % (qg,)) assert not core.bayesdb_has_generator(bdb, None, g) bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, None, g) assert core.bayesdb_has_generator(bdb, p_id, g) assert core.bayesdb_has_generator(bdb, None, g) assert gid == core.bayesdb_get_generator(bdb, p_id, g) # Test dropping models. with bdb.savepoint_rollback(): try: bdb.execute('DROP MODEL 1 FROM %s' % (qg,)) assert core.bayesdb_generator_has_model(bdb, gid, 0) assert not core.bayesdb_generator_has_model(bdb, gid, 1) assert [0] == core.bayesdb_generator_modelnos(bdb, gid) except bayeslite.BQLError, e: # loom does not allow model numbers to be specified in drop models assert exname == 'loom'
def _test_example(bdb, exname): (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1, cleanup) = examples[exname] qt = bql_quote_name(t) qg = bql_quote_name(g) metamodel = mm() bayeslite.bayesdb_register_backend(bdb, metamodel) # Create a table. assert not core.bayesdb_has_table(bdb, t) with bdb.savepoint_rollback(): bdb.sql_execute(t_sql) assert core.bayesdb_has_table(bdb, t) assert not core.bayesdb_has_table(bdb, t) bdb.sql_execute(t_sql) assert core.bayesdb_has_table(bdb, t) # Insert data into the table. assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == 0 for row in data: bdb.sql_execute(data_sql, row) n = len(data) assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == n # Create a population. assert not core.bayesdb_has_population(bdb, p) bdb.execute(p_bql) p_id = core.bayesdb_get_population(bdb, p) # Create a generator. Make sure savepoints work for this. assert not core.bayesdb_has_generator(bdb, p_id, g) with pytest.raises(Exception): with bdb.savepoint(): bdb.execute(g_bqlbad0) assert not core.bayesdb_has_generator(bdb, p_id, g) with pytest.raises(Exception): with bdb.savepoint(): bdb.execute(g_bqlbad1) assert not core.bayesdb_has_generator(bdb, p_id, g) with bdb.savepoint_rollback(): bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, p_id, g) assert not core.bayesdb_has_generator(bdb, p_id, g) bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, p_id, g) assert not core.bayesdb_has_generator(bdb, p_id + 1, g) with pytest.raises(Exception): bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, p_id, g) gid = core.bayesdb_get_generator(bdb, p_id, g) assert not core.bayesdb_generator_has_model(bdb, gid, 0) assert [] == core.bayesdb_generator_modelnos(bdb, gid) with bdb.savepoint_rollback(): bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg, )) assert core.bayesdb_generator_has_model(bdb, gid, 0) assert [0] == core.bayesdb_generator_modelnos(bdb, gid) with bdb.savepoint_rollback(): bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg, )) for i in range(10): assert core.bayesdb_generator_has_model(bdb, gid, i) assert range(10) == core.bayesdb_generator_modelnos(bdb, gid) bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg, )) # Test dropping things. with pytest.raises(bayeslite.BQLError): bdb.execute('DROP TABLE %s' % (qt, )) with bdb.savepoint_rollback(): # Note that sql_execute does not protect us! bdb.sql_execute('DROP TABLE %s' % (qt, )) assert not core.bayesdb_has_table(bdb, t) assert core.bayesdb_has_table(bdb, t) # XXX Should we reject dropping a generator when there remain # models? Should we not reject dropping a table when there remain # generators? A table can be dropped when there remain indices. # # with pytest.raises(bayeslite.BQLError): # # Models remain. # bdb.execute('DROP GENERATOR %s' % (qg,)) with bdb.savepoint_rollback(): bdb.execute('DROP GENERATOR %s' % (qg, )) assert not core.bayesdb_has_generator(bdb, None, g) assert core.bayesdb_has_generator(bdb, p_id, g) with bdb.savepoint_rollback(): bdb.execute('DROP GENERATOR %s' % (qg, )) assert not core.bayesdb_has_generator(bdb, None, g) bdb.execute(g_bql) assert core.bayesdb_has_generator(bdb, None, g) assert core.bayesdb_has_generator(bdb, p_id, g) assert core.bayesdb_has_generator(bdb, None, g) assert gid == core.bayesdb_get_generator(bdb, p_id, g) # Test dropping models. with bdb.savepoint_rollback(): try: bdb.execute('DROP MODEL 1 FROM %s' % (qg, )) assert core.bayesdb_generator_has_model(bdb, gid, 0) assert not core.bayesdb_generator_has_model(bdb, gid, 1) assert [0] == core.bayesdb_generator_modelnos(bdb, gid) except bayeslite.BQLError, e: # loom does not allow model numbers to be specified in drop models assert exname == 'loom'
def test_nig_normal_latent_2var_conditional_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100)) bdb.execute('create population p for t(x numerical; y numerical)') # CORRELATION, CORRELATION PVALUE, without generators. assert 4 == len(bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p ''').fetchall()) bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # observed given other observed bdb.execute(''' estimate probability density of x = 50 given (y = 49) within p ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (y = 49) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (y = 49) within p modeled by g1 ''').fetchall() bdb.execute('simulate x from p given y = 49 limit 1').fetchall() bdb.execute(''' simulate x from p modeled by g0 given y = 49 limit 1 ''').fetchall() bdb.execute(''' simulate x from p modeled by g1 given y = 49 limit 1 ''').fetchall() # observed given related latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 1) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 1) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (xe = 1) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate x from p given xe = 1 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate x from p modeled by g0 given xe = 1 limit 1 ''').fetchall() bdb.execute(''' simulate x from p modeled by g1 given xe = 1 limit 1 ''').fetchall() # observed given unrelated latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of y = 50 given (xe = 1) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of y = 50 given (xe = 1) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of y = 50 given (xe = 1) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate y from p given xe = 1 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate y from p modeled by g0 given xe = 1 limit 1 ''').fetchall() bdb.execute(''' simulate y from p modeled by g1 given xe = 1 limit 1 ''').fetchall() # latent given related observed with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (x = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (x = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 given (x = 50) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate xe from p given x = 50 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p modeled by g0 given x = 50 limit 1 ''').fetchall() bdb.execute(''' simulate xe from p modeled by g1 given x = 50 limit 1 ''').fetchall() # latent given unrelated observed with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (y = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (y = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 given (y = 50) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate xe from p given y = 50 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p modeled by g0 given y = 50 limit 1 ''').fetchall() bdb.execute(''' simulate xe from p modeled by g1 given y = 50 limit 1 ''').fetchall() bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def test_nig_normal_latent_2var2lat_conditional_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100)) bdb.execute('create population p for t(x numerical; y numerical)') bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal( xe deviation(x), ye deviation(y) ) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # latent given latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (ye = -1) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (ye = -1) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 given (ye = -1) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p given ye = -1 limit 1 ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p modeled by g0 given ye = -1 limit 1 ''').fetchall() bdb.execute(''' simulate xe from p modeled by g1 given ye = -1 limit 1 ''').fetchall() with pytest.raises(BQLError): bdb.execute( 'estimate dependence probability of xe with ye within p') with pytest.raises(BQLError): bdb.execute(''' estimate dependence probability of xe with ye within p modeled by g0 ''') bdb.execute(''' estimate dependence probability of xe with ye within p modeled by g1 ''') with pytest.raises(BQLError): bdb.execute( 'estimate mutual information of xe with ye within p') with pytest.raises(BQLError): bdb.execute(''' estimate mutual information of xe with ye within p modeled by g0 ''') bdb.execute(''' estimate mutual information of xe with ye within p modeled by g1 ''') bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def test_initialize_with_all_nulls(): # This test ensures that trying to initialize a generator with any # (manifest) column of all null variables will crash. # Initializing an overriden column with all null variables should not # be a problem in general, so we test this case as well. with bayesdb_open(':memory:', builtin_backends=False) as bdb: registry = { 'barebones': BareBonesCGpm, } bayesdb_register_backend( bdb, CGPM_Backend(registry, multiprocess=0)) # Create table with all missing values for a. bdb.sql_execute(''' CREATE TABLE t (a REAL, b REAL, c REAL); ''') bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 3)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -2, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -5, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, 2, 3)) # Fail when a is numerical and modeled by crosscat. bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( SET STATTYPES OF a, b, c TO NUMERICAL ) ''') bdb.execute(''' CREATE GENERATOR m FOR p; ''') with pytest.raises(BQLError): bdb.execute(''' INITIALIZE 2 MODELS FOR m; ''') # Fail when a is nominal and modeled by crosscat. bdb.execute(''' CREATE POPULATION p2 FOR t WITH SCHEMA( SET STATTYPES OF a TO NOMINAL; SET STATTYPES OF b, c TO NUMERICAL ) ''') bdb.execute('CREATE GENERATOR m2 FOR p2;') with pytest.raises(BQLError): bdb.execute('INITIALIZE 2 MODELS FOR m2;') # Succeed when a is ignored. bdb.execute(''' CREATE POPULATION p3 FOR t WITH SCHEMA( IGNORE a; SET STATTYPES OF b, c TO NUMERICAL ) ''') bdb.execute('CREATE GENERATOR m3 FOR p3;') bdb.execute('INITIALIZE 2 MODELS FOR m3;') # Succeed when a is numerical overriden using a dummy CGPM. bdb.execute(''' CREATE GENERATOR m4 FOR p( OVERRIDE MODEL FOR a GIVEN b USING barebones ) ''') bdb.execute('INITIALIZE 2 MODELS FOR m4;') bdb.execute('ANALYZE m4 FOR 1 ITERATION')
def test_output_stattypes(): with cgpm_dummy_satellites_bdb() as bdb: # Missing policy for class_of_orbit, perigee, period with pytest.raises(BQLError): bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPES OF apogee, launch_mass TO NUMERICAL; SET STATTYPES OF country_of_operator TO NOMINAL ) ''') bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( IGNORE class_of_orbit, perigee, period; SET STATTYPES OF apogee, launch_mass TO NUMERICAL; SET STATTYPES OF country_of_operator TO NOMINAL ) ''') registry = { 'factor_analysis': FactorAnalysis, } bayesdb_register_backend(bdb, CGPM_Backend(registry)) # Creating factor analysis with nominal manifest should crash. bdb.execute(''' CREATE GENERATOR satellites_g0 FOR satellites( OVERRIDE MODEL FOR apogee, country_of_operator AND EXPOSE pc_1 NUMERICAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0') with pytest.raises(BQLError): # Duplicate pc_2 in LATENT and EXPOSE. bdb.execute(''' CREATE GENERATOR satellites_g1 FOR satellites( LATENT pc_2 NOMINAL, OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 NOMINAL USING factor_analysis(L=1) ) ''') # Creating factor analysis with nominal latent should crash. bdb.execute(''' CREATE GENERATOR satellites_g1 FOR satellites( OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 NOMINAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1') # Creating factor analysis with all numerical should be ok. bdb.execute(''' CREATE GENERATOR satellites_g2 FOR satellites USING cgpm( LATENT pc_3 NUMERICAL; OVERRIDE MODEL FOR apogee, launch_mass, pc_3, pc_4 USING factor_analysis(L=2); LATENT pc_4 NUMERICAL ) ''') bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2') bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION') # Cannot transition baseline and foreign using timed analysis. with pytest.raises(BQLError): bdb.execute(''' ANALYZE satellites_g2 FOR 2 SECONDS ( VARIABLES country_of_operator, apogee, launch_mass, pc_3); ''') bdb.execute(''' ANALYZE satellites_g2 FOR 1 ITERATION ( VARIABLES apogee, launch_mass); ''') # Dependence probability of manifest with latent. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF apogee WITH pc_3 BY satellites MODELED BY satellites_g2; ''').fetchall() assert cursor[0][0] == 1. # Dependence probability of latent with latent. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF pc_3 WITH pc_4 BY satellites MODELED BY satellites_g2; ''').fetchall() assert cursor[0][0] == 1. # Mutual information of latent with manifest. cursor = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF apogee WITH pc_4 USING 1 SAMPLES BY satellites MODELED BY satellites_g2; ''').fetchall() # Mutual information of latent with latent. cursor = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF pc_3 WITH pc_4 USING 1 SAMPLES BY satellites MODELED BY satellites_g2; ''').fetchall()
def test_cgpm_extravaganza__ci_slow(): try: from cgpm.regressions.forest import RandomForest from cgpm.regressions.linreg import LinearRegression from cgpm.venturescript.vscgpm import VsCGpm except ImportError: pytest.skip('no sklearn or venturescript') return with bayesdb_open(':memory:', builtin_backends=False) as bdb: # XXX Use the real satellites data instead of this bogosity? bdb.sql_execute(''' CREATE TABLE satellites_ucs ( name, apogee, class_of_orbit, country_of_operator, launch_mass, perigee, period ) ''') for l, f in [ ('geo', lambda x, y: x + y**2), ('leo', lambda x, y: math.sin(x + y)), ]: for x in xrange(1000): for y in xrange(10): countries = ['US', 'Russia', 'China', 'Bulgaria'] country = countries[bdb._np_prng.randint(0, len(countries))] name = 'sat-%s-%d' % ( country, bdb._np_prng.randint(0, 10**8)) mass = bdb._np_prng.normal(1000, 50) bdb.sql_execute(''' INSERT INTO satellites_ucs (name, country_of_operator, launch_mass, class_of_orbit, apogee, perigee, period) VALUES (?,?,?,?,?,?,?) ''', (name, country, mass, l, x, y, f(x, y))) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs ( name IGNORE; apogee NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites ''').fetchall() cgpm_registry = { 'venturescript': VsCGpm, 'linreg': LinearRegression, 'forest': RandomForest, } cgpmt = CGPM_Backend(cgpm_registry) bayesdb_register_backend(bdb, cgpmt) with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apoge TO NORMAL ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( LATENT apogee NUMERICAL ) ''') bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apogee TO NORMAL; LATENT kepler_cluster_id NUMERICAL; LATENT kepler_noise NUMERICAL; OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period GIVEN apogee, perigee USING venturescript (source = "{}"); OVERRIDE MODEL FOR perigee GIVEN apogee USING linreg; OVERRIDE MODEL FOR class_of_orbit GIVEN apogee, period, perigee, kepler_noise USING forest (k = 4); SUBSAMPLE 100, ) '''.format(kepler_source)) population_id = core.bayesdb_get_population(bdb, 'satellites') generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0') assert core.bayesdb_variable_numbers(bdb, population_id, None) \ == [1, 2, 3, 4, 5, 6] assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \ == [-2, -1, 1, 2, 3, 4, 5, 6] # -- MODEL country_of_operator GIVEN class_of_orbit USING forest; bdb.execute('INITIALIZE 1 MODELS FOR g0') bdb.execute('ANALYZE g0 FOR 1 iteration (;)') bdb.execute(''' ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') # OPTIMIZED uses the lovecat backend. bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)') with pytest.raises(Exception): # Disallow both SKIP and VARIABLES clauses. # # XXX Catch a more specific exception. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION ( SKIP kepler_cluster_id; VARIABLES apogee, perigee; ) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') bdb.execute('ANALYZE g0 FOR 1 ITERATION') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF kepler_cluster_id WITH period WITHIN satellites MODELED BY g0 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id FROM satellites MODELED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise FROM satellites MODELED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf FROM satellites MODELED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf FROM satellites MODELED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf FROM satellites MODELED BY g0 LIMIT 1; ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE kepler_cluster_id, apogee, perigee, period FROM satellites MODELED BY g0 LIMIT 4 ''').fetchall() bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP GENERATOR g0') bdb.execute('DROP POPULATION satellites') bdb.execute('DROP TABLE satellites_ucs')
def test_regress_bonanza__ci_integration(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_backend( bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( apogee NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL; ) ''') bdb.execute(''' CREATE GENERATOR m FOR satellites; ''') bdb.execute('INITIALIZE 2 MODELS FOR m;') def check_regression_variables(results, numericals, nominals): seen = set() for r in results: assert len(r) == 2 variable = r[0] assert variable not in seen assert variable in numericals or \ any(variable.startswith('%s_dum_' % (nominal,)) for nominal in nominals) seen.add(variable) # Regression on 1 numerical variable. results = bdb.execute(''' REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites; ''').fetchall() assert len(results) == 2 check_regression_variables(results, ['intercept', 'perigee'], []) # Regression on 1 nominal variable. results = bdb.execute(''' REGRESS apogee GIVEN (country_of_operator) USING 12 SAMPLES BY satellites; ''').fetchall() check_regression_variables( results, ['intercept'], ['country_of_operator']) # Regression on 1 nominal + 1 numerical variable. bdb.execute(''' REGRESS apogee GIVEN (perigee, country_of_operator) USING 12 SAMPLES BY satellites; ''').fetchall() check_regression_variables( results, ['intercept', 'perigee'], ['country_of_operator']) # Regression on all variables. results = bdb.execute(''' REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites; ''', (3,)).fetchall() check_regression_variables( results, ['intercept', 'perigee', 'launch_mass', 'period',], ['country_of_operator', 'class_of_orbit',], ) # Regression on column selector subexpression with a binding. results = bdb.execute(''' REGRESS apogee GIVEN ( satellites.( ESTIMATE * FROM VARIABLES OF satellites ORDER BY dependence probability with apogee DESC LIMIT ? ) ) USING 12 SAMPLES BY satellites MODELED BY m USING MODEL 1; ''', (3,)).fetchall() cursor = bdb.execute(''' ESTIMATE * FROM VARIABLES OF satellites ORDER BY dependence probability with apogee DESC LIMIT ? ''', (3,)).fetchall() top_variables = [c[0] for c in cursor] nominals = [ var for var in top_variables if var in ['country_of_operator', 'class_of_orbit',] ] numericals = [var for var in top_variables if var not in nominals] check_regression_variables( results, numericals + ['intercept'], nominals) # Cannot mix * with other variables. with pytest.raises(BQLError): bdb.execute(''' REGRESS apogee GIVEN (*, class_of_orbit) USING 1 SAMPLES BY satellites; ''').fetchall() # Not enough data for regression, 1 unique nominal variable. with pytest.raises(ValueError): bdb.execute(''' REGRESS apogee GIVEN (class_of_orbit) USING 1 SAMPLES BY satellites; ''').fetchall()
def test_cgpm_extravaganza__ci_slow(): try: from cgpm.regressions.forest import RandomForest from cgpm.regressions.linreg import LinearRegression from cgpm.venturescript.vscgpm import VsCGpm except ImportError: pytest.skip('no sklearn or venturescript') return with bayesdb_open(':memory:', builtin_backends=False) as bdb: # XXX Use the real satellites data instead of this bogosity? bdb.sql_execute(''' CREATE TABLE satellites_ucs ( name, apogee, class_of_orbit, country_of_operator, launch_mass, perigee, period ) ''') for l, f in [ ('geo', lambda x, y: x + y**2), ('leo', lambda x, y: math.sin(x + y)), ]: for x in xrange(1000): for y in xrange(10): countries = ['US', 'Russia', 'China', 'Bulgaria'] country = countries[bdb._np_prng.randint( 0, len(countries))] name = 'sat-%s-%d' % (country, bdb._np_prng.randint(0, 10**8)) mass = bdb._np_prng.normal(1000, 50) bdb.sql_execute( ''' INSERT INTO satellites_ucs (name, country_of_operator, launch_mass, class_of_orbit, apogee, perigee, period) VALUES (?,?,?,?,?,?,?) ''', (name, country, mass, l, x, y, f(x, y))) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs ( name IGNORE; apogee NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites ''').fetchall() cgpm_registry = { 'venturescript': VsCGpm, 'linreg': LinearRegression, 'forest': RandomForest, } cgpmt = CGPM_Backend(cgpm_registry) bayesdb_register_backend(bdb, cgpmt) with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apoge TO NORMAL ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( LATENT apogee NUMERICAL ) ''') bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apogee TO NORMAL; LATENT kepler_cluster_id NUMERICAL; LATENT kepler_noise NUMERICAL; OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period GIVEN apogee, perigee USING venturescript (source = "{}"); OVERRIDE MODEL FOR perigee GIVEN apogee USING linreg; OVERRIDE MODEL FOR class_of_orbit GIVEN apogee, period, perigee, kepler_noise USING forest (k = 4); SUBSAMPLE 100, ) '''.format(kepler_source)) population_id = core.bayesdb_get_population(bdb, 'satellites') generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0') assert core.bayesdb_variable_numbers(bdb, population_id, None) \ == [1, 2, 3, 4, 5, 6] assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \ == [-2, -1, 1, 2, 3, 4, 5, 6] # -- MODEL country_of_operator GIVEN class_of_orbit USING forest; bdb.execute('INITIALIZE 1 MODELS FOR g0') bdb.execute('ANALYZE g0 FOR 1 iteration (;)') bdb.execute(''' ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') # OPTIMIZED uses the lovecat backend. bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)') with pytest.raises(Exception): # Disallow both SKIP and VARIABLES clauses. # # XXX Catch a more specific exception. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION ( SKIP kepler_cluster_id; VARIABLES apogee, perigee; ) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') bdb.execute('ANALYZE g0 FOR 1 ITERATION') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF kepler_cluster_id WITH period WITHIN satellites MODELED BY g0 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id FROM satellites MODELED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise FROM satellites MODELED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf FROM satellites MODELED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf FROM satellites MODELED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf FROM satellites MODELED BY g0 LIMIT 1; ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE kepler_cluster_id, apogee, perigee, period FROM satellites MODELED BY g0 LIMIT 4 ''').fetchall() bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP GENERATOR g0') bdb.execute('DROP POPULATION satellites') bdb.execute('DROP TABLE satellites_ucs')
def test_predictive_relevance(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_backend(bdb, CGPM_Backend(cgpm_registry=dict())) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA ( apogee NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute('CREATE GENERATOR m FOR satellites;') bdb.execute('INITIALIZE 2 MODELS FOR m;') bdb.execute('ANALYZE m FOR 25 ITERATION (OPTIMIZED);') # Check self-similarites, and also provide coverage of bindings. rowids = bdb.execute('SELECT OID from satellites_ucs;').fetchall() for rowid in rowids[:4]: cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE TO EXISTING ROWS (rowid = ?) IN THE CONTEXT OF "period" FROM satellites WHERE rowid = ? ''', (1, 1,)) assert next(cursor)[0] == 1. # A full extravaganza query, using FROM (as a 1-row). cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE TO EXISTING ROWS (country_of_operator = 'Russia' AND period < 0) AND HYPOTHETICAL ROWS WITH VALUES ( (perigee=1.0, launch_mass=120), (country_of_operator='Bulgaria', perigee=2.0)) IN THE CONTEXT OF "country_of_operator" FROM satellites LIMIT 5 ''').fetchall() assert len(cursor) == 5 assert all(0 <= c[0] <= 1 for c in cursor) # A full extravaganza query, using BY (as a constant). cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (country_of_operator = 'Russia' AND period < 0) AND HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0), (country_of_operator='Bulgaria')) IN THE CONTEXT OF "country_of_operator" BY satellites ''').fetchall() assert len(cursor) == 1 assert all(0 <= c[0] <= 1 for c in cursor) # Hypothetical satellite with negative perigee should not be similar, # and use a binding to just ensure that they work. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE TO HYPOTHETICAL ROWS WITH VALUES ( (perigee = ?)) IN THE CONTEXT OF "perigee" FROM satellites LIMIT 5 ''' , (-10000,)).fetchall() assert len(cursor) == 5 assert all(np.allclose(c[0], 0) for c in cursor) # No matching target OF row. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid < 0) TO EXISTING ROWS (rowid = 10) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Unknown CONTEXT variable "banana". with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (rowid = 2) IN THE CONTEXT OF "banana" BY satellites ''') # No matching EXISTING ROW. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 10) TO EXISTING ROWS (rowid < 0) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Unknown nominal values 'Mongolia' in HYPOTHETICAL ROWS. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 10) TO HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='Mongolia'), (country_of_operator='Bulgaria', perigee=2.0)) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Create a new row. bdb.sql_execute(''' INSERT INTO satellites_ucs (apogee, launch_mass) VALUES (12.128, 12.128) ''') # TARGET ROW not yet incorporated should return nan. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (apogee = 12.128) TO HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0)) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is None # EXISTING ROW not yet incorporated should return nan, since there is # no hypothetical. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is None # Although apogee = 12.128 is EXISTING but not incorporated, there are # other EXISTING ROWS with apogee > 0, so we should still get a result. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128 OR apogee > 0) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is not None # Although apogee = 12.128 is EXISTING but not incorporated, there are # other HYPOTHETICAL ROWS, so we should still get a result. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128 OR apogee > 0) AND HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0), (country_of_operator='Bulgaria')) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is not None
def test_analysis_subproblems_basic(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPE OF apogee TO NUMERICAL; SET STATTYPE OF class_of_orbit TO NOMINAL; SET STATTYPE OF country_of_operator TO NOMINAL; SET STATTYPE OF launch_mass TO NUMERICAL; SET STATTYPE OF perigee TO NUMERICAL; SET STATTYPE OF period TO NUMERICAL ) ''') bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm( SUBSAMPLE 10 ); ''') bdb.execute('INITIALIZE 4 MODELS FOR g0') # Test each subproblem individually except for variable hyperparameters. for optimized in [ '', 'OPTIMIZED;', ]: for subproblem in [ 'variable clustering', 'variable clustering concentration', 'row clustering', 'row clustering concentration', ]: bdb.execute(''' ANALYZE g0 MODELS 0,1 FOR 4 ITERATION( SUBPROBLEM %s; %s ); ''' % (subproblem, optimized)) # Test variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION ( VARIABLES period, launch_mass; SUBPROBLEM variable hyperparameters; ) ''') with pytest.raises(BQLError): # OPTIMIZED backend does not support variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 SECONDS ( SUBPROBLEM variable hyperparameters; OPTIMIZED; ) ''') # Test rows. generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0') cursor = bdb.execute( ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id, )) subsample_rows = [c[0] for c in cursor] bad_rows = [i for i in xrange(20) if i not in subsample_rows] for optimized in ['', 'OPTIMIZED;']: bdb.execute(''' ANALYZE g0 MODEL 3 FOR 1 ITERATION ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, subsample_rows)), optimized)) with pytest.raises(BQLError): # Fail on rows not in the population or subsample. bdb.execute(''' ANALYZE g0 MODEL 3 FOR 1 ITERATION ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, bad_rows)), optimized))
def test_nig_normal_latent_2var_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100)) bdb.execute('create population p for t(x numerical; y numerical)') # CORRELATION, CORRELATION PVALUE, without generators. assert 4 == len(bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p ''').fetchall()) bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # CORRELATION, CORRELATION PVALUE, with generators. assert 4 == len(bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p ''').fetchall()) assert 4 == len(bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p modeled by g0 ''').fetchall()) with pytest.raises(BQLError): # g1 has a latent variable xe. assert 4 == len(bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p modeled by g1 ''').fetchall()) # DEPENDENCE PROBABILITY, MUTUAL INFORMATION assert 4 == len(bdb.execute(''' estimate dependence probability, mutual information from pairwise variables of p ''').fetchall()) assert 4 == len(bdb.execute(''' estimate dependence probability, mutual information from pairwise variables of p modeled by g0 ''').fetchall()) assert 9 == len(bdb.execute(''' estimate dependence probability, mutual information from pairwise variables of p modeled by g1 ''').fetchall()) # SIMULATE LATENT VARIABLE assert 10 == len(bdb.execute(''' simulate xe from p modeled by g1 limit 10; ''').fetchall()) assert 10 == len(bdb.execute(''' simulate y, xe from p modeled by g1 limit 10; ''').fetchall()) # Cannot simulate the latent xe from the population p. with pytest.raises(BQLError): assert 10 == len(bdb.execute(''' simulate xe from p limit 10; ''').fetchall()) # Cannot simulate the latent xe from the generator g0. with pytest.raises(BQLError): assert 10 == len(bdb.execute(''' simulate xe from p modeled by g0 limit 10; ''').fetchall()) bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def test_loom_four_var(): """Test Loom on a four variable table. Table consists of: * x - a random int between 0 and 200 * y - a random int between 0 and 100 * xx - just 2*x * z - a nominal variable that has an even chance of being 'a' or 'b' Queries run and tested include: estimate similarity, estimate probability density, simulate, estimate mutual information, estimate dependence probability, infer explicit predict """ with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t(x, xx, y, z)') bdb.sql_execute(''' insert into t (x, xx, y, z) values (100, 200, 50, 'a')''') bdb.sql_execute(''' insert into t (x, xx, y, z) values (100, 200, 50, 'a')''') for _index in xrange(100): x = bdb._prng.weakrandom_uniform(X_MAX) bdb.sql_execute( ''' insert into t(x, xx, y, z) values(?, ?, ?, ?) ''', (x, x * 2, int(bdb._prng.weakrandom_uniform(Y_MAX)), 'a' if bdb._prng.weakrandom_uniform(2) == 1 else 'b')) bdb.execute(''' create population p for t(x numerical; xx numerical; y numerical; z nominal)''') bdb.execute('create generator g for p using loom') bdb.execute('initialize 10 model for g') bdb.execute('analyze g for 20 iterations') with pytest.raises(BQLError): relevance = bdb.execute(''' estimate predictive relevance to hypothetical rows with values ((x=50, xx=100)) in the context of "x" from p where rowid = 1 ''').fetchall() relevance = bdb.execute(''' estimate predictive relevance to existing rows (rowid = 1) in the context of "x" from p where rowid = 1 ''').fetchall() assert relevance[0][0] == 1 similarities = bdb.execute('''estimate similarity in the context of x from pairwise p limit 2''').fetchall() assert similarities[0][2] <= 1 assert similarities[1][2] <= 1 assert abs(similarities[0][2] - similarities[1][2]) < 0.005 impossible_density = bdb.execute( 'estimate probability density of x = %d by p' % (X_MAX * 2.5, )).fetchall() assert impossible_density[0][0] < 0.0001 possible_density = bdb.execute( 'estimate probability density of x = %d by p' % ((X_MAX - X_MIN) / 2, )).fetchall() assert possible_density[0][0] > 0.001 nominal_density = bdb.execute(''' estimate probability density of z = 'a' by p ''').fetchall() assert abs(nominal_density[0][0] - .5) < 0.2 mutual_info = bdb.execute(''' estimate mutual information as mutinf from pairwise columns of p order by mutinf ''').fetchall() _, a, b, c = zip(*mutual_info) mutual_info_dict = dict(zip(zip(a, b), c)) assert mutual_info_dict[('x', 'y')] < mutual_info_dict[( 'x', 'xx')] < mutual_info_dict[('x', 'x')] simulated_data = bdb.execute('simulate x, y from p limit %d' % (PREDICT_RUNS, )).fetchall() xs, ys = zip(*simulated_data) assert abs((sum(xs)/len(xs)) - (X_MAX-X_MIN)/2) < \ (X_MAX-X_MIN)/5 assert abs((sum(ys)/len(ys)) - (Y_MAX-Y_MIN)/2) < \ (Y_MAX-Y_MIN)/5 assert sum([1 if (x < Y_MIN or x > X_MAX) else 0 for x in xs]) < .5 * PREDICT_RUNS assert sum([1 if (y < Y_MIN or y > Y_MAX) else 0 for y in ys]) < .5 * PREDICT_RUNS dependence = bdb.execute('''estimate dependence probability from pairwise variables of p''').fetchall() for (_, col1, col2, d_val) in dependence: if col1 == col2: assert d_val == 1 elif col1 in ['xx', 'x'] and col2 in ['xx', 'x']: assert d_val > 0.80 else: assert d_val < 0.20 predict_confidence = bdb.execute( 'infer explicit predict x confidence x_c FROM p').fetchall() predictions, confidences = zip(*predict_confidence) assert abs((sum(predictions) / len(predictions)) - (X_MAX - X_MIN) / 2) < (X_MAX - X_MIN) / 5 assert sum( [1 if (p < X_MIN or p > X_MAX) else 0 for p in predictions]) < .5 * PREDICT_RUNS assert all([c == 0 for c in confidences])
def test_using_modelnos(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPE OF apogee TO NUMERICAL; SET STATTYPE OF class_of_orbit TO NOMINAL; SET STATTYPE OF country_of_operator TO NOMINAL; SET STATTYPE OF launch_mass TO NUMERICAL; SET STATTYPE OF perigee TO NUMERICAL; SET STATTYPE OF period TO NUMERICAL ) ''') bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm( SUBSAMPLE 10 ); ''') bdb.execute('INITIALIZE 2 MODELS FOR g0') # Crash test simulate. bdb.execute(''' SIMULATE apogee, class_of_orbit FROM satellites MODELED BY g0 USING MODEL 0-1 LIMIT 10 ''') # Crash test infer explicit. bdb.execute(''' INFER EXPLICIT PREDICT period, perigee FROM satellites MODELED BY g0 USING MODEL 0 LIMIT 2 ''') # Crash test dependence probability BY. c = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF launch_mass WITH period BY satellites MODELED BY g0 USING MODEL 0 ''') assert cursor_value(c) in [0, 1] # Crash test dependence probability pairwise. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF satellites MODELED BY g0 USING MODEL 1 ''') for d in cursor: assert d[0] in [0, 1] # Crash test mutual information 1row. bdb.execute(''' ESTIMATE MUTUAL INFORMATION WITH (period) USING 1 SAMPLES FROM VARIABLES OF satellites USING MODEL 0 ''').fetchall() # Test analyze on per-model basis. bdb.execute(''' ANALYZE g0 MODEL 0 FOR 1 ITERATION CHECKPOINT 1 ITERATION ''') engine = bdb.backends['cgpm']._engine(bdb, 1) assert len(engine.states[0].diagnostics['logscore']) == 1 assert len(engine.states[1].diagnostics['logscore']) == 0 bdb.execute(''' ANALYZE g0 MODEL 1 FOR 4 ITERATION CHECKPOINT 1 ITERATION ( OPTIMIZED ); ''') assert len(engine.states[0].diagnostics['logscore']) == 1 assert len(engine.states[1].diagnostics['logscore']) == 4 # Some errors with bad modelnos. with pytest.raises(BQLError): bdb.execute(''' ANALYZE g0 MODEL 0-3 FOR 4 ITERATION ''') with pytest.raises(BQLError): bdb.execute(''' SIMULATE apogee FROM satellites USING MODEL 25 LIMIT 10; ''') with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites USING MODELS 0-8 LIMIT 2; ''')
def test_nig_normal_latent_2var2lat_conditional_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x * x - 100)) bdb.execute('create population p for t(x numerical; y numerical)') bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal( xe deviation(x), ye deviation(y) ) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # latent given latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (ye = -1) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (ye = -1) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 given (ye = -1) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p given ye = -1 limit 1 ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p modeled by g0 given ye = -1 limit 1 ''').fetchall() bdb.execute(''' simulate xe from p modeled by g1 given ye = -1 limit 1 ''').fetchall() with pytest.raises(BQLError): bdb.execute( 'estimate dependence probability of xe with ye within p') with pytest.raises(BQLError): bdb.execute(''' estimate dependence probability of xe with ye within p modeled by g0 ''') bdb.execute(''' estimate dependence probability of xe with ye within p modeled by g1 ''') with pytest.raises(BQLError): bdb.execute('estimate mutual information of xe with ye within p') with pytest.raises(BQLError): bdb.execute(''' estimate mutual information of xe with ye within p modeled by g0 ''') bdb.execute(''' estimate mutual information of xe with ye within p modeled by g1 ''') bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def test_add_drop_models(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_backend( bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE POPULATION p FOR satellites_ucs WITH SCHEMA( GUESS STATTYPES OF (*); ) ''') bdb.execute('CREATE GENERATOR m FOR p (SUBSAMPLE 10);') # Retrieve id for testing. population_id = bayesdb_get_population(bdb, 'p') generator_id = bayesdb_get_generator(bdb, population_id, 'm') def check_modelno_mapping(lookup): pairs = bdb.sql_execute(''' SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno WHERE generator_id = ? ''', (generator_id,)) for pair in pairs: assert lookup[pair[0]] == pair[1] del lookup[pair[0]] assert len(lookup) == 0 # Initialize some models. bdb.execute('INITIALIZE 16 MODELS FOR m') # Assert identity mapping initially. check_modelno_mapping({i:i for i in xrange(16)}) bdb.execute('ANALYZE m FOR 1 ITERATION (QUIET);') # Drop some models. bdb.execute('DROP MODELS 1, 8-12, 14 FROM m') # Assert cgpm models are contiguous while bayesdb models are not, with # the mapping preserving the strict order. check_modelno_mapping({ 0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 13: 7, 15: 8, }) # Run some analysis again. bdb.execute('ANALYZE m FOR 1 ITERATION (OPTIMIZED; QUIET);') # Initialize 14 models if not existing. bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m') # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14 # MODELS in total will exist after the query, rather it will initialize # any non-existing modelnos with index 0-13, and any modelnos > 14 # (modelno 15 in this test case) are untouched. check_modelno_mapping({ 0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 13: 7, 15: 8, # Recreated models. 1: 9, 8: 10, 9: 11, 10: 12, 11: 13, 12: 14, }) # Drop some more models, add them back with some more, and confirm # arithmetic and ordering remains correct. bdb.execute('DROP MODELS 0-1 FROM m') check_modelno_mapping({ 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 13: 6, 15: 7, # Recreated models. 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, }) bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;') check_modelno_mapping({ 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 13: 6, 15: 7, # Recreated models. 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, # Re-recreated models. 0: 13, 1: 14, # New models. 14: 15, 16: 16, 17: 17, 18: 18, 19: 19, }) # No such models. with pytest.raises(BQLError): bdb.execute('DROP MODELS 20-50 FROM m') # Drop all models. bdb.execute('DROP MODELS FROM m;') # No such models. with pytest.raises(BQLError): bdb.execute('DROP MODEL 0 FROM m') # Assert cgpm mapping is cleared. cursor = bdb.sql_execute(''' SELECT COUNT(*) FROM bayesdb_cgpm_modelno WHERE generator_id = ? ''', (generator_id,)) assert cursor_value(cursor) == 0
def test_analysis_subproblems_basic(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPE OF apogee TO NUMERICAL; SET STATTYPE OF class_of_orbit TO NOMINAL; SET STATTYPE OF country_of_operator TO NOMINAL; SET STATTYPE OF launch_mass TO NUMERICAL; SET STATTYPE OF perigee TO NUMERICAL; SET STATTYPE OF period TO NUMERICAL ) ''') bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm( SUBSAMPLE 10 ); ''') bdb.execute('INITIALIZE 4 MODELS FOR g0') # Test each subproblem individually except for variable hyperparameters. for optimized in ['', 'OPTIMIZED;',]: for subproblem in [ 'variable clustering', 'variable clustering concentration', 'row clustering', 'row clustering concentration', ]: bdb.execute(''' ANALYZE g0 MODELS 0,1 FOR 4 ITERATION( SUBPROBLEM %s; %s ); ''' % (subproblem, optimized)) # Test variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION ( VARIABLES period, launch_mass; SUBPROBLEM variable hyperparameters; ) ''') with pytest.raises(BQLError): # OPTIMIZED backend does not support variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 SECONDS ( SUBPROBLEM variable hyperparameters; OPTIMIZED; ) ''') # Test rows. generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0') cursor = bdb.execute(''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id,)) subsample_rows = [c[0] for c in cursor] bad_rows = [i for i in xrange(20) if i not in subsample_rows] for optimized in ['', 'OPTIMIZED;']: bdb.execute(''' ANALYZE g0 MODEL 3 FOR 1 ITERATION ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, subsample_rows)), optimized)) with pytest.raises(BQLError): # Fail on rows not in the population or subsample. bdb.execute(''' ANALYZE g0 MODEL 3 FOR 1 ITERATION ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, bad_rows)), optimized))
def test_nig_normal_latent_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x)') for x in xrange(100): bdb.sql_execute('insert into t(x) values(?)', (x,)) bdb.execute('create population p for t(x numerical)') bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # PROBABILITY DENSITY OF x = v bdb.execute('estimate probability density of x = 50 within p') \ .fetchall() with pytest.raises(BQLError): bdb.execute('estimate probability density of xe = 1 within p') \ .fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 within p modeled by g1 ''').fetchall() # PREDICTIVE PROBABILITY OF x bdb.execute('estimate predictive probability of x from p').fetchall() with pytest.raises(BQLError): bdb.execute( 'estimate predictive probability of xe from p').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate predictive probability of xe from p modeled by g0 ''').fetchall() for r, p_xe in bdb.execute(''' estimate rowid, predictive probability of xe from p modeled by g1 '''): assert p_xe is None, 'rowid %r p(xe) %r' % (r, p_xe) # INFER/PREDICT bdb.execute( 'INFER EXPLICIT PREDICT x CONFIDENCE x_c FROM p').fetchall() with pytest.raises(BQLError): bdb.execute( 'INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p').fetchall() with pytest.raises(BQLError): bdb.execute(''' INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p MODELED BY g0 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p MODELED BY g1 ''').fetchall() # SIMULATE x bdb.execute('simulate x from p limit 1').fetchall() with pytest.raises(BQLError): bdb.execute('simulate x, xe from p limit 1').fetchall() with pytest.raises(BQLError): bdb.execute( 'simulate x, xe from p modeled by g0 limit 1').fetchall() bdb.execute('simulate x, xe from p modeled by g1 limit 1').fetchall() assert 100 == len(bdb.execute(''' estimate similarity in the context of x from pairwise p limit 100 ''').fetchall()) assert 1 == len(bdb.execute(''' estimate similarity in the context of x from pairwise p modeled by g0 limit 1 ''').fetchall()) # No such column xe in g0. with pytest.raises(BQLError): assert 1 == len(bdb.execute(''' estimate similarity in the context of xe from pairwise p modeled by g0 limit 1 ''').fetchall()) # Column xe exists in g1. assert 1 == len(bdb.execute(''' estimate similarity in the context of xe from pairwise p modeled by g1 limit 1 ''').fetchall()) bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def test_loom_complex_add_analyze_drop_sequence(): with tempdir('bayeslite-loom') as loom_store_path: with bayesdb_open(':memory:') as bdb: bayesdb_register_backend( bdb, LoomBackend(loom_store_path=loom_store_path)) bdb.sql_execute('create table t (x)') for x in xrange(10): bdb.sql_execute('insert into t (x) values (?)', (x, )) bdb.execute('create population p for t (x numerical)') bdb.execute('create generator g for p using loom') bdb.execute('initialize 2 models for g') bdb.execute('initialize 3 models if not exists for g') population_id = bayesdb_get_population(bdb, 'p') generator_id = bayesdb_get_generator(bdb, population_id, 'g') cursor = bdb.sql_execute( ''' SELECT num_models FROM bayesdb_loom_generator_model_info WHERE generator_id=?; ''', (generator_id, )) num_models = cursor.fetchall()[0][0] # Make sure that the total number of models is # 3 and not 2 + 3 = 5. assert num_models == 3 bdb.execute('analyze g for 10 iterations') bdb.execute('estimate probability density of x = 50 from p') with pytest.raises(BQLError): bdb.execute('drop model 1 from g') bdb.execute('drop models from g') bdb.execute('initialize 1 models for g') population_id = bayesdb_get_population(bdb, 'p') generator_id = bayesdb_get_generator(bdb, population_id, 'g') cursor = bdb.sql_execute( ''' SELECT num_models FROM bayesdb_loom_generator_model_info WHERE generator_id=?; ''', (generator_id, )) num_models = cursor.fetchall()[0][0] # Make sure that the number of models was reset after dropping. assert num_models == 1 bdb.execute('analyze g for 50 iterations') cursor = bdb.execute(''' estimate probability density of x = 50 from p''') probDensityX1 = cursor.fetchall() probDensityX1 = [x[0] for x in probDensityX1] bdb.execute('simulate x from p limit 1').fetchall() bdb.execute('drop models from g') bdb.execute('initialize 1 model for g') bdb.execute('analyze g for 50 iterations') cursor = bdb.execute(''' estimate probability density of x = 50 from p''') probDensityX2 = cursor.fetchall() probDensityX2 = [x[0] for x in probDensityX2] # Check that the analysis started fresh after dropping models # and that it produces similar results the second time. for px1, px2 in zip(probDensityX1, probDensityX2): assert abs(px1 - px2) < .01 bdb.execute('drop models from g') bdb.execute('drop generator g') bdb.execute('drop population p') bdb.execute('drop table t')
def test_unknown_stattype(): from cgpm.regressions.linreg import LinearRegression with cgpm_dummy_satellites_bdb() as bdb: # Add a column called relaunches, sum of apogee and perigee. bdb.sql_execute('ALTER TABLE satellites_ucs ADD COLUMN relaunches') n_rows = bdb.sql_execute(''' SELECT COUNT(*) FROM satellites_ucs ''').next()[0] for rowid in xrange(n_rows): bdb.sql_execute(''' UPDATE satellites_ucs SET relaunches = (SELECT apogee + perigee) WHERE _rowid_ = ? ''', (rowid+1,)) # Nobody will ever create a QUAGGA statistical type! with pytest.raises(BQLError): # No such statistical type at the moment. bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPES OF apogee, perigee, launch_mass, period TO NUMERICAL; SET STATTYPE OF class_of_orbit, country_of_operator TO NOMINAL; SET STATTYPE OF relaunches TO QUAGGA ) ''') # Invent the statistical type. bdb.sql_execute('INSERT INTO bayesdb_stattype VALUES (?)', ('quagga',)) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPES OF apogee, perigee, launch_mass, period TO NUMERICAL; SET STATTYPES OF class_of_orbit, country_of_operator TO NOMINAL; SET STATTYPES OF relaunches TO QUAGGA ) ''') registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_backend(bdb, CGPM_Backend(registry)) with pytest.raises(BQLError): # Can't model QUAGGA by default. bdb.execute('CREATE GENERATOR g0 FOR satellites USING cgpm') with pytest.raises(BQLError): # Can't model QUAGGA as input. bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR relaunches GIVEN apogee USING linreg; OVERRIDE MODEL FOR period GIVEN relaunches USING linreg ) ''') # Can model QUAGGA with an explicit distribution family. bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR relaunches TO POISSON ) ''') bdb.execute(''' CREATE GENERATOR g1 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR relaunches TO POISSON; OVERRIDE MODEL FOR period GIVEN relaunches USING linreg ) ''')
def test_nig_normal_latent_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x)') for x in xrange(100): bdb.sql_execute('insert into t(x) values(?)', (x, )) bdb.execute('create population p for t(x numerical)') bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # PROBABILITY DENSITY OF x = v bdb.execute('estimate probability density of x = 50 within p') \ .fetchall() with pytest.raises(BQLError): bdb.execute('estimate probability density of xe = 1 within p') \ .fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 within p modeled by g1 ''').fetchall() # PREDICTIVE PROBABILITY OF x bdb.execute('estimate predictive probability of x from p').fetchall() with pytest.raises(BQLError): bdb.execute( 'estimate predictive probability of xe from p').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate predictive probability of xe from p modeled by g0 ''').fetchall() for r, p_xe in bdb.execute(''' estimate rowid, predictive probability of xe from p modeled by g1 '''): assert p_xe is None, 'rowid %r p(xe) %r' % (r, p_xe) # INFER/PREDICT bdb.execute( 'INFER EXPLICIT PREDICT x CONFIDENCE x_c FROM p').fetchall() with pytest.raises(BQLError): bdb.execute( 'INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p').fetchall() with pytest.raises(BQLError): bdb.execute(''' INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p MODELED BY g0 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p MODELED BY g1 ''').fetchall() # SIMULATE x bdb.execute('simulate x from p limit 1').fetchall() with pytest.raises(BQLError): bdb.execute('simulate x, xe from p limit 1').fetchall() with pytest.raises(BQLError): bdb.execute( 'simulate x, xe from p modeled by g0 limit 1').fetchall() bdb.execute('simulate x, xe from p modeled by g1 limit 1').fetchall() assert 100 == len( bdb.execute(''' estimate similarity in the context of x from pairwise p limit 100 ''').fetchall()) assert 1 == len( bdb.execute(''' estimate similarity in the context of x from pairwise p modeled by g0 limit 1 ''').fetchall()) # No such column xe in g0. with pytest.raises(BQLError): assert 1 == len( bdb.execute(''' estimate similarity in the context of xe from pairwise p modeled by g0 limit 1 ''').fetchall()) # Column xe exists in g1. assert 1 == len( bdb.execute(''' estimate similarity in the context of xe from pairwise p modeled by g1 limit 1 ''').fetchall()) bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def test_cgpm_kepler(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( apogee NUMERICAL; launch_mass NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION from PAIRWISE VARIABLES OF satellites ''').fetchall() registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_backend( bdb, CGPM_Backend(registry, multiprocess=0)) bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( OVERRIDE GENERATIVE MODEL FOR period GIVEN apogee, perigee USING linreg ) ''') bdb.execute('INITIALIZE 1 MODEL FOR g0') c = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual') n = c.fetchvalue() # Another generator: exponential launch mass instead of normal. bdb.execute(''' CREATE GENERATOR g1 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR launch_mass TO EXPONENTIAL; OVERRIDE MODEL FOR period GIVEN apogee, perigee USING kepler(quagga = eland); SUBSAMPLE 20 ) ''') c_ = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual') n_ = c_.fetchvalue() assert n_ - n == 20 bdb.execute('INITIALIZE 1 MODEL IF NOT EXISTS FOR g1') bdb.execute('ANALYZE g0 FOR 1 ITERATION') bdb.execute('ANALYZE g0 FOR 1 ITERATION (VARIABLES period)') bdb.execute('ANALYZE g1 FOR 1 ITERATION') bdb.execute('ANALYZE g1 FOR 1 ITERATION (VARIABLES period)') # OPTIMIZED is ignored because period is a foreign variable. bdb.execute(''' ANALYZE g1 FOR 1 ITERATION (OPTIMIZED; VARIABLES period) ''') # This should fail since we have a SET CATEGORY MODEL which is not # compatible with lovecat. The ValueError is from cgpm not bayeslite. with pytest.raises(ValueError): bdb.execute(''' ANALYZE g1 FOR 1 ITERATION (OPTIMIZED; VARIABLES launch_mass) ''') # Cannot use timed analysis with mixed variables. with pytest.raises(BQLError): bdb.execute(''' ANALYZE g1 FOR 5 SECONDS (VARIABLES period, apogee) ''') # Cannot use timed analysis with mixed variables (period by SKIP). with pytest.raises(BQLError): bdb.execute(''' ANALYZE g1 FOR 5 SECONDS (SKIP apogee) ''') # OK to use iteration analysis with mixed values. bdb.execute(''' ANALYZE g1 FOR 1 ITERATION (VARIABLES period, apogee) ''') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF satellites ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE apogee, perigee, period FROM satellites LIMIT 100 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_confidence USING 5 SAMPLES FROM satellites LIMIT 2 ''').fetchall() results = bdb.execute(''' INFER EXPLICIT PREDICT class_of_orbit CONFIDENCE class_of_orbit_confidence FROM satellites LIMIT 2 ''').fetchall() assert len(results[0]) == 2 assert isinstance(results[0][0], unicode) assert isinstance(results[0][1], float) # No CONFIDENCE specified. results = bdb.execute(''' INFER EXPLICIT PREDICT class_of_orbit USING 2 SAMPLES FROM satellites LIMIT 2 ''').fetchall() assert len(results[0]) == 1 assert isinstance(results[0][0], unicode) bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP GENERATOR g0') bdb.execute('DROP GENERATOR g1')
def test_nig_normal_latent_2var_conditional_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x * x - 100)) bdb.execute('create population p for t(x numerical; y numerical)') # CORRELATION, CORRELATION PVALUE, without generators. assert 4 == len( bdb.execute(''' estimate correlation, correlation pvalue from pairwise variables of p ''').fetchall()) bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # observed given other observed bdb.execute(''' estimate probability density of x = 50 given (y = 49) within p ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (y = 49) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (y = 49) within p modeled by g1 ''').fetchall() bdb.execute('simulate x from p given y = 49 limit 1').fetchall() bdb.execute(''' simulate x from p modeled by g0 given y = 49 limit 1 ''').fetchall() bdb.execute(''' simulate x from p modeled by g1 given y = 49 limit 1 ''').fetchall() # observed given related latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 1) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 1) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (xe = 1) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate x from p given xe = 1 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate x from p modeled by g0 given xe = 1 limit 1 ''').fetchall() bdb.execute(''' simulate x from p modeled by g1 given xe = 1 limit 1 ''').fetchall() # observed given unrelated latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of y = 50 given (xe = 1) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of y = 50 given (xe = 1) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of y = 50 given (xe = 1) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate y from p given xe = 1 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate y from p modeled by g0 given xe = 1 limit 1 ''').fetchall() bdb.execute(''' simulate y from p modeled by g1 given xe = 1 limit 1 ''').fetchall() # latent given related observed with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (x = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (x = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 given (x = 50) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate xe from p given x = 50 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p modeled by g0 given x = 50 limit 1 ''').fetchall() bdb.execute(''' simulate xe from p modeled by g1 given x = 50 limit 1 ''').fetchall() # latent given unrelated observed with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (y = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 1 given (y = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 1 given (y = 50) within p modeled by g1 ''').fetchall() with pytest.raises(BQLError): bdb.execute('simulate xe from p given y = 50 limit 1').fetchall() with pytest.raises(BQLError): bdb.execute(''' simulate xe from p modeled by g0 given y = 50 limit 1 ''').fetchall() bdb.execute(''' simulate xe from p modeled by g1 given y = 50 limit 1 ''').fetchall() bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')
def test_nig_normal_latent_conditional_smoke(): with bayesdb_open(':memory:') as bdb: bayesdb_register_backend(bdb, NIGNormalBackend()) bdb.sql_execute('create table t(x)') for x in xrange(100): bdb.sql_execute('insert into t(x) values(?)', (x, )) bdb.execute('create population p for t(x numerical)') bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') bdb.execute('initialize 1 model for g0') bdb.execute('analyze g0 for 1 iteration') bdb.execute('initialize 1 model for g1') bdb.execute('analyze g1 for 1 iteration') # observed given observed bdb.execute(''' estimate probability density of x = 50 given (x = 50) within p ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (x = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (x = 50) within p modeled by g1 ''').fetchall() # observed given latent with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of x = 50 given (xe = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of x = 50 given (xe = 50) within p modeled by g1 ''').fetchall() # latent given observed with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 50 given (x = 50) within p ''').fetchall() with pytest.raises(BQLError): bdb.execute(''' estimate probability density of xe = 50 given (x = 50) within p modeled by g0 ''').fetchall() bdb.execute(''' estimate probability density of xe = 50 given (x = 50) within p modeled by g1 ''').fetchall() bdb.execute('drop models from g0') bdb.execute('drop generator g0') bdb.execute('drop models from g1') bdb.execute('drop generator g1') bdb.execute('drop population p') bdb.execute('drop table t')