def cgpm_smoke_bdb(): with bayesdb_open(':memory:', builtin_backends=False) as bdb: registry = { 'piecewise': PieceWise, } bayesdb_register_backend( bdb, CGPM_Backend(registry, multiprocess=0)) bdb.sql_execute('CREATE TABLE t (Output, cat, Input)') for i in xrange(3): for j in xrange(3): for k in xrange(3): output = i + j/(k + 1) cat = -1 if (i + j*k) % 2 else +1 input = (i*j - k)**2 if i % 2: output = None if j % 2: cat = None if k % 2: input = None bdb.sql_execute(''' INSERT INTO t (output, cat, input) VALUES (?, ?, ?) ''', (output, cat, input)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( output NUMERICAL; input NUMERICAL; cat NOMINAL; ) ''') yield bdb
def get_backend_object(cfg): if cfg.backend is None: raise RuntimeError('BACKEND was not set in config file') if cfg.backend == 'cgpm': return CGPM_Backend({}, multiprocess=False) elif cfg.backend == 'loom': return LoomBackend(cfg.loom_path)
def bayesdb(backend=None, **kwargs): if backend is None: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bdb = bayeslite.bayesdb_open(builtin_backends=False, **kwargs) bayeslite.bayesdb_register_backend(bdb, backend) try: yield bdb finally: bdb.close()
def t1_mp(): backend = CGPM_Backend(cgpm_registry={}, multiprocess=True) return bayesdb_population(bayesdb(backend=backend), 't1', 'p1', 'p1_cc', t1_schema, t1_data, columns=[ 'id IGNORE', 'label NOMINAL', 'age NUMERICAL', 'weight NUMERICAL' ])
def cgpm_dummy_satellites_pop_bdb(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' create population satellites for satellites_ucs with schema( apogee numerical; class_of_orbit nominal; country_of_operator nominal; launch_mass numerical; perigee numerical; period numerical ) ''') backend = CGPM_Backend(dict(), multiprocess=0) bayesdb_register_backend(bdb, backend) yield bdb
def run(stdin, stdout, stderr, argv): args = parse_args(argv[1:]) progname = argv[0] slash = progname.rfind('/') if slash: progname = progname[slash + 1:] if args.bdbpath is None and not args.memory: stderr.write('%s: pass filename or -m/--memory\n' % (progname,)) return 1 if args.bdbpath == '-': stderr.write('%s: missing option?\n' % (progname,)) return 1 bdb = bayeslite.bayesdb_open(pathname=args.bdbpath, builtin_backends=False) multiprocess = args.jobs != 1 backend = CGPM_Backend(cgpm_registry={}, multiprocess=multiprocess) bayeslite.bayesdb_register_backend(bdb, backend) bdbshell = shell.Shell(bdb, 'cgpm', stdin, stdout, stderr) with hook.set_current_shell(bdbshell): if not args.no_init_file: init_file = os.path.join(os.path.expanduser('~/.bayesliterc')) if os.path.isfile(init_file): bdbshell.dot_read(init_file) if args.file is not None: for path in args.file: if os.path.isfile(path): bdbshell.dot_read(path) else: bdbshell.stdout.write('%s is not a file. Aborting.\n' % (str(path),)) break if not args.batch: bdbshell.cmdloop() return 0
def bayesdb(self, line, cell=None): parser = argparse.ArgumentParser() parser.add_argument('path', help='Path of bdb file.') parser.add_argument('-s', type=int, default=0, help='Seed.') parser.add_argument('-j', action='store_true', help='Multiprocessing.') args = parser.parse_args(line.split()) if self._bdb is not None: self._bdb.close() self._bdb = None self._path = args.path seed = struct.pack('<QQQQ', 0, 0, 0, args.s) self._bdb = bayesdb_open(pathname=args.path, seed=seed, builtin_backends=False) # Small hack for the VsCGpm, which takes in the venturescript source # from %venturescript cells! def _VsCGpm(outputs, inputs, rng, *args, **kwds): if 'source' not in kwds: kwds['source'] = '\n'.join(self._venturescript) return VsCGpm(outputs, inputs, rng, *args, **kwds) # Register cgpm backend. cgpm_registry = { 'factor_analysis': FactorAnalysis, 'inline_venturescript': InlineVsCGpm, 'linear_regression': LinearRegression, 'multivariate_kde': MultivariateKde, 'multivariate_knn': MultivariateKnn, 'ordinary_least_squares': OrdinaryLeastSquares, 'random_forest': RandomForest, 'venturescript': _VsCGpm, } mm = CGPM_Backend(cgpm_registry, multiprocess=args.j) bayesdb_register_backend(self._bdb, mm) return 'Loaded: %s' % (self._path)
def test_bad_analyze_vars(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPE OF apogee TO NUMERICAL; SET STATTYPE OF class_of_orbit TO NOMINAL; SET STATTYPE OF country_of_operator TO NOMINAL; SET STATTYPE OF launch_mass TO NUMERICAL; SET STATTYPE OF perigee TO NUMERICAL; SET STATTYPE OF period TO NUMERICAL ) ''') registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_backend(bdb, CGPM_Backend(registry)) bdb.execute(''' CREATE GENERATOR satellites_cgpm FOR satellites USING cgpm ''') bdb.execute('INITIALIZE 1 MODEL FOR satellites_cgpm') bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION ()') bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION') with pytest.raises(BQLError): # Unknown variable `perige'. bdb.execute(''' ANALYZE satellites_cgpm FOR 1 ITERATION ( VARIABLES period, perige ) ''') with pytest.raises(BQLError): # Unknown variable `perige'. bdb.execute(''' ANALYZE satellites_cgpm FOR 1 ITERATION ( SKIP period, perige ) ''')
def test_predictive_relevance(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_backend(bdb, CGPM_Backend(cgpm_registry=dict())) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA ( apogee NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute('CREATE GENERATOR m FOR satellites;') bdb.execute('INITIALIZE 2 MODELS FOR m;') bdb.execute('ANALYZE m FOR 25 ITERATION (OPTIMIZED);') # Check self-similarites, and also provide coverage of bindings. rowids = bdb.execute('SELECT OID from satellites_ucs;').fetchall() for rowid in rowids[:4]: cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE TO EXISTING ROWS (rowid = ?) IN THE CONTEXT OF "period" FROM satellites WHERE rowid = ? ''', (1, 1,)) assert next(cursor)[0] == 1. # A full extravaganza query, using FROM (as a 1-row). cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE TO EXISTING ROWS (country_of_operator = 'Russia' AND period < 0) AND HYPOTHETICAL ROWS WITH VALUES ( (perigee=1.0, launch_mass=120), (country_of_operator='Bulgaria', perigee=2.0)) IN THE CONTEXT OF "country_of_operator" FROM satellites LIMIT 5 ''').fetchall() assert len(cursor) == 5 assert all(0 <= c[0] <= 1 for c in cursor) # A full extravaganza query, using BY (as a constant). cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (country_of_operator = 'Russia' AND period < 0) AND HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0), (country_of_operator='Bulgaria')) IN THE CONTEXT OF "country_of_operator" BY satellites ''').fetchall() assert len(cursor) == 1 assert all(0 <= c[0] <= 1 for c in cursor) # Hypothetical satellite with negative perigee should not be similar, # and use a binding to just ensure that they work. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE TO HYPOTHETICAL ROWS WITH VALUES ( (perigee = ?)) IN THE CONTEXT OF "perigee" FROM satellites LIMIT 5 ''' , (-10000,)).fetchall() assert len(cursor) == 5 assert all(np.allclose(c[0], 0) for c in cursor) # No matching target OF row. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid < 0) TO EXISTING ROWS (rowid = 10) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Unknown CONTEXT variable "banana". with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (rowid = 2) IN THE CONTEXT OF "banana" BY satellites ''') # No matching EXISTING ROW. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 10) TO EXISTING ROWS (rowid < 0) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Unknown nominal values 'Mongolia' in HYPOTHETICAL ROWS. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 10) TO HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='Mongolia'), (country_of_operator='Bulgaria', perigee=2.0)) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Create a new row. bdb.sql_execute(''' INSERT INTO satellites_ucs (apogee, launch_mass) VALUES (12.128, 12.128) ''') # TARGET ROW not yet incorporated should return nan. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (apogee = 12.128) TO HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0)) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is None # EXISTING ROW not yet incorporated should return nan, since there is # no hypothetical. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is None # Although apogee = 12.128 is EXISTING but not incorporated, there are # other EXISTING ROWS with apogee > 0, so we should still get a result. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128 OR apogee > 0) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is not None # Although apogee = 12.128 is EXISTING but not incorporated, there are # other HYPOTHETICAL ROWS, so we should still get a result. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128 OR apogee > 0) AND HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0), (country_of_operator='Bulgaria')) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is not None
def test_add_drop_models(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_backend( bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE POPULATION p FOR satellites_ucs WITH SCHEMA( GUESS STATTYPES OF (*); ) ''') bdb.execute('CREATE GENERATOR m FOR p (SUBSAMPLE 10);') # Retrieve id for testing. population_id = bayesdb_get_population(bdb, 'p') generator_id = bayesdb_get_generator(bdb, population_id, 'm') def check_modelno_mapping(lookup): pairs = bdb.sql_execute(''' SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno WHERE generator_id = ? ''', (generator_id,)) for pair in pairs: assert lookup[pair[0]] == pair[1] del lookup[pair[0]] assert len(lookup) == 0 # Initialize some models. bdb.execute('INITIALIZE 16 MODELS FOR m') # Assert identity mapping initially. check_modelno_mapping({i:i for i in xrange(16)}) bdb.execute('ANALYZE m FOR 1 ITERATION (QUIET);') # Drop some models. bdb.execute('DROP MODELS 1, 8-12, 14 FROM m') # Assert cgpm models are contiguous while bayesdb models are not, with # the mapping preserving the strict order. check_modelno_mapping({ 0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 13: 7, 15: 8, }) # Run some analysis again. bdb.execute('ANALYZE m FOR 1 ITERATION (OPTIMIZED; QUIET);') # Initialize 14 models if not existing. bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m') # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14 # MODELS in total will exist after the query, rather it will initialize # any non-existing modelnos with index 0-13, and any modelnos > 14 # (modelno 15 in this test case) are untouched. check_modelno_mapping({ 0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 13: 7, 15: 8, # Recreated models. 1: 9, 8: 10, 9: 11, 10: 12, 11: 13, 12: 14, }) # Drop some more models, add them back with some more, and confirm # arithmetic and ordering remains correct. bdb.execute('DROP MODELS 0-1 FROM m') check_modelno_mapping({ 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 13: 6, 15: 7, # Recreated models. 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, }) bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;') check_modelno_mapping({ 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 13: 6, 15: 7, # Recreated models. 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, # Re-recreated models. 0: 13, 1: 14, # New models. 14: 15, 16: 16, 17: 17, 18: 18, 19: 19, }) # No such models. with pytest.raises(BQLError): bdb.execute('DROP MODELS 20-50 FROM m') # Drop all models. bdb.execute('DROP MODELS FROM m;') # No such models. with pytest.raises(BQLError): bdb.execute('DROP MODEL 0 FROM m') # Assert cgpm mapping is cleared. cursor = bdb.sql_execute(''' SELECT COUNT(*) FROM bayesdb_cgpm_modelno WHERE generator_id = ? ''', (generator_id,)) assert cursor_value(cursor) == 0
import pytest import shutil import tempfile import bayeslite import bayeslite.core as core from bayeslite import bql_quote_name from bayeslite.backends.cgpm_backend import CGPM_Backend from bayeslite.backends.iid_gaussian import StdNormalBackend examples = { 'cgpm': ( lambda: CGPM_Backend(cgpm_registry={}, multiprocess=False), 't', 'CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)', 'INSERT INTO t (x, y, z) VALUES (?, ?, ?)', [ (0, 1.57, 'foo'), (1.83, 3.141, 'bar'), (1.82, 3.140, 'bar'), (-1, 6.28, 'foo'), ], 'p', 'p_cc', 'CREATE POPULATION p FOR t' '(x NUMERICAL; y NUMERICAL; z NOMINAL)', 'CREATE GENERATOR p_cc FOR p USING cgpm()', 'CREATE GENERATOR p_cc FOR p USING crosscat',
def test_cgpm_extravaganza__ci_slow(): try: from cgpm.regressions.forest import RandomForest from cgpm.regressions.linreg import LinearRegression from cgpm.venturescript.vscgpm import VsCGpm except ImportError: pytest.skip('no sklearn or venturescript') return with bayesdb_open(':memory:', builtin_backends=False) as bdb: # XXX Use the real satellites data instead of this bogosity? bdb.sql_execute(''' CREATE TABLE satellites_ucs ( name, apogee, class_of_orbit, country_of_operator, launch_mass, perigee, period ) ''') for l, f in [ ('geo', lambda x, y: x + y**2), ('leo', lambda x, y: math.sin(x + y)), ]: for x in xrange(1000): for y in xrange(10): countries = ['US', 'Russia', 'China', 'Bulgaria'] country = countries[bdb._np_prng.randint( 0, len(countries))] name = 'sat-%s-%d' % (country, bdb._np_prng.randint(0, 10**8)) mass = bdb._np_prng.normal(1000, 50) bdb.sql_execute( ''' INSERT INTO satellites_ucs (name, country_of_operator, launch_mass, class_of_orbit, apogee, perigee, period) VALUES (?,?,?,?,?,?,?) ''', (name, country, mass, l, x, y, f(x, y))) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs ( name IGNORE; apogee NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites ''').fetchall() cgpm_registry = { 'venturescript': VsCGpm, 'linreg': LinearRegression, 'forest': RandomForest, } cgpmt = CGPM_Backend(cgpm_registry) bayesdb_register_backend(bdb, cgpmt) with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apoge TO NORMAL ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( LATENT apogee NUMERICAL ) ''') bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apogee TO NORMAL; LATENT kepler_cluster_id NUMERICAL; LATENT kepler_noise NUMERICAL; OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period GIVEN apogee, perigee USING venturescript (source = "{}"); OVERRIDE MODEL FOR perigee GIVEN apogee USING linreg; OVERRIDE MODEL FOR class_of_orbit GIVEN apogee, period, perigee, kepler_noise USING forest (k = 4); SUBSAMPLE 100, ) '''.format(kepler_source)) population_id = core.bayesdb_get_population(bdb, 'satellites') generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0') assert core.bayesdb_variable_numbers(bdb, population_id, None) \ == [1, 2, 3, 4, 5, 6] assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \ == [-2, -1, 1, 2, 3, 4, 5, 6] # -- MODEL country_of_operator GIVEN class_of_orbit USING forest; bdb.execute('INITIALIZE 1 MODELS FOR g0') bdb.execute('ANALYZE g0 FOR 1 iteration (;)') bdb.execute(''' ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') # OPTIMIZED uses the lovecat backend. bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)') with pytest.raises(Exception): # Disallow both SKIP and VARIABLES clauses. # # XXX Catch a more specific exception. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION ( SKIP kepler_cluster_id; VARIABLES apogee, perigee; ) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') bdb.execute('ANALYZE g0 FOR 1 ITERATION') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF kepler_cluster_id WITH period WITHIN satellites MODELED BY g0 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id FROM satellites MODELED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise FROM satellites MODELED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf FROM satellites MODELED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf FROM satellites MODELED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf FROM satellites MODELED BY g0 LIMIT 1; ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE kepler_cluster_id, apogee, perigee, period FROM satellites MODELED BY g0 LIMIT 4 ''').fetchall() bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP GENERATOR g0') bdb.execute('DROP POPULATION satellites') bdb.execute('DROP TABLE satellites_ucs')
def test_using_modelnos(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPE OF apogee TO NUMERICAL; SET STATTYPE OF class_of_orbit TO NOMINAL; SET STATTYPE OF country_of_operator TO NOMINAL; SET STATTYPE OF launch_mass TO NUMERICAL; SET STATTYPE OF perigee TO NUMERICAL; SET STATTYPE OF period TO NUMERICAL ) ''') bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm( SUBSAMPLE 10 ); ''') bdb.execute('INITIALIZE 2 MODELS FOR g0') # Crash test simulate. bdb.execute(''' SIMULATE apogee, class_of_orbit FROM satellites MODELED BY g0 USING MODEL 0-1 LIMIT 10 ''') # Crash test infer explicit. bdb.execute(''' INFER EXPLICIT PREDICT period, perigee FROM satellites MODELED BY g0 USING MODEL 0 LIMIT 2 ''') # Crash test dependence probability BY. c = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF launch_mass WITH period BY satellites MODELED BY g0 USING MODEL 0 ''') assert cursor_value(c) in [0, 1] # Crash test dependence probability pairwise. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF satellites MODELED BY g0 USING MODEL 1 ''') for d in cursor: assert d[0] in [0, 1] # Crash test mutual information 1row. bdb.execute(''' ESTIMATE MUTUAL INFORMATION WITH (period) USING 1 SAMPLES FROM VARIABLES OF satellites USING MODEL 0 ''').fetchall() # Test analyze on per-model basis. bdb.execute(''' ANALYZE g0 MODEL 0 FOR 1 ITERATION CHECKPOINT 1 ITERATION ''') engine = bdb.backends['cgpm']._engine(bdb, 1) assert len(engine.states[0].diagnostics['logscore']) == 1 assert len(engine.states[1].diagnostics['logscore']) == 0 bdb.execute(''' ANALYZE g0 MODEL 1 FOR 4 ITERATION CHECKPOINT 1 ITERATION ( OPTIMIZED ); ''') assert len(engine.states[0].diagnostics['logscore']) == 1 assert len(engine.states[1].diagnostics['logscore']) == 4 # Some errors with bad modelnos. with pytest.raises(BQLError): bdb.execute(''' ANALYZE g0 MODEL 0-3 FOR 4 ITERATION ''') with pytest.raises(BQLError): bdb.execute(''' SIMULATE apogee FROM satellites USING MODEL 25 LIMIT 10; ''') with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites USING MODELS 0-8 LIMIT 2; ''')
def test_cgpm_kepler(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( apogee NUMERICAL; launch_mass NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION from PAIRWISE VARIABLES OF satellites ''').fetchall() registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_backend( bdb, CGPM_Backend(registry, multiprocess=0)) bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( OVERRIDE GENERATIVE MODEL FOR period GIVEN apogee, perigee USING linreg ) ''') bdb.execute('INITIALIZE 1 MODEL FOR g0') c = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual') n = c.fetchvalue() # Another generator: exponential launch mass instead of normal. bdb.execute(''' CREATE GENERATOR g1 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR launch_mass TO EXPONENTIAL; OVERRIDE MODEL FOR period GIVEN apogee, perigee USING kepler(quagga = eland); SUBSAMPLE 20 ) ''') c_ = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual') n_ = c_.fetchvalue() assert n_ - n == 20 bdb.execute('INITIALIZE 1 MODEL IF NOT EXISTS FOR g1') bdb.execute('ANALYZE g0 FOR 1 ITERATION') bdb.execute('ANALYZE g0 FOR 1 ITERATION (VARIABLES period)') bdb.execute('ANALYZE g1 FOR 1 ITERATION') bdb.execute('ANALYZE g1 FOR 1 ITERATION (VARIABLES period)') # OPTIMIZED is ignored because period is a foreign variable. bdb.execute(''' ANALYZE g1 FOR 1 ITERATION (OPTIMIZED; VARIABLES period) ''') # This should fail since we have a SET CATEGORY MODEL which is not # compatible with lovecat. The ValueError is from cgpm not bayeslite. with pytest.raises(ValueError): bdb.execute(''' ANALYZE g1 FOR 1 ITERATION (OPTIMIZED; VARIABLES launch_mass) ''') # Cannot use timed analysis with mixed variables. with pytest.raises(BQLError): bdb.execute(''' ANALYZE g1 FOR 5 SECONDS (VARIABLES period, apogee) ''') # Cannot use timed analysis with mixed variables (period by SKIP). with pytest.raises(BQLError): bdb.execute(''' ANALYZE g1 FOR 5 SECONDS (SKIP apogee) ''') # OK to use iteration analysis with mixed values. bdb.execute(''' ANALYZE g1 FOR 1 ITERATION (VARIABLES period, apogee) ''') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF satellites ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE apogee, perigee, period FROM satellites LIMIT 100 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_confidence USING 5 SAMPLES FROM satellites LIMIT 2 ''').fetchall() results = bdb.execute(''' INFER EXPLICIT PREDICT class_of_orbit CONFIDENCE class_of_orbit_confidence FROM satellites LIMIT 2 ''').fetchall() assert len(results[0]) == 2 assert isinstance(results[0][0], unicode) assert isinstance(results[0][1], float) # No CONFIDENCE specified. results = bdb.execute(''' INFER EXPLICIT PREDICT class_of_orbit USING 2 SAMPLES FROM satellites LIMIT 2 ''').fetchall() assert len(results[0]) == 1 assert isinstance(results[0][0], unicode) bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP GENERATOR g0') bdb.execute('DROP GENERATOR g1')
def test_analysis_subproblems_basic(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPE OF apogee TO NUMERICAL; SET STATTYPE OF class_of_orbit TO NOMINAL; SET STATTYPE OF country_of_operator TO NOMINAL; SET STATTYPE OF launch_mass TO NUMERICAL; SET STATTYPE OF perigee TO NUMERICAL; SET STATTYPE OF period TO NUMERICAL ) ''') bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm( SUBSAMPLE 10 ); ''') bdb.execute('INITIALIZE 4 MODELS FOR g0') # Test each subproblem individually except for variable hyperparameters. for optimized in [ '', 'OPTIMIZED;', ]: for subproblem in [ 'variable clustering', 'variable clustering concentration', 'row clustering', 'row clustering concentration', ]: bdb.execute(''' ANALYZE g0 MODELS 0,1 FOR 4 ITERATION( SUBPROBLEM %s; %s ); ''' % (subproblem, optimized)) # Test variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION ( VARIABLES period, launch_mass; SUBPROBLEM variable hyperparameters; ) ''') with pytest.raises(BQLError): # OPTIMIZED backend does not support variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 SECONDS ( SUBPROBLEM variable hyperparameters; OPTIMIZED; ) ''') # Test rows. generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0') cursor = bdb.execute( ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id, )) subsample_rows = [c[0] for c in cursor] bad_rows = [i for i in xrange(20) if i not in subsample_rows] for optimized in ['', 'OPTIMIZED;']: bdb.execute(''' ANALYZE g0 MODEL 3 FOR 1 ITERATION ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, subsample_rows)), optimized)) with pytest.raises(BQLError): # Fail on rows not in the population or subsample. bdb.execute(''' ANALYZE g0 MODEL 3 FOR 1 ITERATION ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, bad_rows)), optimized))
def test_subsample(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bayeslite.bayesdb_register_backend(bdb, backend) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm; ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm( SUBSAMPLE 100 ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)') bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF N_DEATH_ILL FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE FROM PAIRWISE hospitals_sub WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND (r1._rowid_ = 1 OR r1._rowid_ = 101) ''').fetchall() bdb.execute(''' INFER mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() sql = ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ORDER BY cgpm_rowid ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full, )) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid, )) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
from bayeslite.quote import bql_quote_name from bayeslite.read_csv import bayesdb_read_csv from bayeslite.read_csv import bayesdb_read_csv_file from bayeslite.schema import bayesdb_upgrade_schema from bayeslite.txn import BayesDBTxnError from bayeslite.version import __version__ # XXX This is not a good place for me. Find me a better home, please! __all__ = [ 'BQLError', 'BQLParseError', 'BayesDB', 'BayesDBException', 'BayesDBTxnError', 'bayesdb_deregister_backend', 'bayesdb_nullify', 'bayesdb_open', 'bayesdb_read_csv', 'bayesdb_read_csv_file', 'bayesdb_register_backend', 'bayesdb_upgrade_schema', 'bql_quote_name', 'BayesDB_Backend', 'IBayesDBTracer', ] # Register cgpm as a builtin backend. from bayeslite.backends.cgpm_backend import CGPM_Backend bayesdb_builtin_backend(CGPM_Backend({}, multiprocess=True))
def test_regress_bonanza__ci_integration(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0)) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( apogee NUMERICAL; class_of_orbit NOMINAL; country_of_operator NOMINAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL; ) ''') bdb.execute(''' CREATE GENERATOR m FOR satellites; ''') bdb.execute('INITIALIZE 2 MODELS FOR m;') def check_regression_variables(results, numericals, nominals): seen = set() for r in results: assert len(r) == 2 variable = r[0] assert variable not in seen assert variable in numericals or \ any(variable.startswith('%s_dum_' % (nominal,)) for nominal in nominals) seen.add(variable) # Regression on 1 numerical variable. results = bdb.execute(''' REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites; ''').fetchall() assert len(results) == 2 check_regression_variables(results, ['intercept', 'perigee'], []) # Regression on 1 nominal variable. results = bdb.execute(''' REGRESS apogee GIVEN (country_of_operator) USING 12 SAMPLES BY satellites; ''').fetchall() check_regression_variables(results, ['intercept'], ['country_of_operator']) # Regression on 1 nominal + 1 numerical variable. bdb.execute(''' REGRESS apogee GIVEN (perigee, country_of_operator) USING 12 SAMPLES BY satellites; ''').fetchall() check_regression_variables(results, ['intercept', 'perigee'], ['country_of_operator']) # Regression on all variables. results = bdb.execute( ''' REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites; ''', (3, )).fetchall() check_regression_variables( results, [ 'intercept', 'perigee', 'launch_mass', 'period', ], [ 'country_of_operator', 'class_of_orbit', ], ) # Regression on column selector subexpression with a binding. results = bdb.execute( ''' REGRESS apogee GIVEN ( satellites.( ESTIMATE * FROM VARIABLES OF satellites ORDER BY dependence probability with apogee DESC LIMIT ? ) ) USING 12 SAMPLES BY satellites MODELED BY m USING MODEL 1; ''', (3, )).fetchall() cursor = bdb.execute( ''' ESTIMATE * FROM VARIABLES OF satellites ORDER BY dependence probability with apogee DESC LIMIT ? ''', (3, )).fetchall() top_variables = [c[0] for c in cursor] nominals = [ var for var in top_variables if var in [ 'country_of_operator', 'class_of_orbit', ] ] numericals = [var for var in top_variables if var not in nominals] check_regression_variables(results, numericals + ['intercept'], nominals) # Cannot mix * with other variables. with pytest.raises(BQLError): bdb.execute(''' REGRESS apogee GIVEN (*, class_of_orbit) USING 1 SAMPLES BY satellites; ''').fetchall() # Not enough data for regression, 1 unique nominal variable. with pytest.raises(ValueError): bdb.execute(''' REGRESS apogee GIVEN (class_of_orbit) USING 1 SAMPLES BY satellites; ''').fetchall()
def test_initialize_with_all_nulls(): # This test ensures that trying to initialize a generator with any # (manifest) column of all null variables will crash. # Initializing an overriden column with all null variables should not # be a problem in general, so we test this case as well. with bayesdb_open(':memory:', builtin_backends=False) as bdb: registry = { 'barebones': BareBonesCGpm, } bayesdb_register_backend( bdb, CGPM_Backend(registry, multiprocess=0)) # Create table with all missing values for a. bdb.sql_execute(''' CREATE TABLE t (a REAL, b REAL, c REAL); ''') bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 3)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -2, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -5, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, 2, 3)) # Fail when a is numerical and modeled by crosscat. bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( SET STATTYPES OF a, b, c TO NUMERICAL ) ''') bdb.execute(''' CREATE GENERATOR m FOR p; ''') with pytest.raises(BQLError): bdb.execute(''' INITIALIZE 2 MODELS FOR m; ''') # Fail when a is nominal and modeled by crosscat. bdb.execute(''' CREATE POPULATION p2 FOR t WITH SCHEMA( SET STATTYPES OF a TO NOMINAL; SET STATTYPES OF b, c TO NUMERICAL ) ''') bdb.execute('CREATE GENERATOR m2 FOR p2;') with pytest.raises(BQLError): bdb.execute('INITIALIZE 2 MODELS FOR m2;') # Succeed when a is ignored. bdb.execute(''' CREATE POPULATION p3 FOR t WITH SCHEMA( IGNORE a; SET STATTYPES OF b, c TO NUMERICAL ) ''') bdb.execute('CREATE GENERATOR m3 FOR p3;') bdb.execute('INITIALIZE 2 MODELS FOR m3;') # Succeed when a is numerical overriden using a dummy CGPM. bdb.execute(''' CREATE GENERATOR m4 FOR p( OVERRIDE MODEL FOR a GIVEN b USING barebones ) ''') bdb.execute('INITIALIZE 2 MODELS FOR m4;') bdb.execute('ANALYZE m4 FOR 1 ITERATION')
def test_output_stattypes(): with cgpm_dummy_satellites_bdb() as bdb: # Missing policy for class_of_orbit, perigee, period with pytest.raises(BQLError): bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPES OF apogee, launch_mass TO NUMERICAL; SET STATTYPES OF country_of_operator TO NOMINAL ) ''') bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( IGNORE class_of_orbit, perigee, period; SET STATTYPES OF apogee, launch_mass TO NUMERICAL; SET STATTYPES OF country_of_operator TO NOMINAL ) ''') registry = { 'factor_analysis': FactorAnalysis, } bayesdb_register_backend(bdb, CGPM_Backend(registry)) # Creating factor analysis with nominal manifest should crash. bdb.execute(''' CREATE GENERATOR satellites_g0 FOR satellites( OVERRIDE MODEL FOR apogee, country_of_operator AND EXPOSE pc_1 NUMERICAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0') with pytest.raises(BQLError): # Duplicate pc_2 in LATENT and EXPOSE. bdb.execute(''' CREATE GENERATOR satellites_g1 FOR satellites( LATENT pc_2 NOMINAL, OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 NOMINAL USING factor_analysis(L=1) ) ''') # Creating factor analysis with nominal latent should crash. bdb.execute(''' CREATE GENERATOR satellites_g1 FOR satellites( OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 NOMINAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1') # Creating factor analysis with all numerical should be ok. bdb.execute(''' CREATE GENERATOR satellites_g2 FOR satellites USING cgpm( LATENT pc_3 NUMERICAL; OVERRIDE MODEL FOR apogee, launch_mass, pc_3, pc_4 USING factor_analysis(L=2); LATENT pc_4 NUMERICAL ) ''') bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2') bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION') # Cannot transition baseline and foreign using timed analysis. with pytest.raises(BQLError): bdb.execute(''' ANALYZE satellites_g2 FOR 2 SECONDS ( VARIABLES country_of_operator, apogee, launch_mass, pc_3); ''') bdb.execute(''' ANALYZE satellites_g2 FOR 1 ITERATION ( VARIABLES apogee, launch_mass); ''') # Dependence probability of manifest with latent. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF apogee WITH pc_3 BY satellites MODELED BY satellites_g2; ''').fetchall() assert cursor[0][0] == 1. # Dependence probability of latent with latent. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF pc_3 WITH pc_4 BY satellites MODELED BY satellites_g2; ''').fetchall() assert cursor[0][0] == 1. # Mutual information of latent with manifest. cursor = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF apogee WITH pc_4 USING 1 SAMPLES BY satellites MODELED BY satellites_g2; ''').fetchall() # Mutual information of latent with latent. cursor = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF pc_3 WITH pc_4 USING 1 SAMPLES BY satellites MODELED BY satellites_g2; ''').fetchall()
def test_unknown_stattype(): from cgpm.regressions.linreg import LinearRegression with cgpm_dummy_satellites_bdb() as bdb: # Add a column called relaunches, sum of apogee and perigee. bdb.sql_execute('ALTER TABLE satellites_ucs ADD COLUMN relaunches') n_rows = bdb.sql_execute(''' SELECT COUNT(*) FROM satellites_ucs ''').next()[0] for rowid in xrange(n_rows): bdb.sql_execute(''' UPDATE satellites_ucs SET relaunches = (SELECT apogee + perigee) WHERE _rowid_ = ? ''', (rowid+1,)) # Nobody will ever create a QUAGGA statistical type! with pytest.raises(BQLError): # No such statistical type at the moment. bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPES OF apogee, perigee, launch_mass, period TO NUMERICAL; SET STATTYPE OF class_of_orbit, country_of_operator TO NOMINAL; SET STATTYPE OF relaunches TO QUAGGA ) ''') # Invent the statistical type. bdb.sql_execute('INSERT INTO bayesdb_stattype VALUES (?)', ('quagga',)) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( SET STATTYPES OF apogee, perigee, launch_mass, period TO NUMERICAL; SET STATTYPES OF class_of_orbit, country_of_operator TO NOMINAL; SET STATTYPES OF relaunches TO QUAGGA ) ''') registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_backend(bdb, CGPM_Backend(registry)) with pytest.raises(BQLError): # Can't model QUAGGA by default. bdb.execute('CREATE GENERATOR g0 FOR satellites USING cgpm') with pytest.raises(BQLError): # Can't model QUAGGA as input. bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR relaunches GIVEN apogee USING linreg; OVERRIDE MODEL FOR period GIVEN relaunches USING linreg ) ''') # Can model QUAGGA with an explicit distribution family. bdb.execute(''' CREATE GENERATOR g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR relaunches TO POISSON ) ''') bdb.execute(''' CREATE GENERATOR g1 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR relaunches TO POISSON; OVERRIDE MODEL FOR period GIVEN relaunches USING linreg ) ''')