def t1_mp(): crosscat = multiprocessing_crosscat() metamodel = CrosscatMetamodel(crosscat) return bayesdb_population(bayesdb(metamodel=metamodel), 't1', 'p1', 'p1_cc', t1_schema, t1_data, columns=['id IGNORE','label CATEGORICAL', 'age NUMERICAL', 'weight NUMERICAL'])
def test_hackmetamodel(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)') bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')") bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') crosscat = local_crosscat() crosscat_metamodel = CrosscatMetamodel(crosscat) dotdog_metamodel = DotdogMetamodel() bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel) bayeslite.bayesdb_deregister_metamodel(bdb, dotdog_metamodel) bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel) with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)') # XXX Rest of test originally exercised default metamodel, but # syntax doesn't support that now. Not clear that's wrong either. bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)') with pytest.raises(bayeslite.BQLError): bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
def test_guess_population(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)') a_z = range(ord('a'), ord('z') + 1) aa_zz = ((c, d) for c in a_z for d in a_z) data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz) for row in data: bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): # No modelled columns. (x is key.) bayesdb_guess_population(bdb, 'p', 't', overrides=[('y', 'ignore'), ('z', 'ignore')]) bayesdb_guess_population(bdb, 'p', 't') with pytest.raises(ValueError): # Population already exists. bayesdb_guess_population(bdb, 'p', 't') assert bdb.sql_execute('SELECT * FROM bayesdb_variable').fetchall() == [ (1, None, 1, 'y', 'nominal'), (1, None, 2, 'z', 'numerical'), ]
def test_impossible_duplicate_dependency(): # Throw exception when two columns X and Y are both dependent and # independent. data = [(0, 1, 0, 0), (1, 0, 0, 1)] # Create the database. with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) # Read the dataset. bdb.sql_execute('CREATE TABLE foo(id,a,b,c)') for row in data: bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row) # Create schema, we will force DEP(a c) and IND(a c). bql = ''' CREATE GENERATOR bar FOR foo USING crosscat( GUESS(*), id IGNORE, a CATEGORICAL, b CATEGORICAL, c CATEGORICAL, INDEPENDENT(a,b,c), DEPENDENT(a,c), ); ''' # An error should be thrown about impossible schema. with pytest.raises(bayeslite.BQLError): bdb.execute(bql)
def test_crosscat_constraints(): class FakeEngine(crosscat.LocalEngine.LocalEngine): def predictive_probability_multistate(self, M_c, X_L_list, X_D_list, Y, Q): self._last_Y = Y sup = super(FakeEngine, self) return sup.simple_predictive_probability_multistate( M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q) def simple_predictive_sample(self, seed, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).simple_predictive_sample(seed=seed, M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) def impute_and_confidence(self, seed, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).impute_and_confidence(seed=seed, M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) engine = FakeEngine(seed=0) mm = CrosscatMetamodel(engine) with bayesdb(metamodel=mm) as bdb: t1_schema(bdb) t1_data(bdb) bdb.execute(''' CREATE GENERATOR t1_cc FOR t1 USING crosscat( label CATEGORICAL, age NUMERICAL, weight NUMERICAL ) ''') gid = core.bayesdb_get_generator(bdb, 't1_cc') assert core.bayesdb_generator_column_number(bdb, gid, 'label') == 1 assert core.bayesdb_generator_column_number(bdb, gid, 'age') == 2 assert core.bayesdb_generator_column_number(bdb, gid, 'weight') == 3 from bayeslite.metamodels.crosscat import crosscat_cc_colno assert crosscat_cc_colno(bdb, gid, 1) == 0 assert crosscat_cc_colno(bdb, gid, 2) == 1 assert crosscat_cc_colno(bdb, gid, 3) == 2 bdb.execute('INITIALIZE 1 MODEL FOR t1_cc') bdb.execute('ANALYZE t1_cc FOR 1 ITERATION WAIT') bdb.execute('ESTIMATE PROBABILITY OF age = 8 GIVEN (weight = 16)' ' BY t1_cc').next() assert engine._last_Y == [(28, 2, 16)] bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next() bdb.execute("INFER age FROM t1_cc WHERE label = 'baz'").next() assert engine._last_Y == [(3, 0, 1), (3, 2, 32)] bdb.execute('SIMULATE weight FROM t1_cc GIVEN age = 8 LIMIT 1').next() assert engine._last_Y == [(28, 1, 8)]
def test_crosscat_constraints(): class FakeEngine(crosscat.LocalEngine.LocalEngine): def predictive_probability_multistate(self, M_c, X_L_list, X_D_list, Y, Q): self._last_Y = Y sup = super(FakeEngine, self) return sup.simple_predictive_probability_multistate(M_c=M_c, X_L_list=X_L_list, X_D_list=X_D_list, Y=Y, Q=Q) def simple_predictive_sample(self, seed, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).simple_predictive_sample(seed=seed, M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) def impute_and_confidence(self, seed, M_c, X_L, X_D, Y, Q, n): self._last_Y = Y return super(FakeEngine, self).impute_and_confidence(seed=seed, M_c=M_c, X_L=X_L, X_D=X_D, Y=Y, Q=Q, n=n) engine = FakeEngine(seed=0) mm = CrosscatMetamodel(engine) with bayesdb(metamodel=mm) as bdb: t1_schema(bdb) t1_data(bdb) bdb.execute(''' CREATE POPULATION p1 FOR t1 ( id IGNORE; label CATEGORICAL; age NUMERICAL; weight NUMERICAL ) ''') bdb.execute(''' CREATE GENERATOR p1_cc FOR p1 USING crosscat( label CATEGORICAL, age NUMERICAL, weight NUMERICAL ) ''') pid = core.bayesdb_get_population(bdb, 'p1') assert core.bayesdb_variable_number(bdb, pid, None, 'label') == 1 assert core.bayesdb_variable_number(bdb, pid, None, 'age') == 2 assert core.bayesdb_variable_number(bdb, pid, None, 'weight') == 3 gid = core.bayesdb_get_generator(bdb, pid, 'p1_cc') from bayeslite.metamodels.crosscat import crosscat_cc_colno assert crosscat_cc_colno(bdb, gid, 1) == 0 assert crosscat_cc_colno(bdb, gid, 2) == 1 assert crosscat_cc_colno(bdb, gid, 3) == 2 bdb.execute('INITIALIZE 1 MODEL FOR p1_cc') bdb.execute('ANALYZE p1_cc FOR 1 ITERATION WAIT') bdb.execute('ESTIMATE PROBABILITY DENSITY OF age = 8' ' GIVEN (weight = 16)' ' BY p1').next() assert engine._last_Y == [(28, 2, 16)] bdb.execute("SELECT age FROM t1 WHERE label = 'baz'").next() bdb.execute("INFER age FROM p1 WHERE label = 'baz'").next() assert engine._last_Y == [(3, 0, 1), (3, 2, 32)] bdb.execute('SIMULATE weight FROM p1 GIVEN age = 8 LIMIT 1').next() assert engine._last_Y == [(28, 1, 8)] # Simulate with an unknown nominal value should throw an error. with pytest.raises(bayeslite.BQLError): bdb.execute('SIMULATE weight FROM p1 GIVEN label = \'q\' LIMIT 1;')
def t1_mp(): crosscat = multiprocessing_crosscat() metamodel = CrosscatMetamodel(crosscat) return bayesdb_generator( bayesdb(metamodel=metamodel), 't1', 't1_cc', t1_schema, t1_data, columns=['label CATEGORICAL', 'age NUMERICAL', 'weight NUMERICAL'])
def bayesdb(metamodel=None, **kwargs): if metamodel is None: crosscat = local_crosscat() metamodel = CrosscatMetamodel(crosscat) bdb = bayeslite.bayesdb_open(builtin_metamodels=False, **kwargs) bayeslite.bayesdb_register_metamodel(bdb, metamodel) try: yield bdb finally: bdb.close()
def test_subsample(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING crosscat ( SUBSAMPLE(OFF) ) ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING crosscat ( SUBSAMPLE(100) ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION WAIT') bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc' ' FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE hospitals_sub' ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND' ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall() bdb.execute('INFER mdcr_spnd_amblnc FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() sql = ''' SELECT sql_rowid FROM bayesdb_crosscat_subsample WHERE generator_id = ? ORDER BY cc_row_id ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full, )) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid, )) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
def test_codebook_value_map(): ''' A categorical column in crosscat can only take on a fixed number of values v1, v2, ..., v3. In this test, we have a categorical column called `city` which takes on values `RIO, LA, SF, DC` as specified in the codebook value map. INITIALIZE dummy table with only RIO and SF appearing in dataset ANALYZE dummy_cc INSERT rows with `city` names `LA` and `DC` ANALYZE dummy_cc SIMULATE specifying `city` = `LA` (throws KeyError) ''' with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) bayeslite.bayesdb_read_csv(bdb, 'dummy', dummy_data, header=True, create=True) with tempfile.NamedTemporaryFile(prefix='bayeslite') as tempbook: with open(tempbook.name, 'w') as f: f.write(dummy_codebook) bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dummy', tempbook.name) bdb.execute(''' CREATE GENERATOR dummy_cc FOR dummy USING crosscat( GUESS(*), kerberos IGNORE, age NUMERICAL, city CATEGORICAL ) ''') bdb.execute('INITIALIZE 10 MODELS FOR dummy_cc') bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT') bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = RIO LIMIT 5') bdb.sql_execute(''' INSERT INTO dummy (kerberos, age, city) VALUES ('jackie', 18, 'LA'), ('rocker', 22, 'DC') ''') bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT') c = bdb.sql_execute('SELECT * FROM dummy') with pytest.raises(KeyError): bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = LA LIMIT 5')
def test_impossible_nontransitive_dependency(): # Test impossibility of non-transitive dependencies. While in the # general case, dependence is not transitive, crosscat assumes # transitive closure under dependency constraints. The test is # valid since we are using a crosscat local engine. Note that # transitivity under independence is not forced by crosscat. # Changing the behavior of CrossCat to deal with impossible # constraints (such as random dropout) will require updating this # test. data = [(0, 1, 0, 0), (1, 0, 0, 1)] # Create the database. with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) # Read the dataset. bdb.sql_execute('CREATE TABLE foo(id,a,b,c)') for row in data: bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row) # Create schema, we will force DEP(a b), DEP(b c), and IND(a c) which # is non-transitive. bql = ''' CREATE GENERATOR bar FOR foo USING crosscat( GUESS(*), id IGNORE, a CATEGORICAL, b CATEGORICAL, c CATEGORICAL, DEPENDENT(a,b), DEPENDENT(b,c), INDEPENDENT(a,c) ); ''' # Creating the generator should succeed. bdb.execute(bql) # Error thrown when initializing since no initial state exists. # XXX Currently CrossCat throws a RuntimeError, we should fix # the CrossCat exception hierarchy. with pytest.raises(RuntimeError): bdb.execute('INITIALIZE 10 MODELS FOR bar')
def run(stdin, stdout, stderr, argv): args = parse_args(argv[1:]) progname = argv[0] slash = progname.rfind('/') if slash: progname = progname[slash + 1:] if args.bdbpath is None and not args.memory: stderr.write('%s: pass filename or -m/--memory\n' % (progname, )) return 1 if args.bdbpath == '-': stderr.write('%s: missing option?\n' % (progname, )) return 1 bdb = bayeslite.bayesdb_open(pathname=args.bdbpath, builtin_metamodels=False) if args.jobs != 1: import crosscat.MultiprocessingEngine as ccme jobs = args.jobs if args.jobs > 0 else None crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs) else: import crosscat.LocalEngine as ccle crosscat = ccle.LocalEngine(seed=args.seed) metamodel = CrosscatMetamodel(crosscat) bayeslite.bayesdb_register_metamodel(bdb, metamodel) bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr) with hook.set_current_shell(bdbshell): if not args.no_init_file: init_file = os.path.join(os.path.expanduser('~/.bayesliterc')) if os.path.isfile(init_file): bdbshell.dot_read(init_file) if args.file is not None: for path in args.file: if os.path.isfile(path): bdbshell.dot_read(path) else: bdbshell.stdout.write('%s is not a file. Aborting.\n' % (str(path), )) break if not args.batch: bdbshell.cmdloop() return 0
def test_simulate_drawconstraint(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.execute(''' CREATE GENERATOR dha_cc FOR dha USING crosscat ( GUESS(*), name KEY ) ''') bdb.execute('INITIALIZE 1 MODEL FOR dha_cc') bdb.execute('ANALYZE dha_cc FOR 1 ITERATION WAIT') samples = bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM dha_cc GIVEN TTL_MDCR_SPND = 40000 LIMIT 100 ''').fetchall() assert [s[0] for s in samples] == [40000] * 100
def test_legacy_models__ci_slow(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) # Make sure guessing also works. bdb.execute('create generator dha_cc0 for dha using crosscat(guess(*))') bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) # Need to be able to overwrite existing codebook. # # XXX Not sure this is the right API. What if overwrite is a # mistake? bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) bql = ''' ESTIMATE name FROM dha_cc ORDER BY SIMILARITY TO (name = ?) DESC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql, ('Albany NY', )).fetchall() == [ ('Albany NY', ), ('Scranton PA', ), ('United States US', ), ('Norfolk VA', ), ('Reading PA', ), ('Salisbury MD', ), ('Louisville KY', ), ('Cleveland OH', ), ('Covington KY', ), ('Akron OH', ), ] # Tickles an issue in case-folding of column names. bql = ''' ESTIMATE name FROM dha_cc ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql).fetchall() == [ ('McAllen TX', ), ('Worcester MA', ), ('Beaumont TX', ), ('Temple TX', ), ('Corpus Christi TX', ), ('Takoma Park MD', ), ('Kingsport TN', ), ('Bangor ME', ), ('Lebanon NH', ), ('Panama City FL', ), ]
import pytest import tempfile import crosscat.LocalEngine import bayeslite import bayeslite.core as core from bayeslite import bql_quote_name from bayeslite.metamodels.crosscat import CrosscatMetamodel from bayeslite.metamodels.iid_gaussian import StdNormalMetamodel examples = { 'crosscat': ( lambda: CrosscatMetamodel(crosscat.LocalEngine.LocalEngine(seed=0)), 't', 'CREATE TABLE t(x NUMERIC, y CYCLIC, z CATEGORICAL)', 'INSERT INTO t (x, y, z) VALUES (?, ?, ?)', [ (0, 1.57, 'foo'), (1.83, 3.141, 'bar'), (1.82, 3.140, 'bar'), (-1, 6.28, 'foo'), ], 'p', 'p_cc', 'CREATE POPULATION p FOR t' '(x NUMERICAL; y CYCLIC; z CATEGORICAL)', 'CREATE GENERATOR p_cc FOR p USING crosscat()', 'CREATE GENERATOR p_cc FOR p USING crosscat(DEPENDENT)',
def test_complex_dependencies__ci_slow(): # Parameterize number of rows in synthetic dataset. n_rows = 250 # Add an id column to ensure generator and cc colnos are different. ids = np.arange(n_rows) # Create real-valued data, such that DEP(x,y), DEP(y,z), and IND(x,z) mean = [4, -2, -11] cov = [[3.0, 0.7, 0.0], [0.7, 4.0, 0.6], [0.0, 0.6, 2.0]] numerical_data = np.random.multivariate_normal(mean, cov, size=n_rows) x, y, z = numerical_data[:, 0], numerical_data[:, 1], numerical_data[:, 2] # Create categorical data v, highly dependent on x. bins = [np.percentile(x, p) for p in xrange(0, 101, 10)] v = np.digitize(x, bins) # Create categorical data, independent of all other columns. w = np.random.choice(range(8), size=n_rows) data = np.vstack((ids, x, y, z, w, v)).T # Create the database. with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) # Read the dataset. bdb.sql_execute('CREATE TABLE foo(id,x,y,z,v,w)') for row in data: bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?,?,?)', row) # Create schema, we will force IND(x y), IND(x v), and DEP(z v w). bql = ''' CREATE GENERATOR bar FOR foo USING crosscat( GUESS(*), id IGNORE, x NUMERICAL, y NUMERICAL, z NUMERICAL, v CATEGORICAL, w CATEGORICAL, INDEPENDENT(x, y), INDEPENDENT(x, v), DEPENDENT(z, v, w) ); ''' bdb.execute(bql) # Prepare the checker function. def check_dependencies(): bql = ''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF bar ''' for _id, col1, col2, dep in bdb.execute(bql): # test IND(x y) if (col1, col2) in [('x', 'y'), ('y', 'x')]: assert dep == 0 continue # test IND(x v) if (col1, col2) in [('x', 'v'), ('v', 'x')]: assert dep == 0 continue # test DEP(z v) if (col1, col2) in [('z', 'v'), ('v', 'z')]: assert dep == 1 continue # test DEP(z w) if (col1, col2) in [('z', 'w'), ('w', 'z')]: assert dep == 1 continue # Test dependency pre-analysis. bdb.execute('INITIALIZE 10 MODELS FOR bar') check_dependencies() # Test dependency post-analysis. bdb.execute('ANALYZE bar for 10 ITERATION WAIT') check_dependencies()
def new_cc_metamodel(prng): return CrosscatMetamodel(CrosscatLocalEngine(seed=prng.randint(0, 2**31)))
'BayesDB', 'BayesDBException', 'BayesDBTxnError', 'bayesdb_deregister_metamodel', 'bayesdb_load_codebook_csv_file', 'bayesdb_load_legacy_models', 'bayesdb_open', 'bayesdb_read_csv', 'bayesdb_read_csv_file', 'bayesdb_register_metamodel', 'bql_quote_name', 'IBayesDBMetamodel', 'IBayesDBTracer', '__version__', ] from bayeslite.metamodels.crosscat import CrosscatMetamodel from crosscat.LocalEngine import LocalEngine as CrosscatLocalEngine bayesdb_builtin_metamodel(CrosscatMetamodel(CrosscatLocalEngine(seed=0))) import bayeslite.remote import os if not 'BAYESDB_DISABLE_VERSION_CHECK' in os.environ: bayeslite.remote.version_check() # Notebooks should contain comment lines documenting this behavior and # offering a solution, like so: # Please keep BayesDB up to date. To disable remote version checking: # import os; os.environ['BAYESDB_DISABLE_VERSION_CHECK'] = '1'
def test_correlation(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) ccme = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccme) bdb.sql_execute('CREATE TABLE u(id, c0, c1, n0, n1, r0, r1)') bdb.execute(''' CREATE GENERATOR u_cc FOR u USING crosscat ( c0 CATEGORICAL, c1 CATEGORICAL, n0 NUMERICAL, n1 NUMERICAL, r0 CYCLIC, r1 CYCLIC, ) ''') assert bdb.execute('ESTIMATE CORRELATION, CORRELATION PVALUE' ' FROM PAIRWISE COLUMNS OF u_cc' ' WHERE name0 < name1' ' ORDER BY name0, name1').fetchall() == \ [ (1, 'c0', 'c1', None, None), (1, 'c0', 'n0', None, None), (1, 'c0', 'n1', None, None), (1, 'c0', 'r0', None, None), (1, 'c0', 'r1', None, None), (1, 'c1', 'n0', None, None), (1, 'c1', 'n1', None, None), (1, 'c1', 'r0', None, None), (1, 'c1', 'r1', None, None), (1, 'n0', 'n1', None, None), (1, 'n0', 'r0', None, None), (1, 'n0', 'r1', None, None), (1, 'n1', 'r0', None, None), (1, 'n1', 'r1', None, None), (1, 'r0', 'r1', None, None), ] bdb.sql_execute('CREATE TABLE t' '(id, c0, c1, cx, cy, n0, n1, nc, nl, nx, ny)') data = [ ('foo', 'quagga', 'x', 'y', 0, -1, +1, 1, 0, 13), ('bar', 'eland', 'x', 'y', 87, -2, -1, 2, 0, 13), ('baz', 'caribou', 'x', 'y', 92.1, -3, +1, 3, 0, 13), ] * 10 for i, row in enumerate(data): row = (i + 1, ) + row bdb.sql_execute('INSERT INTO t VALUES (?,?,?,?,?,?,?,?,?,?,?)', row) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( c0 CATEGORICAL, c1 CATEGORICAL, cx CATEGORICAL, cy CATEGORICAL, n0 NUMERICAL, n1 NUMERICAL, nc NUMERICAL, nl NUMERICAL, nx NUMERICAL, ny NUMERICAL ) ''') result = bdb.execute('ESTIMATE CORRELATION, CORRELATION PVALUE' ' FROM PAIRWISE COLUMNS OF t_cc' ' WHERE name0 < name1' ' ORDER BY name0, name1').fetchall() expected = [ (2, 'c0', 'c1', 1., 2.900863120340436e-12), (2, 'c0', 'cx', None, None), (2, 'c0', 'cy', None, None), (2, 'c0', 'n0', 1., 0.), (2, 'c0', 'n1', 1., 0.), (2, 'c0', 'nc', 1., 0.), (2, 'c0', 'nl', 1., 0.), (2, 'c0', 'nx', None, None), (2, 'c0', 'ny', None, None), (2, 'c1', 'cx', None, None), (2, 'c1', 'cy', None, None), (2, 'c1', 'n0', 1., 0.), (2, 'c1', 'n1', 1., 0.), (2, 'c1', 'nc', 1., 0.), (2, 'c1', 'nl', 1., 0.), (2, 'c1', 'nx', None, None), (2, 'c1', 'ny', None, None), (2, 'cx', 'cy', None, None), (2, 'cx', 'n0', None, None), (2, 'cx', 'n1', None, None), (2, 'cx', 'nc', None, None), (2, 'cx', 'nl', None, None), (2, 'cx', 'nx', None, None), (2, 'cx', 'ny', None, None), (2, 'cy', 'n0', None, None), (2, 'cy', 'n1', None, None), (2, 'cy', 'nc', None, None), (2, 'cy', 'nl', None, None), (2, 'cy', 'nx', None, None), (2, 'cy', 'ny', None, None), (2, 'n0', 'n1', 0.7913965673596881, 0.), (2, 'n0', 'nc', 0.20860343264031175, 0.0111758925135), (2, 'n0', 'nl', 0.7913965673596881, 0.), (2, 'n0', 'nx', None, None), (2, 'n0', 'ny', None, None), (2, 'n1', 'nc', 0., 1.), (2, 'n1', 'nl', 1., 0.), (2, 'n1', 'nx', None, None), (2, 'n1', 'ny', None, None), (2, 'nc', 'nl', 0., 1.), (2, 'nc', 'nx', None, None), (2, 'nc', 'ny', None, None), (2, 'nl', 'nx', None, None), (2, 'nl', 'ny', None, None), (2, 'nx', 'ny', None, None), ] for expected_item, observed_item in zip(expected, result): (xpd_genid, xpd_name0, xpd_name1, xpd_corr, xpd_corr_p) = expected_item (obs_genid, obs_name0, obs_name1, obs_corr, obs_corr_p) = observed_item assert xpd_genid == obs_genid assert xpd_name0 == obs_name0 assert xpd_name1 == obs_name1 assert xpd_corr == obs_corr or relerr(xpd_corr, obs_corr) < 1e-10 assert (xpd_corr_p == obs_corr_p or relerr(xpd_corr_p, obs_corr_p) < 1e-1)