def test_subsample(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bayeslite.bayesdb_register_backend(bdb, backend) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm; ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm( SUBSAMPLE 100 ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)') bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF N_DEATH_ILL FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE FROM PAIRWISE hospitals_sub WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND (r1._rowid_ = 1 OR r1._rowid_ = 101) ''').fetchall() bdb.execute(''' INFER mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() sql = ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ORDER BY cgpm_rowid ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full,)) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid,)) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
def test_simulate_drawconstraint_error__ci_slow(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.backends['cgpm'].set_multiprocess(False) bayesdb_guess_population( bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute( 'CREATE GENERATOR hospital_cc FOR hospital USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION (OPTIMIZED);') with pytest.raises(ValueError): # Raises a ValueError since the condition variables and query # variables both ttl_mdcr_spnd. ValueError is returned since the # CGPM runtime, not cgpm_backend, captures the error. bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() samples = bdb.execute(''' SIMULATE n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() assert len(samples) == 100 assert all(len(s) == 1 for s in samples)
def test_simulate_drawconstraint_error__ci_slow(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.backends['cgpm'].set_multiprocess(False) bayesdb_guess_population(bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute('CREATE GENERATOR hospital_cc FOR hospital USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION (OPTIMIZED);') with pytest.raises(ValueError): # Raises a ValueError since the condition variables and query # variables both ttl_mdcr_spnd. ValueError is returned since the # CGPM runtime, not cgpm_backend, captures the error. bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() samples = bdb.execute(''' SIMULATE n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() assert len(samples) == 100 assert all(len(s) == 1 for s in samples)
def test_subsample(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING crosscat ( SUBSAMPLE(OFF) ) ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING crosscat ( SUBSAMPLE(100) ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION WAIT') bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc' ' FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE hospitals_sub' ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND' ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall() bdb.execute('INFER mdcr_spnd_amblnc FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() sql = ''' SELECT sql_rowid FROM bayesdb_crosscat_subsample WHERE generator_id = ? ORDER BY cc_row_id ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full, )) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid, )) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
def test_simulate_drawconstraint(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population( bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute( 'CREATE METAMODEL hospital_cc FOR hospital USING crosscat()') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION WAIT') samples = bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN TTL_MDCR_SPND = 40000 LIMIT 100 ''').fetchall() assert [s[0] for s in samples] == [40000] * 100
def dot_guess(self, line): '''guess population schema <population> <table> Create a population named <population> with variables corresponding to columns in table <table>, heuristically guessing their statistical types. ''' # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) != 2: self.stdout.write('Usage: .guess <population> <table>\n') return population = tokens[0] table = tokens[1] try: guess.bayesdb_guess_population(self._bdb, population, table) except Exception: self.stdout.write(traceback.format_exc())
def test_guess_population(): with bayeslite.bayesdb_open() as bdb: bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)') a_z = range(ord('a'), ord('z') + 1) aa_zz = ((c, d) for c in a_z for d in a_z) data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz) for row in data: bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row) with pytest.raises(ValueError): # No modeled columns. (x is key.) bayesdb_guess_population(bdb, 'p', 't', overrides=[('y', 'ignore'), ('z', 'ignore')]) bayesdb_guess_population(bdb, 'p', 't') with pytest.raises(ValueError): # Population already exists. bayesdb_guess_population(bdb, 'p', 't') assert bdb.sql_execute( 'SELECT * FROM bayesdb_variable').fetchall() == [ (1, None, 1, 'y', 'nominal'), (1, None, 2, 'z', 'numerical'), ]
def test_guess_population(): with bayeslite.bayesdb_open() as bdb: bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)') a_z = range(ord('a'), ord('z') + 1) aa_zz = ((c, d) for c in a_z for d in a_z) data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz) for row in data: bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row) with pytest.raises(ValueError): # No modeled columns. (x is key.) bayesdb_guess_population(bdb, 'p', 't', overrides=[('y', 'ignore'), ('z', 'ignore')]) bayesdb_guess_population(bdb, 'p', 't') with pytest.raises(ValueError): # Population already exists. bayesdb_guess_population(bdb, 'p', 't') assert bdb.sql_execute('SELECT * FROM bayesdb_variable').fetchall() == [ (1, None, 1, 'y', 'nominal'), (1, None, 2, 'z', 'numerical'), ]
def test_subsample(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bayeslite.bayesdb_register_backend(bdb, backend) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm; ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm( SUBSAMPLE 100 ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)') bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF N_DEATH_ILL FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE FROM PAIRWISE hospitals_sub WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND (r1._rowid_ = 1 OR r1._rowid_ = 101) ''').fetchall() bdb.execute(''' INFER mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() sql = ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ORDER BY cgpm_rowid ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full, )) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid, )) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')