def test_simulate_drawconstraint(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, "rU") as f: read_csv.bayesdb_read_csv(bdb, "dha", f, header=True, create=True) bdb.execute( """ CREATE GENERATOR dha_cc FOR dha USING crosscat ( GUESS(*), name KEY ) """ ) bdb.execute("INITIALIZE 1 MODEL FOR dha_cc") bdb.execute("ANALYZE dha_cc FOR 1 ITERATION WAIT") samples = bdb.execute( """ SIMULATE ttl_mdcr_spnd, n_death_ill FROM dha_cc GIVEN TTL_MDCR_SPND = 40000 LIMIT 100 """ ).fetchall() assert [s[0] for s in samples] == [40000] * 100
def test_simulate_drawconstraint_error__ci_slow(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.backends['cgpm'].set_multiprocess(False) bayesdb_guess_population( bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute( 'CREATE GENERATOR hospital_cc FOR hospital USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION (OPTIMIZED);') with pytest.raises(ValueError): # Raises a ValueError since the condition variables and query # variables both ttl_mdcr_spnd. ValueError is returned since the # CGPM runtime, not cgpm_backend, captures the error. bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() samples = bdb.execute(''' SIMULATE n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() assert len(samples) == 100 assert all(len(s) == 1 for s in samples)
def test_subsample(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bayeslite.bayesdb_register_backend(bdb, backend) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm; ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm( SUBSAMPLE 100 ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)') bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF N_DEATH_ILL FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE FROM PAIRWISE hospitals_sub WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND (r1._rowid_ = 1 OR r1._rowid_ = 101) ''').fetchall() bdb.execute(''' INFER mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() sql = ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ORDER BY cgpm_rowid ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full,)) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid,)) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
def test_simulate_drawconstraint_error__ci_slow(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.backends['cgpm'].set_multiprocess(False) bayesdb_guess_population(bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute('CREATE GENERATOR hospital_cc FOR hospital USING cgpm;') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION (OPTIMIZED);') with pytest.raises(ValueError): # Raises a ValueError since the condition variables and query # variables both ttl_mdcr_spnd. ValueError is returned since the # CGPM runtime, not cgpm_backend, captures the error. bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() samples = bdb.execute(''' SIMULATE n_death_ill FROM hospital GIVEN ttl_mdcr_spnd = 40000 LIMIT 100 ''').fetchall() assert len(samples) == 100 assert all(len(s) == 1 for s in samples)
def table_from_csv(bdb, table_name, f): read_csv.bayesdb_read_csv(bdb, table_name, f, header=True, create=True, ifnotexists=True)
def test_legacy_models_slow(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) # Make sure guessing also works. bdb.execute('create generator dha_cc0 for dha using crosscat(guess(*))') bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) # Need to be able to overwrite existing codebook. # # XXX Not sure this is the right API. What if overwrite is a # mistake? bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) bql = ''' ESTIMATE name FROM dha_cc ORDER BY SIMILARITY TO (name = ?) DESC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql, ('Albany NY',)).fetchall() == [ ('Albany NY',), ('Scranton PA',), ('United States US',), ('Norfolk VA',), ('Reading PA',), ('Salisbury MD',), ('Louisville KY',), ('Cleveland OH',), ('Covington KY',), ('Akron OH',), ] # Tickles an issue in case-folding of column names. bql = ''' ESTIMATE name FROM dha_cc ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql).fetchall() == [ ('McAllen TX',), ('Worcester MA',), ('Beaumont TX',), ('Temple TX',), ('Corpus Christi TX',), ('Takoma Park MD',), ('Kingsport TN',), ('Bangor ME',), ('Lebanon NH',), ('Panama City FL',), ]
def test_legacy_models__ci_slow(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): bayeslite.bayesdb_load_legacy_models(bdb, "dha_cc", "dha", "crosscat", dha_models, create=True) with open(dha_csv, "rU") as f: read_csv.bayesdb_read_csv(bdb, "dha", f, header=True, create=True) bayeslite.bayesdb_load_legacy_models(bdb, "dha_cc", "dha", "crosscat", dha_models, create=True) # Make sure guessing also works. bdb.execute("create generator dha_cc0 for dha using crosscat(guess(*))") bayeslite.bayesdb_load_codebook_csv_file(bdb, "dha", dha_codebook) # Need to be able to overwrite existing codebook. # # XXX Not sure this is the right API. What if overwrite is a # mistake? bayeslite.bayesdb_load_codebook_csv_file(bdb, "dha", dha_codebook) bql = """ ESTIMATE name FROM dha_cc ORDER BY SIMILARITY TO (name = ?) DESC LIMIT 10 """ with bdb.savepoint(): assert bdb.execute(bql, ("Albany NY",)).fetchall() == [ ("Albany NY",), ("Scranton PA",), ("United States US",), ("Norfolk VA",), ("Reading PA",), ("Salisbury MD",), ("Louisville KY",), ("Cleveland OH",), ("Covington KY",), ("Akron OH",), ] # Tickles an issue in case-folding of column names. bql = """ ESTIMATE name FROM dha_cc ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC LIMIT 10 """ with bdb.savepoint(): assert bdb.execute(bql).fetchall() == [ ("McAllen TX",), ("Worcester MA",), ("Beaumont TX",), ("Temple TX",), ("Corpus Christi TX",), ("Takoma Park MD",), ("Kingsport TN",), ("Bangor ME",), ("Lebanon NH",), ("Panama City FL",), ]
def test_subsample(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING crosscat ( SUBSAMPLE(OFF) ) ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING crosscat ( SUBSAMPLE(100) ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION WAIT') bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc' ' FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE hospitals_sub' ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND' ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall() bdb.execute('INFER mdcr_spnd_amblnc FROM hospitals_sub' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() sql = ''' SELECT sql_rowid FROM bayesdb_crosscat_subsample WHERE generator_id = ? ORDER BY cc_row_id ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full, )) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid, )) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')
def test_subsample(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.execute(''' CREATE GENERATOR dhacc_full FOR dha USING crosscat ( SUBSAMPLE(OFF), GUESS(*), name KEY ) ''') bdb.execute(''' CREATE GENERATOR dhacc FOR dha USING crosscat ( SUBSAMPLE(100), GUESS(*), name KEY ) ''') bdb.execute('INITIALIZE 1 MODEL FOR dhacc') bdb.execute('ANALYZE dhacc FOR 1 ITERATION WAIT') bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM dhacc' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM dhacc' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc' ' FROM dhacc WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE dhacc' ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND' ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall() bdb.execute('INFER mdcr_spnd_amblnc FROM dhacc' ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall() sql = ''' SELECT sql_rowid FROM bayesdb_crosscat_subsample WHERE generator_id = ? ORDER BY cc_row_id ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, 'dhacc_full') cursor = bdb.sql_execute(sql, (gid_full,)) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, 'dhacc') cursor = bdb.sql_execute(sql, (gid,)) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR dhacc') bdb.execute('DROP GENERATOR dhacc_full')
def test_simulate_drawconstraint(): with bayeslite.bayesdb_open() as bdb: with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population( bdb, 'hospital', 'dha', overrides=[('name', 'key')]) bdb.execute( 'CREATE METAMODEL hospital_cc FOR hospital USING crosscat()') bdb.execute('INITIALIZE 1 MODEL FOR hospital_cc') bdb.execute('ANALYZE hospital_cc FOR 1 ITERATION WAIT') samples = bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM hospital GIVEN TTL_MDCR_SPND = 40000 LIMIT 100 ''').fetchall() assert [s[0] for s in samples] == [40000] * 100
def prepare(): (df, csv_str) = dataset() os.environ['BAYESDB_WIZARD_MODE'] = '1' bdb = bayeslite.bayesdb_open() # XXX Do we not have a bayesdb_read_df ? bayesdb_read_csv(bdb, 'plottest', flush(csv_str), header=True, create=True) bdb.execute(''' create generator plottest_cc for plottest using crosscat(guess(*)) ''') # do a plot where a some sub-violins are removed _remove_violin_bql = """ DELETE FROM plottest WHERE categorical_1 = "B" AND (few_ints_3 = 2 OR few_ints_3 = 1); """ cursor = bdb.execute('SELECT * FROM plottest') df = cursor_to_df(cursor) return (df, bdb)
def prepare(): (df, csv_str) = dataset() os.environ['BAYESDB_WIZARD_MODE']='1' bdb = bayeslite.bayesdb_open() # XXX Do we not have a bayesdb_read_df ? bayesdb_read_csv(bdb, 'plottest', flush(csv_str), header=True, create=True) bdb.execute(''' create generator plottest_cc for plottest using crosscat(guess(*)) ''') # do a plot where a some sub-violins are removed _remove_violin_bql = """ DELETE FROM plottest WHERE categorical_1 = "B" AND (few_ints_3 = 2 OR few_ints_3 = 1); """ cursor = bdb.execute('SELECT * FROM plottest') df = cursor_to_df(cursor) return (df, bdb)
def test_simulate_drawconstraint(): with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb: cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bdb.execute(''' CREATE GENERATOR dha_cc FOR dha USING crosscat ( GUESS(*), name KEY ) ''') bdb.execute('INITIALIZE 1 MODEL FOR dha_cc') bdb.execute('ANALYZE dha_cc FOR 1 ITERATION WAIT') samples = bdb.execute(''' SIMULATE ttl_mdcr_spnd, n_death_ill FROM dha_cc GIVEN TTL_MDCR_SPND = 40000 LIMIT 100 ''').fetchall() assert [s[0] for s in samples] == [40000] * 100
def test_legacy_models__ci_slow(): bdb = bayeslite.bayesdb_open(builtin_metamodels=False) cc = crosscat.LocalEngine.LocalEngine(seed=0) metamodel = CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, metamodel) with pytest.raises(ValueError): bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat', dha_models, create=True) # Make sure guessing also works. bdb.execute('create generator dha_cc0 for dha using crosscat(guess(*))') bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) # Need to be able to overwrite existing codebook. # # XXX Not sure this is the right API. What if overwrite is a # mistake? bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook) bql = ''' ESTIMATE name FROM dha_cc ORDER BY SIMILARITY TO (name = ?) DESC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql, ('Albany NY', )).fetchall() == [ ('Albany NY', ), ('Scranton PA', ), ('United States US', ), ('Norfolk VA', ), ('Reading PA', ), ('Salisbury MD', ), ('Louisville KY', ), ('Cleveland OH', ), ('Covington KY', ), ('Akron OH', ), ] # Tickles an issue in case-folding of column names. bql = ''' ESTIMATE name FROM dha_cc ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC LIMIT 10 ''' with bdb.savepoint(): assert bdb.execute(bql).fetchall() == [ ('McAllen TX', ), ('Worcester MA', ), ('Beaumont TX', ), ('Temple TX', ), ('Corpus Christi TX', ), ('Takoma Park MD', ), ('Kingsport TN', ), ('Bangor ME', ), ('Lebanon NH', ), ('Panama City FL', ), ]
def test_subsample(): with bayeslite.bayesdb_open(builtin_backends=False) as bdb: backend = CGPM_Backend(cgpm_registry={}, multiprocess=False) bayeslite.bayesdb_register_backend(bdb, backend) with open(dha_csv, 'rU') as f: read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True) bayesdb_guess_population(bdb, 'hospitals_full', 'dha', overrides=[('name', 'key')]) bayesdb_guess_population(bdb, 'hospitals_sub', 'dha', overrides=[('name', 'key')]) bdb.execute(''' CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm; ''') bdb.execute(''' CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm( SUBSAMPLE 100 ) ''') bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc') bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)') bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF N_DEATH_ILL FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE FROM PAIRWISE hospitals_sub WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND (r1._rowid_ = 1 OR r1._rowid_ = 101) ''').fetchall() bdb.execute(''' INFER mdcr_spnd_amblnc FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101 ''').fetchall() sql = ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ORDER BY cgpm_rowid ASC LIMIT 100 ''' gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc') cursor = bdb.sql_execute(sql, (gid_full, )) assert [row[0] for row in cursor] == range(1, 100 + 1) gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc') cursor = bdb.sql_execute(sql, (gid, )) assert [row[0] for row in cursor] != range(1, 100 + 1) bdb.execute('DROP GENERATOR hosp_sub_cc') bdb.execute('DROP GENERATOR hosp_full_cc') bdb.execute('DROP POPULATION hospitals_sub') bdb.execute('DROP POPULATION hospitals_full')