Exemple #1
0
def test_hackmetamodel():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)')
    bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')")
    bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    dotdog_metamodel = DotdogMetamodel()
    bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel)
    bayeslite.bayesdb_deregister_metamodel(bdb, dotdog_metamodel)
    bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel)
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    # XXX Rest of test originally exercised default metamodel, but
    # syntax doesn't support that now.  Not clear that's wrong either.
    bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
Exemple #2
0
def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('''
            create population p for t(id ignore; model x,y as numerical)
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g1) == [-1, 1, 2]
def test_simulate_drawconstraint():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, "rU") as f:
            read_csv.bayesdb_read_csv(bdb, "dha", f, header=True, create=True)
        bdb.execute(
            """
            CREATE GENERATOR dha_cc FOR dha USING crosscat (
                GUESS(*),
                name KEY
            )
        """
        )
        bdb.execute("INITIALIZE 1 MODEL FOR dha_cc")
        bdb.execute("ANALYZE dha_cc FOR 1 ITERATION WAIT")
        samples = bdb.execute(
            """
            SIMULATE ttl_mdcr_spnd, n_death_ill FROM dha_cc
                GIVEN TTL_MDCR_SPND = 40000
                LIMIT 100
        """
        ).fetchall()
        assert [s[0] for s in samples] == [40000] * 100
def test_geweke_iid_gaussian():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        bayeslite.bayesdb_register_metamodel(bdb, gauss.StdNormalMetamodel())
        kl_est = geweke.geweke_kl(bdb, "std_normal",
            [['column', 'numerical']], ['column'],
            [(1,0), (2,0)], 2, 2, 2, 2)
        assert kl_est == (2, 0, 0)
Exemple #5
0
def cgpm_smoke_bdb():
    with bayesdb_open(':memory:', builtin_metamodels=False) as bdb:
        registry = {
            'piecewise': PieceWise,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry,
                                                       multiprocess=0))

        bdb.sql_execute('CREATE TABLE t (Output, cat, Input)')
        for i in xrange(3):
            for j in xrange(3):
                for k in xrange(3):
                    output = i + j / (k + 1)
                    cat = -1 if (i + j * k) % 2 else +1
                    input = (i * j - k)**2
                    if i % 2:
                        output = None
                    if j % 2:
                        cat = None
                    if k % 2:
                        input = None
                    bdb.sql_execute(
                        '''
                        INSERT INTO t (output, cat, input) VALUES (?, ?, ?)
                    ''', (output, cat, input))

        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                MODEL output, input AS NUMERICAL;
                MODEL cat AS CATEGORICAL
            )
        ''')

        yield bdb
def test_impossible_duplicate_dependency():
    # Throw exception when two columns X and Y are both dependent and
    # independent.

    data = [(0, 1, 0, 0), (1, 0, 0, 1)]

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,a,b,c)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row)

        # Create schema, we will force DEP(a c) and IND(a c).
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                a CATEGORICAL,
                b CATEGORICAL,
                c CATEGORICAL,
                INDEPENDENT(a,b,c),
                DEPENDENT(a,c),
            );
        '''

        # An error should be thrown about impossible schema.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute(bql)
def test_impossible_duplicate_dependency():
    # Throw exception when two columns X and Y are both dependent and
    # independent.

    data = [(0, 1, 0, 0), (1, 0, 0, 1)]

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,a,b,c)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row)

        # Create schema, we will force DEP(a c) and IND(a c).
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                a CATEGORICAL,
                b CATEGORICAL,
                c CATEGORICAL,
                INDEPENDENT(a,b,c),
                DEPENDENT(a,c),
            );
        '''

        # An error should be thrown about impossible schema.
        with pytest.raises(bayeslite.BQLError):
            bdb.execute(bql)
def test_register():
    bdb = bayeslite.bayesdb_open()
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    # Check if globally registered.
    try:
        bdb.sql_execute('''
            SELECT * FROM bayesdb_metamodel WHERE name={}
        '''.format(quote(composer.name()))).next()
    except StopIteration:
        pytest.fail('Composer not registered in bayesdb_metamodel.')
    # Check all tables/triggers.
    schema = [('table', 'bayesdb_composer_cc_id'),
              ('table', 'bayesdb_composer_column_owner'),
              ('table', 'bayesdb_composer_column_toposort'),
              ('trigger', 'bayesdb_composer_column_toposort_check'),
              ('table', 'bayesdb_composer_column_parents'),
              ('table', 'bayesdb_composer_column_foreign_predictor'),
              ('trigger', 'bayesdb_composer_column_foreign_predictor_check')]
    for kind, name in schema:
        try:
            bdb.sql_execute('''
                SELECT * FROM sqlite_master WHERE type={} AND name={}
            '''.format(quote(kind), quote(name))).next()
        except StopIteration:
            pytest.fail('Missing from Composer schema: {}'.format(
                (kind, name)))
    bdb.close()
Exemple #9
0
def test_register():
    bdb = bayeslite.bayesdb_open()
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    # Check if globally registered.
    try:
        bdb.sql_execute('''
            SELECT * FROM bayesdb_metamodel WHERE name={}
        '''.format(quote(composer.name()))).next()
    except StopIteration:
        pytest.fail('Composer not registered in bayesdb_metamodel.')
    # Check all tables/triggers.
    schema = [
        ('table', 'bayesdb_composer_cc_id'),
        ('table', 'bayesdb_composer_column_owner'),
        ('table', 'bayesdb_composer_column_toposort'),
        ('trigger', 'bayesdb_composer_column_toposort_check'),
        ('table', 'bayesdb_composer_column_parents'),
        ('table', 'bayesdb_composer_column_foreign_predictor'),
        ('trigger', 'bayesdb_composer_column_foreign_predictor_check')
    ]
    for kind, name in schema:
        try:
            bdb.sql_execute('''
                SELECT * FROM sqlite_master WHERE type={} AND name={}
            '''.format(quote(kind), quote(name))).next()
        except StopIteration:
            pytest.fail('Missing from Composer schema: {}'.format((kind,name)))
    bdb.close()
Exemple #10
0
def test_hackmetamodel():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    bdb.sql_execute('CREATE TABLE t(a INTEGER, b TEXT)')
    bdb.sql_execute("INSERT INTO t (a, b) VALUES (42, 'fnord')")
    bdb.sql_execute('CREATE TABLE u AS SELECT * FROM t')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    crosscat = local_crosscat()
    crosscat_metamodel = CrosscatMetamodel(crosscat)
    dotdog_metamodel = DotdogMetamodel()
    bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel)
    bayeslite.bayesdb_deregister_metamodel(bdb, dotdog_metamodel)
    bayeslite.bayesdb_register_metamodel(bdb, dotdog_metamodel)
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_cc FOR t USING crosscat(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR t_dd FOR t USING dotdog(a NUMERICAL)')
    # XXX Rest of test originally exercised default metamodel, but
    # syntax doesn't support that now.  Not clear that's wrong either.
    bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('CREATE GENERATOR u_dd FOR u USING dotdog(a NUMERICAL)')
Exemple #11
0
def test_guess_population():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)')
    a_z = range(ord('a'), ord('z') + 1)
    aa_zz = ((c, d) for c in a_z for d in a_z)
    data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz)
    for row in data:
        bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row)
    cc = crosscat.LocalEngine.LocalEngine(seed=0)
    metamodel = CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    with pytest.raises(ValueError):
        # No modelled columns.  (x is key.)
        bayesdb_guess_population(bdb,
                                 'p',
                                 't',
                                 overrides=[('y', 'ignore'), ('z', 'ignore')])
    bayesdb_guess_population(bdb, 'p', 't')
    with pytest.raises(ValueError):
        # Population already exists.
        bayesdb_guess_population(bdb, 'p', 't')
    assert bdb.sql_execute('SELECT * FROM bayesdb_variable').fetchall() == [
        (1, None, 1, 'y', 'nominal'),
        (1, None, 2, 'z', 'numerical'),
    ]
def test_legacy_models_slow():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    cc = crosscat.LocalEngine.LocalEngine(seed=0)
    metamodel = CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    with pytest.raises(ValueError):
        bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat',
            dha_models, create=True)
    with open(dha_csv, 'rU') as f:
        read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
    bayeslite.bayesdb_load_legacy_models(bdb, 'dha_cc', 'dha', 'crosscat',
        dha_models, create=True)
    # Make sure guessing also works.
    bdb.execute('create generator dha_cc0 for dha using crosscat(guess(*))')
    bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook)
    # Need to be able to overwrite existing codebook.
    #
    # XXX Not sure this is the right API.  What if overwrite is a
    # mistake?
    bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook)
    bql = '''
        ESTIMATE name FROM dha_cc
            ORDER BY SIMILARITY TO (name = ?) DESC
            LIMIT 10
    '''
    with bdb.savepoint():
        assert bdb.execute(bql, ('Albany NY',)).fetchall() == [
            ('Albany NY',),
            ('Scranton PA',),
            ('United States US',),
            ('Norfolk VA',),
            ('Reading PA',),
            ('Salisbury MD',),
            ('Louisville KY',),
            ('Cleveland OH',),
            ('Covington KY',),
            ('Akron OH',),
        ]
    # Tickles an issue in case-folding of column names.
    bql = '''
        ESTIMATE name
            FROM dha_cc
            ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC
            LIMIT 10
    '''
    with bdb.savepoint():
        assert bdb.execute(bql).fetchall() == [
            ('McAllen TX',),
            ('Worcester MA',),
            ('Beaumont TX',),
            ('Temple TX',),
            ('Corpus Christi TX',),
            ('Takoma Park MD',),
            ('Kingsport TN',),
            ('Bangor ME',),
            ('Lebanon NH',),
            ('Panama City FL',),
        ]
def test_legacy_models__ci_slow():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    cc = crosscat.LocalEngine.LocalEngine(seed=0)
    metamodel = CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    with pytest.raises(ValueError):
        bayeslite.bayesdb_load_legacy_models(bdb, "dha_cc", "dha", "crosscat", dha_models, create=True)
    with open(dha_csv, "rU") as f:
        read_csv.bayesdb_read_csv(bdb, "dha", f, header=True, create=True)
    bayeslite.bayesdb_load_legacy_models(bdb, "dha_cc", "dha", "crosscat", dha_models, create=True)
    # Make sure guessing also works.
    bdb.execute("create generator dha_cc0 for dha using crosscat(guess(*))")
    bayeslite.bayesdb_load_codebook_csv_file(bdb, "dha", dha_codebook)
    # Need to be able to overwrite existing codebook.
    #
    # XXX Not sure this is the right API.  What if overwrite is a
    # mistake?
    bayeslite.bayesdb_load_codebook_csv_file(bdb, "dha", dha_codebook)
    bql = """
        ESTIMATE name FROM dha_cc
            ORDER BY SIMILARITY TO (name = ?) DESC
            LIMIT 10
    """
    with bdb.savepoint():
        assert bdb.execute(bql, ("Albany NY",)).fetchall() == [
            ("Albany NY",),
            ("Scranton PA",),
            ("United States US",),
            ("Norfolk VA",),
            ("Reading PA",),
            ("Salisbury MD",),
            ("Louisville KY",),
            ("Cleveland OH",),
            ("Covington KY",),
            ("Akron OH",),
        ]
    # Tickles an issue in case-folding of column names.
    bql = """
        ESTIMATE name
            FROM dha_cc
            ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC
            LIMIT 10
    """
    with bdb.savepoint():
        assert bdb.execute(bql).fetchall() == [
            ("McAllen TX",),
            ("Worcester MA",),
            ("Beaumont TX",),
            ("Temple TX",),
            ("Corpus Christi TX",),
            ("Takoma Park MD",),
            ("Kingsport TN",),
            ("Bangor ME",),
            ("Lebanon NH",),
            ("Panama City FL",),
        ]
def test_geweke_nig_normal():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        nig = normal.NIGNormalMetamodel(seed=1)
        bayeslite.bayesdb_register_metamodel(bdb, nig)
        (ct, kl, error) = geweke.geweke_kl(bdb, "nig_normal",
            [['column', 'numerical']], ['column'],
            [(1,0), (2,0)], 2, 2, 2, 2)
        assert ct == 2
        assert 0 < kl and kl < 10 # KL should be positive
        assert 0 < error and error < 10 # KL error estimate too
Exemple #15
0
def bayesdb(metamodel=None, **kwargs):
    if metamodel is None:
        crosscat = local_crosscat()
        metamodel = CrosscatMetamodel(crosscat)
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False, **kwargs)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    try:
        yield bdb
    finally:
        bdb.close()
Exemple #16
0
def bayesdb(metamodel=None, **kwargs):
    if metamodel is None:
        crosscat = local_crosscat()
        metamodel = CrosscatMetamodel(crosscat)
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False, **kwargs)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    try:
        yield bdb
    finally:
        bdb.close()
Exemple #17
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bayesdb_guess_population(bdb,
                                 'hospitals_full',
                                 'dha',
                                 overrides=[('name', 'key')])
        bayesdb_guess_population(bdb,
                                 'hospitals_sub',
                                 'dha',
                                 overrides=[('name', 'key')])
        bdb.execute('''
            CREATE GENERATOR hosp_full_cc FOR hospitals_full USING crosscat (
                SUBSAMPLE(OFF)
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING crosscat (
                SUBSAMPLE(100)
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc')
        bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc'
                    ' FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE hospitals_sub'
                    ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND'
                    ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall()
        bdb.execute('INFER mdcr_spnd_amblnc FROM hospitals_sub'
                    ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        sql = '''
            SELECT sql_rowid FROM bayesdb_crosscat_subsample
                WHERE generator_id = ?
                ORDER BY cc_row_id ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc')
        cursor = bdb.sql_execute(sql, (gid_full, ))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc')
        cursor = bdb.sql_execute(sql, (gid, ))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR hosp_sub_cc')
        bdb.execute('DROP GENERATOR hosp_full_cc')
        bdb.execute('DROP POPULATION hospitals_sub')
        bdb.execute('DROP POPULATION hospitals_full')
Exemple #18
0
def test_geweke_nig_normal():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        nig = normal.NIGNormalMetamodel(seed=1)
        bayeslite.bayesdb_register_metamodel(bdb, nig)
        (ct, kl, error) = geweke.geweke_kl(bdb, "nig_normal",
                                           [['column', 'numerical']],
                                           ['column'], [(1, 0),
                                                        (2, 0)], 2, 2, 2, 2)
        assert ct == 2
        assert 0 < kl and kl < 10  # KL should be positive
        assert 0 < error and error < 10  # KL error estimate too
Exemple #19
0
def do_probe_fileset(files, generator, probes, specs, seed):
    # Keys are (file_name, model_ct, name); values are aggregated results
    results = {}
    prng = random.Random(seed)
    for fname in files:
        log("processing file %s" % fname)
        with bayeslite.bayesdb_open(fname, builtin_metamodels=False) as bdb:
            bayeslite.bayesdb_register_metamodel(bdb, new_cc_metamodel(prng))
            res = [((fname, model_ct, name), ress)
                   for ((model_ct, name), ress)
                   in run_probes(bdb, generator, probes, specs).iteritems()]
            incorporate(results, res)
    return results
Exemple #20
0
def do_probe_fileset(files, generator, probes, specs, seed):
    # Keys are (file_name, model_ct, name); values are aggregated results
    results = {}
    prng = random.Random(seed)
    for fname in files:
        log("processing file %s" % fname)
        with bayeslite.bayesdb_open(fname, builtin_metamodels=False) as bdb:
            bayeslite.bayesdb_register_metamodel(bdb, new_cc_metamodel(prng))
            res = [((fname, model_ct, name), ress) for (
                (model_ct, name),
                ress) in run_probes(bdb, generator, probes, specs).iteritems()]
            incorporate(results, res)
    return results
Exemple #21
0
def test_codebook_value_map():
    '''
    A categorical column in crosscat can only take on a fixed number of values
    v1, v2, ..., v3.  In this test, we have a categorical column called
    `city` which takes on values `RIO, LA, SF, DC` as specified in the codebook
    value map.

        INITIALIZE dummy table with only RIO and SF appearing in dataset
        ANALYZE dummy_cc
        INSERT rows with `city` names `LA` and `DC`
        ANALYZE dummy_cc
        SIMULATE specifying `city` = `LA` (throws KeyError)
    '''

    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        bayeslite.bayesdb_read_csv(bdb,
                                   'dummy',
                                   dummy_data,
                                   header=True,
                                   create=True)

        with tempfile.NamedTemporaryFile(prefix='bayeslite') as tempbook:
            with open(tempbook.name, 'w') as f:
                f.write(dummy_codebook)
            bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dummy',
                                                     tempbook.name)

        bdb.execute('''
            CREATE GENERATOR dummy_cc FOR dummy
                USING crosscat(
                    GUESS(*),
                    kerberos IGNORE,
                    age NUMERICAL,
                    city CATEGORICAL
                )
        ''')
        bdb.execute('INITIALIZE 10 MODELS FOR dummy_cc')
        bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT')
        bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = RIO LIMIT 5')
        bdb.sql_execute('''
            INSERT INTO dummy (kerberos, age, city) VALUES
                ('jackie', 18, 'LA'), ('rocker', 22, 'DC')
        ''')
        bdb.execute('ANALYZE dummy_cc FOR 20 ITERATIONS WAIT')
        c = bdb.sql_execute('SELECT * FROM dummy')
        with pytest.raises(KeyError):
            bdb.execute('SIMULATE age FROM dummy_cc GIVEN city = LA LIMIT 5')
def test_codebook_value_map():
    """
    A categorical column in crosscat can only take on a fixed number of values
    v1, v2, ..., v3.  In this test, we have a categorical column called
    `city` which takes on values `RIO, LA, SF, DC` as specified in the codebook
    value map.

        INITIALIZE dummy table with only RIO and SF appearing in dataset
        ANALYZE dummy_cc
        INSERT rows with `city` names `LA` and `DC`
        ANALYZE dummy_cc
        SIMULATE specifying `city` = `LA` (throws KeyError)
    """

    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        bayeslite.bayesdb_read_csv(bdb, "dummy", dummy_data, header=True, create=True)

        with tempfile.NamedTemporaryFile(prefix="bayeslite") as tempbook:
            with open(tempbook.name, "w") as f:
                f.write(dummy_codebook)
            bayeslite.bayesdb_load_codebook_csv_file(bdb, "dummy", tempbook.name)

        bdb.execute(
            """
            CREATE GENERATOR dummy_cc FOR dummy
                USING crosscat(
                    GUESS(*),
                    kerberos IGNORE,
                    age NUMERICAL,
                    city CATEGORICAL
                )
        """
        )
        bdb.execute("INITIALIZE 10 MODELS FOR dummy_cc")
        bdb.execute("ANALYZE dummy_cc FOR 20 ITERATIONS WAIT")
        bdb.execute("SIMULATE age FROM dummy_cc GIVEN city = RIO LIMIT 5")
        bdb.sql_execute(
            """
            INSERT INTO dummy (kerberos, age, city) VALUES
                ('jackie', 18, 'LA'), ('rocker', 22, 'DC')
        """
        )
        bdb.execute("ANALYZE dummy_cc FOR 20 ITERATIONS WAIT")
        c = bdb.sql_execute("SELECT * FROM dummy")
        with pytest.raises(KeyError):
            bdb.execute("SIMULATE age FROM dummy_cc GIVEN city = LA LIMIT 5")
def _retest_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, g)
    gid = core.bayesdb_get_generator(bdb, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)
    bdb.execute("ANALYZE %s FOR 1 ITERATION WAIT" % (qg,))
    bdb.execute("ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT" % (qg,))
    bdb.execute("ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT" % (qg,))
Exemple #24
0
def test_sessions_error_metamodel():
    with test_core.t1() as (bdb, _generator_id):
        bayeslite.bayesdb_register_metamodel(bdb, ErroneousMetamodel())
        bdb.execute('''
            CREATE GENERATOR t1_err FOR t1
                USING erroneous(age NUMERICAL)
        ''')
        tr = sescap.SessionOrchestrator(bdb)
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF age FROM t1_err
        ''')
        with pytest.raises(Boom):
            cursor.fetchall()
        #tr._start_new_session()
        assert tr._check_error_entries(tr.session_id) > 0
Exemple #25
0
def cgpm_dummy_satellites_pop_bdb():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            create population satellites for satellites_ucs with schema(
                model apogee as numerical;
                model class_of_orbit as categorical;
                model country_of_operator as categorical;
                model launch_mass as numerical;
                model perigee as numerical;
                model period as numerical
            )
        ''')
        metamodel = CGPM_Metamodel(dict(), multiprocess=0)
        bayesdb_register_metamodel(bdb, metamodel)
        yield bdb
def test_sessions_error_metamodel():
    with test_core.t1() as (bdb, _generator_id):
        bayeslite.bayesdb_register_metamodel(bdb, ErroneousMetamodel())
        bdb.execute('''
            CREATE GENERATOR t1_err FOR t1
                USING erroneous(age NUMERICAL)
        ''')
        tr = sescap.SessionOrchestrator(bdb)
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF age FROM t1_err
        ''')
        with pytest.raises(Boom):
            cursor.fetchall()
        #tr._start_new_session()
        assert tr._check_error_entries(tr.session_id) > 0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bdb.execute('''
            CREATE GENERATOR dhacc_full FOR dha USING crosscat (
                SUBSAMPLE(OFF),
                GUESS(*),
                name KEY
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR dhacc FOR dha USING crosscat (
                SUBSAMPLE(100),
                GUESS(*),
                name KEY
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR dhacc')
        bdb.execute('ANALYZE dhacc FOR 1 ITERATION WAIT')
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=2) FROM dhacc'
            ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY TO (_rowid_=102) FROM dhacc'
            ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc'
            ' FROM dhacc WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE dhacc'
            ' WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND'
                ' (r1._rowid_ = 1 OR r1._rowid_ = 101)').fetchall()
        bdb.execute('INFER mdcr_spnd_amblnc FROM dhacc'
            ' WHERE _rowid_ = 1 OR _rowid_ = 101').fetchall()
        sql = '''
            SELECT sql_rowid FROM bayesdb_crosscat_subsample
                WHERE generator_id = ?
                ORDER BY cc_row_id ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, 'dhacc_full')
        cursor = bdb.sql_execute(sql, (gid_full,))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, 'dhacc')
        cursor = bdb.sql_execute(sql, (gid,))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR dhacc')
        bdb.execute('DROP GENERATOR dhacc_full')
def test_geweke_catches_nig_normal_bug__ci_slow():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        bayeslite.bayesdb_register_metamodel(bdb, DoctoredNIGNormal(seed=1))
        cells = [(i,0) for i in range(4)]
        for chain_ct in (0, 1, 5):
            (ct, kl, error) = geweke.geweke_kl(bdb, "nig_normal",
                [['column', 'numerical']], ['column'], cells,
                200, 200, chain_ct, 3000)
            if chain_ct == 0:
                assert ct == 3000
                assert 0 < kl and kl < 0.1
                assert 0 < error and error < 0.05
            else:
                assert ct == 3000
                assert kl > 5
                assert 0 < error and error < 4
Exemple #29
0
def test_nig_normal_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x, ))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g for p using nig_normal')
        bdb.execute('initialize 1 model for g')
        bdb.execute('analyze g for 1 iteration wait')
        bdb.execute('estimate probability density of x = 50 from p').fetchall()
        bdb.execute('simulate x from p limit 1').fetchall()
        bdb.execute('drop models from g')
        bdb.execute('drop generator g')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
Exemple #30
0
def _retest_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, g)
    gid = core.bayesdb_get_generator(bdb, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg, ))
Exemple #31
0
def test_geweke_catches_nig_normal_bug__ci_slow():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        bayeslite.bayesdb_register_metamodel(bdb, DoctoredNIGNormal(seed=1))
        cells = [(i, 0) for i in range(4)]
        for chain_ct in (0, 1, 5):
            (ct, kl, error) = geweke.geweke_kl(bdb, "nig_normal",
                                               [['column', 'numerical']],
                                               ['column'], cells, 200, 200,
                                               chain_ct, 3000)
            if chain_ct == 0:
                assert ct == 3000
                assert 0 < kl and kl < 0.1
                assert 0 < error and error < 0.05
            else:
                assert ct == 3000
                assert kl > 5
                assert 0 < error and error < 4
def test_impossible_nontransitive_dependency():
    # Test impossibility of non-transitive dependencies. While in the
    # general case, dependence is not transitive, crosscat assumes
    # transitive closure under dependency constraints.  The test is
    # valid since we are using a crosscat local engine.  Note that
    # transitivity under independence is not forced by crosscat.
    # Changing the behavior of CrossCat to deal with impossible
    # constraints (such as random dropout) will require updating this
    # test.
    data = [(0, 1, 0, 0), (1, 0, 0, 1)]

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,a,b,c)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row)

        # Create schema, we will force DEP(a b), DEP(b c), and IND(a c) which
        # is non-transitive.
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                a CATEGORICAL,
                b CATEGORICAL,
                c CATEGORICAL,
                DEPENDENT(a,b),
                DEPENDENT(b,c),
                INDEPENDENT(a,c)
            );
        '''

        # Creating the generator should succeed.
        bdb.execute(bql)

        # Error thrown when initializing since no initial state exists.
        # XXX Currently CrossCat throws a RuntimeError, we should fix
        # the CrossCat exception hierarchy.
        with pytest.raises(RuntimeError):
            bdb.execute('INITIALIZE 10 MODELS FOR bar')
Exemple #33
0
def test_impossible_nontransitive_dependency():
    # Test impossibility of non-transitive dependencies. While in the
    # general case, dependence is not transitive, crosscat assumes
    # transitive closure under dependency constraints.  The test is
    # valid since we are using a crosscat local engine.  Note that
    # transitivity under independence is not forced by crosscat.
    # Changing the behavior of CrossCat to deal with impossible
    # constraints (such as random dropout) will require updating this
    # test.
    data = [(0, 1, 0, 0), (1, 0, 0, 1)]

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,a,b,c)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?)', row)

        # Create schema, we will force DEP(a b), DEP(b c), and IND(a c) which
        # is non-transitive.
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                a CATEGORICAL,
                b CATEGORICAL,
                c CATEGORICAL,
                DEPENDENT(a,b),
                DEPENDENT(b,c),
                INDEPENDENT(a,c)
            );
        '''

        # Creating the generator should succeed.
        bdb.execute(bql)

        # Error thrown when initializing since no initial state exists.
        # XXX Currently CrossCat throws a RuntimeError, we should fix
        # the CrossCat exception hierarchy.
        with pytest.raises(RuntimeError):
            bdb.execute('INITIALIZE 10 MODELS FOR bar')
def test_geweke_nig_normal_seriously__ci_slow():
    # Note: The actual assertions in this test and the next one were
    # dervied heuristically by inspecting a fuller (and costlier to
    # compute) tableau of values of geweke.geweke_kl and deciding the
    # aggregate impression was "probably no bug" (resp. "definitely
    # bug").  The assertions constitute an attempt to capture the most
    # salient features that give that impression.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        nig = normal.NIGNormalMetamodel(seed=1)
        bayeslite.bayesdb_register_metamodel(bdb, nig)
        cells = [(i,0) for i in range(4)]
        for chain_ct in (0, 1, 5):
            (ct, kl, error) = geweke.geweke_kl(bdb, "nig_normal",
                [['column', 'numerical']], ['column'], cells,
                200, 200, chain_ct, 3000)
            assert ct == 3000
            assert 0 < kl and kl < 0.1
            assert 0 < error and error < 0.05
Exemple #35
0
def run(stdin, stdout, stderr, argv):
    args = parse_args(argv[1:])
    progname = argv[0]
    slash = progname.rfind('/')
    if slash:
        progname = progname[slash + 1:]
    if args.bdbpath is None and not args.memory:
        stderr.write('%s: pass filename or -m/--memory\n' % (progname, ))
        return 1
    if args.bdbpath == '-':
        stderr.write('%s: missing option?\n' % (progname, ))
        return 1
    bdb = bayeslite.bayesdb_open(pathname=args.bdbpath,
                                 builtin_metamodels=False)

    if args.jobs != 1:
        import crosscat.MultiprocessingEngine as ccme
        jobs = args.jobs if args.jobs > 0 else None
        crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs)
    else:
        import crosscat.LocalEngine as ccle
        crosscat = ccle.LocalEngine(seed=args.seed)
    metamodel = CrosscatMetamodel(crosscat)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr)
    with hook.set_current_shell(bdbshell):
        if not args.no_init_file:
            init_file = os.path.join(os.path.expanduser('~/.bayesliterc'))
            if os.path.isfile(init_file):
                bdbshell.dot_read(init_file)

        if args.file is not None:
            for path in args.file:
                if os.path.isfile(path):
                    bdbshell.dot_read(path)
                else:
                    bdbshell.stdout.write('%s is not a file.  Aborting.\n' %
                                          (str(path), ))
                    break

        if not args.batch:
            bdbshell.cmdloop()
    return 0
Exemple #36
0
def test_geweke_nig_normal_seriously__ci_slow():
    # Note: The actual assertions in this test and the next one were
    # dervied heuristically by inspecting a fuller (and costlier to
    # compute) tableau of values of geweke.geweke_kl and deciding the
    # aggregate impression was "probably no bug" (resp. "definitely
    # bug").  The assertions constitute an attempt to capture the most
    # salient features that give that impression.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        nig = normal.NIGNormalMetamodel(seed=1)
        bayeslite.bayesdb_register_metamodel(bdb, nig)
        cells = [(i, 0) for i in range(4)]
        for chain_ct in (0, 1, 5):
            (ct, kl, error) = geweke.geweke_kl(bdb, "nig_normal",
                                               [['column', 'numerical']],
                                               ['column'], cells, 200, 200,
                                               chain_ct, 3000)
            assert ct == 3000
            assert 0 < kl and kl < 0.1
            assert 0 < error and error < 0.05
Exemple #37
0
def run(stdin, stdout, stderr, argv):
    args = parse_args(argv[1:])
    progname = argv[0]
    slash = progname.rfind('/')
    if slash:
        progname = progname[slash + 1:]
    if args.bdbpath is None and not args.memory:
        stderr.write('%s: pass filename or -m/--memory\n' % (progname,))
        return 1
    if args.bdbpath == '-':
        stderr.write('%s: missing option?\n' % (progname,))
        return 1
    bdb = bayeslite.bayesdb_open(pathname=args.bdbpath,
        builtin_metamodels=False)

    if args.jobs != 1:
        import crosscat.MultiprocessingEngine as ccme
        jobs = args.jobs if args.jobs > 0 else None
        crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs)
    else:
        import crosscat.LocalEngine as ccle
        crosscat = ccle.LocalEngine(seed=args.seed)
    metamodel = CrosscatMetamodel(crosscat)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr)
    with hook.set_current_shell(bdbshell):
        if not args.no_init_file:
            init_file = os.path.join(os.path.expanduser('~/.bayesliterc'))
            if os.path.isfile(init_file):
                bdbshell.dot_read(init_file)

        if args.file is not None:
            for path in args.file:
                if os.path.isfile(path):
                    bdbshell.dot_read(path)
                else:
                    bdbshell.stdout.write('%s is not a file.  Aborting.\n' %
                        (str(path),))
                    break

        if not args.batch:
            bdbshell.cmdloop()
    return 0
Exemple #38
0
def test_bad_analyze_vars():
    try:
        from cgpm.regressions.linreg import LinearRegression
    except ImportError:
        pytest.skip('no sklearn')
        return
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS CATEGORICAL;
                MODEL country_of_operator AS CATEGORICAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry))
        bdb.execute('''
            CREATE METAMODEL satellites_cgpm FOR satellites USING cgpm
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_cgpm')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION WAIT ()')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION WAIT')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION WAIT (
                    VARIABLES period, perige
                )
            ''')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION WAIT (
                    SKIP period, perige
                )
            ''')
def test_register_foreign_predictor():
    bdb = bayeslite.bayesdb_open()
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    # Register valid predictors.
    composer.register_foreign_predictor(random_forest.RandomForest)
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    # Register duplicates.
    with pytest.raises(BLE):
        composer.register_foreign_predictor(keplers_law.KeplersLaw)
    with pytest.raises(BLE):
        composer.register_foreign_predictor(
            multiple_regression.MultipleRegression)
    with pytest.raises(BLE):
        composer.register_foreign_predictor(random_forest.RandomForest)
    # Register invalid predictors.
    with pytest.raises(AssertionError):
        composer.register_foreign_predictor(None)
    with pytest.raises(AssertionError):
        composer.register_foreign_predictor('bans')
Exemple #40
0
def test_simulate_drawconstraint():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        metamodel = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, metamodel)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bdb.execute('''
            CREATE GENERATOR dha_cc FOR dha USING crosscat (
                GUESS(*),
                name KEY
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR dha_cc')
        bdb.execute('ANALYZE dha_cc FOR 1 ITERATION WAIT')
        samples = bdb.execute('''
            SIMULATE ttl_mdcr_spnd, n_death_ill FROM dha_cc
                GIVEN TTL_MDCR_SPND = 40000
                LIMIT 100
        ''').fetchall()
        assert [s[0] for s in samples] == [40000] * 100
Exemple #41
0
def test_register_foreign_predictor():
    bdb = bayeslite.bayesdb_open()
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    # Register valid predictors.
    composer.register_foreign_predictor(random_forest.RandomForest)
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    # Register duplicates.
    with pytest.raises(BLE):
        composer.register_foreign_predictor(keplers_law.KeplersLaw)
    with pytest.raises(BLE):
        composer.register_foreign_predictor(
            multiple_regression.MultipleRegression)
    with pytest.raises(BLE):
        composer.register_foreign_predictor(random_forest.RandomForest)
    # Register invalid predictors.
    with pytest.raises(AssertionError):
        composer.register_foreign_predictor(None)
    with pytest.raises(AssertionError):
        composer.register_foreign_predictor('bans')
def test_guess_generator():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    bdb.sql_execute('CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)')
    a_z = range(ord('a'), ord('z') + 1)
    aa_zz = ((c, d) for c in a_z for d in a_z)
    data = ((chr(c) + chr(d), (c + d) % 2, math.sqrt(c + d)) for c, d in aa_zz)
    for row in data:
        bdb.sql_execute('INSERT INTO t (x, y, z) VALUES (?, ?, ?)', row)
    cc = crosscat.LocalEngine.LocalEngine(seed=0)
    metamodel = CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    with pytest.raises(ValueError):
        # No modelled columns.  (x is key.)
        bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat',
            overrides=[('y', 'ignore'), ('z', 'ignore')])
    bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat')
    with pytest.raises(ValueError):
        # Generator already exists.
        bayesdb_guess_generator(bdb, 't_cc', 't', 'crosscat')
    assert bdb.sql_execute('SELECT *'
            ' FROM bayesdb_generator_column').fetchall() == [
        (1, 1, 'categorical'),
        (1, 2, 'numerical'),
    ]
def _test_example(bdb, exname):
    mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == n

    # Create a population.
    assert not core.bayesdb_has_population(bdb, p)
    bdb.execute(p_bql)
    p_id = core.bayesdb_get_population(bdb, p)

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id + 1, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)

    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg, ))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg, ))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt, ))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt, ))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert core.bayesdb_has_generator(bdb, None, g)
    assert gid == core.bayesdb_get_generator(bdb, p_id, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        bdb.execute('DROP MODEL 1 FROM %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert not core.bayesdb_generator_has_model(bdb, gid, 1)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)

    # Test analyzing models.
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg, ))
Exemple #44
0
def test_nig_normal_latent_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x, ))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration wait')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration wait')

        # PROBABILITY DENSITY OF x = v
        bdb.execute('estimate probability density of x = 50 within p') \
            .fetchall()
        with pytest.raises(BQLError):
            bdb.execute('estimate probability density of xe = 1 within p') \
                .fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 within p modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 within p modelled by g1
        ''').fetchall()

        # PREDICTIVE PROBABILITY OF x
        bdb.execute('estimate predictive probability of x from p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'estimate predictive probability of xe from p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate predictive probability of xe from p modelled by g0
            ''').fetchall()
        for r, p_xe in bdb.execute('''
            estimate rowid, predictive probability of xe from p modelled by g1
        '''):
            assert p_xe is None, 'rowid %r p(xe) %r' % (r, p_xe)

        # INFER/PREDICT
        bdb.execute(
            'INFER EXPLICIT PREDICT x CONFIDENCE x_c FROM p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p
                    MODELLED BY g0
            ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT xe CONFIDENCE xe_c FROM p
                MODELLED BY g1
        ''').fetchall()

        # SIMULATE x
        bdb.execute('simulate x from p limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate x, xe from p limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute(
                'simulate x, xe from p modelled by g0 limit 1').fetchall()
        bdb.execute('simulate x, xe from p modelled by g1 limit 1').fetchall()

        assert 100 == len(
            bdb.execute('''
            estimate similarity in the context of x from pairwise p limit 100
        ''').fetchall())
        assert 1 == len(
            bdb.execute('''
            estimate similarity in the context of x
            from pairwise p modelled by g0 limit 1
        ''').fetchall())
        # No such column xe in g0.
        with pytest.raises(BQLError):
            assert 1 == len(
                bdb.execute('''
                estimate similarity in the context of xe
                    from pairwise p modelled by g0 limit 1
            ''').fetchall())
        # Column xe exists in g1.
        assert 1 == len(
            bdb.execute('''
            estimate similarity in the context of xe
                from pairwise p modelled by g1 limit 1
        ''').fetchall())

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
Exemple #45
0
def test_nig_normal_latent_2var2lat_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(
                xe deviation(x),
                ye deviation(y)
            )
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration wait')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration wait')

        # latent given latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (ye = -1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (ye = -1) within p
                     modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (ye = -1) within p
                 modelled by g1
        ''').fetchall()

        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p given ye = -1 limit 1
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modelled by g0 given ye = -1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modelled by g1 given ye = -1 limit 1
        ''').fetchall()

        with pytest.raises(BQLError):
            bdb.execute(
                'estimate dependence probability of xe with ye within p')
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate dependence probability of xe with ye within p
                    modelled by g0
            ''')
        bdb.execute('''
            estimate dependence probability of xe with ye within p
                modelled by g1
        ''')

        with pytest.raises(BQLError):
            bdb.execute('estimate mutual information of xe with ye within p')
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate mutual information of xe with ye within p
                    modelled by g0
            ''')
        bdb.execute('''
            estimate mutual information of xe with ye within p
                modelled by g1
        ''')

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
Exemple #46
0
def test_nig_normal_latent_2var_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')

        # CORRELATION, CORRELATION PVALUE, without generators.
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration wait')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration wait')

        # CORRELATION, CORRELATION PVALUE, with generators.
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p modelled by g0
        ''').fetchall())
        with pytest.raises(BQLError):
            assert 4 == len(
                bdb.execute('''
                estimate correlation, correlation pvalue
                    from pairwise variables of p modelled by g1
            ''').fetchall())

        # DEPENDENCE PROBABILITY, MUTUAL INFORMATION
        assert 4 == len(
            bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p
        ''').fetchall())
        assert 4 == len(
            bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p modelled by g0
        ''').fetchall())
        assert 9 == len(
            bdb.execute('''
            estimate dependence probability, mutual information
                from pairwise variables of p modelled by g1
        ''').fetchall())

        # SIMULATE LATENT VARIABLE
        assert 10 == len(
            bdb.execute('''
            simulate xe from p modeled by g1 limit 10;
        ''').fetchall())
        assert 10 == len(
            bdb.execute('''
            simulate y, xe from p modeled by g1 limit 10;
        ''').fetchall())
        # Cannot simulate the latent xe from the population p.
        with pytest.raises(BQLError):
            assert 10 == len(
                bdb.execute('''
                simulate xe from p limit 10;
            ''').fetchall())
        # Cannot simulate the latent xe from the generator g0.
        with pytest.raises(BQLError):
            assert 10 == len(
                bdb.execute('''
                simulate xe from p modeled by g0 limit 10;
            ''').fetchall())

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
Exemple #47
0
def test_nig_normal_latent_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(x)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x) values(?)', (x, ))
        bdb.execute('create population p for t(x numerical)')
        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration wait')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration wait')

        # observed given observed
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
                modelled by g0
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (x = 50) within p
                modelled by g1
        ''').fetchall()

        # observed given latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 50) within p
                    modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (xe = 50) within p
                modelled by g1
        ''').fetchall()

        # latent given observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 50 given (x = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 50 given (x = 50) within p
                    modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 50 given (x = 50) within p
                modelled by g1
        ''').fetchall()

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
Exemple #48
0
def test_drop_generator():
    bdb = bayeslite.bayesdb_open()
    # Initialize the database
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
        header=True, create=True)
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    composer.register_foreign_predictor(random_forest.RandomForest)
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                    Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            DEPENDENT(Contractor, Country_of_Contractor),
            INDEPENDENT(Country_of_Operator, Date_of_Launch)
        );''')
    generator_id = bayeslite.core.bayesdb_get_generator(bdb, 't1')
    schema = [
        ('table', 'bayesdb_composer_cc_id'),
        ('table', 'bayesdb_composer_column_owner'),
        ('table', 'bayesdb_composer_column_toposort'),
        ('table', 'bayesdb_composer_column_parents'),
        ('table', 'bayesdb_composer_column_foreign_predictor'),
    ]
    # Iterate through tables before dropping.
    for _, name in schema:
        bdb.sql_execute('''
            SELECT * FROM {} WHERE generator_id=?
        '''.format(quote(name)), (generator_id,)).next()
    # Drop generator and ensure table lookups with generator_id throw error.
    bdb.execute('DROP GENERATOR t1')
    for _, name in schema:
        with pytest.raises(StopIteration):
            bdb.sql_execute('''
                SELECT * FROM {} WHERE generator_id=?
            '''.format(quote(name)), (generator_id,)).next()
    assert not bayeslite.core.bayesdb_has_generator(bdb, 't1')
    assert not bayeslite.core.bayesdb_has_generator(bdb, 't1_cc')
    bdb.close()
Exemple #49
0
def test_composer_integration__ci_slow():
    # But currently difficult to seperate these tests into smaller tests because
    # of their sequential nature. We will still test all internal functions
    # with different regimes of operation.

    # SETUP
    # -----
    # Dataset.
    bdb = bayeslite.bayesdb_open()
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
        header=True, create=True)
    bdbcontrib.nullify(bdb, 'satellites', 'NaN')
    # Composer.
    composer = Composer(n_samples=5)
    composer.register_foreign_predictor(
        multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    composer.register_foreign_predictor(random_forest.RandomForest)
    # Use complex generator for interesting test cases.
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                    Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            DEPENDENT(Contractor, Country_of_Contractor),
            INDEPENDENT(Country_of_Operator, Date_of_Launch)
        );''')


    # ----------------------
    # TEST INITIALIZE MODELS
    # ----------------------

    bdb.execute('INITIALIZE 2 MODELS FOR t1')
    # Check number of models.
    df = bdbcontrib.describe_generator_models(bdb, 't1')
    assert len(df) == 2
    df = bdbcontrib.describe_generator_models(bdb, 't1_cc')
    assert len(df) == 2

    # -------------------
    # TEST ANALYZE MODELS
    # -------------------

    bdb.execute('ANALYZE t1 FOR 2 ITERATIONS WAIT;')
    # Check number of iterations of composer.
    df = bdbcontrib.describe_generator_models(bdb, 't1')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2
    # Check number of iterations of composer_cc.
    df = bdbcontrib.describe_generator_models(bdb, 't1_cc')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2

    # ----------------------------------
    # TEST COLUMN DEPENDENCE PROBABILITY
    # ----------------------------------

    # Special 0/1 regimes.
    # Local with a INDEPENDENT local should be 0.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Date_of_Launch
            WITH Country_of_Operator FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 0
    # Local with a DEPENDENT local should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Perigee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Apogee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    # Foreign with a local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Apogee_km
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Power_watts
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Type_of_Orbit WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a local non-parent DEPENDENT with local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Eccentricity FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with foreign sharing common direct ancestor should be 1.
    # Launch_Mass_kg is the common parent.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign sharing a common DEPENDENT ancestor should be 1.
    # Eccentricity is a parent of Type_of_orbit, and is dependent
    # with Period_minutes through DEPENDENT(Apogee_km, Perigee_km, Eccentricity)
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Column with itself should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.

    # Unknown [0,1] regimes.
    # Foreign with a local of unknown relation with parents.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    # Foreign with a foreign of unknown ancestry relation.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Period_minutes FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.

    # ----------------------------------
    # TEST SIMULATE
    # ----------------------------------

    # Crash tests for various code paths. Quality of simulations ignored.
    # Joint local.
    curs = bdb.execute('''
        SIMULATE Power_watts, Launch_Mass_kg FROM t1 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes FROM t1 GIVEN Apogee_km = 1000, Perigee_km = 980
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign with missing parents.
    curs = bdb.execute('''
        SIMULATE Anticipated_Lifetime FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with parents, and missing parents.
    curs = bdb.execute('''
        SIMULATE Type_of_Orbit, Eccentricity FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with non-parents.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Eccentricity FROM t1 GIVEN Apogee_km = 38000
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint local conditioned on two foreigns.
    curs = bdb.execute('''
        SIMULATE Country_of_Operator, Inclination_radians FROM t1
            GIVEN Period_minutes = 1432, Anticipated_Lifetime = 5 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint foreign conditioned on third foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Anticipated_Lifetime FROM t1
            GIVEN Type_of_Orbit = 'Deep Highly Eccentric' LIMIT 2
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate foreign conditioned on itself.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Apogee_km FROM t1
            GIVEN Period_minutes = 102 LIMIT 2
    ''')
    assert [s[0] for s in curs] == [102] * 2

    # -----------------------------
    # TEST COLUMN VALUE PROBABILITY
    # -----------------------------

    # Crash tests for various code path. Quality of logpdf ignored.
    # Conditional local.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Power_watts = 800 GIVEN (Perigee_km = 980,
            Launch_Mass_kg = 890) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Unconditional foreign
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Conditional foreign on parent and non-parents.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 GIVEN
            (Apogee_km = 38000, Eccentricity = 0.03) FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[0]
    # Conditional foriegn on foreign.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Anticipated_Lifetime = 4.09 GIVEN
            (Class_of_Orbit = 'LEO', Purpose='Astrophysics',
                Period_minutes = 1436) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Categorical foreign should be less than 1.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Type_of_Orbit = 'Polar' FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] <= 1.
    # Query inconsistent with evidence should be 0.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF "Type_of_Orbit" = 'Polar'
            GIVEN ("Type_of_Orbit" = 'Deep Highly Eccentric') FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] == 0.
    # In theory, query consistent with evidence should be 1, but this is very
    # hard to ensure due to stochastic sampling giving different estimates of
    # P(Y), once in joint and once in marginal Monte Carlo estimation.

    # -----------------------
    # TEST MUTUAL INFORMATION
    # -----------------------

    # Two local columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Country_of_Contractor WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX Small sample sizes non-deterministically produce negative MI
    assert -1 <= curs.next()[0]
    # One local and one foreign column.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]
    # Two foreign columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            Anticipated_Lifetime USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]

    # -----------------------
    # TEST PREDICT CONFIDENCE
    # -----------------------

    # Continuous local column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Dry_Mass_kg CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete local column with no children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Purpose CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Discrete local column with children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Contractor CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Continuous foreign columns.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Period_minutes CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete foreign column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Type_of_Orbit CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1

    bdb.close()
def _test_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == n

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert not core.bayesdb_has_generator(bdb, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)

    gid = core.bayesdb_get_generator(bdb, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg,))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg,))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt,))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt,))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    assert gid == core.bayesdb_get_generator(bdb, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        bdb.execute('DROP MODEL 1 FROM %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert not core.bayesdb_generator_has_model(bdb, gid, 1)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)

    # Test analyzing models.
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg,))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg,))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg,))
def test_geweke_troll():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        bayeslite.bayesdb_register_metamodel(bdb, troll.TrollMetamodel())
        kl_est = geweke.geweke_kl(bdb, "troll_rng", [['column', 'numerical']],
            ['column'], [(1,0)], 2, 2, 2, 2)
        assert kl_est == (2, 0, 0)
def test_complex_dependencies__ci_slow():
    # Parameterize number of rows in synthetic dataset.
    n_rows = 250

    # Add an id column to ensure generator and cc colnos are different.
    ids = np.arange(n_rows)

    # Create real-valued data, such that DEP(x,y), DEP(y,z), and IND(x,z)
    mean = [4, -2, -11]
    cov = [[3.0, 0.7, 0.0],
           [0.7, 4.0, 0.6],
           [0.0, 0.6, 2.0]]
    numerical_data = np.random.multivariate_normal(mean, cov, size=n_rows)
    x, y, z = numerical_data[:,0], numerical_data[:,1], numerical_data[:,2]

    # Create categorical data v, highly dependent on x.
    bins = [np.percentile(x,p) for p in xrange(0,101,10)]
    v = np.digitize(x, bins)

    # Create categorical data, independent of all other columns.
    w = np.random.choice(range(8), size=n_rows)

    data = np.vstack((ids,x,y,z,w,v)).T

    # Create the database.
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)

        # Read the dataset.
        bdb.sql_execute('CREATE TABLE foo(id,x,y,z,v,w)')
        for row in data:
            bdb.sql_execute('INSERT INTO foo VALUES(?,?,?,?,?,?)', row)

        # Create schema, we will force  IND(x y), IND(x v), and DEP(z v w).
        bql = '''
            CREATE GENERATOR bar FOR foo USING crosscat(
                GUESS(*),
                id IGNORE,
                x NUMERICAL,
                y NUMERICAL,
                z NUMERICAL,
                v CATEGORICAL,
                w CATEGORICAL,
                INDEPENDENT(x, y),
                INDEPENDENT(x, v),
                DEPENDENT(z, v, w)
            );
        '''
        bdb.execute(bql)

        # Prepare the checker function.
        def check_dependencies():
            bql = '''
                ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF bar
            '''
            for _id, col1, col2, dep in bdb.execute(bql):
                # test IND(x y)
                if (col1, col2) in [('x','y'), ('y','x')]:
                    assert dep == 0
                    continue
                # test IND(x v)
                if (col1, col2) in [('x','v'), ('v','x')]:
                    assert dep == 0
                    continue
                # test DEP(z v)
                if (col1, col2) in [('z','v'), ('v','z')]:
                    assert dep == 1
                    continue
                # test DEP(z w)
                if (col1, col2) in [('z', 'w'), ('w', 'z')]:
                    assert dep == 1
                    continue

        # Test dependency pre-analysis.
        bdb.execute('INITIALIZE 10 MODELS FOR bar')
        check_dependencies()

        # Test dependency post-analysis.
        bdb.execute('ANALYZE bar for 10 ITERATION WAIT')
        check_dependencies()
def test_correlation():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        cc = crosscat.LocalEngine.LocalEngine(seed=0)
        ccme = CrosscatMetamodel(cc)
        bayeslite.bayesdb_register_metamodel(bdb, ccme)
        bdb.sql_execute('CREATE TABLE u(id, c0, c1, n0, n1, r0, r1)')
        bdb.execute('''
            CREATE GENERATOR u_cc FOR u USING crosscat (
                c0 CATEGORICAL,
                c1 CATEGORICAL,
                n0 NUMERICAL,
                n1 NUMERICAL,
                r0 CYCLIC,
                r1 CYCLIC,
            )
        ''')
        assert bdb.execute('ESTIMATE CORRELATION, CORRELATION PVALUE'
                ' FROM PAIRWISE COLUMNS OF u_cc'
                ' WHERE name0 < name1'
                ' ORDER BY name0, name1').fetchall() == \
            [
                (1, 'c0', 'c1', None, None),
                (1, 'c0', 'n0', None, None),
                (1, 'c0', 'n1', None, None),
                (1, 'c0', 'r0', None, None),
                (1, 'c0', 'r1', None, None),
                (1, 'c1', 'n0', None, None),
                (1, 'c1', 'n1', None, None),
                (1, 'c1', 'r0', None, None),
                (1, 'c1', 'r1', None, None),
                (1, 'n0', 'n1', None, None),
                (1, 'n0', 'r0', None, None),
                (1, 'n0', 'r1', None, None),
                (1, 'n1', 'r0', None, None),
                (1, 'n1', 'r1', None, None),
                (1, 'r0', 'r1', None, None),
            ]
        bdb.sql_execute('CREATE TABLE t'
            '(id, c0, c1, cx, cy, n0, n1, nc, nl, nx, ny)')
        data = [
            ('foo', 'quagga', 'x', 'y', 0, -1, +1, 1, 0, 13),
            ('bar', 'eland', 'x', 'y', 87, -2, -1, 2, 0, 13),
            ('baz', 'caribou', 'x', 'y', 92.1, -3, +1, 3, 0, 13),
        ] * 10
        for i, row in enumerate(data):
            row = (i + 1,) + row
            bdb.sql_execute('INSERT INTO t VALUES (?,?,?,?,?,?,?,?,?,?,?)',
                row)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                c0 CATEGORICAL,
                c1 CATEGORICAL,
                cx CATEGORICAL,
                cy CATEGORICAL,
                n0 NUMERICAL,
                n1 NUMERICAL,
                nc NUMERICAL,
                nl NUMERICAL,
                nx NUMERICAL,
                ny NUMERICAL
            )
        ''')
        result = bdb.execute('ESTIMATE CORRELATION, CORRELATION PVALUE'
            ' FROM PAIRWISE COLUMNS OF t_cc'
            ' WHERE name0 < name1'
            ' ORDER BY name0, name1').fetchall()
        expected = [
                (2, 'c0', 'c1', 1., 2.900863120340436e-12),
                (2, 'c0', 'cx', None, None),
                (2, 'c0', 'cy', None, None),
                (2, 'c0', 'n0', 1., 0.),
                (2, 'c0', 'n1', 1., 0.),
                (2, 'c0', 'nc', 1., 0.),
                (2, 'c0', 'nl', 1., 0.),
                (2, 'c0', 'nx', None, None),
                (2, 'c0', 'ny', None, None),
                (2, 'c1', 'cx', None, None),
                (2, 'c1', 'cy', None, None),
                (2, 'c1', 'n0', 1., 0.),
                (2, 'c1', 'n1', 1., 0.),
                (2, 'c1', 'nc', 1., 0.),
                (2, 'c1', 'nl', 1., 0.),
                (2, 'c1', 'nx', None, None),
                (2, 'c1', 'ny', None, None),
                (2, 'cx', 'cy', None, None),
                (2, 'cx', 'n0', None, None),
                (2, 'cx', 'n1', None, None),
                (2, 'cx', 'nc', None, None),
                (2, 'cx', 'nl', None, None),
                (2, 'cx', 'nx', None, None),
                (2, 'cx', 'ny', None, None),
                (2, 'cy', 'n0', None, None),
                (2, 'cy', 'n1', None, None),
                (2, 'cy', 'nc', None, None),
                (2, 'cy', 'nl', None, None),
                (2, 'cy', 'nx', None, None),
                (2, 'cy', 'ny', None, None),
                (2, 'n0', 'n1', 0.7913965673596881, 0.),
                (2, 'n0', 'nc', 0.20860343264031175, 0.0111758925135),
                (2, 'n0', 'nl', 0.7913965673596881, 0.),
                (2, 'n0', 'nx', None, None),
                (2, 'n0', 'ny', None, None),
                (2, 'n1', 'nc', 0., 1.),
                (2, 'n1', 'nl', 1., 0.),
                (2, 'n1', 'nx', None, None),
                (2, 'n1', 'ny', None, None),
                (2, 'nc', 'nl', 0., 1.),
                (2, 'nc', 'nx', None, None),
                (2, 'nc', 'ny', None, None),
                (2, 'nl', 'nx', None, None),
                (2, 'nl', 'ny', None, None),
                (2, 'nx', 'ny', None, None),
            ]
    for expected_item, observed_item in zip(expected, result):
        (xpd_genid, xpd_name0, xpd_name1, xpd_corr, xpd_corr_p) = expected_item
        (obs_genid, obs_name0, obs_name1, obs_corr, obs_corr_p) = observed_item
        assert xpd_genid == obs_genid
        assert xpd_name0 == obs_name0
        assert xpd_name1 == obs_name1
        assert xpd_corr == obs_corr or relerr(xpd_corr, obs_corr) < 1e-10
        assert (xpd_corr_p == obs_corr_p or
                relerr(xpd_corr_p, obs_corr_p) < 1e-1)
Exemple #54
0
def test_nig_normal_latent_2var_conditional_smoke():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('create population p for t(x numerical; y numerical)')

        # CORRELATION, CORRELATION PVALUE, without generators.
        assert 4 == len(
            bdb.execute('''
            estimate correlation, correlation pvalue
                from pairwise variables of p
        ''').fetchall())

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')
        bdb.execute('initialize 1 model for g0')
        bdb.execute('analyze g0 for 1 iteration wait')
        bdb.execute('initialize 1 model for g1')
        bdb.execute('analyze g1 for 1 iteration wait')

        # observed given other observed
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
                modelled by g0
        ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (y = 49) within p
                modelled by g1
        ''').fetchall()
        bdb.execute('simulate x from p given y = 49 limit 1').fetchall()
        bdb.execute('''
            simulate x from p modelled by g0 given y = 49 limit 1
        ''').fetchall()
        bdb.execute('''
            simulate x from p modelled by g1 given y = 49 limit 1
        ''').fetchall()

        # observed given related latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of x = 50 given (xe = 1) within p
                    modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of x = 50 given (xe = 1) within p
                modelled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate x from p given xe = 1 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate x from p modelled by g0 given xe = 1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate x from p modelled by g1 given xe = 1 limit 1
        ''').fetchall()

        # observed given unrelated latent
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of y = 50 given (xe = 1) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of y = 50 given (xe = 1) within p
                    modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of y = 50 given (xe = 1) within p
                modelled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate y from p given xe = 1 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate y from p modelled by g0 given xe = 1 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate y from p modelled by g1 given xe = 1 limit 1
        ''').fetchall()

        # latent given related observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (x = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (x = 50) within p
                    modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (x = 50) within p
                modelled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate xe from p given x = 50 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modelled by g0 given x = 50 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modelled by g1 given x = 50 limit 1
        ''').fetchall()

        # latent given unrelated observed
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (y = 50) within p
            ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                estimate probability density of xe = 1 given (y = 50) within p
                    modelled by g0
            ''').fetchall()
        bdb.execute('''
            estimate probability density of xe = 1 given (y = 50) within p
                modelled by g1
        ''').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('simulate xe from p given y = 50 limit 1').fetchall()
        with pytest.raises(BQLError):
            bdb.execute('''
                simulate xe from p modelled by g0 given y = 50 limit 1
            ''').fetchall()
        bdb.execute('''
            simulate xe from p modelled by g1 given y = 50 limit 1
        ''').fetchall()

        bdb.execute('drop models from g0')
        bdb.execute('drop generator g0')
        bdb.execute('drop models from g1')
        bdb.execute('drop generator g1')
        bdb.execute('drop population p')
        bdb.execute('drop table t')
Exemple #55
0
def test_regress_bonanza__ci_integration():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_metamodel(
            bdb, CGPM_Metamodel(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                MODEL apogee AS NUMERICAL;
                MODEL class_of_orbit AS NOMINAL;
                MODEL country_of_operator AS NOMINAL;
                MODEL launch_mass AS NUMERICAL;
                MODEL perigee AS NUMERICAL;
                MODEL period AS NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE METAMODEL m FOR satellites WITH BASELINE crosscat;
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')

        def check_regression_variables(results, numericals, nominals):
            seen = set()
            for r in results:
                assert len(r) == 2
                variable = r[0]
                assert variable not in seen
                assert variable in numericals or \
                    any(variable.startswith('%s_dum_' % (nominal,))
                        for nominal in nominals)
                seen.add(variable)

        # Regression on 1 numerical variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites;
        ''').fetchall()
        assert len(results) == 2
        check_regression_variables(results, ['intercept', 'perigee'], [])

        # Regression on 1 nominal variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept'], ['country_of_operator'])

        # Regression on 1 nominal + 1 numerical variable.
        bdb.execute('''
            REGRESS apogee GIVEN (perigee, country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(
            results, ['intercept', 'perigee'], ['country_of_operator'])

        # Regression on all variables.
        results = bdb.execute('''
            REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites;
        ''', (3,)).fetchall()
        check_regression_variables(
            results,
            ['intercept', 'perigee', 'launch_mass', 'period',],
            ['country_of_operator', 'class_of_orbit',],
        )

        # Regression on column selector subexpression with a binding.
        results = bdb.execute('''
            REGRESS apogee GIVEN (
                satellites.(
                    ESTIMATE * FROM VARIABLES OF satellites
                    ORDER BY dependence probability with apogee DESC
                    LIMIT ?
                )
            )
            USING 12 SAMPLES BY satellites MODELLED BY m USING MODEL 1;
        ''', (3,)).fetchall()

        cursor = bdb.execute('''
            ESTIMATE * FROM VARIABLES OF satellites
                ORDER BY dependence probability with apogee DESC
                LIMIT ?
        ''', (3,)).fetchall()
        top_variables = [c[0] for c in cursor]
        nominals = [
            var for var in top_variables
            if var in ['country_of_operator', 'class_of_orbit',]
        ]
        numericals = [var for var in top_variables if var not in nominals]
        check_regression_variables(
            results, numericals + ['intercept'], nominals)

        # Cannot mix * with other variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                REGRESS apogee GIVEN (*, class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()

        # Not enough data for regression, 1 unique nominal variable.
        with pytest.raises(ValueError):
            bdb.execute('''
                REGRESS apogee GIVEN (class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()
Exemple #56
0
def test_create_generator_schema():
    bdb = bayeslite.bayesdb_open()
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
        header=True, create=True)
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    # Using crosscat and default to specify models should work.
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL
            ),
            crosscat (
                Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL
            )
        );''')
    assert bayeslite.core.bayesdb_has_generator(bdb, 't1_cc')
    # IGNORE and GUESS(*) are forbidden and should crash.
    with pytest.raises(AttributeError):
        bdb.execute('''
            CREATE GENERATOR t2 FOR satellites USING composer(
                default (
                    GUESS(*), Country_of_Operator IGNORE,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                )
            );''')
    # Test unregistered foreign predictor.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t3 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Unregistered foreign predictor should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t4 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Registered foreign predictor should work.
    composer.register_foreign_predictor(random_forest.RandomForest)
    bdb.execute('''
        CREATE GENERATOR t5 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Eccentricity NUMERICAL
            ),
            random_forest (
                Apogee_km NUMERICAL GIVEN Operator_Owner
            )
        );''')
    # Wrong stattype in predictor should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t6 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km RADIAL GIVEN Operator_Owner
                )
            );''')
    # Missing GIVEN keyword should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t6 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL, Operator_Owner
                )
            );''')
    # Missing conditions in random forest conditions should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t7 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Test duplicate declarations.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t7 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Class_of_orbit CATEGORICAL GIVEN Operator_Owner
                )
            );''')
    # Arbitrary DAG with foreign predictors.
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    bdb.execute('''
        CREATE GENERATOR t8 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
            ),
            random_forest (
                Apogee_km NUMERICAL GIVEN Operator_Owner, Users
            ),
            multiple_regression (
                Eccentricity NUMERICAL GIVEN Apogee_km, Users, Perigee_km
            )
        );''')
    # Duplicate declarations in foreign predictors should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t9 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Perigee_km NUMERICAL GIVEN Purpose
                ),
                multiple_regression (
                    Perigee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # MML for default models should work.
    bdb.execute('''
        CREATE GENERATOR t10 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Apogee_km NUMERICAL
            )
            random_forest (
                Perigee_km NUMERICAL GIVEN Purpose
            )
            multiple_regression (
                Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit
            )
            DEPENDENT(Apogee_km, Perigee_km, Purpose),
            INDEPENDENT(Country_of_Operator, Purpose)
        );''')
    # MML for foreign predictors should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t11 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL,
                    Apogee_km NUMERICAL
                ),
                random_forest (
                    Perigee_km NUMERICAL GIVEN Purpose
                ),
                multiple_regression (
                    Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit
                )
                DEPENDENT(Apogee_km, Eccentricity, Country_of_Operator),
                INDEPENDENT(Perigee_km, Purpose)
            );''')
    # Test full generator.
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    bdb.execute('''
        CREATE GENERATOR t12 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            INDEPENDENT(Country_of_Operator, longitude_radians_of_geo)
        );''')
    bdb.close()
def make_bdb():
    crosscat = test_core.local_crosscat()
    metamodel = test_core.CrosscatMetamodel(crosscat)
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    return bdb
Exemple #58
0
# Find the satellites file.
PATH_KEPLER = os.path.dirname(os.path.abspath(__file__))
PATH_EXAMPLES = os.path.dirname(PATH_KEPLER)
PATH_SATELLITES = os.path.join(PATH_EXAMPLES, 'satellites')
PATH_SATELLITES_CSV = os.path.join(PATH_SATELLITES, 'satellites.csv')

composer = Composer()
composer.register_foreign_predictor(keplers_law.KeplersLaw)
composer.register_foreign_predictor(random_forest.RandomForest)

if os.path.exists(os.path.join(outdir, 'kepler.bdb')):
    os.remove(os.path.join(outdir, 'kepler.bdb'))

bdb = bayeslite.bayesdb_open(os.path.join(outdir, 'kepler.bdb'))
bayeslite.bayesdb_register_metamodel(bdb, composer)
bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
    header=True, create=True)

bdbcontrib.query(bdb, '''
    CREATE GENERATOR sat_kepler FOR satellites USING composer(
        default (
            Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
            Users CATEGORICAL, Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL, Perigee_km NUMERICAL,
            Apogee_km NUMERICAL, Eccentricity NUMERICAL,
            Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
Exemple #59
0
def test_legacy_models__ci_slow():
    bdb = bayeslite.bayesdb_open(builtin_metamodels=False)
    cc = crosscat.LocalEngine.LocalEngine(seed=0)
    metamodel = CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    with pytest.raises(ValueError):
        bayeslite.bayesdb_load_legacy_models(bdb,
                                             'dha_cc',
                                             'dha',
                                             'crosscat',
                                             dha_models,
                                             create=True)
    with open(dha_csv, 'rU') as f:
        read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
    bayeslite.bayesdb_load_legacy_models(bdb,
                                         'dha_cc',
                                         'dha',
                                         'crosscat',
                                         dha_models,
                                         create=True)
    # Make sure guessing also works.
    bdb.execute('create generator dha_cc0 for dha using crosscat(guess(*))')
    bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook)
    # Need to be able to overwrite existing codebook.
    #
    # XXX Not sure this is the right API.  What if overwrite is a
    # mistake?
    bayeslite.bayesdb_load_codebook_csv_file(bdb, 'dha', dha_codebook)
    bql = '''
        ESTIMATE name FROM dha_cc
            ORDER BY SIMILARITY TO (name = ?) DESC
            LIMIT 10
    '''
    with bdb.savepoint():
        assert bdb.execute(bql, ('Albany NY', )).fetchall() == [
            ('Albany NY', ),
            ('Scranton PA', ),
            ('United States US', ),
            ('Norfolk VA', ),
            ('Reading PA', ),
            ('Salisbury MD', ),
            ('Louisville KY', ),
            ('Cleveland OH', ),
            ('Covington KY', ),
            ('Akron OH', ),
        ]
    # Tickles an issue in case-folding of column names.
    bql = '''
        ESTIMATE name
            FROM dha_cc
            ORDER BY PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc ASC
            LIMIT 10
    '''
    with bdb.savepoint():
        assert bdb.execute(bql).fetchall() == [
            ('McAllen TX', ),
            ('Worcester MA', ),
            ('Beaumont TX', ),
            ('Temple TX', ),
            ('Corpus Christi TX', ),
            ('Takoma Park MD', ),
            ('Kingsport TN', ),
            ('Bangor ME', ),
            ('Lebanon NH', ),
            ('Panama City FL', ),
        ]
Exemple #60
0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed):
    then = time.time()

    timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d')
    user = subprocess.check_output(["whoami"]).strip()
    host = subprocess.check_output(["hostname"]).strip()
    filestamp = '-' + timestamp + '-' + user
    def out_file_name(base, ext):
        return out_dir + '/' + base + filestamp + ext

    csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv')
    bdb_file = out_file_name('satellites', '.bdb')

    # so we can build bdb models
    os.environ['BAYESDB_WIZARD_MODE']='1'

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(bdb_file):
        print 'Error: File', bdb_file, 'already exists. Please remove it.'
        sys.exit(1)

    # create database mapped to filesystem
    log('opening bdb on disk: %s' % bdb_file)
    bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False)

    def execute(bql):
        log("executing %s" % bql)
        bdb.execute(bql)

    # read csv into table
    log('reading data from %s' % csv_file)
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file,
            header=True, create=True, ifnotexists=True)

    # Add a "not applicable" orbit sub-type
    log('adding "not applicable" orbit sub-type')
    bdb.sql_execute('''UPDATE satellites
        SET type_of_orbit = 'N/A'
        WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO')
          AND type_of_orbit = 'NaN'
    ''')

    # nullify "NaN"
    log('nullifying NaN')
    bdbcontrib.nullify(bdb, 'satellites', 'NaN')

    # register crosscat metamodel
    cc = ccme.MultiprocessingEngine(seed=seed)
    ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, ccmm)

    # create the crosscat generator using
    execute('''
        CREATE GENERATOR satellites_cc FOR satellites USING crosscat (
            GUESS(*),
            name IGNORE,
            Country_of_Operator CATEGORICAL,
            Operator_Owner CATEGORICAL,
            Users CATEGORICAL,
            Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL,
            Type_of_Orbit CATEGORICAL,
            Perigee_km NUMERICAL,
            Apogee_km NUMERICAL,
            Eccentricity NUMERICAL,
            Period_minutes NUMERICAL,
            Launch_Mass_kg NUMERICAL,
            Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL,
            Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL,
            Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL
        )
    ''')

    execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models,))

    cur_iter_ct = 0

    def snapshot():
        log('vacuuming')
        bdb.sql_execute('vacuum')
        cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
        save_file_name = out_file_name('satellites', cur_infix + '.bdb')
        meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
        log('recording snapshot ' + save_file_name)
        os.system("cp %s %s" % (bdb_file, save_file_name))
        report(save_file_name, meta_file_name)

    def record_metadata(f, saved_file_name, sha_sum, total_time,
                        plot_file_name=None):
        f.write("DB file " + saved_file_name + "\n")
        f.write(sha_sum)
        f.write("built from " + csv_file + "\n")
        f.write("by %s@%s\n" % (user, host))
        f.write("at seed %s\n" % seed)
        f.write("in %3.2f seconds\n" % total_time)
        f.write("with %s models analyzed for %s iterations\n"
                % (num_models, num_iters))
        f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n"
                % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__))
        if plot_file_name is not None:
            f.write("diagnostics recorded to %s\n" % plot_file_name)
        f.flush()

    def report(saved_file_name, metadata_file, echo=False, plot_file_name=None):
        sha256 = hashlib.sha256()
        with open(saved_file_name, 'rb') as fd:
            for chunk in iter(lambda: fd.read(65536), ''):
                sha256.update(chunk)
        sha_sum = sha256.hexdigest() + '\n'
        total_time = time.time() - then
        with open(metadata_file, 'w') as fd:
            record_metadata(fd, saved_file_name,
                            sha_sum, total_time, plot_file_name)
            fd.write('using script ')
            fd.write('-' * 57)
            fd.write('\n')
            fd.flush()
            os.system("cat %s >> %s" % (__file__, metadata_file))

        if echo:
            record_metadata(sys.stdout, saved_file_name,
                            sha_sum, total_time, plot_file_name)

    def final_report():
        # create a diagnostics plot
        plot_file_name = out_file_name('satellites', '-logscores.pdf')
        log('writing diagnostic plot to %s' % plot_file_name)
        _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore',
                                                          'satellites_cc')
        plt.savefig(plot_file_name)
        final_metadata_file = out_file_name('satellites', '-meta.txt')
        report(bdb_file, final_metadata_file,
               echo=True, plot_file_name=plot_file_name)

    snapshot()
    while cur_iter_ct < num_iters:
        execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq)
        cur_iter_ct += checkpoint_freq
        snapshot()

    final_report()

    log('closing bdb %s' % bdb_file)
    bdb.close()
    os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))