Esempio n. 1
0
def cgpm_smoke_bdb():
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        registry = {
            'piecewise': PieceWise,
        }
        bayesdb_register_backend(
            bdb, CGPM_Backend(registry, multiprocess=0))

        bdb.sql_execute('CREATE TABLE t (Output, cat, Input)')
        for i in xrange(3):
            for j in xrange(3):
                for k in xrange(3):
                    output = i + j/(k + 1)
                    cat = -1 if (i + j*k) % 2 else +1
                    input = (i*j - k)**2
                    if i % 2:
                        output = None
                    if j % 2:
                        cat = None
                    if k % 2:
                        input = None
                    bdb.sql_execute('''
                        INSERT INTO t (output, cat, input) VALUES (?, ?, ?)
                    ''', (output, cat, input))

        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                output  NUMERICAL;
                input   NUMERICAL;
                cat     NOMINAL;
            )
        ''')

        yield bdb
Esempio n. 2
0
def get_backend_object(cfg):
    if cfg.backend is None:
        raise RuntimeError('BACKEND was not set in config file')

    if cfg.backend == 'cgpm':
        return CGPM_Backend({}, multiprocess=False)
    elif cfg.backend == 'loom':
        return LoomBackend(cfg.loom_path)
Esempio n. 3
0
def bayesdb(backend=None, **kwargs):
    if backend is None:
        backend = CGPM_Backend(cgpm_registry={}, multiprocess=False)
    bdb = bayeslite.bayesdb_open(builtin_backends=False, **kwargs)
    bayeslite.bayesdb_register_backend(bdb, backend)
    try:
        yield bdb
    finally:
        bdb.close()
Esempio n. 4
0
def t1_mp():
    backend = CGPM_Backend(cgpm_registry={}, multiprocess=True)
    return bayesdb_population(bayesdb(backend=backend),
                              't1',
                              'p1',
                              'p1_cc',
                              t1_schema,
                              t1_data,
                              columns=[
                                  'id IGNORE', 'label NOMINAL',
                                  'age NUMERICAL', 'weight NUMERICAL'
                              ])
Esempio n. 5
0
def cgpm_dummy_satellites_pop_bdb():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            create population satellites for satellites_ucs with schema(
                apogee numerical;
                class_of_orbit nominal;
                country_of_operator nominal;
                launch_mass numerical;
                perigee numerical;
                period numerical
            )
        ''')
        backend = CGPM_Backend(dict(), multiprocess=0)
        bayesdb_register_backend(bdb, backend)
        yield bdb
Esempio n. 6
0
def run(stdin, stdout, stderr, argv):
    args = parse_args(argv[1:])
    progname = argv[0]
    slash = progname.rfind('/')
    if slash:
        progname = progname[slash + 1:]
    if args.bdbpath is None and not args.memory:
        stderr.write('%s: pass filename or -m/--memory\n' % (progname,))
        return 1
    if args.bdbpath == '-':
        stderr.write('%s: missing option?\n' % (progname,))
        return 1
    bdb = bayeslite.bayesdb_open(pathname=args.bdbpath,
        builtin_backends=False)

    multiprocess = args.jobs != 1
    backend = CGPM_Backend(cgpm_registry={}, multiprocess=multiprocess)
    bayeslite.bayesdb_register_backend(bdb, backend)
    bdbshell = shell.Shell(bdb, 'cgpm', stdin, stdout, stderr)
    with hook.set_current_shell(bdbshell):
        if not args.no_init_file:
            init_file = os.path.join(os.path.expanduser('~/.bayesliterc'))
            if os.path.isfile(init_file):
                bdbshell.dot_read(init_file)

        if args.file is not None:
            for path in args.file:
                if os.path.isfile(path):
                    bdbshell.dot_read(path)
                else:
                    bdbshell.stdout.write('%s is not a file.  Aborting.\n' %
                        (str(path),))
                    break

        if not args.batch:
            bdbshell.cmdloop()
    return 0
Esempio n. 7
0
    def bayesdb(self, line, cell=None):
        parser = argparse.ArgumentParser()
        parser.add_argument('path', help='Path of bdb file.')
        parser.add_argument('-s', type=int, default=0, help='Seed.')
        parser.add_argument('-j', action='store_true', help='Multiprocessing.')
        args = parser.parse_args(line.split())
        if self._bdb is not None:
            self._bdb.close()
            self._bdb = None

        self._path = args.path
        seed = struct.pack('<QQQQ', 0, 0, 0, args.s)
        self._bdb = bayesdb_open(pathname=args.path,
                                 seed=seed,
                                 builtin_backends=False)

        # Small hack for the VsCGpm, which takes in the venturescript source
        # from %venturescript cells!
        def _VsCGpm(outputs, inputs, rng, *args, **kwds):
            if 'source' not in kwds:
                kwds['source'] = '\n'.join(self._venturescript)
            return VsCGpm(outputs, inputs, rng, *args, **kwds)

        # Register cgpm backend.
        cgpm_registry = {
            'factor_analysis': FactorAnalysis,
            'inline_venturescript': InlineVsCGpm,
            'linear_regression': LinearRegression,
            'multivariate_kde': MultivariateKde,
            'multivariate_knn': MultivariateKnn,
            'ordinary_least_squares': OrdinaryLeastSquares,
            'random_forest': RandomForest,
            'venturescript': _VsCGpm,
        }
        mm = CGPM_Backend(cgpm_registry, multiprocess=args.j)
        bayesdb_register_backend(self._bdb, mm)
        return 'Loaded: %s' % (self._path)
Esempio n. 8
0
def test_bad_analyze_vars():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee TO NUMERICAL;
                SET STATTYPE OF class_of_orbit TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass TO NUMERICAL;
                SET STATTYPE OF perigee TO NUMERICAL;
                SET STATTYPE OF period TO NUMERICAL
            )
        ''')
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_backend(bdb, CGPM_Backend(registry))
        bdb.execute('''
            CREATE GENERATOR satellites_cgpm FOR satellites USING cgpm
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_cgpm')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION ()')
        bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION (
                    VARIABLES period, perige
                )
            ''')
        with pytest.raises(BQLError):
            # Unknown variable `perige'.
            bdb.execute('''
                ANALYZE satellites_cgpm FOR 1 ITERATION (
                    SKIP period, perige
                )
            ''')
Esempio n. 9
0
def test_predictive_relevance():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(bdb, CGPM_Backend(cgpm_registry=dict()))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA (
                apogee                  NUMERICAL;
                class_of_orbit          NOMINAL;
                country_of_operator     NOMINAL;
                launch_mass             NUMERICAL;
                perigee                 NUMERICAL;
                period                  NUMERICAL
            )
        ''')
        bdb.execute('CREATE GENERATOR m FOR satellites;')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')
        bdb.execute('ANALYZE m FOR 25 ITERATION (OPTIMIZED);')

        # Check self-similarites, and also provide coverage of bindings.
        rowids = bdb.execute('SELECT OID from satellites_ucs;').fetchall()
        for rowid in rowids[:4]:
            cursor = bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    TO EXISTING ROWS (rowid = ?)
                    IN THE CONTEXT OF "period"
                FROM satellites
                WHERE rowid = ?
            ''', (1, 1,))
            assert next(cursor)[0] == 1.

        # A full extravaganza query, using FROM (as a 1-row).
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                TO EXISTING ROWS
                    (country_of_operator = 'Russia' AND period < 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (perigee=1.0, launch_mass=120),
                    (country_of_operator='Bulgaria', perigee=2.0))
                IN THE CONTEXT OF "country_of_operator"
            FROM satellites
            LIMIT 5
        ''').fetchall()
        assert len(cursor) == 5
        assert all(0 <= c[0] <= 1 for c in cursor)

        # A full extravaganza query, using BY (as a constant).
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS
                    (country_of_operator = 'Russia' AND period < 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0),
                    (country_of_operator='Bulgaria'))
                IN THE CONTEXT OF "country_of_operator"
            BY satellites
        ''').fetchall()
        assert len(cursor) == 1
        assert all(0 <= c[0] <= 1 for c in cursor)

        # Hypothetical satellite with negative perigee should not be similar,
        # and use a binding to just ensure that they work.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                TO HYPOTHETICAL ROWS WITH VALUES (
                    (perigee = ?))
                IN THE CONTEXT OF "perigee"
            FROM satellites
            LIMIT 5
        ''' , (-10000,)).fetchall()
        assert len(cursor) == 5
        assert all(np.allclose(c[0], 0) for c in cursor)

        # No matching target OF row.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid < 0) TO EXISTING ROWS (rowid = 10)
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Unknown CONTEXT variable "banana".
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 1) TO EXISTING ROWS (rowid = 2)
                    IN THE CONTEXT OF "banana"
                BY satellites
            ''')

        # No matching EXISTING ROW.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 10) TO EXISTING ROWS (rowid < 0)
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Unknown nominal values 'Mongolia' in HYPOTHETICAL ROWS.
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE RELEVANCE
                    OF (rowid = 10)
                    TO HYPOTHETICAL ROWS WITH VALUES (
                        (country_of_operator='Mongolia'),
                        (country_of_operator='Bulgaria', perigee=2.0))
                    IN THE CONTEXT OF "launch_mass"
                BY satellites
            ''')

        # Create a new row.
        bdb.sql_execute('''
            INSERT INTO satellites_ucs
            (apogee, launch_mass) VALUES (12.128, 12.128)
        ''')

        # TARGET ROW not yet incorporated should return nan.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (apogee = 12.128)
                TO HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0))
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is None

        # EXISTING ROW not yet incorporated should return nan, since there is
        # no hypothetical.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128)
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is None

        # Although apogee = 12.128 is EXISTING but not incorporated, there are
        # other EXISTING ROWS with apogee > 0, so we should still get a result.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128 OR apogee > 0)
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is not None

        # Although apogee = 12.128 is EXISTING but not incorporated, there are
        # other HYPOTHETICAL ROWS, so we should still get a result.
        cursor = bdb.execute('''
            ESTIMATE PREDICTIVE RELEVANCE
                OF (rowid = 1)
                TO EXISTING ROWS (apogee = 12.128 OR apogee > 0)
                AND HYPOTHETICAL ROWS WITH VALUES (
                    (country_of_operator='China', perigee=1.0),
                    (country_of_operator='Bulgaria'))
                IN THE CONTEXT OF "launch_mass"
            BY satellites
        ''')
        result = cursor_value(cursor)
        assert result is not None
Esempio n. 10
0
def test_add_drop_models():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(
            bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION p FOR satellites_ucs WITH SCHEMA(
                GUESS STATTYPES OF (*);
            )
        ''')
        bdb.execute('CREATE GENERATOR m FOR p (SUBSAMPLE 10);')

        # Retrieve id for testing.
        population_id = bayesdb_get_population(bdb, 'p')
        generator_id = bayesdb_get_generator(bdb, population_id, 'm')

        def check_modelno_mapping(lookup):
            pairs = bdb.sql_execute('''
                SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno
                WHERE generator_id = ?
            ''', (generator_id,))
            for pair in pairs:
                assert lookup[pair[0]] == pair[1]
                del lookup[pair[0]]
            assert len(lookup) == 0

        # Initialize some models.
        bdb.execute('INITIALIZE 16 MODELS FOR m')
        # Assert identity mapping initially.
        check_modelno_mapping({i:i for i in xrange(16)})

        bdb.execute('ANALYZE m FOR 1 ITERATION (QUIET);')

        # Drop some models.
        bdb.execute('DROP MODELS 1, 8-12, 14 FROM m')
        # Assert cgpm models are contiguous while bayesdb models are not, with
        # the mapping preserving the strict order.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
        })

        # Run some analysis again.
        bdb.execute('ANALYZE m FOR 1 ITERATION (OPTIMIZED; QUIET);')

        # Initialize 14 models if not existing.
        bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m')
        # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note
        # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14
        # MODELS in total will exist after the query, rather it will initialize
        # any non-existing modelnos with index 0-13, and any modelnos > 14
        # (modelno 15 in this test case) are untouched.
        check_modelno_mapping({
            0: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5,
            7: 6,
            13: 7,
            15: 8,
            # Recreated models.
            1: 9,
            8: 10,
            9: 11,
            10: 12,
            11: 13,
            12: 14,
        })

        # Drop some more models, add them back with some more, and confirm
        # arithmetic and ordering remains correct.
        bdb.execute('DROP MODELS 0-1 FROM m')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
        })
        bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;')
        check_modelno_mapping({
            2: 0,
            3: 1,
            4: 2,
            5: 3,
            6: 4,
            7: 5,
            13: 6,
            15: 7,
            # Recreated models.
            8: 8,
            9: 9,
            10: 10,
            11: 11,
            12: 12,
            # Re-recreated models.
            0: 13,
            1: 14,
            # New models.
            14: 15,
            16: 16,
            17: 17,
            18: 18,
            19: 19,
        })

        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODELS 20-50 FROM m')
        # Drop all models.
        bdb.execute('DROP MODELS FROM m;')
        # No such models.
        with pytest.raises(BQLError):
            bdb.execute('DROP MODEL 0 FROM m')
        # Assert cgpm mapping is cleared.
        cursor = bdb.sql_execute('''
            SELECT COUNT(*) FROM bayesdb_cgpm_modelno
            WHERE generator_id = ?
        ''', (generator_id,))
        assert cursor_value(cursor) == 0
Esempio n. 11
0
import pytest
import shutil
import tempfile

import bayeslite

import bayeslite.core as core

from bayeslite import bql_quote_name
from bayeslite.backends.cgpm_backend import CGPM_Backend
from bayeslite.backends.iid_gaussian import StdNormalBackend

examples = {
    'cgpm': (
        lambda: CGPM_Backend(cgpm_registry={}, multiprocess=False),
        't',
        'CREATE TABLE t(x NUMERIC, y NUMERIC, z NUMERIC)',
        'INSERT INTO t (x, y, z) VALUES (?, ?, ?)',
        [
            (0, 1.57, 'foo'),
            (1.83, 3.141, 'bar'),
            (1.82, 3.140, 'bar'),
            (-1, 6.28, 'foo'),
        ],
        'p',
        'p_cc',
        'CREATE POPULATION p FOR t'
        '(x NUMERICAL; y NUMERICAL; z NOMINAL)',
        'CREATE GENERATOR p_cc FOR p USING cgpm()',
        'CREATE GENERATOR p_cc FOR p USING crosscat',
Esempio n. 12
0
def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(
                        0, len(countries))]
                    name = 'sat-%s-%d' % (country,
                                          bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute(
                        '''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit NOMINAL;
                country_of_operator NOMINAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Backend(cgpm_registry)
        bayesdb_register_backend(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_variable_numbers(bdb, population_id, None) \
            == [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \
            == [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')
Esempio n. 13
0
def test_using_modelnos():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee              TO NUMERICAL;
                SET STATTYPE OF class_of_orbit      TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass         TO NUMERICAL;
                SET STATTYPE OF perigee             TO NUMERICAL;
                SET STATTYPE OF period              TO NUMERICAL
            )
        ''')
        bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR g0')

        # Crash test simulate.
        bdb.execute('''
            SIMULATE apogee, class_of_orbit
            FROM satellites
            MODELED BY g0
            USING MODEL 0-1
            LIMIT 10
        ''')
        # Crash test infer explicit.
        bdb.execute('''
            INFER EXPLICIT PREDICT period, perigee
            FROM satellites
            MODELED BY g0
            USING MODEL 0
            LIMIT 2
        ''')
        # Crash test dependence probability BY.
        c = bdb.execute('''
            ESTIMATE
                DEPENDENCE PROBABILITY OF launch_mass WITH period
            BY satellites
            MODELED BY g0
            USING MODEL 0
        ''')
        assert cursor_value(c) in [0, 1]
        # Crash test dependence probability pairwise.
        cursor = bdb.execute('''
            ESTIMATE
                DEPENDENCE PROBABILITY
            FROM PAIRWISE VARIABLES OF satellites
            MODELED BY g0
            USING MODEL 1
        ''')
        for d in cursor:
            assert d[0] in [0, 1]
        # Crash test mutual information 1row.
        bdb.execute('''
            ESTIMATE
                MUTUAL INFORMATION WITH (period) USING 1 SAMPLES
            FROM VARIABLES OF satellites
            USING MODEL 0
        ''').fetchall()
        # Test analyze on per-model basis.
        bdb.execute('''
            ANALYZE g0 MODEL 0 FOR 1 ITERATION CHECKPOINT 1 ITERATION
        ''')
        engine = bdb.backends['cgpm']._engine(bdb, 1)
        assert len(engine.states[0].diagnostics['logscore']) == 1
        assert len(engine.states[1].diagnostics['logscore']) == 0
        bdb.execute('''
            ANALYZE g0 MODEL 1 FOR 4 ITERATION CHECKPOINT 1 ITERATION (
                OPTIMIZED
            );
        ''')
        assert len(engine.states[0].diagnostics['logscore']) == 1
        assert len(engine.states[1].diagnostics['logscore']) == 4
        # Some errors with bad modelnos.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g0 MODEL 0-3 FOR 4 ITERATION
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                SIMULATE apogee FROM satellites USING MODEL 25 LIMIT 10;
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
                USING MODELS 0-8 LIMIT 2;
            ''')
Esempio n. 14
0
def test_cgpm_kepler():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                apogee                  NUMERICAL;
                launch_mass             NUMERICAL;
                class_of_orbit          NOMINAL;
                country_of_operator     NOMINAL;
                perigee                 NUMERICAL;
                period                  NUMERICAL
            )
        ''')
        bdb.execute('''
            ESTIMATE CORRELATION from PAIRWISE VARIABLES OF satellites
        ''').fetchall()
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_backend(
            bdb, CGPM_Backend(registry, multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                OVERRIDE GENERATIVE MODEL FOR period
                GIVEN apogee, perigee
                USING linreg
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR g0')
        c = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual')
        n = c.fetchvalue()
        # Another generator: exponential launch mass instead of normal.
        bdb.execute('''
            CREATE GENERATOR g1 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR launch_mass TO EXPONENTIAL;
                OVERRIDE MODEL FOR period GIVEN apogee, perigee
                    USING kepler(quagga = eland);
                SUBSAMPLE 20
            )
        ''')
        c_ = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual')
        n_ = c_.fetchvalue()
        assert n_ - n == 20
        bdb.execute('INITIALIZE 1 MODEL IF NOT EXISTS FOR g1')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION (VARIABLES period)')
        bdb.execute('ANALYZE g1 FOR 1 ITERATION')
        bdb.execute('ANALYZE g1 FOR 1 ITERATION (VARIABLES period)')
        # OPTIMIZED is ignored because period is a foreign variable.
        bdb.execute('''
            ANALYZE g1 FOR 1 ITERATION (OPTIMIZED; VARIABLES period)
        ''')
        # This should fail since we have a SET CATEGORY MODEL which is not
        # compatible with lovecat. The ValueError is from cgpm not bayeslite.
        with pytest.raises(ValueError):
            bdb.execute('''
                ANALYZE g1 FOR 1 ITERATION
                    (OPTIMIZED; VARIABLES launch_mass)
            ''')
        # Cannot use timed analysis with mixed variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g1 FOR 5 SECONDS (VARIABLES period, apogee)
            ''')
        # Cannot use timed analysis with mixed variables (period by SKIP).
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE g1 FOR 5 SECONDS (SKIP apogee)
            ''')
        # OK to use iteration analysis with mixed values.
        bdb.execute('''
                ANALYZE g1 FOR 1 ITERATION (VARIABLES period, apogee)
            ''')
        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                FROM PAIRWISE VARIABLES OF satellites
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()
        bdb.execute('''
            SIMULATE apogee, perigee, period FROM satellites LIMIT 100
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                PREDICT apogee
                    CONFIDENCE apogee_confidence
                    USING 5 SAMPLES
            FROM satellites LIMIT 2
        ''').fetchall()
        results = bdb.execute('''
            INFER EXPLICIT
                PREDICT class_of_orbit
                    CONFIDENCE class_of_orbit_confidence
            FROM satellites LIMIT 2
        ''').fetchall()
        assert len(results[0]) == 2
        assert isinstance(results[0][0], unicode)
        assert isinstance(results[0][1], float)
        # No CONFIDENCE specified.
        results = bdb.execute('''
            INFER EXPLICIT PREDICT class_of_orbit USING 2 SAMPLES
            FROM satellites LIMIT 2
        ''').fetchall()
        assert len(results[0]) == 1
        assert isinstance(results[0][0], unicode)
        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP GENERATOR g1')
Esempio n. 15
0
def test_analysis_subproblems_basic():
    with cgpm_dummy_satellites_bdb() as bdb:
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPE OF apogee TO NUMERICAL;
                SET STATTYPE OF class_of_orbit TO NOMINAL;
                SET STATTYPE OF country_of_operator TO NOMINAL;
                SET STATTYPE OF launch_mass TO NUMERICAL;
                SET STATTYPE OF perigee TO NUMERICAL;
                SET STATTYPE OF period TO NUMERICAL
            )
        ''')
        bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm(
                SUBSAMPLE 10
            );
        ''')
        bdb.execute('INITIALIZE 4 MODELS FOR g0')

        # Test each subproblem individually except for variable hyperparameters.
        for optimized in [
                '',
                'OPTIMIZED;',
        ]:
            for subproblem in [
                    'variable clustering',
                    'variable clustering concentration',
                    'row clustering',
                    'row clustering concentration',
            ]:
                bdb.execute('''
                    ANALYZE g0 MODELS 0,1 FOR 4 ITERATION(
                        SUBPROBLEM %s;
                        %s
                    );
                ''' % (subproblem, optimized))

        # Test variable hyperparameters.
        bdb.execute('''
            ANALYZE g0 FOR 1 ITERATION (
                VARIABLES period, launch_mass;
                SUBPROBLEM variable hyperparameters;
            )
        ''')
        with pytest.raises(BQLError):
            # OPTIMIZED backend does not support variable hyperparameters.
            bdb.execute('''
                ANALYZE g0 FOR 1 SECONDS (
                    SUBPROBLEM variable hyperparameters;
                    OPTIMIZED;
                )
            ''')

        # Test rows.
        generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0')
        cursor = bdb.execute(
            '''
            SELECT table_rowid FROM  bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, ))
        subsample_rows = [c[0] for c in cursor]
        bad_rows = [i for i in xrange(20) if i not in subsample_rows]
        for optimized in ['', 'OPTIMIZED;']:
            bdb.execute('''
                ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                    VARIABLES class_of_orbit;
                    ROWS %s;
                    SUBPROBLEMS (
                        row clustering,
                        row clustering concentration
                    );
                    %s
            )
            ''' % (','.join(map(str, subsample_rows)), optimized))
            with pytest.raises(BQLError):
                # Fail on rows not in the population or subsample.
                bdb.execute('''
                    ANALYZE g0 MODEL 3 FOR 1 ITERATION (
                        VARIABLES class_of_orbit;
                        ROWS %s;
                        SUBPROBLEMS (
                            row clustering,
                            row clustering concentration
                        );
                        %s
                )
                ''' % (','.join(map(str, bad_rows)), optimized))
Esempio n. 16
0
def test_subsample():
    with bayeslite.bayesdb_open(builtin_backends=False) as bdb:
        backend = CGPM_Backend(cgpm_registry={}, multiprocess=False)
        bayeslite.bayesdb_register_backend(bdb, backend)
        with open(dha_csv, 'rU') as f:
            read_csv.bayesdb_read_csv(bdb, 'dha', f, header=True, create=True)
        bayesdb_guess_population(bdb,
                                 'hospitals_full',
                                 'dha',
                                 overrides=[('name', 'key')])
        bayesdb_guess_population(bdb,
                                 'hospitals_sub',
                                 'dha',
                                 overrides=[('name', 'key')])
        bdb.execute('''
            CREATE GENERATOR hosp_full_cc FOR hospitals_full USING cgpm;
        ''')
        bdb.execute('''
            CREATE GENERATOR hosp_sub_cc FOR hospitals_sub USING cgpm(
                SUBSAMPLE 100
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR hosp_sub_cc')
        bdb.execute('ANALYZE hosp_sub_cc FOR 1 ITERATION (OPTIMIZED)')
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=2) IN THE CONTEXT OF PNEUM_SCORE
            FROM hospitals_sub WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY TO (_rowid_=102) IN THE CONTEXT OF
            N_DEATH_ILL FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF mdcr_spnd_amblnc
            FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        bdb.execute('''
            ESTIMATE SIMILARITY IN THE CONTEXT OF PNEUM_SCORE
            FROM PAIRWISE hospitals_sub
            WHERE (r0._rowid_ = 1 OR r0._rowid_ = 101) AND
            (r1._rowid_ = 1 OR r1._rowid_ = 101)
        ''').fetchall()
        bdb.execute('''
            INFER mdcr_spnd_amblnc FROM hospitals_sub
            WHERE _rowid_ = 1 OR _rowid_ = 101
        ''').fetchall()
        sql = '''
            SELECT table_rowid FROM bayesdb_cgpm_individual
                WHERE generator_id = ?
                ORDER BY cgpm_rowid ASC
                LIMIT 100
        '''
        gid_full = bayesdb_get_generator(bdb, None, 'hosp_full_cc')
        cursor = bdb.sql_execute(sql, (gid_full, ))
        assert [row[0] for row in cursor] == range(1, 100 + 1)
        gid = bayesdb_get_generator(bdb, None, 'hosp_sub_cc')
        cursor = bdb.sql_execute(sql, (gid, ))
        assert [row[0] for row in cursor] != range(1, 100 + 1)
        bdb.execute('DROP GENERATOR hosp_sub_cc')
        bdb.execute('DROP GENERATOR hosp_full_cc')
        bdb.execute('DROP POPULATION hospitals_sub')
        bdb.execute('DROP POPULATION hospitals_full')
Esempio n. 17
0
from bayeslite.quote import bql_quote_name
from bayeslite.read_csv import bayesdb_read_csv
from bayeslite.read_csv import bayesdb_read_csv_file
from bayeslite.schema import bayesdb_upgrade_schema
from bayeslite.txn import BayesDBTxnError
from bayeslite.version import __version__

# XXX This is not a good place for me.  Find me a better home, please!

__all__ = [
    'BQLError',
    'BQLParseError',
    'BayesDB',
    'BayesDBException',
    'BayesDBTxnError',
    'bayesdb_deregister_backend',
    'bayesdb_nullify',
    'bayesdb_open',
    'bayesdb_read_csv',
    'bayesdb_read_csv_file',
    'bayesdb_register_backend',
    'bayesdb_upgrade_schema',
    'bql_quote_name',
    'BayesDB_Backend',
    'IBayesDBTracer',
]

# Register cgpm as a builtin backend.
from bayeslite.backends.cgpm_backend import CGPM_Backend
bayesdb_builtin_backend(CGPM_Backend({}, multiprocess=True))
Esempio n. 18
0
def test_regress_bonanza__ci_integration():
    with cgpm_dummy_satellites_bdb() as bdb:
        bayesdb_register_backend(bdb, CGPM_Backend(dict(), multiprocess=0))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                apogee                  NUMERICAL;
                class_of_orbit          NOMINAL;
                country_of_operator     NOMINAL;
                launch_mass             NUMERICAL;
                perigee                 NUMERICAL;
                period                  NUMERICAL;
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR m FOR satellites;
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m;')

        def check_regression_variables(results, numericals, nominals):
            seen = set()
            for r in results:
                assert len(r) == 2
                variable = r[0]
                assert variable not in seen
                assert variable in numericals or \
                    any(variable.startswith('%s_dum_' % (nominal,))
                        for nominal in nominals)
                seen.add(variable)

        # Regression on 1 numerical variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites;
        ''').fetchall()
        assert len(results) == 2
        check_regression_variables(results, ['intercept', 'perigee'], [])

        # Regression on 1 nominal variable.
        results = bdb.execute('''
            REGRESS apogee GIVEN (country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(results, ['intercept'],
                                   ['country_of_operator'])

        # Regression on 1 nominal + 1 numerical variable.
        bdb.execute('''
            REGRESS apogee GIVEN (perigee, country_of_operator)
            USING 12 SAMPLES BY satellites;
        ''').fetchall()
        check_regression_variables(results, ['intercept', 'perigee'],
                                   ['country_of_operator'])

        # Regression on all variables.
        results = bdb.execute(
            '''
            REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites;
        ''', (3, )).fetchall()
        check_regression_variables(
            results,
            [
                'intercept',
                'perigee',
                'launch_mass',
                'period',
            ],
            [
                'country_of_operator',
                'class_of_orbit',
            ],
        )

        # Regression on column selector subexpression with a binding.
        results = bdb.execute(
            '''
            REGRESS apogee GIVEN (
                satellites.(
                    ESTIMATE * FROM VARIABLES OF satellites
                    ORDER BY dependence probability with apogee DESC
                    LIMIT ?
                )
            )
            USING 12 SAMPLES BY satellites MODELED BY m USING MODEL 1;
        ''', (3, )).fetchall()

        cursor = bdb.execute(
            '''
            ESTIMATE * FROM VARIABLES OF satellites
                ORDER BY dependence probability with apogee DESC
                LIMIT ?
        ''', (3, )).fetchall()
        top_variables = [c[0] for c in cursor]
        nominals = [
            var for var in top_variables if var in [
                'country_of_operator',
                'class_of_orbit',
            ]
        ]
        numericals = [var for var in top_variables if var not in nominals]
        check_regression_variables(results, numericals + ['intercept'],
                                   nominals)

        # Cannot mix * with other variables.
        with pytest.raises(BQLError):
            bdb.execute('''
                REGRESS apogee GIVEN (*, class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()

        # Not enough data for regression, 1 unique nominal variable.
        with pytest.raises(ValueError):
            bdb.execute('''
                REGRESS apogee GIVEN (class_of_orbit)
                USING 1 SAMPLES BY satellites;
            ''').fetchall()
Esempio n. 19
0
def test_initialize_with_all_nulls():
    # This test ensures that trying to initialize a generator with any
    # (manifest) column of all null variables will crash.
    # Initializing an overriden column with all null variables should not
    # be a problem in general, so we test this case as well.

    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        registry = {
            'barebones': BareBonesCGpm,
        }
        bayesdb_register_backend(
            bdb, CGPM_Backend(registry, multiprocess=0))
        # Create table with all missing values for a.
        bdb.sql_execute('''
            CREATE TABLE t (a REAL, b REAL, c REAL);
        ''')
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 3))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -2, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -5, 1))
        bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, 2, 3))

        # Fail when a is numerical and modeled by crosscat.
        bdb.execute('''
            CREATE POPULATION p FOR t WITH SCHEMA(
                SET STATTYPES OF a, b, c TO NUMERICAL
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR m FOR p;
        ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                INITIALIZE 2 MODELS FOR m;
            ''')

        # Fail when a is nominal and modeled by crosscat.
        bdb.execute('''
            CREATE POPULATION p2 FOR t WITH SCHEMA(
                SET STATTYPES OF a TO NOMINAL;
                SET STATTYPES OF b, c TO NUMERICAL
            )
        ''')
        bdb.execute('CREATE GENERATOR m2 FOR p2;')
        with pytest.raises(BQLError):
            bdb.execute('INITIALIZE 2 MODELS FOR m2;')

        # Succeed when a is ignored.
        bdb.execute('''
            CREATE POPULATION p3 FOR t WITH SCHEMA(
                IGNORE a;
                SET STATTYPES OF b, c TO NUMERICAL
            )
        ''')
        bdb.execute('CREATE GENERATOR m3 FOR p3;')
        bdb.execute('INITIALIZE 2 MODELS FOR m3;')


        # Succeed when a is numerical overriden using a dummy CGPM.
        bdb.execute('''
            CREATE GENERATOR m4 FOR p(
                OVERRIDE MODEL FOR a GIVEN b USING barebones
            )
        ''')
        bdb.execute('INITIALIZE 2 MODELS FOR m4;')
        bdb.execute('ANALYZE m4 FOR 1 ITERATION')
Esempio n. 20
0
def test_output_stattypes():
    with cgpm_dummy_satellites_bdb() as bdb:
        # Missing policy for class_of_orbit, perigee, period
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                    SET STATTYPES OF apogee, launch_mass TO NUMERICAL;
                    SET STATTYPES OF country_of_operator TO NOMINAL
                )
            ''')
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                IGNORE class_of_orbit, perigee, period;
                SET STATTYPES OF apogee, launch_mass TO NUMERICAL;
                SET STATTYPES OF country_of_operator TO NOMINAL
            )
        ''')
        registry = {
            'factor_analysis': FactorAnalysis,
        }
        bayesdb_register_backend(bdb, CGPM_Backend(registry))
        # Creating factor analysis with nominal manifest should crash.
        bdb.execute('''
            CREATE GENERATOR satellites_g0 FOR satellites(
                OVERRIDE MODEL FOR apogee, country_of_operator
                AND EXPOSE pc_1 NUMERICAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0')
        with pytest.raises(BQLError):
            # Duplicate pc_2 in LATENT and EXPOSE.
            bdb.execute('''
                CREATE GENERATOR satellites_g1 FOR satellites(
                    LATENT pc_2 NOMINAL,
                    OVERRIDE GENERATIVE MODEL FOR
                        apogee, launch_mass
                    AND EXPOSE pc_2 NOMINAL
                    USING factor_analysis(L=1)
                )
            ''')
        # Creating factor analysis with nominal latent should crash.
        bdb.execute('''
            CREATE GENERATOR satellites_g1 FOR satellites(
                OVERRIDE GENERATIVE MODEL FOR
                    apogee, launch_mass
                AND EXPOSE pc_2 NOMINAL
                USING factor_analysis(L=1)
            )
        ''')
        with pytest.raises(ValueError):
            bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1')
        # Creating factor analysis with all numerical should be ok.
        bdb.execute('''
            CREATE GENERATOR satellites_g2 FOR satellites USING cgpm(
                LATENT pc_3 NUMERICAL;

                OVERRIDE MODEL FOR apogee, launch_mass, pc_3, pc_4
                USING factor_analysis(L=2);

                LATENT pc_4 NUMERICAL
            )
        ''')
        bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2')
        bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION')
        # Cannot transition baseline and foreign using timed analysis.
        with pytest.raises(BQLError):
            bdb.execute('''
                ANALYZE satellites_g2 FOR 2 SECONDS (
                    VARIABLES country_of_operator, apogee, launch_mass, pc_3);
            ''')
        bdb.execute('''
            ANALYZE satellites_g2 FOR 1 ITERATION (
                VARIABLES apogee, launch_mass);
        ''')
        # Dependence probability of manifest with latent.
        cursor = bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY OF apogee WITH pc_3
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        assert cursor[0][0] == 1.
        # Dependence probability of latent with latent.
        cursor = bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY OF pc_3 WITH pc_4
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        assert cursor[0][0] == 1.
        # Mutual information of latent with manifest.
        cursor = bdb.execute('''
            ESTIMATE MUTUAL INFORMATION OF apogee WITH pc_4 USING 1 SAMPLES
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
        # Mutual information of latent with latent.
        cursor = bdb.execute('''
            ESTIMATE MUTUAL INFORMATION OF pc_3 WITH pc_4 USING 1 SAMPLES
            BY satellites MODELED BY satellites_g2;
        ''').fetchall()
Esempio n. 21
0
def test_unknown_stattype():
    from cgpm.regressions.linreg import LinearRegression
    with cgpm_dummy_satellites_bdb() as bdb:
        # Add a column called relaunches, sum of apogee and perigee.
        bdb.sql_execute('ALTER TABLE satellites_ucs ADD COLUMN relaunches')
        n_rows = bdb.sql_execute('''
            SELECT COUNT(*) FROM satellites_ucs
        ''').next()[0]
        for rowid in xrange(n_rows):
            bdb.sql_execute('''
                UPDATE satellites_ucs
                    SET relaunches = (SELECT apogee + perigee)
                    WHERE _rowid_ = ?
            ''', (rowid+1,))
        # Nobody will ever create a QUAGGA statistical type!
        with pytest.raises(BQLError):
            # No such statistical type at the moment.
            bdb.execute('''
                CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                    SET STATTYPES OF apogee, perigee, launch_mass, period
                        TO NUMERICAL;

                    SET STATTYPE OF class_of_orbit, country_of_operator
                        TO NOMINAL;

                    SET STATTYPE OF relaunches
                        TO QUAGGA
                )
            ''')
        # Invent the statistical type.
        bdb.sql_execute('INSERT INTO bayesdb_stattype VALUES (?)', ('quagga',))
        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA(
                SET STATTYPES OF apogee, perigee, launch_mass, period
                    TO NUMERICAL;

                SET STATTYPES OF class_of_orbit, country_of_operator
                TO NOMINAL;

                SET STATTYPES OF relaunches
                TO QUAGGA
            )
        ''')
        registry = {
            'kepler': Kepler,
            'linreg': LinearRegression,
        }
        bayesdb_register_backend(bdb, CGPM_Backend(registry))
        with pytest.raises(BQLError):
            # Can't model QUAGGA by default.
            bdb.execute('CREATE GENERATOR g0 FOR satellites USING cgpm')
        with pytest.raises(BQLError):
            # Can't model QUAGGA as input.
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR relaunches GIVEN apogee USING linreg;
                    OVERRIDE MODEL FOR period GIVEN relaunches USING linreg
                )
            ''')
        # Can model QUAGGA with an explicit distribution family.
        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR relaunches TO POISSON
            )
        ''')
        bdb.execute('''
            CREATE GENERATOR g1 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR relaunches TO POISSON;
                OVERRIDE MODEL FOR period GIVEN relaunches USING linreg
            )
        ''')