Example #1
0
def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('''
            create population p for t(id ignore; model x,y as numerical)
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g1) == [-1, 1, 2]
Example #2
0
def bql_row_similarity(bdb, generator_id, modelno, rowid, target_rowid,
                       *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
    if len(colnos) == 0:
        colnos = core.bayesdb_generator_column_numbers(bdb, generator_id)
    return metamodel.row_similarity(bdb, generator_id, modelno, rowid,
                                    target_rowid, colnos)
Example #3
0
def bql_row_similarity(bdb, generator_id, modelno, rowid, target_rowid,
        *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
    if len(colnos) == 0:
        colnos = core.bayesdb_generator_column_numbers(bdb, generator_id)
    return metamodel.row_similarity(bdb, generator_id, modelno, rowid,
        target_rowid, colnos)
Example #4
0
 def _predict_confidence(self, bdb, genid, modelno, colno, rowid,
         numsamples=None):
     # Predicts a value for the cell [rowid, colno] with a confidence metric.
     # XXX Prefer accuracy over speed for imputation.
     if numsamples is None:
         numsamples = self.n_samples
     colnos = core.bayesdb_generator_column_numbers(bdb, genid)
     colnames = core.bayesdb_generator_column_names(bdb, genid)
     row = core.bayesdb_generator_row_values(bdb, genid, rowid)
     # Account for multiple imputations if imputing parents.
     parent_conf = 1
     # Predicting lcol.
     if colno in self.lcols(bdb, genid):
         # Delegate to CC IFF
         # (lcol has no children OR all its children are None).
         children = [f for f in self.fcols(bdb, genid) if colno in
                 self.pcols(bdb, genid, f)]
         if len(children) == 0 or \
                 all(row[i] is None for i in xrange(len(row)) if i+1
                     in children):
             return self.cc(bdb, genid).predict_confidence(bdb,
                     self.cc_id(bdb, genid), modelno,
                     self.cc_colno(bdb, genid, colno), rowid)
         else:
             # Obtain likelihood weighted samples from posterior.
             Q = [(rowid, colno)]
             Y = [(rowid, c, v) for c,v in zip(colnos, row)
                  if c != colno and v is not None]
             samples = self.simulate(bdb, genid, modelno, Q, Y,
                 numpredictions=numsamples)
             samples = [s[0] for s in samples]
     # Predicting fcol.
     else:
         conditions = {c:v for c,v in zip(colnames, row) if
             core.bayesdb_generator_column_number(bdb, genid, c) in
             self.pcols(bdb, genid, colno)}
         for colname, val in conditions.iteritems():
             # Impute all missing parents.
             if val is None:
                 imp_col = core.bayesdb_generator_column_number(bdb, genid,
                     colname)
                 imp_val, imp_conf = self.predict_confidence(bdb, genid,
                     modelno, imp_col, rowid, numsamples=numsamples)
                 # XXX If imputing several parents, take the overall
                 # overall conf as min conf. If we define imp_conf as
                 # P[imp_val = correct] then we might choose to multiply
                 # the imp_confs, but we cannot assert that the imp_confs
                 # are independent so multiplying is extremely conservative.
                 parent_conf = min(parent_conf, imp_conf)
                 conditions[colname] = imp_val
         assert all(v is not None for c,v in conditions.iteritems())
         predictor = self.predictor(bdb, genid, colno)
         samples = predictor.simulate(numsamples, conditions)
     # Since foreign predictor does not know how to impute, imputation
     # shall occur here in the composer by simulate/logpdf calls.
     stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno)
     if stattype == 'categorical':
         # imp_conf is most frequent.
         imp_val =  max(((val, samples.count(val)) for val in set(samples)),
             key=lambda v: v[1])[0]
         if colno in self.fcols(bdb, genid):
             imp_conf = np.exp(predictor.logpdf(imp_val, conditions))
         else:
             imp_conf = sum(np.array(samples)==imp_val) / len(samples)
     elif stattype == 'numerical':
         # XXX The definition of confidence is P[k=1] where
         # k=1 is the number of mixture componets (we need a distribution
         # over GPMM to answer this question). The confidence is instead
         # implemented as \max_i{p_i} where p_i are the weights of a
         # fitted DPGMM.
         imp_val = np.mean(samples)
         imp_conf = su.continuous_imputation_confidence(samples, None, None,
             n_steps=1000)
     else:
         raise ValueError('Unknown stattype "{}" for a foreign predictor '
             'column encountered in predict_confidence.'.format(stattype))
     return imp_val, imp_conf * parent_conf
Example #5
0
def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_metamodels=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(
                        0, len(countries))]
                    name = 'sat-%s-%d' % (country,
                                          bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute(
                        '''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit CATEGORICAL;
                country_of_operator CATEGORICAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Metamodel(cgpm_registry)
        bayesdb_register_metamodel(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE METAMODEL g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE METAMODEL g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE METAMODEL g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE METAMODEL g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_generator_column_numbers(bdb, generator_id) == \
            [-2, -1, 1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, None) == \
            [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(
                bdb, population_id, generator_id) == \
            [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration WAIT (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration WAIT (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration WAIT (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration WAIT (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION WAIT (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration WAIT (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELLED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELLED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELLED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELLED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELLED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELLED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELLED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP METAMODEL g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')