def test_nig_normal_latent_numbering(): with bayesdb_open(':memory:') as bdb: bayesdb_register_metamodel(bdb, NIGNormalMetamodel()) bdb.sql_execute('create table t(id integer primary key, x, y)') for x in xrange(100): bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x * x - 100)) bdb.execute(''' create population p for t(id ignore; model x,y as numerical) ''') assert core.bayesdb_has_population(bdb, 'p') pid = core.bayesdb_get_population(bdb, 'p') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] bdb.execute('create generator g0 for p using nig_normal') bdb.execute(''' create generator g1 for p using nig_normal(xe deviation(x)) ''') assert core.bayesdb_has_generator(bdb, pid, 'g0') g0 = core.bayesdb_get_generator(bdb, pid, 'g0') assert core.bayesdb_has_generator(bdb, pid, 'g1') g1 = core.bayesdb_get_generator(bdb, pid, 'g1') assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2] assert core.bayesdb_generator_column_numbers(bdb, g0) == [1, 2] assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2] assert core.bayesdb_generator_column_numbers(bdb, g1) == [-1, 1, 2]
def bql_row_similarity(bdb, generator_id, modelno, rowid, target_rowid, *colnos): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) if len(colnos) == 0: colnos = core.bayesdb_generator_column_numbers(bdb, generator_id) return metamodel.row_similarity(bdb, generator_id, modelno, rowid, target_rowid, colnos)
def _predict_confidence(self, bdb, genid, modelno, colno, rowid, numsamples=None): # Predicts a value for the cell [rowid, colno] with a confidence metric. # XXX Prefer accuracy over speed for imputation. if numsamples is None: numsamples = self.n_samples colnos = core.bayesdb_generator_column_numbers(bdb, genid) colnames = core.bayesdb_generator_column_names(bdb, genid) row = core.bayesdb_generator_row_values(bdb, genid, rowid) # Account for multiple imputations if imputing parents. parent_conf = 1 # Predicting lcol. if colno in self.lcols(bdb, genid): # Delegate to CC IFF # (lcol has no children OR all its children are None). children = [f for f in self.fcols(bdb, genid) if colno in self.pcols(bdb, genid, f)] if len(children) == 0 or \ all(row[i] is None for i in xrange(len(row)) if i+1 in children): return self.cc(bdb, genid).predict_confidence(bdb, self.cc_id(bdb, genid), modelno, self.cc_colno(bdb, genid, colno), rowid) else: # Obtain likelihood weighted samples from posterior. Q = [(rowid, colno)] Y = [(rowid, c, v) for c,v in zip(colnos, row) if c != colno and v is not None] samples = self.simulate(bdb, genid, modelno, Q, Y, numpredictions=numsamples) samples = [s[0] for s in samples] # Predicting fcol. else: conditions = {c:v for c,v in zip(colnames, row) if core.bayesdb_generator_column_number(bdb, genid, c) in self.pcols(bdb, genid, colno)} for colname, val in conditions.iteritems(): # Impute all missing parents. if val is None: imp_col = core.bayesdb_generator_column_number(bdb, genid, colname) imp_val, imp_conf = self.predict_confidence(bdb, genid, modelno, imp_col, rowid, numsamples=numsamples) # XXX If imputing several parents, take the overall # overall conf as min conf. If we define imp_conf as # P[imp_val = correct] then we might choose to multiply # the imp_confs, but we cannot assert that the imp_confs # are independent so multiplying is extremely conservative. parent_conf = min(parent_conf, imp_conf) conditions[colname] = imp_val assert all(v is not None for c,v in conditions.iteritems()) predictor = self.predictor(bdb, genid, colno) samples = predictor.simulate(numsamples, conditions) # Since foreign predictor does not know how to impute, imputation # shall occur here in the composer by simulate/logpdf calls. stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno) if stattype == 'categorical': # imp_conf is most frequent. imp_val = max(((val, samples.count(val)) for val in set(samples)), key=lambda v: v[1])[0] if colno in self.fcols(bdb, genid): imp_conf = np.exp(predictor.logpdf(imp_val, conditions)) else: imp_conf = sum(np.array(samples)==imp_val) / len(samples) elif stattype == 'numerical': # XXX The definition of confidence is P[k=1] where # k=1 is the number of mixture componets (we need a distribution # over GPMM to answer this question). The confidence is instead # implemented as \max_i{p_i} where p_i are the weights of a # fitted DPGMM. imp_val = np.mean(samples) imp_conf = su.continuous_imputation_confidence(samples, None, None, n_steps=1000) else: raise ValueError('Unknown stattype "{}" for a foreign predictor ' 'column encountered in predict_confidence.'.format(stattype)) return imp_val, imp_conf * parent_conf
def test_cgpm_extravaganza__ci_slow(): try: from cgpm.regressions.forest import RandomForest from cgpm.regressions.linreg import LinearRegression from cgpm.venturescript.vscgpm import VsCGpm except ImportError: pytest.skip('no sklearn or venturescript') return with bayesdb_open(':memory:', builtin_metamodels=False) as bdb: # XXX Use the real satellites data instead of this bogosity? bdb.sql_execute(''' CREATE TABLE satellites_ucs ( name, apogee, class_of_orbit, country_of_operator, launch_mass, perigee, period ) ''') for l, f in [ ('geo', lambda x, y: x + y**2), ('leo', lambda x, y: math.sin(x + y)), ]: for x in xrange(1000): for y in xrange(10): countries = ['US', 'Russia', 'China', 'Bulgaria'] country = countries[bdb._np_prng.randint( 0, len(countries))] name = 'sat-%s-%d' % (country, bdb._np_prng.randint(0, 10**8)) mass = bdb._np_prng.normal(1000, 50) bdb.sql_execute( ''' INSERT INTO satellites_ucs (name, country_of_operator, launch_mass, class_of_orbit, apogee, perigee, period) VALUES (?,?,?,?,?,?,?) ''', (name, country, mass, l, x, y, f(x, y))) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs ( name IGNORE; apogee NUMERICAL; class_of_orbit CATEGORICAL; country_of_operator CATEGORICAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites ''').fetchall() cgpm_registry = { 'venturescript': VsCGpm, 'linreg': LinearRegression, 'forest': RandomForest, } cgpmt = CGPM_Metamodel(cgpm_registry) bayesdb_register_metamodel(bdb, cgpmt) with pytest.raises(BQLError): bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apoge TO NORMAL ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( LATENT apogee NUMERICAL ) ''') bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apogee TO NORMAL; LATENT kepler_cluster_id NUMERICAL; LATENT kepler_noise NUMERICAL; OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period GIVEN apogee, perigee USING venturescript (source = "{}"); OVERRIDE MODEL FOR perigee GIVEN apogee USING linreg; OVERRIDE MODEL FOR class_of_orbit GIVEN apogee, period, perigee, kepler_noise USING forest (k = 4); SUBSAMPLE 100, ) '''.format(kepler_source)) population_id = core.bayesdb_get_population(bdb, 'satellites') generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0') assert core.bayesdb_generator_column_numbers(bdb, generator_id) == \ [-2, -1, 1, 2, 3, 4, 5, 6] assert core.bayesdb_variable_numbers(bdb, population_id, None) == \ [1, 2, 3, 4, 5, 6] assert core.bayesdb_variable_numbers( bdb, population_id, generator_id) == \ [-2, -1, 1, 2, 3, 4, 5, 6] # -- MODEL country_of_operator GIVEN class_of_orbit USING forest; bdb.execute('INITIALIZE 1 MODELS FOR g0') bdb.execute('ANALYZE g0 FOR 1 iteration WAIT (;)') bdb.execute(''' ANALYZE g0 FOR 1 iteration WAIT (VARIABLES kepler_cluster_id) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration WAIT ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') # OPTIMIZED uses the lovecat backend. bdb.execute('ANALYZE g0 FOR 20 iteration WAIT (OPTIMIZED)') with pytest.raises(Exception): # Disallow both SKIP and VARIABLES clauses. # # XXX Catch a more specific exception. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION WAIT ( SKIP kepler_cluster_id; VARIABLES apogee, perigee; ) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration WAIT ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF kepler_cluster_id WITH period WITHIN satellites MODELLED BY g0 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id FROM satellites MODELLED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise FROM satellites MODELLED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf FROM satellites MODELLED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf FROM satellites MODELLED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf FROM satellites MODELLED BY g0 LIMIT 1; ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE kepler_cluster_id, apogee, perigee, period FROM satellites MODELLED BY g0 LIMIT 4 ''').fetchall() bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP METAMODEL g0') bdb.execute('DROP POPULATION satellites') bdb.execute('DROP TABLE satellites_ucs')