def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1): st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0) st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1) table_name = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0) colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1) qcn0 = sqlite3_quote_name(colname0) qcn1 = sqlite3_quote_name(colname1) data_sql = ''' SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL ''' % (qcn0, qcn1, qt, qcn0, qcn1) data = bdb.sql_execute(data_sql).fetchall() data0 = [row[0] for row in data] data1 = [row[1] for row in data] return (st0, st1, data0, data1)
def bayesdb_generator_column_stattypes(bdb, generator_id): column_stattypes = {} for name in core.bayesdb_generator_column_names(bdb, generator_id): stattype = core.bayesdb_generator_column_stattype(bdb, generator_id, name) column_stattypes[casefold(name)] = casefold(stattype) return column_stattypes
def bayesdb_generator_column_stattypes(bdb, generator_id): column_stattypes = {} for name in core.bayesdb_generator_column_names(bdb, generator_id): stattype = core.bayesdb_generator_column_stattype( bdb, generator_id, name) column_stattypes[casefold(name)] = casefold(stattype) return column_stattypes
def initialize_models(self, bdb, genid, modelnos, model_config): # Initialize internal crosscat, maintaining equality of model numbers. # The semantics of INITIALIZE are that it guarantees the existence # of a sequence of models up to the requested number of them, # and BayesDB computes the numbers that need to be filled in. # The inverse of that computation is max(modelnos)+1. qg = quote(core.bayesdb_generator_name(bdb, self.cc_id(bdb, genid))) bql = 'INITIALIZE {} MODELS FOR {};'.format(max(modelnos)+1, qg) bdb.execute(bql) # Initialize the foriegn predictors. for fcol in self.fcols(bdb, genid): # Convert column numbers to names. targets = \ [(core.bayesdb_generator_column_name(bdb, genid, fcol), core.bayesdb_generator_column_stattype(bdb, genid, fcol))] conditions = \ [(core.bayesdb_generator_column_name(bdb, genid, pcol), core.bayesdb_generator_column_stattype(bdb, genid, pcol)) for pcol in self.pcols(bdb, genid, fcol)] # Initialize the foreign predictor. table_name = core.bayesdb_generator_table(bdb, genid) predictor_name = self.predictor_name(bdb, genid, fcol) builder = self.predictor_builder[predictor_name] predictor = builder.create(bdb, table_name, targets, conditions) # Store in the database. with bdb.savepoint(): sql = ''' UPDATE bayesdb_composer_column_foreign_predictor SET predictor_binary = :predictor_binary WHERE generator_id = :genid AND colno = :colno ''' predictor_binary = builder.serialize(bdb, predictor) bdb.sql_execute(sql, { 'genid': genid, 'predictor_binary': sqlite3.Binary(predictor_binary), 'colno': fcol })
def _from_numeric(self, bdb, generator_id, colno, value): """Convert value in cgpm to equivalent bayeslite format.""" # XXX Latent variables are not associated with an entry in # bayesdb_cgpm_category, so just pass through whatever value cgpm # returns. if colno < 0: return value if math.isnan(value): return None stattype = core.bayesdb_generator_column_stattype( bdb, generator_id, colno) if _is_categorical(stattype): cursor = bdb.sql_execute( ''' SELECT value FROM bayesdb_cgpm_category WHERE generator_id = ? AND colno = ? AND code = ? ''', (generator_id, colno, value)) text = cursor_value(cursor, nullok=True) if text is None: raise BQLError('Invalid category: %r' % (value, )) return text else: return value
def _to_numeric(self, bdb, generator_id, colno, value): """Convert value in bayeslite to equivalent cgpm format.""" if value is None: return float('NaN') # XXX Latent variables are not associated with an entry in # bayesdb_cgpm_category, so just pass through whatever value # the user supplied, as a float. if colno < 0: return float(value) stattype = core.bayesdb_generator_column_stattype( bdb, generator_id, colno) if _is_categorical(stattype): cursor = bdb.sql_execute( ''' SELECT code FROM bayesdb_cgpm_category WHERE generator_id = ? AND colno = ? AND value = ? ''', (generator_id, colno, value)) integer = cursor_value(cursor, nullok=True) if integer is None: return float('NaN') # raise BQLError('Invalid category: %r' % (value,)) return integer else: return value
def _predict_confidence(self, bdb, genid, modelno, colno, rowid, numsamples=None): # Predicts a value for the cell [rowid, colno] with a confidence metric. # XXX Prefer accuracy over speed for imputation. if numsamples is None: numsamples = self.n_samples colnos = core.bayesdb_generator_column_numbers(bdb, genid) colnames = core.bayesdb_generator_column_names(bdb, genid) row = core.bayesdb_generator_row_values(bdb, genid, rowid) # Account for multiple imputations if imputing parents. parent_conf = 1 # Predicting lcol. if colno in self.lcols(bdb, genid): # Delegate to CC IFF # (lcol has no children OR all its children are None). children = [f for f in self.fcols(bdb, genid) if colno in self.pcols(bdb, genid, f)] if len(children) == 0 or \ all(row[i] is None for i in xrange(len(row)) if i+1 in children): return self.cc(bdb, genid).predict_confidence(bdb, self.cc_id(bdb, genid), modelno, self.cc_colno(bdb, genid, colno), rowid) else: # Obtain likelihood weighted samples from posterior. Q = [(rowid, colno)] Y = [(rowid, c, v) for c,v in zip(colnos, row) if c != colno and v is not None] samples = self.simulate(bdb, genid, modelno, Q, Y, numpredictions=numsamples) samples = [s[0] for s in samples] # Predicting fcol. else: conditions = {c:v for c,v in zip(colnames, row) if core.bayesdb_generator_column_number(bdb, genid, c) in self.pcols(bdb, genid, colno)} for colname, val in conditions.iteritems(): # Impute all missing parents. if val is None: imp_col = core.bayesdb_generator_column_number(bdb, genid, colname) imp_val, imp_conf = self.predict_confidence(bdb, genid, modelno, imp_col, rowid, numsamples=numsamples) # XXX If imputing several parents, take the overall # overall conf as min conf. If we define imp_conf as # P[imp_val = correct] then we might choose to multiply # the imp_confs, but we cannot assert that the imp_confs # are independent so multiplying is extremely conservative. parent_conf = min(parent_conf, imp_conf) conditions[colname] = imp_val assert all(v is not None for c,v in conditions.iteritems()) predictor = self.predictor(bdb, genid, colno) samples = predictor.simulate(numsamples, conditions) # Since foreign predictor does not know how to impute, imputation # shall occur here in the composer by simulate/logpdf calls. stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno) if stattype == 'categorical': # imp_conf is most frequent. imp_val = max(((val, samples.count(val)) for val in set(samples)), key=lambda v: v[1])[0] if colno in self.fcols(bdb, genid): imp_conf = np.exp(predictor.logpdf(imp_val, conditions)) else: imp_conf = sum(np.array(samples)==imp_val) / len(samples) elif stattype == 'numerical': # XXX The definition of confidence is P[k=1] where # k=1 is the number of mixture componets (we need a distribution # over GPMM to answer this question). The confidence is instead # implemented as \max_i{p_i} where p_i are the weights of a # fitted DPGMM. imp_val = np.mean(samples) imp_conf = su.continuous_imputation_confidence(samples, None, None, n_steps=1000) else: raise ValueError('Unknown stattype "{}" for a foreign predictor ' 'column encountered in predict_confidence.'.format(stattype)) return imp_val, imp_conf * parent_conf