def bayesdb_generator_column_stattypes(bdb, generator_id): column_stattypes = {} for name in core.bayesdb_generator_column_names(bdb, generator_id): stattype = core.bayesdb_generator_column_stattype( bdb, generator_id, name) column_stattypes[casefold(name)] = casefold(stattype) return column_stattypes
def bayesdb_generator_column_stattypes(bdb, generator_id): column_stattypes = {} for name in core.bayesdb_generator_column_names(bdb, generator_id): stattype = core.bayesdb_generator_column_stattype(bdb, generator_id, name) column_stattypes[casefold(name)] = casefold(stattype) return column_stattypes
def _predict_confidence(self, bdb, genid, modelno, colno, rowid, numsamples=None): # Predicts a value for the cell [rowid, colno] with a confidence metric. # XXX Prefer accuracy over speed for imputation. if numsamples is None: numsamples = self.n_samples colnos = core.bayesdb_generator_column_numbers(bdb, genid) colnames = core.bayesdb_generator_column_names(bdb, genid) row = core.bayesdb_generator_row_values(bdb, genid, rowid) # Account for multiple imputations if imputing parents. parent_conf = 1 # Predicting lcol. if colno in self.lcols(bdb, genid): # Delegate to CC IFF # (lcol has no children OR all its children are None). children = [f for f in self.fcols(bdb, genid) if colno in self.pcols(bdb, genid, f)] if len(children) == 0 or \ all(row[i] is None for i in xrange(len(row)) if i+1 in children): return self.cc(bdb, genid).predict_confidence(bdb, self.cc_id(bdb, genid), modelno, self.cc_colno(bdb, genid, colno), rowid) else: # Obtain likelihood weighted samples from posterior. Q = [(rowid, colno)] Y = [(rowid, c, v) for c,v in zip(colnos, row) if c != colno and v is not None] samples = self.simulate(bdb, genid, modelno, Q, Y, numpredictions=numsamples) samples = [s[0] for s in samples] # Predicting fcol. else: conditions = {c:v for c,v in zip(colnames, row) if core.bayesdb_generator_column_number(bdb, genid, c) in self.pcols(bdb, genid, colno)} for colname, val in conditions.iteritems(): # Impute all missing parents. if val is None: imp_col = core.bayesdb_generator_column_number(bdb, genid, colname) imp_val, imp_conf = self.predict_confidence(bdb, genid, modelno, imp_col, rowid, numsamples=numsamples) # XXX If imputing several parents, take the overall # overall conf as min conf. If we define imp_conf as # P[imp_val = correct] then we might choose to multiply # the imp_confs, but we cannot assert that the imp_confs # are independent so multiplying is extremely conservative. parent_conf = min(parent_conf, imp_conf) conditions[colname] = imp_val assert all(v is not None for c,v in conditions.iteritems()) predictor = self.predictor(bdb, genid, colno) samples = predictor.simulate(numsamples, conditions) # Since foreign predictor does not know how to impute, imputation # shall occur here in the composer by simulate/logpdf calls. stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno) if stattype == 'categorical': # imp_conf is most frequent. imp_val = max(((val, samples.count(val)) for val in set(samples)), key=lambda v: v[1])[0] if colno in self.fcols(bdb, genid): imp_conf = np.exp(predictor.logpdf(imp_val, conditions)) else: imp_conf = sum(np.array(samples)==imp_val) / len(samples) elif stattype == 'numerical': # XXX The definition of confidence is P[k=1] where # k=1 is the number of mixture componets (we need a distribution # over GPMM to answer this question). The confidence is instead # implemented as \max_i{p_i} where p_i are the weights of a # fitted DPGMM. imp_val = np.mean(samples) imp_conf = su.continuous_imputation_confidence(samples, None, None, n_steps=1000) else: raise ValueError('Unknown stattype "{}" for a foreign predictor ' 'column encountered in predict_confidence.'.format(stattype)) return imp_val, imp_conf * parent_conf