Esempio n. 1
0
def bayesdb_generator_column_stattypes(bdb, generator_id):
    column_stattypes = {}
    for name in core.bayesdb_generator_column_names(bdb, generator_id):
        stattype = core.bayesdb_generator_column_stattype(
            bdb, generator_id, name)
        column_stattypes[casefold(name)] = casefold(stattype)
    return column_stattypes
Esempio n. 2
0
def bayesdb_generator_column_stattypes(bdb, generator_id):
    column_stattypes = {}
    for name in core.bayesdb_generator_column_names(bdb, generator_id):
        stattype = core.bayesdb_generator_column_stattype(bdb, generator_id,
            name)
        column_stattypes[casefold(name)] = casefold(stattype)
    return column_stattypes
Esempio n. 3
0
 def _predict_confidence(self, bdb, genid, modelno, colno, rowid,
         numsamples=None):
     # Predicts a value for the cell [rowid, colno] with a confidence metric.
     # XXX Prefer accuracy over speed for imputation.
     if numsamples is None:
         numsamples = self.n_samples
     colnos = core.bayesdb_generator_column_numbers(bdb, genid)
     colnames = core.bayesdb_generator_column_names(bdb, genid)
     row = core.bayesdb_generator_row_values(bdb, genid, rowid)
     # Account for multiple imputations if imputing parents.
     parent_conf = 1
     # Predicting lcol.
     if colno in self.lcols(bdb, genid):
         # Delegate to CC IFF
         # (lcol has no children OR all its children are None).
         children = [f for f in self.fcols(bdb, genid) if colno in
                 self.pcols(bdb, genid, f)]
         if len(children) == 0 or \
                 all(row[i] is None for i in xrange(len(row)) if i+1
                     in children):
             return self.cc(bdb, genid).predict_confidence(bdb,
                     self.cc_id(bdb, genid), modelno,
                     self.cc_colno(bdb, genid, colno), rowid)
         else:
             # Obtain likelihood weighted samples from posterior.
             Q = [(rowid, colno)]
             Y = [(rowid, c, v) for c,v in zip(colnos, row)
                  if c != colno and v is not None]
             samples = self.simulate(bdb, genid, modelno, Q, Y,
                 numpredictions=numsamples)
             samples = [s[0] for s in samples]
     # Predicting fcol.
     else:
         conditions = {c:v for c,v in zip(colnames, row) if
             core.bayesdb_generator_column_number(bdb, genid, c) in
             self.pcols(bdb, genid, colno)}
         for colname, val in conditions.iteritems():
             # Impute all missing parents.
             if val is None:
                 imp_col = core.bayesdb_generator_column_number(bdb, genid,
                     colname)
                 imp_val, imp_conf = self.predict_confidence(bdb, genid,
                     modelno, imp_col, rowid, numsamples=numsamples)
                 # XXX If imputing several parents, take the overall
                 # overall conf as min conf. If we define imp_conf as
                 # P[imp_val = correct] then we might choose to multiply
                 # the imp_confs, but we cannot assert that the imp_confs
                 # are independent so multiplying is extremely conservative.
                 parent_conf = min(parent_conf, imp_conf)
                 conditions[colname] = imp_val
         assert all(v is not None for c,v in conditions.iteritems())
         predictor = self.predictor(bdb, genid, colno)
         samples = predictor.simulate(numsamples, conditions)
     # Since foreign predictor does not know how to impute, imputation
     # shall occur here in the composer by simulate/logpdf calls.
     stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno)
     if stattype == 'categorical':
         # imp_conf is most frequent.
         imp_val =  max(((val, samples.count(val)) for val in set(samples)),
             key=lambda v: v[1])[0]
         if colno in self.fcols(bdb, genid):
             imp_conf = np.exp(predictor.logpdf(imp_val, conditions))
         else:
             imp_conf = sum(np.array(samples)==imp_val) / len(samples)
     elif stattype == 'numerical':
         # XXX The definition of confidence is P[k=1] where
         # k=1 is the number of mixture componets (we need a distribution
         # over GPMM to answer this question). The confidence is instead
         # implemented as \max_i{p_i} where p_i are the weights of a
         # fitted DPGMM.
         imp_val = np.mean(samples)
         imp_conf = su.continuous_imputation_confidence(samples, None, None,
             n_steps=1000)
     else:
         raise ValueError('Unknown stattype "{}" for a foreign predictor '
             'column encountered in predict_confidence.'.format(stattype))
     return imp_val, imp_conf * parent_conf