Ejemplo n.º 1
0
def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1):
    st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0)
    st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1)
    table_name = core.bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0)
    colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1)
    qcn0 = sqlite3_quote_name(colname0)
    qcn1 = sqlite3_quote_name(colname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qcn0, qcn1, qt, qcn0, qcn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
Ejemplo n.º 2
0
def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1):
    st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0)
    st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1)
    table_name = core.bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0)
    colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1)
    qcn0 = sqlite3_quote_name(colname0)
    qcn1 = sqlite3_quote_name(colname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qcn0, qcn1, qt, qcn0, qcn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
Ejemplo n.º 3
0
def bayesdb_generator_column_stattypes(bdb, generator_id):
    column_stattypes = {}
    for name in core.bayesdb_generator_column_names(bdb, generator_id):
        stattype = core.bayesdb_generator_column_stattype(bdb, generator_id,
            name)
        column_stattypes[casefold(name)] = casefold(stattype)
    return column_stattypes
Ejemplo n.º 4
0
def bayesdb_generator_column_stattypes(bdb, generator_id):
    column_stattypes = {}
    for name in core.bayesdb_generator_column_names(bdb, generator_id):
        stattype = core.bayesdb_generator_column_stattype(
            bdb, generator_id, name)
        column_stattypes[casefold(name)] = casefold(stattype)
    return column_stattypes
Ejemplo n.º 5
0
 def initialize_models(self, bdb, genid, modelnos, model_config):
     # Initialize internal crosscat, maintaining equality of model numbers.
     # The semantics of INITIALIZE are that it guarantees the existence
     # of a sequence of models up to the requested number of them,
     # and BayesDB computes the numbers that need to be filled in.
     # The inverse of that computation is max(modelnos)+1.
     qg = quote(core.bayesdb_generator_name(bdb, self.cc_id(bdb, genid)))
     bql = 'INITIALIZE {} MODELS FOR {};'.format(max(modelnos)+1, qg)
     bdb.execute(bql)
     # Initialize the foriegn predictors.
     for fcol in self.fcols(bdb, genid):
         # Convert column numbers to names.
         targets = \
             [(core.bayesdb_generator_column_name(bdb, genid, fcol),
               core.bayesdb_generator_column_stattype(bdb, genid, fcol))]
         conditions = \
             [(core.bayesdb_generator_column_name(bdb, genid, pcol),
               core.bayesdb_generator_column_stattype(bdb, genid, pcol))
              for pcol in self.pcols(bdb, genid, fcol)]
         # Initialize the foreign predictor.
         table_name = core.bayesdb_generator_table(bdb, genid)
         predictor_name = self.predictor_name(bdb, genid, fcol)
         builder = self.predictor_builder[predictor_name]
         predictor = builder.create(bdb, table_name, targets, conditions)
         # Store in the database.
         with bdb.savepoint():
             sql = '''
                 UPDATE bayesdb_composer_column_foreign_predictor SET
                     predictor_binary = :predictor_binary
                     WHERE generator_id = :genid AND colno = :colno
             '''
             predictor_binary = builder.serialize(bdb, predictor)
             bdb.sql_execute(sql, {
                 'genid': genid,
                 'predictor_binary': sqlite3.Binary(predictor_binary),
                 'colno': fcol
             })
Ejemplo n.º 6
0
 def _from_numeric(self, bdb, generator_id, colno, value):
     """Convert value in cgpm to equivalent bayeslite format."""
     # XXX Latent variables are not associated with an entry in
     # bayesdb_cgpm_category, so just pass through whatever value cgpm
     # returns.
     if colno < 0:
         return value
     if math.isnan(value):
         return None
     stattype = core.bayesdb_generator_column_stattype(
         bdb, generator_id, colno)
     if _is_categorical(stattype):
         cursor = bdb.sql_execute(
             '''
             SELECT value FROM bayesdb_cgpm_category
                 WHERE generator_id = ? AND colno = ? AND code = ?
         ''', (generator_id, colno, value))
         text = cursor_value(cursor, nullok=True)
         if text is None:
             raise BQLError('Invalid category: %r' % (value, ))
         return text
     else:
         return value
Ejemplo n.º 7
0
 def _to_numeric(self, bdb, generator_id, colno, value):
     """Convert value in bayeslite to equivalent cgpm format."""
     if value is None:
         return float('NaN')
     # XXX Latent variables are not associated with an entry in
     # bayesdb_cgpm_category, so just pass through whatever value
     # the user supplied, as a float.
     if colno < 0:
         return float(value)
     stattype = core.bayesdb_generator_column_stattype(
         bdb, generator_id, colno)
     if _is_categorical(stattype):
         cursor = bdb.sql_execute(
             '''
             SELECT code FROM bayesdb_cgpm_category
                 WHERE generator_id = ? AND colno = ? AND value = ?
         ''', (generator_id, colno, value))
         integer = cursor_value(cursor, nullok=True)
         if integer is None:
             return float('NaN')
             # raise BQLError('Invalid category: %r' % (value,))
         return integer
     else:
         return value
Ejemplo n.º 8
0
 def _predict_confidence(self, bdb, genid, modelno, colno, rowid,
         numsamples=None):
     # Predicts a value for the cell [rowid, colno] with a confidence metric.
     # XXX Prefer accuracy over speed for imputation.
     if numsamples is None:
         numsamples = self.n_samples
     colnos = core.bayesdb_generator_column_numbers(bdb, genid)
     colnames = core.bayesdb_generator_column_names(bdb, genid)
     row = core.bayesdb_generator_row_values(bdb, genid, rowid)
     # Account for multiple imputations if imputing parents.
     parent_conf = 1
     # Predicting lcol.
     if colno in self.lcols(bdb, genid):
         # Delegate to CC IFF
         # (lcol has no children OR all its children are None).
         children = [f for f in self.fcols(bdb, genid) if colno in
                 self.pcols(bdb, genid, f)]
         if len(children) == 0 or \
                 all(row[i] is None for i in xrange(len(row)) if i+1
                     in children):
             return self.cc(bdb, genid).predict_confidence(bdb,
                     self.cc_id(bdb, genid), modelno,
                     self.cc_colno(bdb, genid, colno), rowid)
         else:
             # Obtain likelihood weighted samples from posterior.
             Q = [(rowid, colno)]
             Y = [(rowid, c, v) for c,v in zip(colnos, row)
                  if c != colno and v is not None]
             samples = self.simulate(bdb, genid, modelno, Q, Y,
                 numpredictions=numsamples)
             samples = [s[0] for s in samples]
     # Predicting fcol.
     else:
         conditions = {c:v for c,v in zip(colnames, row) if
             core.bayesdb_generator_column_number(bdb, genid, c) in
             self.pcols(bdb, genid, colno)}
         for colname, val in conditions.iteritems():
             # Impute all missing parents.
             if val is None:
                 imp_col = core.bayesdb_generator_column_number(bdb, genid,
                     colname)
                 imp_val, imp_conf = self.predict_confidence(bdb, genid,
                     modelno, imp_col, rowid, numsamples=numsamples)
                 # XXX If imputing several parents, take the overall
                 # overall conf as min conf. If we define imp_conf as
                 # P[imp_val = correct] then we might choose to multiply
                 # the imp_confs, but we cannot assert that the imp_confs
                 # are independent so multiplying is extremely conservative.
                 parent_conf = min(parent_conf, imp_conf)
                 conditions[colname] = imp_val
         assert all(v is not None for c,v in conditions.iteritems())
         predictor = self.predictor(bdb, genid, colno)
         samples = predictor.simulate(numsamples, conditions)
     # Since foreign predictor does not know how to impute, imputation
     # shall occur here in the composer by simulate/logpdf calls.
     stattype = core.bayesdb_generator_column_stattype(bdb, genid, colno)
     if stattype == 'categorical':
         # imp_conf is most frequent.
         imp_val =  max(((val, samples.count(val)) for val in set(samples)),
             key=lambda v: v[1])[0]
         if colno in self.fcols(bdb, genid):
             imp_conf = np.exp(predictor.logpdf(imp_val, conditions))
         else:
             imp_conf = sum(np.array(samples)==imp_val) / len(samples)
     elif stattype == 'numerical':
         # XXX The definition of confidence is P[k=1] where
         # k=1 is the number of mixture componets (we need a distribution
         # over GPMM to answer this question). The confidence is instead
         # implemented as \max_i{p_i} where p_i are the weights of a
         # fitted DPGMM.
         imp_val = np.mean(samples)
         imp_conf = su.continuous_imputation_confidence(samples, None, None,
             n_steps=1000)
     else:
         raise ValueError('Unknown stattype "{}" for a foreign predictor '
             'column encountered in predict_confidence.'.format(stattype))
     return imp_val, imp_conf * parent_conf