def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1): st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0) st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1) table_name = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0) colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1) qcn0 = sqlite3_quote_name(colname0) qcn1 = sqlite3_quote_name(colname1) data_sql = ''' SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL ''' % (qcn0, qcn1, qt, qcn0, qcn1) data = bdb.sql_execute(data_sql).fetchall() data0 = [row[0] for row in data] data1 = [row[1] for row in data] return (st0, st1, data0, data1)
def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1): st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0) st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1) table_name = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0) colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1) qcn0 = sqlite3_quote_name(colname0) qcn1 = sqlite3_quote_name(colname1) data_sql = ''' SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL ''' % (qcn0, qcn1, qt, qcn0, qcn1) data = bdb.sql_execute(data_sql).fetchall() data0 = [row[0] for row in data] data1 = [row[1] for row in data] return (st0, st1, data0, data1)
def bayesdb_generator_cell_value(bdb, generator_id, colno, rowid): table_name = core.bayesdb_generator_table(bdb, generator_id) qt = bql_quote_name(table_name) colname = core.bayesdb_generator_column_name(bdb, generator_id, colno) qcn = bql_quote_name(colname) sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt) cursor = bdb.sql_execute(sql, (rowid, )) try: row = cursor.next() except StopIteration: assert False, 'Missing row at %d!' % (rowid, ) else: return row[0]
def bayesdb_generator_cell_value(bdb, generator_id, colno, rowid): table_name = core.bayesdb_generator_table(bdb, generator_id) qt = bql_quote_name(table_name) colname = core.bayesdb_generator_column_name(bdb, generator_id, colno) qcn = bql_quote_name(colname) sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt) cursor = bdb.sql_execute(sql, (rowid,)) try: row = cursor.next() except StopIteration: assert False, 'Missing row at %d!' % (rowid,) else: return row[0]
def initialize_models(self, bdb, genid, modelnos, model_config): # Initialize internal crosscat, maintaining equality of model numbers. # The semantics of INITIALIZE are that it guarantees the existence # of a sequence of models up to the requested number of them, # and BayesDB computes the numbers that need to be filled in. # The inverse of that computation is max(modelnos)+1. qg = quote(core.bayesdb_generator_name(bdb, self.cc_id(bdb, genid))) bql = 'INITIALIZE {} MODELS FOR {};'.format(max(modelnos)+1, qg) bdb.execute(bql) # Initialize the foriegn predictors. for fcol in self.fcols(bdb, genid): # Convert column numbers to names. targets = \ [(core.bayesdb_generator_column_name(bdb, genid, fcol), core.bayesdb_generator_column_stattype(bdb, genid, fcol))] conditions = \ [(core.bayesdb_generator_column_name(bdb, genid, pcol), core.bayesdb_generator_column_stattype(bdb, genid, pcol)) for pcol in self.pcols(bdb, genid, fcol)] # Initialize the foreign predictor. table_name = core.bayesdb_generator_table(bdb, genid) predictor_name = self.predictor_name(bdb, genid, fcol) builder = self.predictor_builder[predictor_name] predictor = builder.create(bdb, table_name, targets, conditions) # Store in the database. with bdb.savepoint(): sql = ''' UPDATE bayesdb_composer_column_foreign_predictor SET predictor_binary = :predictor_binary WHERE generator_id = :genid AND colno = :colno ''' predictor_binary = builder.serialize(bdb, predictor) bdb.sql_execute(sql, { 'genid': genid, 'predictor_binary': sqlite3.Binary(predictor_binary), 'colno': fcol })
def test_t1_column_value_probability(colno, rowid): with analyzed_bayesdb_generator(t1(), 1, 1) as (bdb, generator_id): if rowid == 0: rowid = bayesdb_maxrowid(bdb, generator_id) value = bayesdb_generator_cell_value(bdb, generator_id, colno, rowid) bqlfn.bql_column_value_probability(bdb, generator_id, None, colno, value) table_name = core.bayesdb_generator_table(bdb, generator_id) colname = core.bayesdb_generator_column_name(bdb, generator_id, colno) qt = bql_quote_name(table_name) qc = bql_quote_name(colname) sql = ''' select bql_column_value_probability(?, NULL, ?, (select %s from %s where rowid = ?)) ''' % (qc, qt) bdb.sql_execute(sql, (generator_id, colno, rowid)).fetchall()
def test_t1_column_value_probability(colno, rowid): with analyzed_bayesdb_generator(t1(), 1, 1) as (bdb, generator_id): if rowid == 0: rowid = bayesdb_maxrowid(bdb, generator_id) value = bayesdb_generator_cell_value(bdb, generator_id, colno, rowid) bqlfn.bql_column_value_probability(bdb, generator_id, None, colno, value) table_name = core.bayesdb_generator_table(bdb, generator_id) colname = core.bayesdb_generator_column_name(bdb, generator_id, colno) qt = bql_quote_name(table_name) qc = bql_quote_name(colname) sql = ''' select bql_column_value_probability(?, NULL, ?, (select %s from %s where rowid = ?)) ''' % (qc, qt) bdb.sql_execute(sql, (generator_id, colno, rowid)).fetchall()
def predictor(self, bdb, genid, fcol): if (genid, fcol) not in self._predictor_cache(bdb): cursor = bdb.sql_execute(''' SELECT predictor_name, predictor_binary FROM bayesdb_composer_column_foreign_predictor WHERE generator_id = ? AND colno = ? ''', (genid, fcol)) name, binary = cursor.fetchall()[0] builder = self.predictor_builder.get(name, None) if builder is None: raise LookupError('Foreign predictor for column "{}" ' 'not registered: "{}".'.format(name, core.bayesdb_generator_column_name(bdb, genid, fcol))) self._predictor_cache(bdb)[(genid, fcol)] = \ builder.deserialize(bdb, binary) return self._predictor_cache(bdb)[(genid, fcol)]
def _weighted_sample(self, bdb, genid, modelno, row_id, Y, n_samples=None): # Returns a pairs of parallel lists ([sample ...], [weight ...]) # Each `sample` is a dict {col:v} of values for all nodes in # the network for one row. Y specifies evidence nodes as (row, # col, value) triples: all returned samples have constrained # values at the evidence nodes. # `weight` is the likelihood of the evidence Y under s\Y. if n_samples is None: n_samples = self.n_samples # Create n_samples dicts, each entry is weighted sample from joint. samples = [{c:v for r,c,v in Y if r == row_id} for _ in xrange(n_samples)] weights = [] w0 = 0 # Assess likelihood of evidence at root. Y_cc = [(r, c, v) for r,c,v in Y if c in self.lcols(bdb, genid)] if Y_cc: w0 += self.cc(bdb, genid).logpdf_joint(bdb, self.cc_id(bdb, genid), Y_cc, [], modelno) # Simulate unobserved ccs. Q_cc = [(row_id, c) for c in self.lcols(bdb, genid) if c not in samples[0]] V_cc = self.cc(bdb, genid).simulate_joint(bdb, self.cc_id(bdb, genid), Q_cc, Y_cc, modelno, num_predictions=n_samples) for k in xrange(n_samples): w = w0 # Add simulated Q_cc. samples[k].update({c:v for (_, c), v in zip(Q_cc, V_cc[k])}) for fcol in self.topo(bdb, genid): pcols = self.pcols(bdb, genid, fcol) predictor = self.predictor(bdb, genid, fcol) # All parents of FP known (evidence or simulated)? assert pcols.issubset(set(samples[k])) conditions = {core.bayesdb_generator_column_name( bdb, genid, c):v for c,v in samples[k].iteritems() if c in pcols} if fcol in samples[k]: # f is evidence: compute likelihood weight. w += predictor.logpdf(samples[k][fcol], conditions) else: # f is latent: simulate from conditional distribution. samples[k][fcol] = predictor.simulate(1, conditions)[0] weights.append(w) return samples, weights
def create_generator(self, bdb, table, schema, instantiate): # Parse the schema. (columns, lcols, _fcols, fcol_to_pcols, fcol_to_fpred, dependencies) = self.parse(schema) # Instantiate **this** generator. genid, bdbcolumns = instantiate(columns.items()) # Create internal crosscat generator. The name will be the same as # this generator name, with a _cc suffix. SUFFIX = '_cc' cc_name = bayeslite.core.bayesdb_generator_name(bdb, genid) + SUFFIX # Create strings for crosscat schema. cc_cols = ','.join('{} {}'.format(quote(c), quote(columns[c])) for c in lcols) cc_dep = [] for dep, colnames in dependencies: qcns = ','.join(map(quote, colnames)) if dep: cc_dep.append('DEPENDENT({})'.format(qcns)) else: cc_dep.append('INDEPENDENT({})'.format(qcns)) bql = """ CREATE GENERATOR {} FOR {} USING crosscat( {}, {} ); """.format(quote(cc_name), quote(table), cc_cols, ','.join(cc_dep)) bdb.execute(bql) # Convert strings to column numbers. fcolno_to_pcolnos = {} for f in fcol_to_pcols: fcolno = core.bayesdb_generator_column_number(bdb, genid, f) fcolno_to_pcolnos[fcolno] = [core.bayesdb_generator_column_number( bdb, genid, col) for col in fcol_to_pcols[f]] with bdb.savepoint(): # Save internal cc generator id. bdb.sql_execute(''' INSERT INTO bayesdb_composer_cc_id (generator_id, crosscat_generator_id) VALUES (?,?) ''', (genid, core.bayesdb_get_generator(bdb, cc_name),)) # Save lcols/fcolnos. for colno, _, _ in bdbcolumns: local = colno not in fcolno_to_pcolnos bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_owner (generator_id, colno, local) VALUES (?,?,?) ''', (genid, colno, int(local),)) # Save parents of foreign columns. for fcolno in fcolno_to_pcolnos: for pcolno in fcolno_to_pcolnos[fcolno]: bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_parents (generator_id, fcolno, pcolno) VALUES (?,?,?) ''', (genid, fcolno, pcolno,)) # Save topological order. topo = self.topological_sort(fcolno_to_pcolnos) for position, (colno, _) in enumerate(topo): bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_toposort (generator_id, colno, position) VALUES (?,?,?) ''', (genid, colno, position,)) # Save predictor names of foreign columns. for fcolno in fcolno_to_pcolnos: fp_name = fcol_to_fpred[casefold( core.bayesdb_generator_column_name(bdb,genid, fcolno))] bdb.sql_execute(''' INSERT INTO bayesdb_composer_column_foreign_predictor (generator_id, colno, predictor_name) VALUES (?,?,?) ''', (genid, fcolno, casefold(fp_name)))