def bql_row_similarity(bdb, population_id, generator_id, modelnos, rowid, target_rowid, colno): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') modelnos = _retrieve_modelnos(modelnos) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Change [colno] to colno by updating IBayesDBMetamodel. return metamodel.row_similarity(bdb, generator_id, modelnos, rowid, target_rowid, [colno]) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid, *colnos): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') if len(colnos) == 0: colnos = core.bayesdb_variable_numbers(bdb, population_id, generator_id) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.row_similarity(bdb, generator_id, None, rowid, target_rowid, colnos) generator_ids = [generator_id] if generator_id is not None else \ core.bayesdb_population_generators(bdb, population_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def _retrieve_rowid_constraints(bdb, population_id, constraints): rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,)) constraints = [ (c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb) ] return rowid, constraints
def bayesdb_generator_cell_value(bdb, generator_id, rowid, colno): table_name = bayesdb_generator_table(bdb, generator_id) colname = bayesdb_generator_column_name(bdb, generator_id, colno) qt = sqlite3_quote_name(table_name) qcn = sqlite3_quote_name(colname) value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt) value_cursor = bdb.sql_execute(value_sql, (rowid, )) value = None try: row = value_cursor.next() except StopIteration: generator = bayesdb_generator_name(bdb, generator_id) raise BQLError(bdb, 'No such row in %s: %d' % (repr(generator), rowid)) else: assert len(row) == 1 value = row[0] return value
def _check_loom_initialized(self, bdb, generator_id): # Not invoked on a per-query basis due to high overhead. cursor = bdb.sql_execute( ''' SELECT COUNT(*) FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? ''', (generator_id, )) count_row = cursor.fetchall() cursor = bdb.sql_execute( ''' SELECT COUNT(*) FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? ''', (generator_id, )) count_col = cursor.fetchall() if count_row[0][0] == 0 or count_col[0][0] == 0: raise BQLError(bdb, 'Analyze must be run before any BQL'\ ' queries when using loom.')
def predictive_relevance(self, bdb, generator_id, modelnos, rowid_target, rowid_queries, hypotheticals, colno): if len(hypotheticals) > 0: raise BQLError(bdb, 'Loom cannot handle hypothetical rows' \ ' because it is unable to insert rows into CrossCat') if modelnos is None: modelnos = range(self._get_num_models(bdb, generator_id)) relevances = [0] * len(rowid_queries) for modelno in modelnos: kind_id_context = self._get_kind_id(bdb, generator_id, modelno, colno) partition_id_target = self._get_partition_id( bdb, generator_id, modelno, kind_id_context, rowid_target) for idx, rowid in enumerate(rowid_queries): partition_id_query = self._get_partition_id( bdb, generator_id, modelno, kind_id_context, rowid) if partition_id_target == partition_id_query: relevances[idx] += 1 # XXX This procedure appears to be computing the wrong thing. return [xsum / float(len(modelnos)) for xsum in relevances]
def register(self, bdb): with bdb.savepoint(): schema_sql = 'SELECT version FROM bayesdb_metamodel WHERE name = ?' cursor = bdb.sql_execute(schema_sql, (self.name(),)) version = None try: row = cursor.next() except StopIteration: version = 0 else: version = row[0] assert version is not None if version == 0: # XXX WHATTAKLUDGE! for stmt in std_normal_schema_1.split(';'): bdb.sql_execute(stmt) version = 1 if version != 1: raise BQLError(bdb, 'IID-Gaussian already installed' ' with unknown schema version: %d' % (version,))
def bayesdb_population_cell_value(bdb, population_id, rowid, colno): if colno < 0: # Latent variables do not appear in the table. return None table_name = bayesdb_population_table(bdb, population_id) var = bayesdb_variable_name(bdb, population_id, colno) qt = sqlite3_quote_name(table_name) qv = sqlite3_quote_name(var) value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qv, qt) value_cursor = bdb.sql_execute(value_sql, (rowid, )) value = None try: row = value_cursor.next() except StopIteration: population = bayesdb_population_name(bdb, population_id) raise BQLError( bdb, 'No such invidual in population %r: %d' % (population, rowid)) else: assert len(row) == 1 value = row[0] return value
def _initialize_engine(self, bdb, generator_id, n, variables): population_id = core.bayesdb_generator_population(bdb, generator_id) def map_var(var): return core.bayesdb_variable_number(bdb, population_id, generator_id, var) # If no variables in the population modeled by the gpmcc, then create 1 # dummy variable with one measurement. The design space for how to # refactor cgpm.crosscat.State to initialize without any variables is # not simple, so we will live with this workaround for now. if not variables: (outputs, cctypes, distargs, gpmcc_data) = \ [7**10], ['bernoulli'], [None], [[0]] else: outputs = [map_var(var) for var, _st, _cct, _da in variables] cctypes = [cctype for _n, _st, cctype, _da in variables] distargs = [distargs for _n, _st, _cct, distargs in variables] gpmcc_vars = [var for var, _stattype, _dist, _params in variables] gpmcc_data = self._data(bdb, generator_id, gpmcc_vars) # If gpmcc_data has any column which is all null, then crash early # and notify the user of all offending column names. n_rows = len(gpmcc_data[0]) nulls = [ v for i, v in enumerate(gpmcc_vars) if all( math.isnan(gpmcc_data[r][i]) for r in xrange(n_rows)) ] if nulls: raise BQLError( bdb, 'Failed to initialize, ' 'columns have all null values: %s' % repr(nulls)) return Engine(gpmcc_data, num_states=n, rng=bdb.np_prng, multiprocess=self._ncpu, outputs=outputs, cctypes=cctypes, distargs=distargs)
def _from_numeric(self, bdb, generator_id, colno, value): """Convert value in cgpm to equivalent bayeslite format.""" # XXX Latent variables are not associated with an entry in # bayesdb_cgpm_category, so just pass through whatever value cgpm # returns. if colno < 0: return value if math.isnan(value): return None stattype = core.bayesdb_generator_column_stattype( bdb, generator_id, colno) if _is_categorical(stattype): cursor = bdb.sql_execute( ''' SELECT value FROM bayesdb_cgpm_category WHERE generator_id = ? AND colno = ? AND code = ? ''', (generator_id, colno, value)) text = cursor_value(cursor, nullok=True) if text is None: raise BQLError('Invalid category: %r' % (value, )) return text else: return value
def create_generator(self, bdb, table, schema, instantiate): # The schema is the column list. May want to change this later # to make room for specifying the hyperparameters, etc. insert_column_sql = ''' INSERT INTO bayesdb_nig_normal_column (generator_id, colno, count, sum, sumsq) VALUES (:generator_id, :colno, :count, :sum, :sumsq) ''' with bdb.savepoint(): generator_id, column_list = instantiate(schema) for (colno, column_name, stattype) in column_list: if not stattype == 'numerical': raise BQLError(bdb, 'NIG-Normal only supports' ' numerical columns, but %s is %s' % (repr(column_name), repr(stattype))) (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name) bdb.sql_execute(insert_column_sql, { 'generator_id': generator_id, 'colno': colno, 'count': count, 'sum': xsum, 'sumsq': sumsq, })
def _initialize_cgpm(self, bdb, generator_id, cgpm_ext): population_id = core.bayesdb_generator_population(bdb, generator_id) def map_var(var): return core.bayesdb_variable_number(bdb, population_id, generator_id, var) name = cgpm_ext['name'] outputs = map(map_var, cgpm_ext['outputs']) inputs = map(map_var, cgpm_ext['inputs']) args = cgpm_ext.get('args', ()) kwds = cgpm_ext.get('kwds', {}) if name not in self._cgpm_registry: raise BQLError(bdb, 'Unknown CGPM: %s' % (repr(name), )) cls = self._cgpm_registry[name] cgpm_vars = cgpm_ext['outputs'] + cgpm_ext['inputs'] cgpm_data = self._data(bdb, generator_id, cgpm_vars) cgpm = cls(outputs, inputs, rng=bdb.np_prng, *args, **kwds) for cgpm_rowid, row in enumerate(cgpm_data): # CGPMs do not uniformly handle null values or missing # values sensibly yet, so until we have that sorted # out we both (a) omit nulls and (b) ignore errors in # incorporate. query = { colno: row[i] for i, colno in enumerate(outputs) if not math.isnan(row[i]) } n = len(outputs) evidence = { colno: row[n + i] for i, colno in enumerate(inputs) if not math.isnan(row[n + i]) } try: cgpm.incorporate(cgpm_rowid, query, evidence) except Exception: pass return cgpm
def instantiate_generator(bdb, gen_name, table, metamodel, columns, default=None): if default is None: default = False # Make sure there is no table by this name. if core.bayesdb_has_table(bdb, gen_name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(gen_name), )) # Make sure the bayesdb_column table knows all the columns. core.bayesdb_table_guarantee_columns(bdb, table) generator_already_existed = False if core.bayesdb_has_generator(bdb, gen_name): generator_already_existed = True else: # Create the generator record. generator_sql = '''INSERT INTO bayesdb_generator (name, tabname, metamodel, defaultp) VALUES (:name, :table, :metamodel, :defaultp)''' cursor = bdb.sql_execute( generator_sql, { 'name': gen_name, 'table': table, 'metamodel': metamodel.name(), 'defaultp': default, }) generator_id = core.bayesdb_get_generator(bdb, gen_name) assert generator_id assert 0 < generator_id # Get a map from column name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. column_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for name, stattype in columns: name_folded = casefold(name) if name_folded in column_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, { 'stattype': stattype, }) if cursor_value(cursor) == 0: invalid.add(stattype) continue column_map[casefold(name)] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError( bdb, 'No such columns in table %s: %s' % (repr(table), repr(list(missing)))) if duplicates: raise BQLError( bdb, 'Duplicate column names: %s' % (repr(list(duplicates)), )) if invalid: raise BQLError( bdb, 'Invalid statistical types: %s' % (repr(list(invalid)), )) if not generator_already_existed: # Insert column records. column_sql = ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) VALUES (:generator_id, :colno, :stattype) ''' for name, stattype in columns: colno = column_map[casefold(name)] stattype = casefold(stattype) bdb.sql_execute( column_sql, { 'generator_id': generator_id, 'colno': colno, 'stattype': stattype, }) column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns) return generator_id, column_list
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) if core.bayesdb_has_table(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) if not core.bayesdb_has_generator_default( bdb, phrase.simulation.generator): raise BQLError( bdb, 'No such generator: %s' % (phrase.simulation.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.simulation.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) table = core.bayesdb_generator_table(bdb, generator_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) qgn = sqlite3_quote_name(phrase.simulation.generator) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, )) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) for column_name, _expression in phrase.simulation.constraints: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write('SELECT ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) out.write(', ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out) for _column_name, expression in phrase.simulation.constraints: out.write(', ') compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) modelno = cursor[0][1] assert modelno is None or isinstance(modelno, int) constraints = \ [(core.bayesdb_generator_column_number(bdb, generator_id, name), value) for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])] colnos = \ [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names] bdb.sql_execute( 'CREATE %sTABLE %s%s (%s)' % ('TEMP ' if phrase.temp else '', 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join( '%s %s' % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names)))) insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in bqlfn.bayesdb_simulate(bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): # XXX Automatically delete the generators? Generators # are more interesting than triggers and indices, so # automatic deletion is not obviously right. raise BQLError( bdb, 'Table still in use by generators: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp) or \ core.bayesdb_has_generator(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column( bdb, table, cmd.old): raise BQLError( bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = ''' SELECT id FROM bayesdb_generator WHERE tabname = ? ''' cursor = bdb.sql_execute(generators_sql, (table, )) for (generator_id, ) in cursor: metamodel = core.bayesdb_generator_metamodel( bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) elif isinstance(cmd, ast.AlterTabSetDefGen): if not core.bayesdb_has_generator(bdb, cmd.generator): raise BQLError( bdb, 'No such generator: %s' % (repr(cmd.generator), )) generator_id = core.bayesdb_get_generator( bdb, cmd.generator) bayesdb_schema_required(bdb, 6, "generator defaults") unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) set_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(set_default_sql, (generator_id, )) assert bdb._sqlite3.totalchanges() - total_changes == 1 elif isinstance(cmd, ast.AlterTabUnsetDefGen): unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the metamodel. if phrase.metamodel not in bdb.metamodels: raise BQLError( bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), )) metamodel = bdb.metamodels[phrase.metamodel] # Let the metamodel parse the schema itself and call # create_generator with the modelled columns. with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) else: def instantiate(columns): return instantiate_generator(bdb, phrase.name, phrase.table, metamodel, columns, default=phrase.default) metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name), )) generator_id = core.bayesdb_get_generator(bdb, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_generator_column WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id, )) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id, )) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) modelnos = range(phrase.nmodels) model_config = None # XXX For now. with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model( bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError( bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' for modelno in modelnos: bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': 0, }) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos, model_config) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError('No background analysis -- use WAIT.') # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError( bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id, )) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) assert False # XXX
def simulate_models_rows(bdb, simulation): assert all(isinstance(c, ast.SimCol) for c in simulation.columns) population_id = core.bayesdb_get_population(bdb, simulation.population) generator_id = None if simulation.generator is not None: if not core.bayesdb_has_generator(bdb, population_id, simulation.generator): raise BQLError(bdb, 'No such generator: %r' % (simulation.generator, )) generator_id = core.bayesdb_get_generator(bdb, population_id, simulation.generator) def retrieve_literal(expression): assert isinstance(expression, ast.ExpLit) lit = expression.value if isinstance(lit, ast.LitNull): return None elif isinstance(lit, ast.LitInt): return lit.value elif isinstance(lit, ast.LitFloat): return lit.value elif isinstance(lit, ast.LitString): return lit.value else: assert False def retrieve_variable(var): if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): raise BQLError(bdb, 'No such population variable: %s' % (var, )) return core.bayesdb_variable_number(bdb, population_id, generator_id, var) def simulate_column(phrase): if isinstance(phrase, ast.ExpBQLDepProb): raise BQLError( bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.') elif isinstance(phrase, ast.ExpBQLProb): raise BQLError(bdb, 'PROBABILITY OF simulation still unsupported.') elif isinstance(phrase, ast.ExpBQLMutInf): colno0 = retrieve_variable(phrase.column0) colno1 = retrieve_variable(phrase.column1) constraint_args = () if phrase.constraints is not None: constraint_args = tuple( itertools.chain.from_iterable( [[retrieve_variable(colname), retrieve_literal(expr)] for colname, expr in phrase.constraints])) nsamples = phrase.nsamples and retrieve_literal(phrase.nsamples) # One mi_list per generator of the population. mi_lists = bqlfn._bql_column_mutual_information( bdb, population_id, generator_id, colno0, colno1, nsamples, *constraint_args) return list(itertools.chain.from_iterable(mi_lists)) else: raise BQLError( bdb, 'Only constants can be simulated: %s.' % (simulation, )) columns = [simulate_column(c.col) for c in simulation.columns] # All queries must return the same number of rows, equal to the number of # models of all generators implied by the query. assert all(len(column) == len(columns[0]) for column in columns) # Convert the columns into rows. return zip(*columns)
def bayesdb_simulate(bdb, population_id, constraints, colnos, generator_id=None, numpredictions=1, accuracy=None): """Simulate rows from a generative model, subject to constraints. Returns a list of `numpredictions` tuples, with a value for each column specified in the list `colnos`, conditioned on the constraints in the list `constraints` of tuples ``(colno, value)``. The results are simulated from the predictive distribution on fresh rows. """ rowid, constraints = _retrieve_rowid_constraints(bdb, population_id, constraints) def loglikelihood(generator_id, metamodel): if not constraints: return 0 return metamodel.logpdf_joint(bdb, generator_id, rowid, constraints, [], None) def simulate(generator_id, metamodel, n): return metamodel.simulate_joint(bdb, generator_id, rowid, colnos, constraints, None, num_samples=n, accuracy=accuracy) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) metamodels = [ core.bayesdb_generator_metamodel(bdb, generator_id) for generator_id in generator_ids ] if len(generator_ids) > 1: loglikelihoods = map(loglikelihood, generator_ids, metamodels) likelihoods = map(math.exp, loglikelihoods) total_likelihood = sum(likelihoods) if total_likelihood == 0: # XXX Show the constraints with symbolic names. raise BQLError(bdb, 'Impossible constraints: %r' % (constraints, )) probabilities = [ likelihood / total_likelihood for likelihood in likelihoods ] countses = bdb.np_prng.multinomial(numpredictions, probabilities, size=1) counts = countses[0] else: counts = [numpredictions] rowses = map(simulate, generator_ids, metamodels, counts) all_rows = [row for rows in rowses for row in rows] assert all(isinstance(row, (tuple, list)) for row in all_rows) return all_rows
def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints, num_samples=1, accuracy=None): # Retrieve the population id. population_id = bayesdb_generator_population(bdb, generator_id) # If rowid exists, retrieve conditioning data from the table. if rowid != bayesdb_population_fresh_row_id(bdb, generator_id): row_values_raw = bayesdb_population_row_values( bdb, population_id, rowid) row_values = [ str(a) if isinstance(a, unicode) else a for a in row_values_raw ] row = [ entry for entry in enumerate(row_values) if entry[1] is not None ] constraints_colnos = [c[0] for c in constraints] row_colnos = [r[0] for r in row] if any([colno in constraints_colnos for colno in row_colnos]): raise BQLError(bdb, 'Overlap between constraints and' \ 'target row in simulate.') constraints.extend(row) # Prepare the query row to provide to Loom. row = {} target_num_to_name = {} for colno in targets: name = bayesdb_variable_name(bdb, generator_id, None, colno) target_num_to_name[colno] = name row[name] = '' for (colno, value) in constraints: name = bayesdb_variable_name(bdb, generator_id, None, colno) row[name] = value # Fetch the server. server = self._get_cache_entry(bdb, generator_id, 'preql_server') # Prepare the csv header. csv_headers, csv_values = zip(*row.iteritems()) lower_to_upper = {str(a).lower(): str(a) for a in csv_headers} csv_headers = lower_to_upper.keys() csv_values = [str(a) for a in csv_values] # Retrieve the samples from the server.. outfile = StringIO() writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue) reader = iter([csv_headers] + [csv_values]) server._predict(reader, num_samples, writer, False) output = writer.result() # Parse output. returned_headers = [ lower_to_upper[a] for a in output.strip().split('\r\n')[0].split(CSV_DELIMITER) ] loom_output = [ zip(returned_headers, a.split(CSV_DELIMITER)) for a in output.strip().split('\r\n')[1:] ] return_list = [] for row in loom_output: # Prepare the row. row_values = [] row_dict = dict(row) for colno in targets: colname = target_num_to_name[colno] value = row_dict[colname] stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) if not _is_nominal(stattype): value = float(value) row_values.append(value) # Add this row to the return list. return_list.append(row_values) return return_list
def grouped_schema(): schema = '' nominal = [] numerical = [] ignore = [] for var in guesses.keys(): if len(var) > 0: guessed_type_reason = guesses[var] guessed_type = guessed_type_reason[0].lower() guessed_reason = guessed_type_reason[1] if guessed_type == 'nominal': nominal.append([var, guessed_reason]) elif guessed_type == 'numerical': numerical.append([var, guessed_reason]) elif guessed_type == 'ignore': ignore.append([var, guessed_reason]) elif guessed_type == 'key': if len(guessed_reason) > 0: ignore.append([var, guessed_reason]) else: ignore.append([var, 'This variable is a key.']) else: raise BQLError(bdb, 'Empty column name(s) in table %s' % (tablename,)) stattype_var_list_pairs = [ ['NOMINAL', nominal], ['NUMERICAL', numerical], ['IGNORE', ignore] ] for stattype, var_list in stattype_var_list_pairs: # Remove any empty-string variable names. var_list = filter(None, var_list) if len(var_list) > 0: if stattype == 'IGNORE': schema += 'IGNORE ' else: schema += 'MODEL %s ' % (os.linesep,) for i in xrange(len(var_list)): # List of variable and reason it was classified as such. var_reason = var_list[i] var = var_reason[0] reason = var_reason[1] schema += '\t %s' % (var,) # Don't append a comma for last item in list. if i != len(var_list) - 1: schema += ',' # Add a space between the last variable and 'AS' for proper # parsing. else: schema += ' ' if len(reason) > 0: # Add reason as a comment. schema += " '''# %s" % (reason,) # Each variable (and reason) on a separate line. schema += os.linesep # If reason was commented on previous line, need triple # quote to re-enter schema string. if len(reason) > 0: schema += "'''" if stattype != 'IGNORE': schema += 'AS %s \t %s' % (os.linesep, stattype,) schema += ';%s' % (os.linesep,) # Strip last semicolon and newline - not needed at end of schema. schema = schema[:-2] return schema
def _create_population(bdb, phrase): if core.bayesdb_has_population(bdb, phrase.name): if phrase.ifnotexists: return else: raise BQLError( bdb, 'Name already defined as population: %r' % (phrase.name, )) # Make sure the bayesdb_column table knows all the columns of the # underlying table. core.bayesdb_table_guarantee_columns(bdb, phrase.table) # Retrieve all columns from the base table. The user is required to provide # a strategy for each single variable, either MODEL, IGNORE, or GUESS. base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table) seen_columns = [] # Create the population record and get the assigned id. bdb.sql_execute( ''' INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?) ''', (phrase.name, phrase.table)) population_id = core.bayesdb_get_population(bdb, phrase.name) # Extract the population column names and stattypes as pairs. pop_model_vars = list( itertools.chain.from_iterable([[(name, s.stattype) for name in s.names] for s in phrase.schema if isinstance(s, ast.PopModelVars)])) # Extract the ignored columns. pop_ignore_vars = list( itertools.chain.from_iterable([[(name, 'ignore') for name in s.names] for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)])) # Extract the columns to guess. pop_guess = list( itertools.chain.from_iterable([ s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars) ])) if '*' in pop_guess: # Do not allow * to coincide with other variables. if len(pop_guess) > 1: raise BQLError( bdb, 'Cannot use wildcard GUESS with variables names: %r' % (pop_guess, )) # Retrieve all variables in the base table. avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars) pop_guess = [t for t in base_table_columns if casefold(t) not in avoid] # Perform the guessing. if pop_guess: qt = sqlite3_quote_name(phrase.table) qcns = ','.join(map(sqlite3_quote_name, pop_guess)) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt)) rows = cursor.fetchall() # XXX This function returns a stattype called `key`, which we will add # to the pop_ignore_vars. pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows) pop_guess_vars = zip(pop_guess, pop_guess_stattypes) migrate = [(col, st) for col, st in pop_guess_vars if st == 'key'] for col, st in migrate: pop_guess_vars.remove((col, st)) pop_ignore_vars.append((col, 'ignore')) else: pop_guess_vars = [] # Pool all the variables and statistical types together. pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars # Check that everyone in the population is modeled. # `known` contains all the variables for which a policy is known. known = [casefold(t[0]) for t in pop_all_vars] not_found = [t for t in base_table_columns if casefold(t) not in known] if not_found: raise BQLError( bdb, 'Cannot determine a modeling policy for variables: %r' % (not_found, )) # Get a map from variable name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. variable_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if name in variable_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': phrase.table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype}) if cursor_value(cursor) == 0 and stattype != 'ignore': invalid.add(stattype) continue variable_map[name] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError( bdb, 'No such columns in table %r: %r' % (phrase.table, list(missing))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates), )) if invalid: raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid), )) # Insert variable records. for nm, st in pop_all_vars: name = casefold(nm) colno = variable_map[name] stattype = casefold(st) if stattype == 'ignore': continue bdb.sql_execute( ''' INSERT INTO bayesdb_variable (population_id, name, colno, stattype) VALUES (?, ?, ?, ?) ''', (population_id, name, colno, stattype))
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Table already exists: %s' % (repr(phrase.name), )) bayesdb_read_csv_file(bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) if not core.bayesdb_has_population(bdb, phrase.simulation.population): raise BQLError( bdb, 'No such population: %s' % (phrase.simulation.population, )) population_id = core.bayesdb_get_population( bdb, phrase.simulation.population) generator_id = None if phrase.simulation.generator is not None: if not core.bayesdb_has_generator(bdb, population_id, phrase.simulation.generator): raise BQLError( bdb, 'No such generator: %r' % (phrase.simulation.generator, )) generator_id = core.bayesdb_get_generator( bdb, population_id, phrase.simulation.generator) table = core.bayesdb_population_table(bdb, population_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, )) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such variable' ' in population %r: %s' % (phrase.simulation.population, column_name)) for column_name, _expression in phrase.simulation.constraints: cn = casefold(column_name) if (cn not in column_sqltypes and cn not in core.bayesdb_rowid_tokens(bdb)): raise BQLError( bdb, 'No such variable in population %s: %s' % (phrase.simulation.population, column_name)) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write('SELECT ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) for _column_name, expression in phrase.simulation.constraints: out.write(', ') compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) def map_var(var): if casefold(var) not in core.bayesdb_rowid_tokens(bdb): return core.bayesdb_variable_number( bdb, population_id, generator_id, var) else: return casefold(var) def map_constraint(((var, _expression), value)): return (map_var(var), value) constraints = map( map_constraint, zip(phrase.simulation.constraints, cursor[0][1:])) colnos = map(map_var, column_names) schema = ','.join('%s %s' % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names)) bdb.sql_execute( 'CREATE %sTABLE %s%s (%s)' % ('TEMP ' if phrase.temp else '', 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, schema)) insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in bqlfn.bayesdb_simulate( bdb, population_id, constraints, colnos, generator_id=generator_id, numpredictions=nsamples, accuracy=phrase.simulation.accuracy): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb)
population_id, constraints, colnos, generator_id=generator_id, numpredictions=nsamples, accuracy=phrase.simulation.accuracy): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): raise BQLError( bdb, 'Table still in use by populations: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do
def simulate_models_rows(bdb, simulation): assert all(isinstance(c, ast.SelColExp) for c in simulation.columns) population_id = core.bayesdb_get_population(bdb, simulation.population) generator_id = None if simulation.generator is not None: if not core.bayesdb_has_generator(bdb, population_id, simulation.generator): raise BQLError(bdb, 'No such generator: %r' % (simulation.generator, )) generator_id = core.bayesdb_get_generator(bdb, population_id, simulation.generator) def retrieve_literal(expression): assert isinstance(expression, ast.ExpLit) lit = expression.value if isinstance(lit, ast.LitNull): return None elif isinstance(lit, ast.LitInt): return lit.value elif isinstance(lit, ast.LitFloat): return lit.value elif isinstance(lit, ast.LitString): return lit.value else: assert False def retrieve_variable(var): if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): raise BQLError(bdb, 'No such population variable: %s' % (var, )) return core.bayesdb_variable_number(bdb, population_id, generator_id, var) def simulate_column(exp): if isinstance(exp, ast.ExpCol): # XXX This is wrong -- it returns independent samples from # the marginals of each variable, not one sample from the # joint on all variables. if False: raise BQLError( bdb, 'SIMULATE FROM MODELS OF can\'t sample conditional') # XXX Gotta weight each model by probability of # constraints. constraints = [(retrieve_variable(v), retrieve_literal(e)) for v, e in simulation.constraints] else: constraints = [] colnos = [retrieve_variable(exp.column)] accuracy = 1 # XXX Allow nontrivial accuracy? samples = bqlfn.bayesdb_simulate(bdb, population_id, constraints, colnos, generator_id=generator_id, numpredictions=1, accuracy=accuracy) return [sample[0] for sample in samples] elif isinstance(exp, ast.ExpBQLDepProb): raise BQLError( bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.') elif isinstance(exp, ast.ExpBQLProbDensity): raise BQLError( bdb, 'PROBABILITY DENSITY OF simulation still unsupported.') elif isinstance(exp, ast.ExpBQLMutInf): colnos0 = [retrieve_variable(c) for c in exp.columns0] colnos1 = [retrieve_variable(c) for c in exp.columns1] constraint_args = () if exp.constraints is not None: constraint_args = tuple( itertools.chain.from_iterable( [[retrieve_variable(colname), retrieve_literal(expr)] for colname, expr in exp.constraints])) nsamples = exp.nsamples and retrieve_literal(exp.nsamples) # One mi_list per generator of the population. # # XXX fsaad@20170625: Setting modelnos = None arbitrarily, figure # out how to set the modelnos argument. mi_lists = bqlfn._bql_column_mutual_information( bdb, population_id, generator_id, None, colnos0, colnos1, nsamples, *constraint_args) return list(itertools.chain.from_iterable(mi_lists)) else: raise BQLError( bdb, 'Only constants can be simulated: %s.' % (simulation, )) columns = [simulate_column(c.expression) for c in simulation.columns] # All queries must return the same number of rows, equal to the number of # models of all generators implied by the query. assert all(len(column) == len(columns[0]) for column in columns) # Convert the columns into rows. return zip(*columns)
def retrieve_variable(var): if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): raise BQLError(bdb, 'No such population variable: %s' % (var, )) return core.bayesdb_variable_number(bdb, population_id, generator_id, var)
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Table already exists: %s' % (repr(phrase.name), )) bayesdb_read_csv_file(bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): raise BQLError( bdb, 'Table still in use by populations: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column( bdb, table, cmd.old): raise BQLError( bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = ''' SELECT id FROM bayesdb_generator WHERE tabname = ? ''' cursor = bdb.sql_execute(generators_sql, (table, )) for (generator_id, ) in cursor: metamodel = core.bayesdb_generator_metamodel( bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.GuessSchema): if not core.bayesdb_has_table(bdb, phrase.table): raise BQLError(bdb, 'No such table : %s' % phrase.table) out = compiler.Output(0, {}, {}) with bdb.savepoint(): qt = sqlite3_quote_name(phrase.table) temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, )) column_names = [d[0] for d in cursor.description] rows = cursor.fetchall() stattypes = bayesdb_guess_stattypes(column_names, rows) distinct_value_counts = [ len(set([row[i] for row in rows])) for i in range(len(column_names)) ] out.winder( ''' CREATE TEMP TABLE %s (column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT) ''' % (qtt), ()) for cn, st, ct in zip(column_names, stattypes, distinct_value_counts): out.winder( ''' INSERT INTO %s VALUES (?, ?, ?, ?) ''' % (qtt), (cn, st[0], ct, st[1])) out.write('SELECT * FROM %s' % (qtt, )) out.unwinder('DROP TABLE %s' % (qtt, ), ()) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.CreatePop): with bdb.savepoint(): _create_population(bdb, phrase) return empty_cursor(bdb) if isinstance(phrase, ast.DropPop): with bdb.savepoint(): if not core.bayesdb_has_population(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such population: %r' % (phrase.name, )) population_id = core.bayesdb_get_population(bdb, phrase.name) generator_ids = core.bayesdb_population_generators( bdb, population_id) if generator_ids: generators = [ core.bayesdb_generator_name(bdb, gid) for gid in generator_ids ] raise BQLError( bdb, 'Population %r still has metamodels: %r' % (phrase.name, generators)) # XXX helpful error checking if generators still exist # XXX check change counts bdb.sql_execute( ''' DELETE FROM bayesdb_variable WHERE population_id = ? ''', (population_id, )) bdb.sql_execute( ''' DELETE FROM bayesdb_population WHERE id = ? ''', (population_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterPop): with bdb.savepoint(): population = phrase.population if not core.bayesdb_has_population(bdb, population): raise BQLError(bdb, 'No such population: %s' % (repr(population), )) population_id = core.bayesdb_get_population(bdb, population) for cmd in phrase.commands: if isinstance(cmd, ast.AlterPopAddVar): # Ensure column exists in base table. table = core.bayesdb_population_table(bdb, population_id) if not core.bayesdb_table_has_column(bdb, table, cmd.name): raise BQLError( bdb, 'No such variable in base table: %s' % (cmd.name)) # Ensure variable not already in population. if core.bayesdb_has_variable(bdb, population_id, None, cmd.name): raise BQLError( bdb, 'Variable already in population: %s' % (cmd.name)) # Ensure there is at least observation in the column. qt = sqlite3_quote_name(table) qc = sqlite3_quote_name(cmd.name) cursor = bdb.sql_execute( 'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' % (qt, qc)) if cursor_value(cursor) == 0: raise BQLError( bdb, 'Cannot add variable without any values: %s' % (cmd.name)) # If stattype is None, guess. if cmd.stattype is None: cursor = bdb.sql_execute('SELECT %s FROM %s' % (qc, qt)) rows = cursor.fetchall() [stattype, reason] = bayesdb_guess_stattypes([cmd.name], rows)[0] # Fail if trying to model a key. if stattype == 'key': raise BQLError( bdb, 'Values in column %s appear to be keys.' % (cmd.name, )) # Fail if cannot determine a stattype. elif stattype == 'ignore': raise BQLError( bdb, 'Failed to determine a stattype for %s, ' 'please specify one manually.' % (cmd.name, )) # If user specified stattype, ensure it exists. elif not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError(bdb, 'Invalid stattype: %s' % (cmd.stattype)) else: stattype = cmd.stattype # Check that strings are not being modeled as numerical. if stattype == 'numerical' \ and _column_contains_string(bdb, table, cmd.name): raise BQLError( bdb, 'Numerical column contains string values: %r ' % (qc, )) with bdb.savepoint(): # Add the variable to the population. core.bayesdb_add_variable(bdb, population_id, cmd.name, stattype) colno = core.bayesdb_variable_number( bdb, population_id, None, cmd.name) # Add the variable to each (initialized) metamodel in # the population. generator_ids = filter( lambda g: core.bayesdb_generator_modelnos(bdb, g), core.bayesdb_population_generators( bdb, population_id), ) for generator_id in generator_ids: # XXX Omit needless bayesdb_generator_column table # Github issue #441. bdb.sql_execute( ''' INSERT INTO bayesdb_generator_column VALUES (:generator_id, :colno, :stattype) ''', { 'generator_id': generator_id, 'colno': colno, 'stattype': stattype, }) metamodel = core.bayesdb_generator_metamodel( bdb, generator_id) metamodel.add_column(bdb, generator_id, colno) elif isinstance(cmd, ast.AlterPopStatType): # Check the no metamodels are defined for this population. generators = core.bayesdb_population_generators( bdb, population_id) if generators: raise BQLError( bdb, 'Cannot update statistical types for population ' '%s, it has metamodels: %s' % ( repr(population), repr(generators), )) # Check all the variables are in the population. unknown = [ c for c in cmd.names if not core.bayesdb_has_variable( bdb, population_id, None, c) ] if unknown: raise BQLError( bdb, 'No such variables in population: %s' % (repr(unknown))) # Check the statistical type is valid. if not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError( bdb, 'Invalid statistical type: %r' % (repr(cmd.stattype), )) # Check that strings are not being modeled as numerical. if cmd.stattype == 'numerical': table = core.bayesdb_population_table( bdb, population_id) numerical_string_vars = [ col for col in cmd.names if _column_contains_string(bdb, table, col) ] if numerical_string_vars: raise BQLError( bdb, 'Columns with string values modeled as ' 'numerical: %r' % (numerical_string_vars, )) # Perform the stattype update. colnos = [ core.bayesdb_variable_number(bdb, population_id, None, c) for c in cmd.names ] qcolnos = ','.join('%d' % (colno, ) for colno in colnos) update_stattype_sql = ''' UPDATE bayesdb_variable SET stattype = ? WHERE population_id = ? AND colno IN (%s) ''' % (qcolnos, ) bdb.sql_execute(update_stattype_sql, ( casefold(cmd.stattype), population_id, )) else: assert False, 'Invalid ALTER POPULATION command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population, )) population_id = core.bayesdb_get_population(bdb, phrase.population) table = core.bayesdb_population_table(bdb, population_id) # Find the metamodel, or use the default. metamodel_name = phrase.metamodel if phrase.metamodel is None: metamodel_name = 'cgpm' if metamodel_name not in bdb.metamodels: raise BQLError(bdb, 'No such metamodel: %s' % (repr(metamodel_name), )) metamodel = bdb.metamodels[metamodel_name] with bdb.savepoint(): if core.bayesdb_has_generator(bdb, population_id, phrase.name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) else: # Insert a record into bayesdb_generator and get the # assigned id. bdb.sql_execute( ''' INSERT INTO bayesdb_generator (name, tabname, population_id, metamodel) VALUES (?, ?, ?, ?) ''', (phrase.name, table, population_id, metamodel.name())) generator_id = core.bayesdb_get_generator( bdb, population_id, phrase.name) # Populate bayesdb_generator_column. # # XXX Omit needless bayesdb_generator_column table -- # Github issue #441. bdb.sql_execute( ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) SELECT :generator_id, colno, stattype FROM bayesdb_variable WHERE population_id = :population_id AND generator_id IS NULL ''', { 'generator_id': generator_id, 'population_id': population_id, }) # Do any metamodel-specific initialization. metamodel.create_generator(bdb, generator_id, phrase.schema, baseline=phrase.baseline) # Populate bayesdb_generator_column with any latent # variables that metamodel.create_generator has added # with bayesdb_add_latent. bdb.sql_execute( ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) SELECT :generator_id, colno, stattype FROM bayesdb_variable WHERE population_id = :population_id AND generator_id = :generator_id ''', { 'generator_id': generator_id, 'population_id': population_id, }) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, None, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name), )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_generator_column WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id, )) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id, )) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, None, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, None, generator) cmds_generic = [] for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Disable modelnos with AlterGenRenameGen. if phrase.modelnos is not None: raise BQLError(bdb, 'Cannot specify models for RENAME') # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, None, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name elif isinstance(cmd, ast.AlterGenGeneric): cmds_generic.append(cmd.command) else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) if cmds_generic: modelnos = phrase.modelnos modelnos_invalid = None if modelnos is None else [ modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno) ] if modelnos_invalid: raise BQLError( bdb, 'No such models in generator %s: %s' % (repr(phrase.generator), repr(modelnos))) # Call generic alternations on the metamodel. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.alter(bdb, generator_id, modelnos, cmds_generic) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) modelnos = range(phrase.nmodels) with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model( bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError( bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' for modelno in modelnos: bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': 0, }) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError('No background analysis -- use WAIT.') # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, program=phrase.program) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError( bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id, )) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) if isinstance(phrase, ast.Regress): # Retrieve the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population, )) population_id = core.bayesdb_get_population(bdb, phrase.population) # Retrieve the metamodel. generator_id = None if phrase.metamodel: if not core.bayesdb_has_generator(bdb, population_id, phrase.metamodel): raise BQLError(bdb, 'No such metamodel: %r' % (phrase.population, )) generator_id = core.bayesdb_get_generator(bdb, population_id, phrase.metamodel) # Retrieve the target variable. if not core.bayesdb_has_variable(bdb, population_id, None, phrase.target): raise BQLError(bdb, 'No such variable: %r' % (phrase.target, )) colno_target = core.bayesdb_variable_number(bdb, population_id, None, phrase.target) if core.bayesdb_variable_stattype(bdb, population_id, colno_target) != \ 'numerical': raise BQLError( bdb, 'Target variable is not numerical: %r' % (phrase.target, )) # Build the given variables. if any(isinstance(col, ast.SelColAll) for col in phrase.givens): # Using * is not allowed to be mixed with other variables. if len(phrase.givens) > 1: raise BQLError(bdb, 'Cannot use (*) with other givens.') colno_givens = core.bayesdb_variable_numbers( bdb, population_id, None) else: if any(isinstance(col, ast.SelColSub) for col in phrase.givens): # Subexpression needs special compiling. out = compiler.Output(n_numpar, nampar_map, bindings) bql_compiler = compiler.BQLCompiler_None() givens = compiler.expand_select_columns( bdb, phrase.givens, True, bql_compiler, out) else: givens = phrase.givens colno_givens = [ core.bayesdb_variable_number(bdb, population_id, None, given.expression.column) for given in givens ] # Build the arguments to bqlfn.bayesdb_simulate. colno_givens_unique = set(colno for colno in colno_givens if colno != colno_target) if len(colno_givens_unique) == 0: raise BQLError(bdb, 'No matching given columns.') constraints = [] colnos = [colno_target] + list(colno_givens_unique) nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value modelnos = None if phrase.modelnos is None else str(phrase.modelnos) rows = bqlfn.bayesdb_simulate(bdb, population_id, generator_id, modelnos, constraints, colnos, numpredictions=nsamp) # Retrieve the stattypes. stattypes = [ core.bayesdb_variable_stattype(bdb, population_id, colno_given) for colno_given in colno_givens_unique ] # Separate the target values from the given values. target_values = [row[0] for row in rows] given_values = [row[1:] for row in rows] given_names = [ core.bayesdb_variable_name(bdb, population_id, given) for given in colno_givens_unique ] # Compute the coefficients. The import to regress_ols is here since the # feature depends on pandas + sklearn, so avoid module-wide import. from bayeslite.regress import regress_ols coefficients = regress_ols(target_values, given_values, given_names, stattypes) # Store the results in a winder. temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) out = compiler.Output(0, {}, {}) out.winder( ''' CREATE TEMP TABLE %s (variable TEXT, coefficient REAL); ''' % (qtt, ), ()) for variable, coef in coefficients: out.winder( ''' INSERT INTO %s VALUES (?, ?) ''' % (qtt), ( variable, coef, )) out.write('SELECT * FROM %s ORDER BY variable' % (qtt, )) out.unwinder('DROP TABLE %s' % (qtt, ), ()) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) assert False # XXX
def retrieve_analyze_variables(ast): # Transition all variables by default. variables = None # Exactly 1 VARIABLES or SKIP clause supported for simplicity. seen_variables, seen_skip, seen_optimized = False, False, False for clause in ast: # Transition user specified variables only. if isinstance(clause, cgpm_analyze.parse.Variables): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE' ) seen_variables = True included = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable( bdb, population_id, generator_id, var): unknown.add(var) included.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) variables = sorted(included) # Transition all variables except user specified skip. elif isinstance(clause, cgpm_analyze.parse.Skip): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE' ) seen_skip = True excluded = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable( bdb, population_id, generator_id, var): unknown.add(var) excluded.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) all_vars = core.bayesdb_variable_names( bdb, population_id, generator_id) variables = sorted(set(all_vars) - excluded) elif isinstance(clause, cgpm_analyze.parse.Optimized): seen_optimized = True # Unknown/impossible clause. else: raise ValueError('Unknown clause in ANALYZE: %s.' % ast) if variables is None: variables = core.bayesdb_variable_names( bdb, population_id, generator_id) varnos = [ core.bayesdb_variable_number(bdb, population_id, generator_id, v) for v in variables ] # TODO Perform error checking if the OPTIMIZED clause is used. # In particular, the variables in OPTIMIZED must correspond # EXACTLY to the variables that are modeled by the CrossCat # baseline. Avoided this check for now since the nature of a # variable is not stored in the bdb. For now, just check the # user did not include a VARIABLES clause. if seen_optimized: if seen_variables: raise BQLError(bdb, 'OPTIMIZED incompatible with VARIABLES') # TODO Check if varnos are exactly the CrossCat variables. # raise BQLError(bdb, # 'The OPTIMIZED phrase in ANALYZE must target all the ' # 'variables modeled by the baseline, only. ' # 'Use SKIP to explicitly ignore analysis of overriden ' # 'variables') return varnos, seen_optimized
def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints, num_samples=1, accuracy=None): # Retrieve the population id. population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Prepare list of full constraints, potentially adding data from table. constraints_full = constraints # If rowid exist in base table, retrieve conditioning data. # Conditioning values are fetched for any rowid that exists in the base # table irrespective of whether the rowid is incorporated in the Loom # model or whether it was added after creation. if bayesdb_table_has_rowid(bdb, table, rowid): # Fetch population column numbers and row values. colnos = bayesdb_variable_numbers(bdb, population_id, generator_id) rowvals = bayesdb_population_row_values(bdb, population_id, rowid) observations = [(colno, rowval) for colno, rowval in zip(colnos, rowvals) if rowval is not None and colno not in targets] # Raise error if a constraint overrides an observed cell. colnos_constrained = [constraint[0] for constraint in constraints] colnos_observed = [observation[0] for observation in observations] if set.intersection(set(colnos_constrained), set(colnos_observed)): raise BQLError( bdb, 'Overlap between constraints and' ' target row in simulate.') # Update the constraints. constraints_full = constraints + observations # Store mapping from target column name to column number and stattype. target_colno_to_name = { colno: bayesdb_variable_name(bdb, generator_id, None, colno) for colno in targets } target_colno_to_stattype = { colno: bayesdb_variable_stattype(bdb, population_id, None, colno) for colno in targets } # Construct the CSV row for targets. row_targets = {target_colno_to_name[colno]: '' for colno in targets} row_constraints = { bayesdb_variable_name(bdb, generator_id, None, colno): value for colno, value in constraints_full } row = dict( itertools.chain(row_targets.iteritems(), row_constraints.iteritems())) # Fetch the server. server = self._get_preql_server(bdb, generator_id) # Prepare the csv header and values. csv_headers = map(str, row.iterkeys()) csv_values = map(str, row.itervalues()) # Prepare streams for the server. outfile = StringIO() writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue) reader = iter([csv_headers] + [csv_values]) # Obtain the prediction. server._predict(reader, num_samples, writer, False) # Parse the CSV output. output_csv = writer.result() output_rows = output_csv.strip().split('\r\n') # Extract the header of the CSV file. header = output_rows[0].split(CSV_DELIMITER) # Extract list of simulated rows. Each simulated row is represented # as a dictionary mapping column name to its simulated value. simulated_rows = [ dict(zip(header, row.split(CSV_DELIMITER))) for row in output_rows[1:] ] # Prepare the return list of simulated_rows. def _extract_simulated_value(row, colno): colname = target_colno_to_name[colno] stattype = target_colno_to_stattype[colno] value = row[colname] return value if _is_nominal(stattype) else float(value) # Return the list of samples. return [[_extract_simulated_value(row, colno) for colno in targets] for row in simulated_rows]
def _retrieve_analyze_variables(bdb, generator_id, ast): population_id = core.bayesdb_generator_population(bdb, generator_id) # Transitions all variables by default. variables = None # Exactly 1 VARIABLES or SKIP clause supported for simplicity. seen_variables, seen_skip, seen_optimized = False, False, False for clause in ast: # Transition user specified variables only. if isinstance(clause, cgpm_analyze.parse.Variables): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE') seen_variables = True included = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): unknown.add(var) included.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) variables = sorted(included) # Transition all variables except user specified skip. elif isinstance(clause, cgpm_analyze.parse.Skip): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE') seen_skip = True excluded = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): unknown.add(var) excluded.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) all_vars = core.bayesdb_variable_names(bdb, population_id, generator_id) variables = sorted(set(all_vars) - excluded) # OPTIMIZED is incompatible with any other clause. elif isinstance(clause, cgpm_analyze.parse.Optimized): seen_optimized = True # Unknown/impossible clause. else: raise BQLError(bdb, 'Unknown clause in ANALYZE: %s.' % (ast, )) # OPTIMIZED is incompatible with any other clause. if seen_optimized: if seen_variables or seen_skip: raise BQLError(bdb, 'OPTIMIZED incompatible with other clauses.') variable_numbers = [ core.bayesdb_variable_number(bdb, population_id, generator_id, v) for v in variables ] if variables else None return (variable_numbers, seen_optimized)
def analyze_models(self, bdb, generator_id, modelnos=None, iterations=None, max_seconds=None, ckpt_iterations=None, ckpt_seconds=None, program=None): # Not sure why model-based analysis is useful. if modelnos: raise NotImplementedError('CGpm analysis by models not supported.') # XXX https://github.com/probcomp/cgpm/issues/167 if ckpt_iterations is not None or ckpt_seconds is not None: raise NotImplementedError( 'CGpm analysis checkpoint not supported.') if program is None: program = [] # Retrieve the engine. engine = self._engine(bdb, generator_id) # Retrieve user-specified target variables to transition. analyze_ast = cgpm_analyze.parse.parse(program) vars_user, optimized = _retrieve_analyze_variables( bdb, generator_id, analyze_ast) # Transitions all baseline variables only using lovecat. if optimized: engine.transition_lovecat(N=iterations, S=max_seconds, multiprocess=self._multiprocess) # More complex possibilities if using cgpm. else: # XXX Retrieve all, baseline, and foreign variable indices. state = engine.states[0] vars_baseline = state.outputs vars_foreign = list( itertools.chain.from_iterable([ cgpm.outputs for cgpm in state.hooked_cgpms.itervalues() ])) # By default transition all baseline variables only. vars_target_baseline = vars_baseline vars_target_foreign = None # Partition user-specified variables into baseline and foreign. if vars_user: intersection = lambda a, b: [x for x in a if x in b] vars_target_baseline = intersection(vars_user, vars_baseline) vars_target_foreign = intersection(vars_user, vars_foreign) assert vars_target_baseline or vars_target_foreign # Timed analysis is incompatible with mixed baseline and foreign. if max_seconds and (vars_target_baseline and vars_target_foreign): raise BQLError( bdb, 'Timed analysis accepts foreign xor baseline variables.') # Run transitions on baseline variables. if vars_target_baseline: engine.transition(N=iterations, S=max_seconds, cols=vars_target_baseline, multiprocess=self._multiprocess) # Run transitions on foreign variables. if vars_target_foreign: engine.transition_foreign(N=iterations, S=max_seconds, cols=vars_target_foreign, multiprocess=self._multiprocess) # Serialize the engine. engine_json = json_dumps(engine.to_metadata()) # Update the engine. bdb.sql_execute( ''' UPDATE bayesdb_cgpm_generator SET engine_json = :engine_json WHERE generator_id = :generator_id ''', { 'generator_id': generator_id, 'engine_json': engine_json })
def _create_schema(bdb, generator_id, schema_ast, **kwargs): # Get some parameters. population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) # State. variables = [] variable_dist = {} latents = {} cgpm_composition = [] modelled = set() default_modelled = set() subsample = None deferred_input = defaultdict(lambda: []) deferred_output = dict() # Error-reporting state. duplicate = set() unknown = set() needed = set() existing_latent = set() must_exist = [] unknown_stattype = {} # XXX Convert all Foreign.exposed lists to Latent clauses. # Retrieve Foreign clauses with exposed variables. foreign_clauses = [ c for c in schema_ast if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0 ] # Add the exposed variables to Foreign.outputs # Note that this assumes if there are K exposed variables, then they are # necessarily the last K outputs of the fc.outputs. for fc in foreign_clauses: fc.outputs.extend([e[0] for e in fc.exposed]) # Convert exposed entries into Latent clauses. latent_vars = list( itertools.chain.from_iterable(c.exposed for c in foreign_clauses)) latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars] # Append the Latent clauses to the ast. schema_ast.extend(latent_clauses) # XXX Convert the baseline to a Foreign clause. # Currently the baselines do not accept a schema, and will fail if # `schema_ast` has any entries. baseline = kwargs.get('baseline', None) if baseline is not None and casefold(baseline.name) != 'crosscat': if schema_ast: raise BQLError( bdb, 'Cannot accept schema with baseline: %s.' % schema_ast) # Retrieve all variable names in the population outputs = core.bayesdb_variable_names(bdb, population_id, None) # Convert the LITERAL namedtuples to their raw values. ps, vs = zip(*baseline.params) vs_new = [v.value for v in vs] params = zip(ps, vs_new) # Create the clause. clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name, params) # And add append it to the schema_ast. schema_ast.append(clause) # Process each clause one by one. for clause in schema_ast: if isinstance(clause, cgpm_schema.parse.Basic): # Basic Crosscat component model: one variable to be put # into Crosscat views. var = clause.var dist = clause.dist params = dict(clause.params) # XXX error checking # Reject if the variable does not exist. if not core.bayesdb_has_variable(bdb, population_id, None, var): unknown.add(var) continue # Reject if the variable has already been modelled. if var in modelled: duplicate.add(var) continue # Reject if the variable is latent. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Get the column number. colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno # Add it to the list and mark it modelled by default. stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) default_modelled.add(var) elif isinstance(clause, cgpm_schema.parse.Latent): var = clause.name stattype = clause.stattype # Reject if the variable has already been modelled by the # default model. if var in default_modelled: duplicate.add(var) continue # Reject if the variable even *exists* in the population # at all yet. if core.bayesdb_has_variable(bdb, population_id, None, var): duplicate.add(var) continue # Reject if the variable is already latent, from another # generator. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Reject if we've already processed it. if var in latents: duplicate.add(var) continue # Add it to the set of latent variables. latents[var] = stattype elif isinstance(clause, cgpm_schema.parse.Foreign): # Foreign model: some set of output variables is to be # modelled by foreign logic, possibly conditional on some # set of input variables. # # Gather up the state for a cgpm_composition record, which # we may have to do incrementally because it must refer to # the distribution types of variables we may not have # seen. name = clause.name outputs = clause.outputs inputs = clause.inputs output_stattypes = [] output_statargs = [] input_stattypes = [] input_statargs = [] distargs = { 'inputs': { 'stattypes': input_stattypes, 'statargs': input_statargs }, 'outputs': { 'stattypes': output_stattypes, 'statargs': output_statargs, } } kwds = {'distargs': distargs} kwds.update(clause.params) # First make sure all the output variables exist and have # not yet been modelled. for var in outputs: must_exist.append(var) if var in modelled: duplicate.add(var) continue modelled.add(var) # Add the output statistical type and its parameters. i = len(output_stattypes) assert i == len(output_statargs) output_stattypes.append(None) output_statargs.append(None) deferred_output[var] = (output_stattypes, output_statargs, i) # Next make sure all the input variables exist, mark them # needed, and record where to put their distribution type # and parameters. for var in inputs: must_exist.append(var) needed.add(var) i = len(input_stattypes) assert i == len(input_statargs) input_stattypes.append(None) input_statargs.append(None) deferred_input[var].append( (input_stattypes, input_statargs, i)) # Finally, add a cgpm_composition record. cgpm_composition.append({ 'name': name, 'inputs': inputs, 'outputs': outputs, 'kwds': kwds, }) elif isinstance(clause, cgpm_schema.parse.Subsample): if subsample is not None: raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, )) subsample = clause.n else: raise BQLError(bdb, 'Unknown clause: %r' % (clause, )) # Make sure all the outputs and inputs exist, either in the # population or as latents in this generator. for var in must_exist: if core.bayesdb_has_variable(bdb, population_id, None, var): continue if var in latents: continue unknown.add(var) # Raise an exception if there were duplicates or unknown # variables. if duplicate: raise BQLError(bdb, 'Duplicate model variables: %r' % (sorted(duplicate), )) if existing_latent: raise BQLError( bdb, 'Latent variables already defined: %r' % (sorted(existing_latent), )) if unknown: raise BQLError(bdb, 'Unknown model variables: %r' % (sorted(unknown), )) def default_dist(var, stattype): stattype = casefold(stattype) if stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == stattype else: unknown_stattype[var] = stattype return None dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var) return dist, params # Use the default distribution for any variables that remain to be # modelled, excluding any that are latent or that have statistical # types we don't know about. for var in core.bayesdb_variable_names(bdb, population_id, None): if var in modelled: continue colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) # Fill in the deferred_input statistical type assignments. for var in sorted(deferred_input.iterkeys()): # Check whether the variable is modelled. If not, skip -- we # will fail later because this variable is guaranteed to also # be in needed. if var not in modelled: assert var in needed continue # Determine (possibly fictitious) distribution and parameters. if var in default_modelled: # Manifest variable modelled by default Crosscat model. assert var in variable_dist stattype, dist, params = variable_dist[var] else: # Modelled by a foreign model. Assign a fictitious # default distribution because the 27B/6 of CGPM requires # this. if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. stattype = latents[var] else: # Manifest variable modelled by a foreign model. Use # the statistical type in the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams # Assign the distribution and parameters. for cctypes, ccargs, i in deferred_input[var]: assert cctypes[i] is None assert ccargs[i] is None cctypes[i] = dist ccargs[i] = params # Fill in the deferred_output statistical type assignments. The need to be # in the form NUMERICAL or CATEGORICAL. for var in deferred_output: if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. var_stattype = casefold(latents[var]) if var_stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == var_stattype else: unknown_stattype[var] = var_stattype # XXX Cannot specify statargs for a latent variable. Trying to using # default_dist might lookup the counts for unique values of the # categorical in the base table causing a failure. var_statargs = {} else: # Manifest variable modelled by a foreign model. Use # the statistical type and arguments from the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) var_stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, var_stattype) if distparams is None: continue _, var_statargs = distparams stattypes, statargs, i = deferred_output[var] assert stattypes[i] is None assert statargs[i] is None stattypes[i] = var_stattype statargs[i] = var_statargs if unknown_stattype: raise BQLError( bdb, 'Unknown statistical types for variables: %r' % (sorted(unknown_stattype.iteritems(), ))) # If there remain any variables that we needed to model, because # others are conditional on them, fail. needed -= modelled if needed: raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, )) # Finally, create a CGPM schema. return { 'variables': variables, 'cgpm_composition': cgpm_composition, 'subsample': subsample, 'latents': latents, }
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Table already exists: %s' % (repr(phrase.name), )) bayesdb_read_csv_file(bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSimModels): assert isinstance(phrase.simulation, ast.SimulateModels) with bdb.savepoint(): # Check if table exists. if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) raise BQLError( bdb, 'Name already defined as table: %s' % (phrase.name), ) # Set up schema and create the new table. qn = sqlite3_quote_name(phrase.name) qcns = map(sqlite3_quote_name, [ simcol.name if simcol.name is not None else str(simcol.col) for simcol in phrase.simulation.columns ]) temp = '' if phrase.temp is None else 'TEMP' bdb.sql_execute(''' CREATE %s TABLE %s (%s) ''' % (temp, qn, str.join(',', qcns))) # Retrieve the rows. rows = simulate_models_rows(bdb, phrase.simulation) # Insert the rows into the table. insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in rows: bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): raise BQLError( bdb, 'Table still in use by populations: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column( bdb, table, cmd.old): raise BQLError( bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = ''' SELECT id FROM bayesdb_generator WHERE tabname = ? ''' cursor = bdb.sql_execute(generators_sql, (table, )) for (generator_id, ) in cursor: metamodel = core.bayesdb_generator_metamodel( bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.GuessSchema): if not core.bayesdb_has_table(bdb, phrase.table): raise BQLError(bdb, 'No such table : %s' % phrase.table) schema = guess.guess_to_schema(guess.bayesdb_guess_stattypes, bdb, phrase.table) # Print schema to console, so user can edit it and/or copy/paste it into # the schema definition when creating a population. print schema return empty_cursor(bdb) if isinstance(phrase, ast.CreatePop): with bdb.savepoint(): _create_population(bdb, phrase) return empty_cursor(bdb) if isinstance(phrase, ast.DropPop): with bdb.savepoint(): if not core.bayesdb_has_population(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such population: %r' % (phrase.name, )) population_id = core.bayesdb_get_population(bdb, phrase.name) if core.bayesdb_population_generators(bdb, population_id): raise BQLError( bdb, 'Population still has generators: %r' % (phrase.name, )) # XXX helpful error checking if generators still exist # XXX check change counts bdb.sql_execute( ''' DELETE FROM bayesdb_variable WHERE population_id = ? ''', (population_id, )) bdb.sql_execute( ''' DELETE FROM bayesdb_population WHERE id = ? ''', (population_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterPop): with bdb.savepoint(): population = phrase.population if not core.bayesdb_has_population(bdb, population): raise BQLError(bdb, 'No such population: %s' % (repr(population), )) population_id = core.bayesdb_get_population(bdb, population) for cmd in phrase.commands: if isinstance(cmd, ast.AlterPopStatType): # Check the no metamodels are defined for this population. generators = core.bayesdb_population_generators( bdb, population_id) if generators: raise BQLError( bdb, 'Cannot update statistical types ' 'for population %s, it has metamodels: %s' % ( repr(population), repr(generators), )) # Check all the variables are in the population. unknown = [ c for c in cmd.names if not core.bayesdb_has_variable( bdb, population_id, None, c) ] if unknown: raise BQLError( bdb, 'No such variables in population' ': %s' % (repr(unknown))) # Check the statistical type is valid. if not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError( bdb, 'Invalid statistical type' ': %r' % (repr(cmd.stattype), )) # Perform the stattype update. colnos = [ core.bayesdb_variable_number(bdb, population_id, None, c) for c in cmd.names ] qcolnos = ','.join('%d' % (colno, ) for colno in colnos) update_stattype_sql = ''' UPDATE bayesdb_variable SET stattype = ? WHERE population_id = ? AND colno IN (%s) ''' % (qcolnos, ) bdb.sql_execute(update_stattype_sql, ( casefold(cmd.stattype), population_id, )) else: assert False, 'Invalid ALTER POPULATION command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population, )) population_id = core.bayesdb_get_population(bdb, phrase.population) table = core.bayesdb_population_table(bdb, population_id) # Find the metamodel, or use the default. metamodel_name = phrase.metamodel if phrase.metamodel is None: metamodel_name = 'cgpm' if metamodel_name not in bdb.metamodels: raise BQLError(bdb, 'No such metamodel: %s' % (repr(metamodel_name), )) metamodel = bdb.metamodels[metamodel_name] with bdb.savepoint(): if core.bayesdb_has_generator(bdb, population_id, phrase.name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) else: # Insert a record into bayesdb_generator and get the # assigned id. bdb.sql_execute( ''' INSERT INTO bayesdb_generator (name, tabname, population_id, metamodel) VALUES (?, ?, ?, ?) ''', (phrase.name, table, population_id, metamodel.name())) generator_id = core.bayesdb_get_generator( bdb, population_id, phrase.name) # Populate bayesdb_generator_column. # # XXX Omit needless bayesdb_generator_column table -- # Github issue #441. bdb.sql_execute( ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) SELECT :generator_id, colno, stattype FROM bayesdb_variable WHERE population_id = :population_id AND generator_id IS NULL ''', { 'generator_id': generator_id, 'population_id': population_id, }) # Do any metamodel-specific initialization. metamodel.create_generator(bdb, generator_id, phrase.schema, baseline=phrase.baseline) # Populate bayesdb_generator_column with any latent # variables that metamodel.create_generator has added # with bayesdb_add_latent. bdb.sql_execute( ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) SELECT :generator_id, colno, stattype FROM bayesdb_variable WHERE population_id = :population_id AND generator_id = :generator_id ''', { 'generator_id': generator_id, 'population_id': population_id, }) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, None, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name), )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_generator_column WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id, )) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id, )) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, None, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, None, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, None, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) modelnos = range(phrase.nmodels) with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model( bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError( bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' for modelno in modelnos: bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': 0, }) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError('No background analysis -- use WAIT.') # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, program=phrase.program) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError( bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id, )) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) assert False # XXX