def _default_categorical(bdb, generator_id, var): table = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table) qv = sqlite3_quote_name(var) cursor = bdb.sql_execute('SELECT COUNT(DISTINCT %s) FROM %s' % (qv, qt)) k = cursor_value(cursor) return 'categorical', {'k': k}
def bayesdb_generator_cell_value(bdb, generator_id, colno, rowid): table_name = core.bayesdb_generator_table(bdb, generator_id) qt = bql_quote_name(table_name) colname = core.bayesdb_generator_column_name(bdb, generator_id, colno) qcn = bql_quote_name(colname) sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt) cursor = bdb.sql_execute(sql, (rowid, )) try: row = cursor.next() except StopIteration: assert False, 'Missing row at %d!' % (rowid, ) else: return row[0]
def bayesdb_generator_cell_value(bdb, generator_id, colno, rowid): table_name = core.bayesdb_generator_table(bdb, generator_id) qt = bql_quote_name(table_name) colname = core.bayesdb_generator_column_name(bdb, generator_id, colno) qcn = bql_quote_name(colname) sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt) cursor = bdb.sql_execute(sql, (rowid,)) try: row = cursor.next() except StopIteration: assert False, 'Missing row at %d!' % (rowid,) else: return row[0]
def test_t1_column_value_probability(colno, rowid): with analyzed_bayesdb_generator(t1(), 1, 1) as (bdb, generator_id): if rowid == 0: rowid = bayesdb_maxrowid(bdb, generator_id) value = bayesdb_generator_cell_value(bdb, generator_id, colno, rowid) bqlfn.bql_column_value_probability(bdb, generator_id, None, colno, value) table_name = core.bayesdb_generator_table(bdb, generator_id) colname = core.bayesdb_generator_column_name(bdb, generator_id, colno) qt = bql_quote_name(table_name) qc = bql_quote_name(colname) sql = ''' select bql_column_value_probability(?, NULL, ?, (select %s from %s where rowid = ?)) ''' % (qc, qt) bdb.sql_execute(sql, (generator_id, colno, rowid)).fetchall()
def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1): st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0) st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1) table_name = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0) colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1) qcn0 = sqlite3_quote_name(colname0) qcn1 = sqlite3_quote_name(colname1) data_sql = ''' SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL ''' % (qcn0, qcn1, qt, qcn0, qcn1) data = bdb.sql_execute(data_sql).fetchall() data0 = [row[0] for row in data] data1 = [row[1] for row in data] return (st0, st1, data0, data1)
def _data(self, bdb, generator_id, vars): # Get the column numbers and statistical types. population_id = core.bayesdb_generator_population(bdb, generator_id) colnos = [ core.bayesdb_variable_number(bdb, population_id, generator_id, var) for var in vars ] stattypes = [ core.bayesdb_variable_stattype(bdb, population_id, colno) for colno in colnos ] # Get the table name, quoted for constructing SQL. table_name = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) # Create SQL expressions to cast each variable to the correct # affinity for its statistical type. def cast(var, colno, stattype): if colno < 0: return 'NULL' qv = sqlite3_quote_name(var) affinity = core.bayesdb_stattype_affinity(bdb, stattype) qa = sqlite3_quote_name(affinity) return 'CAST(t.%s AS %s)' % (qv, qa) qexpressions = ','.join(map(cast, vars, colnos, stattypes)) # Get a cursor. cursor = bdb.sql_execute( ''' SELECT %s FROM %s AS t, bayesdb_cgpm_individual AS ci WHERE ci.generator_id = ? AND ci.table_rowid = t._rowid_ ORDER BY t._rowid_ ASC ''' % (qexpressions, qt), (generator_id, )) # Map values to codes. def map_value(colno, value): return self._to_numeric(bdb, generator_id, colno, value) return [ tuple(map_value(colno, x) for colno, x in zip(colnos, row)) for row in cursor ]
def initialize_models(self, bdb, genid, modelnos, model_config): # Initialize internal crosscat, maintaining equality of model numbers. # The semantics of INITIALIZE are that it guarantees the existence # of a sequence of models up to the requested number of them, # and BayesDB computes the numbers that need to be filled in. # The inverse of that computation is max(modelnos)+1. qg = quote(core.bayesdb_generator_name(bdb, self.cc_id(bdb, genid))) bql = 'INITIALIZE {} MODELS FOR {};'.format(max(modelnos)+1, qg) bdb.execute(bql) # Initialize the foriegn predictors. for fcol in self.fcols(bdb, genid): # Convert column numbers to names. targets = \ [(core.bayesdb_generator_column_name(bdb, genid, fcol), core.bayesdb_generator_column_stattype(bdb, genid, fcol))] conditions = \ [(core.bayesdb_generator_column_name(bdb, genid, pcol), core.bayesdb_generator_column_stattype(bdb, genid, pcol)) for pcol in self.pcols(bdb, genid, fcol)] # Initialize the foreign predictor. table_name = core.bayesdb_generator_table(bdb, genid) predictor_name = self.predictor_name(bdb, genid, fcol) builder = self.predictor_builder[predictor_name] predictor = builder.create(bdb, table_name, targets, conditions) # Store in the database. with bdb.savepoint(): sql = ''' UPDATE bayesdb_composer_column_foreign_predictor SET predictor_binary = :predictor_binary WHERE generator_id = :genid AND colno = :colno ''' predictor_binary = builder.serialize(bdb, predictor) bdb.sql_execute(sql, { 'genid': genid, 'predictor_binary': sqlite3.Binary(predictor_binary), 'colno': fcol })
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = "TEMP " if phrase.temp else "" ifnotexists = "IF NOT EXISTS " if phrase.ifnotexists else "" out.write("CREATE %sTABLE %s%s AS " % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): raise BQLError(bdb, "Name already defined as generator: %s" % (repr(phrase.name),)) if core.bayesdb_has_table(bdb, phrase.name): raise BQLError(bdb, "Name already defined as table: %s" % (repr(phrase.name),)) if not core.bayesdb_has_generator_default(bdb, phrase.simulation.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.simulation.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.simulation.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) table = core.bayesdb_generator_table(bdb, generator_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) qgn = sqlite3_quote_name(phrase.simulation.generator) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute("PRAGMA table_info(%s)" % (qt,)) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, "No such column" " in generator %s table %s: %s" % (repr(phrase.simulation.generator), repr(table), repr(column_name)), ) for column_name, _expression in phrase.simulation.constraints: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, "No such column" " in generator %s table %s: %s" % (repr(phrase.simulation.generator), repr(table), repr(column_name)), ) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write("SELECT ") with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) out.write(", ") with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"): compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out) for _column_name, expression in phrase.simulation.constraints: out.write(", ") compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) modelno = cursor[0][1] assert modelno is None or isinstance(modelno, int) constraints = [ (core.bayesdb_generator_column_number(bdb, generator_id, name), value) for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:]) ] colnos = [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names] bdb.sql_execute( "CREATE %sTABLE %s%s (%s)" % ( "TEMP " if phrase.temp else "", "IF NOT EXISTS " if phrase.ifnotexists else "", qn, ",".join( "%s %s" % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names) ), ) ) insert_sql = """ INSERT INTO %s (%s) VALUES (%s) """ % ( qn, ",".join(qcns), ",".join("?" for qcn in qcns), ) for row in bqlfn.bayesdb_simulate( bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples ): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = "SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?" cursor = bdb.sql_execute(sql, (phrase.name,)) if 0 < cursor_value(cursor): # XXX Automatically delete the generators? Generators # are more interesting than triggers and indices, so # automatic deletion is not obviously right. raise BQLError(bdb, "Table still in use by generators: %s" % (repr(phrase.name),)) bdb.sql_execute("DELETE FROM bayesdb_column WHERE tabname = ?", (phrase.name,)) ifexists = "IF EXISTS " if phrase.ifexists else "" qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute("DROP TABLE %s%s" % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, "No such table: %s" % (repr(table),)) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + "_temp" while core.bayesdb_has_table(bdb, temp) or core.bayesdb_has_generator(bdb, temp): temp += "_temp" rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),)) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),)) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError("Renaming columns" " not yet implemented.") # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column(bdb, table, cmd.old): raise BQLError(bdb, "No such column in table %s" ": %s" % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, "Column already exists" " in table %s: %s" % (repr(table), repr(cmd.new)) ) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = """ UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(update_column_sql, {"table": table, "old": cmd.old, "new": cmd.new}) assert bdb.sqlite3.total_changes - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = """ SELECT id FROM bayesdb_generator WHERE tabname = ? """ cursor = bdb.sql_execute(generators_sql, (table,)) for (generator_id,) in cursor: metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) elif isinstance(cmd, ast.AlterTabSetDefGen): if not core.bayesdb_has_generator(bdb, cmd.generator): raise BQLError(bdb, "No such generator: %s" % (repr(cmd.generator),)) generator_id = core.bayesdb_get_generator(bdb, cmd.generator) unset_default_sql = """ UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(unset_default_sql, (table,)) assert bdb.sqlite3.total_changes - total_changes in (0, 1) set_default_sql = """ UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ? """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(set_default_sql, (generator_id,)) assert bdb.sqlite3.total_changes - total_changes == 1 elif isinstance(cmd, ast.AlterTabUnsetDefGen): unset_default_sql = """ UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(unset_default_sql, (table,)) assert bdb.sqlite3.total_changes - total_changes in (0, 1) else: assert False, "Invalid alter table command: %s" % (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the metamodel. if phrase.metamodel not in bdb.metamodels: raise BQLError(bdb, "No such metamodel: %s" % (repr(phrase.metamodel),)) metamodel = bdb.metamodels[phrase.metamodel] # Let the metamodel parse the schema itself and call # create_generator with the modelled columns. with bdb.savepoint(): def instantiate(columns): return instantiate_generator( bdb, phrase.name, phrase.table, metamodel, columns, ifnotexists=phrase.ifnotexists, default=phrase.default, ) metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, "No such generator: %s" % (repr(phrase.name),)) generator_id = core.bayesdb_get_generator(bdb, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = """ DELETE FROM bayesdb_generator_column WHERE generator_id = ? """ bdb.sql_execute(drop_columns_sql, (generator_id,)) drop_model_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = ? """ bdb.sql_execute(drop_model_sql, (generator_id,)) drop_generator_sql = """ DELETE FROM bayesdb_generator WHERE id = ? """ bdb.sql_execute(drop_generator_sql, (generator_id,)) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, generator): raise BQLError(bdb, "No such generator: %s" % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),)) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),)) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = """ UPDATE bayesdb_generator SET name = ? WHERE id = ? """ total_changes = bdb.sqlite3.total_changes bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb.sqlite3.total_changes - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, "Invalid ALTER GENERATOR command: %s" % (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) modelnos = range(phrase.nmodels) model_config = None # XXX For now. with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set( modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno) ) else: existing = set( modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno) ) if 0 < len(existing): raise BQLError( bdb, "Generator %s already has models: %s" % (repr(phrase.generator), sorted(existing)) ) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = """ INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) """ for modelno in modelnos: bdb.sql_execute(insert_model_sql, {"generator_id": generator_id, "modelno": modelno, "iterations": 0}) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos, model_config) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError("No background analysis -- use WAIT.") # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, "No such generator: %s" % (phrase.generator,)) generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models( bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, ) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = """ SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno """ modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, {"generator_id": generator_id, "modelno": modelno}) if cursor_value(cursor) == 0: raise BQLError( bdb, "No such model" " in generator %s: %s" % (repr(phrase.generator), repr(modelno)) ) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = ? """ bdb.sql_execute(drop_models_sql, (generator_id,)) else: drop_model_sql = """ DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno """ for modelno in modelnos: bdb.sql_execute(drop_model_sql, {"generator_id": generator_id, "modelno": modelno}) return empty_cursor(bdb) assert False # XXX
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabSim): assert isinstance(phrase.simulation, ast.Simulate) with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) if core.bayesdb_has_table(bdb, phrase.name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) if not core.bayesdb_has_generator_default( bdb, phrase.simulation.generator): raise BQLError( bdb, 'No such generator: %s' % (phrase.simulation.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.simulation.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) table = core.bayesdb_generator_table(bdb, generator_id) qn = sqlite3_quote_name(phrase.name) qt = sqlite3_quote_name(table) qgn = sqlite3_quote_name(phrase.simulation.generator) column_names = phrase.simulation.columns qcns = map(sqlite3_quote_name, column_names) cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, )) column_sqltypes = {} for _colno, name, sqltype, _nonnull, _default, _primary in cursor: assert casefold(name) not in column_sqltypes column_sqltypes[casefold(name)] = sqltype assert 0 < len(column_sqltypes) for column_name in column_names: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) for column_name, _expression in phrase.simulation.constraints: if casefold(column_name) not in column_sqltypes: raise BQLError( bdb, 'No such column' ' in generator %s table %s: %s' % (repr(phrase.simulation.generator), repr(table), repr(column_name))) # XXX Move to compiler.py. # XXX Copypasta of this in compile_simulate! out = compiler.Output(n_numpar, nampar_map, bindings) out.write('SELECT ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out) out.write(', ') with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'): compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out) for _column_name, expression in phrase.simulation.constraints: out.write(', ') compiler.compile_nobql_expression(bdb, expression, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall() assert len(cursor) == 1 nsamples = cursor[0][0] assert isinstance(nsamples, int) modelno = cursor[0][1] assert modelno is None or isinstance(modelno, int) constraints = \ [(core.bayesdb_generator_column_number(bdb, generator_id, name), value) for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])] colnos = \ [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names] bdb.sql_execute( 'CREATE %sTABLE %s%s (%s)' % ('TEMP ' if phrase.temp else '', 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join( '%s %s' % (qcn, column_sqltypes[casefold(column_name)]) for qcn, column_name in zip(qcns, column_names)))) insert_sql = ''' INSERT INTO %s (%s) VALUES (%s) ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns)) for row in bqlfn.bayesdb_simulate(bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples): bdb.sql_execute(insert_sql, row) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): # XXX Automatically delete the generators? Generators # are more interesting than triggers and indices, so # automatic deletion is not obviously right. raise BQLError( bdb, 'Table still in use by generators: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp) or \ core.bayesdb_has_generator(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) rename_table(bdb, table, cmd.name) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column( bdb, table, cmd.old): raise BQLError( bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except metamodels may have the (case-folded) # name cached. if old_folded != new_folded: generators_sql = ''' SELECT id FROM bayesdb_generator WHERE tabname = ? ''' cursor = bdb.sql_execute(generators_sql, (table, )) for (generator_id, ) in cursor: metamodel = core.bayesdb_generator_metamodel( bdb, generator_id) metamodel.rename_column(bdb, generator_id, old_folded, new_folded) elif isinstance(cmd, ast.AlterTabSetDefGen): if not core.bayesdb_has_generator(bdb, cmd.generator): raise BQLError( bdb, 'No such generator: %s' % (repr(cmd.generator), )) generator_id = core.bayesdb_get_generator( bdb, cmd.generator) bayesdb_schema_required(bdb, 6, "generator defaults") unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) set_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(set_default_sql, (generator_id, )) assert bdb._sqlite3.totalchanges() - total_changes == 1 elif isinstance(cmd, ast.AlterTabUnsetDefGen): unset_default_sql = ''' UPDATE bayesdb_generator SET defaultp = 0 WHERE tabname = ? AND defaultp ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(unset_default_sql, (table, )) assert bdb._sqlite3.totalchanges() - total_changes in (0, 1) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the metamodel. if phrase.metamodel not in bdb.metamodels: raise BQLError( bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), )) metamodel = bdb.metamodels[phrase.metamodel] # Let the metamodel parse the schema itself and call # create_generator with the modelled columns. with bdb.savepoint(): if core.bayesdb_has_generator(bdb, phrase.name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(phrase.name), )) else: def instantiate(columns): return instantiate_generator(bdb, phrase.name, phrase.table, metamodel, columns, default=phrase.default) metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name), )) generator_id = core.bayesdb_get_generator(bdb, phrase.name) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # Metamodel-specific destruction. metamodel.drop_generator(bdb, generator_id) # Drop the columns, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_generator_column WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id, )) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id, )) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, generator) for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table' ': %s' % (repr(cmd.name), )) if core.bayesdb_has_generator(bdb, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) modelnos = range(phrase.nmodels) model_config = None # XXX For now. with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model( bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError( bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' for modelno in modelnos: bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': 0, }) # Do metamodel-specific initialization. metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) metamodel.initialize_models(bdb, generator_id, modelnos, model_config) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): if not phrase.wait: raise NotImplementedError('No background analysis -- use WAIT.') # WARNING: It is the metamodel's responsibility to work in a # transaction. # # WARNING: It is the metamodel's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the metamodel can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator_default(bdb, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. metamodel.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator_default( bdb, phrase.generator) metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError( bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) metamodel.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id, )) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) assert False # XXX
def bayesdb_maxrowid(bdb, generator_id): table_name = core.bayesdb_generator_table(bdb, generator_id) qt = bql_quote_name(table_name) sql = 'SELECT MAX(_rowid_) FROM %s' % (qt, ) return cursor_value(bdb.sql_execute(sql))
def bayesdb_maxrowid(bdb, generator_id): table_name = core.bayesdb_generator_table(bdb, generator_id) qt = bql_quote_name(table_name) sql = 'SELECT MAX(_rowid_) FROM %s' % (qt,) return cursor_value(bdb.sql_execute(sql))
def _cmd_render_crosscat(self, query, sql=None, **kwargs): '''Returns a rendering of the specified crosscat state Usage: .render_crosscat [options] <generator> <modelno>. Options: --subsample=<n> --width=<w> --height=<c> --rowlabels=<colname> --progress=[True|False] --yticklabeslize=<fontsize> --xticklabeslize=<fontsize> The allowable fontsize strings are: xx-small, x-small, # small, medium, large, x-large, xx-large ''' tokens = query.split() if len(tokens) != 2: self.write_stderr('Usage: .render_crosscat <generator> <modelno>') return generator = tokens[0] modelno = int(tokens[1]) if not bayesdb_has_generator(self._bdb, None, generator): self.write_stderr('No such generator: %s.' % (generator, )) return generator_id = bayesdb_get_generator(self._bdb, None, generator) population_id = bayesdb_generator_population(self._bdb, generator_id) backend = bayesdb_generator_backend(self._bdb, generator_id) if backend.name() != 'cgpm': self.write_stderr('.render_crosscat requires generator from the ' 'cgpm backend') return engine = backend._engine(self._bdb, generator_id) cursor = self._bdb.sql_execute( ''' SELECT cgpm_modelno FROM bayesdb_cgpm_modelno WHERE generator_id = ? AND modelno = ? ''', ( generator_id, modelno, )) cgpm_modelno = cursor_value(cursor, nullok=True) if cgpm_modelno is None: self.write_stderr('No such model number: %d.' % (modelno, )) return state = engine.get_state(cgpm_modelno) row_names = None row_index_column = kwargs.get('rowlabels', None) if row_index_column is not None: table_name = bayesdb_generator_table(self._bdb, generator_id) qt = bql_quote_name(table_name) qc = bql_quote_name(row_index_column) cursor = self._bdb.sql_execute( ''' SELECT %s FROM %s WHERE oid IN ( SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ) ''' % (qc, qt), (generator_id, )) row_names = [c[0] for c in cursor] if 'progress' in kwargs: sys.stdout.write('Creating figure...\n') import cgpm.utils.render if 'variable' not in kwargs: # Plot the entire state. col_names = [ bayesdb_variable_name(self._bdb, population_id, None, colno) for colno in state.outputs ] fig, _ax = cgpm.utils.render.viz_state(state, col_names=col_names, row_names=row_names, **kwargs) else: # Plot the view of the requested variable. varno = bayesdb_variable_number(self._bdb, population_id, generator_id, kwargs['variable']) view = state.view_for(varno) col_names = [ bayesdb_variable_name(self._bdb, population_id, None, colno) for colno in view.outputs[1:] ] fig, _ax = cgpm.utils.render.viz_view(view, col_names=col_names, row_names=row_names, **kwargs) (width, height) = fig.get_size_inches() if 'width' in kwargs: width = float(kwargs['width']) fig.set_size_inches(width, height) if 'height' in kwargs: height = float(kwargs['height']) fig.set_size_inches(width, height) if 'progress' in kwargs: sys.stdout.write('Rendering figure...\n')
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname, create=False, ifnotexists=False, gzipped=None): """Load legacy BayesDB models from a file. Legacy models are from the previous incarnation of BayesDB, before bayeslite. If you did not use the previous incarnation of BayesDB, you need not worry about this. :param bayeslite.BayesDB bdb: BayesDB instance :param str generator: name of generator :param str table: name of table :param str metamodel: name of metamodel, must be ``crosscat`` :param str pathname: pathname of legacy models file :param bool create: if true and `generator` does not exist, create it :param bool ifnotexists: if true and `generator` exists, do it anyway :param bool gzipped: if true, or if ``None`` and `pathname` ends in ``.pkl.gz``, decompress with gzip first """ if metamodel != 'crosscat': raise ValueError('Only crosscat legacy models are supported.') if not create: if ifnotexists: raise ValueError('Not creating generator whether or not exists!') # Load the pickled file -- gzipped, if gzipped is true or if # gzipped is not specified and the file ends in .pkl.gz. pickled = None with open(pathname, 'rb') as f: if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')): with gzip.GzipFile(fileobj=f) as gzf: pickled = pickle.load(gzf) else: pickled = pickle.load(f) # Pick apart the schema and model data. # # XXX Support even older models formats, from before the schema # was included. Not sure exactly how they were structured. if 'schema' not in pickled: raise IOError('Invalid legacy model: missing schema') if 'models' not in pickled: raise IOError('Invalid legacy model: missing models') schema = pickled['schema'] models = pickled['models'] # Make sure the schema looks sensible. Map legacy stattypes # (`cctypes') to modern stattypes. if not isinstance(schema, dict): raise IOError('Invalid legacy model: schema is not a dict') for column_name in schema: column_schema = schema[column_name] if not isinstance(column_schema, dict): raise IOError('Invalid legacy model: column schema is not a dict') if not 'cctype' in column_schema: raise IOError('Invalid legacy model: column schema missing cctype') if column_schema['cctype'] in renamed_column_stattypes: column_schema['cctype'] = \ renamed_column_stattypes[column_schema['cctype']] if column_schema['cctype'] not in allowed_column_stattypes: raise IOError('Invalid legacy model: unknown column type') # XXX Check whether the schema resembles a sane generator schema. # XXX Check whether models is a dict mapping integers to thetas. # XXX Check whether the thetas look sensible. # XXX Check whether the metamodel makes sense of it! column_stattypes = dict((casefold(column_name), casefold(schema[column_name]['cctype'])) for column_name in schema) # Ready to update the database. Do it in a savepoint in case # anything goes wrong. with bdb.savepoint(): # Ensure the table exists. Can't do anything if we have no # data. if not core.bayesdb_has_table(bdb, table): raise ValueError('No such table: %s' % (repr(table),)) # Ensure the generator exists. if core.bayesdb_has_generator(bdb, generator): if create and not ifnotexists: raise ValueError('Generator already exists: %s' % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, generator) generator_table = core.bayesdb_generator_table(bdb, generator_id) if casefold(table) != generator_table: raise ValueError( 'Generator %r is for table %r, not for table: %r' % (generator, generator_table, table)) # Generator exists. If the schema differs and there are # existing models, fail. If the schema differs and there # are no existing models, change the schema. # # XXX Not clear changing the schema is really appropriate. generator_id = core.bayesdb_get_generator(bdb, generator) old_types = bayesdb_generator_column_stattypes(bdb, generator_id) if column_stattypes != old_types: sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(bdb, (generator_id,)) if 0 < cursor_value(cursor): raise ValueError('Legacy models mismatch schema: %s' % (repr(generator),)) qg = sqlite3_quote_name(generator) bdb.execute('DROP GENERATOR %s' % (qg,)) bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) elif create: bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) else: raise ValueError('No such generator: %s' % (repr(generator),)) # Map the case of the column names in the models. # # XXX Check more than just the column names. for modelno in models: # dictionary theta = models[modelno] if 'X_L' not in theta: raise IOError('Invalid legacy model: no X_L in theta[%u]' % (modelno,)) X_L = theta['X_L'] if 'view_state' not in X_L: raise IOError('Invalid legacy model' ': no view_state in X_L[%u]' % (modelno,)) for viewno, view_state in enumerate(X_L['view_state']): if 'column_names' not in view_state: raise IOError('Invalid legacy model: no column names' ' in view state %u of X_L[%u]' % (viewno, modelno)) view_column_names = view_state['column_names'] if not isinstance(view_column_names, list): raise IOError('Invalid legacy model' ': non-list for view %u columns in X_L[%u]' % (viewno, modelno)) for i in range(len(view_column_names)): name = view_column_names[i] if not core.bayesdb_table_has_column(bdb, table, name): raise IOError('No such column in table %s: %s' % (repr(table), repr(name))) # Canonicalize the case. colno = core.bayesdb_table_column_number(bdb, table, name) name = core.bayesdb_table_column_name(bdb, table, colno) view_column_names[i] = name # Determine where to start numbering the new models. generator_id = core.bayesdb_get_generator(bdb, generator) modelno_max_sql = ''' SELECT MAX(modelno) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(modelno_max_sql, (generator_id,)) modelno_max = cursor_value(cursor) modelno_start = 0 if modelno_max is None else modelno_max + 1 # Consistently number the models consecutively in order of the # external numbering starting at the smallest nonnegative # model number not currently used. Do not vary based on the # ordering of Python dict iteration. insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' insert_theta_json_sql = ''' INSERT INTO bayesdb_crosscat_theta (generator_id, modelno, theta_json) VALUES (:generator_id, :modelno, :theta_json) ''' for i, modelno_ext in enumerate(sorted(models.keys())): modelno = modelno_start + i theta = models[modelno_ext] iterations = 0 if 'iterations' in theta and isinstance(theta['iterations'], int): iterations = theta['iterations'] bdb.sql_execute(insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': iterations, }) bdb.sql_execute(insert_theta_json_sql, { 'generator_id': generator_id, 'modelno': modelno, 'theta_json': json.dumps(theta), })
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname, create=False, ifnotexists=False, gzipped=None): """Load legacy BayesDB models from a file. Legacy models are from the previous incarnation of BayesDB, before bayeslite. If you did not use the previous incarnation of BayesDB, you need not worry about this. :param bayeslite.BayesDB bdb: BayesDB instance :param str generator: name of generator :param str table: name of table :param str metamodel: name of metamodel, must be ``crosscat`` :param str pathname: pathname of legacy models file :param bool create: if true and `generator` does not exist, create it :param bool ifnotexists: if true and `generator` exists, do it anyway :param bool gzipped: if true, or if ``None`` and `pathname` ends in ``.pkl.gz``, decompress with gzip first """ if metamodel != 'crosscat': raise ValueError('Only crosscat legacy models are supported.') if not create: if ifnotexists: raise ValueError('Not creating generator whether or not exists!') # Load the pickled file -- gzipped, if gzipped is true or if # gzipped is not specified and the file ends in .pkl.gz. pickled = None with open(pathname, 'rb') as f: if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')): with gzip.GzipFile(fileobj=f) as gzf: pickled = pickle.load(gzf) else: pickled = pickle.load(f) # Pick apart the schema and model data. # # XXX Support even older models formats, from before the schema # was included. Not sure exactly how they were structured. if 'schema' not in pickled: raise IOError('Invalid legacy model: missing schema') if 'models' not in pickled: raise IOError('Invalid legacy model: missing models') schema = pickled['schema'] models = pickled['models'] # Make sure the schema looks sensible. Map legacy stattypes # (`cctypes') to modern stattypes. if not isinstance(schema, dict): raise IOError('Invalid legacy model: schema is not a dict') for column_name in schema: column_schema = schema[column_name] if not isinstance(column_schema, dict): raise IOError('Invalid legacy model: column schema is not a dict') if not 'cctype' in column_schema: raise IOError('Invalid legacy model: column schema missing cctype') if column_schema['cctype'] in renamed_column_stattypes: column_schema['cctype'] = \ renamed_column_stattypes[column_schema['cctype']] if column_schema['cctype'] not in allowed_column_stattypes: raise IOError('Invalid legacy model: unknown column type') # XXX Check whether the schema resembles a sane generator schema. # XXX Check whether models is a dict mapping integers to thetas. # XXX Check whether the thetas look sensible. # XXX Check whether the metamodel makes sense of it! column_stattypes = dict( (casefold(column_name), casefold(schema[column_name]['cctype'])) for column_name in schema) # Ready to update the database. Do it in a savepoint in case # anything goes wrong. with bdb.savepoint(): # Ensure the table exists. Can't do anything if we have no # data. if not core.bayesdb_has_table(bdb, table): raise ValueError('No such table: %s' % (repr(table), )) # Ensure the generator exists. if core.bayesdb_has_generator(bdb, generator): if create and not ifnotexists: raise ValueError('Generator already exists: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, generator) generator_table = core.bayesdb_generator_table(bdb, generator_id) if casefold(table) != generator_table: raise ValueError( 'Generator %r is for table %r, not for table: %r' % (generator, generator_table, table)) # Generator exists. If the schema differs and there are # existing models, fail. If the schema differs and there # are no existing models, change the schema. # # XXX Not clear changing the schema is really appropriate. generator_id = core.bayesdb_get_generator(bdb, generator) old_types = bayesdb_generator_column_stattypes(bdb, generator_id) if column_stattypes != old_types: sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(bdb, (generator_id, )) if 0 < cursor_value(cursor): raise ValueError('Legacy models mismatch schema: %s' % (repr(generator), )) qg = sqlite3_quote_name(generator) bdb.execute('DROP GENERATOR %s' % (qg, )) bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) elif create: bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) else: raise ValueError('No such generator: %s' % (repr(generator), )) # Map the case of the column names in the models. # # XXX Check more than just the column names. for modelno in models: # dictionary theta = models[modelno] if 'X_L' not in theta: raise IOError('Invalid legacy model: no X_L in theta[%u]' % (modelno, )) X_L = theta['X_L'] if 'view_state' not in X_L: raise IOError('Invalid legacy model' ': no view_state in X_L[%u]' % (modelno, )) for viewno, view_state in enumerate(X_L['view_state']): if 'column_names' not in view_state: raise IOError('Invalid legacy model: no column names' ' in view state %u of X_L[%u]' % (viewno, modelno)) view_column_names = view_state['column_names'] if not isinstance(view_column_names, list): raise IOError('Invalid legacy model' ': non-list for view %u columns in X_L[%u]' % (viewno, modelno)) for i in range(len(view_column_names)): name = view_column_names[i] if not core.bayesdb_table_has_column(bdb, table, name): raise IOError('No such column in table %s: %s' % (repr(table), repr(name))) # Canonicalize the case. colno = core.bayesdb_table_column_number(bdb, table, name) name = core.bayesdb_table_column_name(bdb, table, colno) view_column_names[i] = name # Determine where to start numbering the new models. generator_id = core.bayesdb_get_generator(bdb, generator) modelno_max_sql = ''' SELECT MAX(modelno) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(modelno_max_sql, (generator_id, )) modelno_max = cursor_value(cursor) modelno_start = 0 if modelno_max is None else modelno_max + 1 # Consistently number the models consecutively in order of the # external numbering starting at the smallest nonnegative # model number not currently used. Do not vary based on the # ordering of Python dict iteration. insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' insert_theta_json_sql = ''' INSERT INTO bayesdb_crosscat_theta (generator_id, modelno, theta_json) VALUES (:generator_id, :modelno, :theta_json) ''' for i, modelno_ext in enumerate(sorted(models.keys())): modelno = modelno_start + i theta = models[modelno_ext] iterations = 0 if 'iterations' in theta and isinstance(theta['iterations'], int): iterations = theta['iterations'] bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': iterations, }) bdb.sql_execute( insert_theta_json_sql, { 'generator_id': generator_id, 'modelno': modelno, 'theta_json': json.dumps(theta), })