def create_generator(self, bdb, generator_id, schema, **kwargs): # XXX Do something with the schema. insert_column_sql = ''' INSERT INTO bayesdb_nig_normal_column (population_id, generator_id, colno, count, sum, sumsq) VALUES (:population_id, :generator_id, :colno, :count, :sum, :sumsq) ''' population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) for colno in core.bayesdb_variable_numbers(bdb, population_id, None): column_name = core.bayesdb_variable_name(bdb, population_id, generator_id, colno) stattype = core.bayesdb_variable_stattype(bdb, population_id, generator_id, colno) if not stattype == 'numerical': raise BQLError( bdb, 'NIG-Normal only supports' ' numerical columns, but %s is %s' % (repr(column_name), repr(stattype))) (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name) bdb.sql_execute( insert_column_sql, { 'population_id': population_id, 'generator_id': generator_id, 'colno': colno, 'count': count, 'sum': xsum, 'sumsq': sumsq, }) # XXX Make the schema a little more flexible. if schema == [[]]: return for clause in schema: if not (len(clause) == 3 and \ isinstance(clause[0], str) and \ clause[1] == 'deviation' and \ isinstance(clause[2], list) and \ len(clause[2]) == 1 and \ isinstance(clause[2][0], str)): raise BQLError(bdb, 'Invalid nig_normal clause: %r' % (clause, )) dev_var = clause[0] obs_var = clause[2][0] if not core.bayesdb_has_variable(bdb, population_id, None, obs_var): raise BQLError(bdb, 'No such variable: %r' % (obs_var, )) obs_colno = core.bayesdb_variable_number(bdb, population_id, None, obs_var) dev_colno = core.bayesdb_add_latent(bdb, population_id, generator_id, dev_var, 'numerical') bdb.sql_execute( ''' INSERT INTO bayesdb_nig_normal_deviation (population_id, generator_id, deviation_colno, observed_colno) VALUES (?, ?, ?, ?) ''', (population_id, generator_id, dev_colno, obs_colno))
def analyze_models(self, bdb, generator_id, modelnos=None, iterations=1, max_seconds=None, ckpt_iterations=None, ckpt_seconds=None, program=None): if program is not None: # XXX raise NotImplementedError('nig_normal analysis programs') population_id = core.bayesdb_generator_population(bdb, generator_id) # Ignore analysis timing control, because one step reaches the # posterior anyway. # NOTE: Does not update the model iteration count. This would # manifest as failing to count the number of inference # iterations taken. Since inference converges in one step, # this consists of failing to track the metadata of whether # that one step was done or not. update_sample_sql = ''' UPDATE bayesdb_nig_normal_model SET mu = :mu, sigma = :sigma WHERE population_id = :population_id AND generator_id = :generator_id AND colno = :colno AND modelno = :modelno ''' if modelnos is None: # This assumes that models x columns forms a dense # rectangle in the database, which it should. modelnos = self._modelnos(bdb, generator_id) self._set_models(bdb, population_id, generator_id, modelnos, update_sample_sql)
def column_mutual_information(self, bdb, generator_id, modelnos, colnos0, colnos1, constraints, numsamples): population_id = bayesdb_generator_population(bdb, generator_id) colnames0 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos0 ] colnames1 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos1 ] server = self._get_preql_server(bdb, generator_id) target_set = server._cols_to_mask(server.encode_set(colnames0)) query_set = server._cols_to_mask(server.encode_set(colnames1)) if self._marginize_cmi(constraints): inner_numsamples = numsamples conditioning_rows_loom_format = self._get_constraint_rows( constraints, bdb, generator_id, population_id, modelnos, server, inner_numsamples) else: conditioning_rows_loom_format = [ self._get_constraint_row(constraints, bdb, generator_id, population_id, server) ] mi_estimates = [ server._query_server.mutual_information( target_set, query_set, entropys=None, sample_count=loom.preql.SAMPLE_COUNT, conditioning_row=conditioning_row_loom_format).mean for conditioning_row_loom_format in conditioning_rows_loom_format ] # Output requires an iterable. return [arithmetic_mean(mi_estimates)]
def _get_ordered_column_names(self, bdb, generator_id): """Return list of column names ordered by their loom rank.""" population_id = bayesdb_generator_population(bdb, generator_id) return [ bayesdb_variable_name(bdb, population_id, None, colno) for colno in self._get_ordered_column_numbers(bdb, generator_id) ]
def logpdf_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints): population_id = bayesdb_generator_population(bdb, generator_id) ordered_column_names = self._get_ordered_column_names(bdb, generator_id) # Pr[targets|constraints] = Pr[targets, constraints] / Pr[constraints] # The numerator is and_case; denominator is conditional_case. and_case = OrderedDict( [(a, None) for a in ordered_column_names]) conditional_case = OrderedDict( [(a, None) for a in ordered_column_names]) for (colno, value) in targets: column_name = bayesdb_variable_name(bdb, population_id, None, colno) and_case[column_name] = self._convert_to_proper_stattype( bdb, generator_id, colno, value) conditional_case[column_name] = None for (colno, value) in constraints: column_name = bayesdb_variable_name(bdb, population_id, None, colno) processed_value = self._convert_to_proper_stattype( bdb, generator_id, colno, value) and_case[column_name] = processed_value conditional_case[column_name] = processed_value and_case = and_case.values() conditional_case = conditional_case.values() server = self._get_query_server(bdb, generator_id) and_score = server.score(and_case) conditional_score = server.score(conditional_case) return and_score - conditional_score
def predict_confidence(self, bdb, generator_id, modelnos, rowid, colno, numsamples=None): if not numsamples: numsamples = 2 assert numsamples > 0 def _impute_categorical(sample): counts = Counter(s[0] for s in sample) mode_count = max(counts[v] for v in counts) pred = iter(v for v in counts if counts[v] == mode_count).next() conf = float(mode_count) / numsamples return pred, conf def _impute_numerical(sample): pred = sum(s[0] for s in sample) / float(len(sample)) conf = 0 return pred, conf # Retrieve the samples: specifying rowid suffices to ensures that # relevant constraints are retrieved by simulat_joint. sample = self.simulate_joint( bdb, generator_id, modelnos, rowid, [colno], [], numsamples) # Determine the imputation strategy (mode or mean). population_id = bayesdb_generator_population(bdb, generator_id) stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) # Run the imputation. if _is_nominal(stattype): return _impute_categorical(sample) else: return _impute_numerical(sample)
def logpdf_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints): population_id = bayesdb_generator_population(bdb, generator_id) ordered_column_names = self._get_ordered_column_names( bdb, generator_id) # Pr[targets|constraints] = Pr[targets, constraints] / Pr[constraints] # The numerator is and_case; denominator is conditional_case. and_case = OrderedDict([(a, None) for a in ordered_column_names]) conditional_case = OrderedDict([(a, None) for a in ordered_column_names]) for (colno, value) in targets: column_name = bayesdb_variable_name(bdb, population_id, None, colno) and_case[column_name] = self._convert_to_proper_stattype( bdb, generator_id, colno, value) conditional_case[column_name] = None for (colno, value) in constraints: column_name = bayesdb_variable_name(bdb, population_id, None, colno) processed_value = self._convert_to_proper_stattype( bdb, generator_id, colno, value) and_case[column_name] = processed_value conditional_case[column_name] = processed_value and_case = and_case.values() conditional_case = conditional_case.values() server = self._get_query_server(bdb, generator_id) and_score = server.score(and_case) conditional_score = server.score(conditional_case) return and_score - conditional_score
def _get_ordered_column_names(self, bdb, generator_id): """Return list of column names ordered by their loom rank.""" population_id = bayesdb_generator_population(bdb, generator_id) return [ bayesdb_variable_name(bdb, population_id, None, colno) for colno in self._get_ordered_column_numbers(bdb, generator_id) ]
def analyze_models(self, bdb, generator_id, modelnos=None, iterations=1, max_seconds=None, ckpt_iterations=None, ckpt_seconds=None, program=None): if program is not None: # XXX raise NotImplementedError('nig_normal analysis programs') population_id = core.bayesdb_generator_population(bdb, generator_id) # Ignore analysis timing control, because one step reaches the # posterior anyway. # NOTE: Does not update the model iteration count. This would # manifest as failing to count the number of inference # iterations taken. Since inference converges in one step, # this consists of failing to track the metadata of whether # that one step was done or not. update_sample_sql = ''' UPDATE bayesdb_nig_normal_model SET mu = :mu, sigma = :sigma WHERE population_id = :population_id AND generator_id = :generator_id AND colno = :colno AND modelno = :modelno ''' if modelnos is None: # This assumes that models x columns forms a dense # rectangle in the database, which it should. modelnos = self._modelnos(bdb, generator_id) self._set_models(bdb, population_id, generator_id, modelnos, update_sample_sql)
def predict_confidence(self, bdb, generator_id, modelnos, rowid, colno, numsamples=None): if not numsamples: numsamples = 2 assert numsamples > 0 def _impute_categorical(sample): counts = Counter(s[0] for s in sample) mode_count = max(counts[v] for v in counts) pred = iter(v for v in counts if counts[v] == mode_count).next() conf = float(mode_count) / numsamples return pred, conf def _impute_numerical(sample): pred = sum(s[0] for s in sample) / float(len(sample)) conf = 0 return pred, conf # Retrieve the samples. Specifying `rowid` ensures that relevant # constraints are retrieved by `simulate`, # so provide empty constraints. sample = self.simulate_joint(bdb, generator_id, modelnos, rowid, [colno], [], numsamples) # Determine the imputation strategy (mode or mean). population_id = bayesdb_generator_population(bdb, generator_id) stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) if _is_nominal(stattype): return _impute_categorical(sample) return _impute_numerical(sample)
def create_generator(self, bdb, generator_id, schema, **kwargs): # XXX Do something with the schema. insert_column_sql = ''' INSERT INTO bayesdb_nig_normal_column (population_id, generator_id, colno, count, sum, sumsq) VALUES (:population_id, :generator_id, :colno, :count, :sum, :sumsq) ''' population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) for colno in core.bayesdb_variable_numbers(bdb, population_id, None): column_name = core.bayesdb_variable_name( bdb, population_id, generator_id, colno) stattype = core.bayesdb_variable_stattype( bdb, population_id, generator_id, colno) if not stattype == 'numerical': raise BQLError(bdb, 'NIG-Normal only supports' ' numerical columns, but %s is %s' % (repr(column_name), repr(stattype))) (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name) bdb.sql_execute(insert_column_sql, { 'population_id': population_id, 'generator_id': generator_id, 'colno': colno, 'count': count, 'sum': xsum, 'sumsq': sumsq, }) # XXX Make the schema a little more flexible. if schema == [[]]: return for clause in schema: if not (len(clause) == 3 and \ isinstance(clause[0], str) and \ clause[1] == 'deviation' and \ isinstance(clause[2], list) and \ len(clause[2]) == 1 and \ isinstance(clause[2][0], str)): raise BQLError(bdb, 'Invalid nig_normal clause: %r' % (clause,)) dev_var = clause[0] obs_var = clause[2][0] if not core.bayesdb_has_variable(bdb, population_id, None, obs_var): raise BQLError(bdb, 'No such variable: %r' % (obs_var,)) obs_colno = core.bayesdb_variable_number(bdb, population_id, None, obs_var) dev_colno = core.bayesdb_add_latent(bdb, population_id, generator_id, dev_var, 'numerical') bdb.sql_execute(''' INSERT INTO bayesdb_nig_normal_deviation (population_id, generator_id, deviation_colno, observed_colno) VALUES (?, ?, ?, ?) ''', (population_id, generator_id, dev_colno, obs_colno))
def initialize_models(self, bdb, generator_id, modelnos): population_id = core.bayesdb_generator_population(bdb, generator_id) insert_sample_sql = ''' INSERT INTO bayesdb_nig_normal_model (population_id, generator_id, colno, modelno, mu, sigma) VALUES (:population_id, :generator_id, :colno, :modelno, :mu, :sigma) ''' self._set_models(bdb, population_id, generator_id, modelnos, insert_sample_sql)
def initialize_models(self, bdb, generator_id, modelnos): population_id = core.bayesdb_generator_population(bdb, generator_id) insert_sample_sql = ''' INSERT INTO bayesdb_nig_normal_model (population_id, generator_id, colno, modelno, mu, sigma) VALUES (:population_id, :generator_id, :colno, :modelno, :mu, :sigma) ''' self._set_models(bdb, population_id, generator_id, modelnos, insert_sample_sql)
def _convert_to_proper_stattype(self, bdb, generator_id, colno, value): """Convert a value returned by the logpdf_joint method parameters into a form that Loom can handle. For instance, convert from an integer to real or, from a string to an integer. """ if value is None: return value population_id = bayesdb_generator_population(bdb, generator_id) stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) # If nominal then return the integer code. if _is_nominal(stattype): return self._get_integer_form(bdb, generator_id, colno, value) # Return the value as float. return float(value)
def create_generator(self, bdb, generator_id, schema, **kwargs): population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Store generator info in bdb. name = self._generate_name(bdb, generator_id) bdb.sql_execute( ''' INSERT INTO bayesdb_loom_generator (generator_id, name, loom_store_path) VALUES (?, ?, ?) ''', (generator_id, name, self.loom_store_path)) headers = [] data = [] data_by_column = {} for colno in bayesdb_variable_numbers(bdb, population_id, None): column_name = bayesdb_variable_name(bdb, population_id, None, colno) headers.append(column_name) qt = sqlite3_quote_name(table) qcn = sqlite3_quote_name(column_name) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt)) col_data = [item for (item, ) in cursor.fetchall()] data.append(col_data) data_by_column[column_name] = col_data data = [list(i) for i in zip(*data)] # Ingest data into loom. schema_file = self._data_to_schema(bdb, population_id, data_by_column) csv_file = self._data_to_csv(bdb, headers, data) project_path = self._get_loom_project_path(bdb, generator_id) loom.tasks.ingest(project_path, rows_csv=csv_file.name, schema=schema_file.name) # Store encoding info in bdb. self._store_encoding_info(bdb, generator_id) # Store rowid mapping in the bdb. qt = sqlite3_quote_name(table) rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt, )).fetchall() insertions = ','.join( str((generator_id, table_rowid, loom_rowid)) for loom_rowid, (table_rowid, ) in enumerate(rowids)) bdb.sql_execute(''' INSERT INTO bayesdb_loom_rowid_mapping (generator_id, table_rowid, loom_rowid) VALUES %s ''' % (insertions, ))
def create_generator(self, bdb, generator_id, schema, **kwargs): population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Store generator info in bdb. name = self._generate_name(bdb, generator_id) bdb.sql_execute(''' INSERT INTO bayesdb_loom_generator (generator_id, name, loom_store_path) VALUES (?, ?, ?) ''', (generator_id, name, self.loom_store_path)) headers = [] data = [] data_by_column = {} for colno in bayesdb_variable_numbers(bdb, population_id, None): column_name = bayesdb_variable_name(bdb, population_id, None, colno) headers.append(column_name) qt = sqlite3_quote_name(table) qcn = sqlite3_quote_name(column_name) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt)) col_data = [item for (item,) in cursor.fetchall()] data.append(col_data) data_by_column[column_name] = col_data data = [list(i) for i in zip(*data)] # Ingest data into loom. schema_file = self._data_to_schema(bdb, population_id, data_by_column) csv_file = self._data_to_csv(bdb, headers, data) project_path = self._get_loom_project_path(bdb, generator_id) loom.tasks.ingest(project_path, rows_csv=csv_file.name, schema=schema_file.name) # Store encoding info in bdb. self._store_encoding_info(bdb, generator_id) # Store rowid mapping in the bdb. qt = sqlite3_quote_name(table) rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt,)).fetchall() insertions = ','.join( str((generator_id, table_rowid, loom_rowid)) for loom_rowid, (table_rowid,) in enumerate(rowids) ) bdb.sql_execute(''' INSERT INTO bayesdb_loom_rowid_mapping (generator_id, table_rowid, loom_rowid) VALUES %s ''' % (insertions,))
def _data(self, bdb, generator_id, vars): # Get the column numbers and statistical types. population_id = core.bayesdb_generator_population(bdb, generator_id) colnos = [ core.bayesdb_variable_number(bdb, population_id, generator_id, var) for var in vars ] stattypes = [ core.bayesdb_variable_stattype(bdb, population_id, colno) for colno in colnos ] # Get the table name, quoted for constructing SQL. table_name = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) # Create SQL expressions to cast each variable to the correct # affinity for its statistical type. def cast(var, colno, stattype): if colno < 0: return 'NULL' qv = sqlite3_quote_name(var) affinity = core.bayesdb_stattype_affinity(bdb, stattype) qa = sqlite3_quote_name(affinity) return 'CAST(t.%s AS %s)' % (qv, qa) qexpressions = ','.join(map(cast, vars, colnos, stattypes)) # Get a cursor. cursor = bdb.sql_execute( ''' SELECT %s FROM %s AS t, bayesdb_cgpm_individual AS ci WHERE ci.generator_id = ? AND ci.table_rowid = t._rowid_ ORDER BY t._rowid_ ASC ''' % (qexpressions, qt), (generator_id, )) # Map values to codes. def map_value(colno, value): return self._to_numeric(bdb, generator_id, colno, value) return [ tuple(map_value(colno, x) for colno, x in zip(colnos, row)) for row in cursor ]
def _store_encoding_info(self, bdb, generator_id): encoding_path = os.path.join( self._get_loom_project_path(bdb, generator_id), 'ingest', 'encoding.json.gz') with gzip.open(encoding_path) as encoding_file: encoding = json.loads(encoding_file.read().decode('ascii')) population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Store string encoding. insert_string_encoding = ''' INSERT INTO bayesdb_loom_string_encoding (generator_id, colno, string_form, integer_form) VALUES (:generator_id, :colno, :string_form, :integer_form) ''' for col in encoding: if 'symbols' in col: colno = bayesdb_table_column_number(bdb, table, str(col['name'])) for string_form, integer_form in col['symbols'].iteritems(): bdb.sql_execute( insert_string_encoding, { 'generator_id': generator_id, 'colno': colno, 'string_form': string_form, 'integer_form': integer_form }) # Store ordering of columns. insert_order_sql = ''' INSERT INTO bayesdb_loom_column_ordering (generator_id, colno, rank) VALUES (:generator_id, :colno, :rank) ''' for col_index in xrange(len(encoding)): colno = bayesdb_table_column_number( bdb, table, str(encoding[col_index]['name'])) bdb.sql_execute(insert_order_sql, { 'generator_id': generator_id, 'colno': colno, 'rank': col_index })
def _store_encoding_info(self, bdb, generator_id): encoding_path = os.path.join( self._get_loom_project_path(bdb, generator_id), 'ingest', 'encoding.json.gz' ) with gzip.open(encoding_path) as encoding_file: encoding = json.loads(encoding_file.read().decode('ascii')) population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Store string encoding. insert_string_encoding = ''' INSERT INTO bayesdb_loom_string_encoding (generator_id, colno, string_form, integer_form) VALUES (:generator_id, :colno, :string_form, :integer_form) ''' for col in encoding: if 'symbols' in col: colno = bayesdb_table_column_number(bdb, table, str(col['name'])) for string_form, integer_form in col['symbols'].iteritems(): bdb.sql_execute(insert_string_encoding, { 'generator_id': generator_id, 'colno': colno, 'string_form': string_form, 'integer_form': integer_form }) # Store ordering of columns. insert_order_sql = ''' INSERT INTO bayesdb_loom_column_ordering (generator_id, colno, rank) VALUES (:generator_id, :colno, :rank) ''' for col_index in xrange(len(encoding)): colno = bayesdb_table_column_number( bdb, table, str(encoding[col_index]['name'])) bdb.sql_execute(insert_order_sql, { 'generator_id': generator_id, 'colno': colno, 'rank': col_index })
def _store_kind_partition(self, bdb, generator_id, modelnos): population_id = bayesdb_generator_population(bdb, generator_id) if modelnos is None: modelnos = range(self._get_num_models(bdb, generator_id)) with bdb.savepoint(): for modelno in modelnos: column_partition = self._retrieve_column_partition( bdb, generator_id, modelno) # Bulk insertion of mapping from colno to kind_id. colnos = bayesdb_variable_numbers(bdb, population_id, None) ranks = [self._get_loom_rank(bdb, generator_id, colno) for colno in colnos] insertions = ','.join( str((generator_id, modelno, colno, column_partition[rank])) for colno, rank in zip(colnos, ranks) ) bdb.sql_execute(''' INSERT OR REPLACE INTO bayesdb_loom_column_kind_partition (generator_id, modelno, colno, kind_id) VALUES %s ''' % (insertions,)) # Bulk insertion of mapping from (kind_id, rowid) to cluster_id. row_partition = self._retrieve_row_partition( bdb, generator_id, modelno) rowids = bdb.sql_execute(''' SELECT table_rowid, loom_rowid FROM bayesdb_loom_rowid_mapping ''').fetchall() insertions = ','.join( str((generator_id, modelno, rowid[0], rowid[1], kind_id, partition_id)) for kind_id in row_partition for rowid, partition_id in zip(rowids, row_partition[kind_id])) bdb.sql_execute(''' INSERT OR REPLACE INTO bayesdb_loom_row_kind_partition (generator_id, modelno, table_rowid, loom_rowid, kind_id, partition_id) VALUES %s ''' % (insertions,))
def _store_kind_partition(self, bdb, generator_id, modelnos): population_id = bayesdb_generator_population(bdb, generator_id) if modelnos is None: modelnos = range(self._get_num_models(bdb, generator_id)) with bdb.savepoint(): for modelno in modelnos: column_partition = self._retrieve_column_partition( bdb, generator_id, modelno) # Bulk insertion of mapping from colno to kind_id. colnos = bayesdb_variable_numbers(bdb, population_id, None) ranks = [ self._get_loom_rank(bdb, generator_id, colno) for colno in colnos ] insertions = ','.join( str((generator_id, modelno, colno, column_partition[rank])) for colno, rank in zip(colnos, ranks)) bdb.sql_execute(''' INSERT OR REPLACE INTO bayesdb_loom_column_kind_partition (generator_id, modelno, colno, kind_id) VALUES %s ''' % (insertions, )) # Bulk insertion of mapping from (kind_id, rowid) to cluster_id. row_partition = self._retrieve_row_partition( bdb, generator_id, modelno) rowids = bdb.sql_execute(''' SELECT table_rowid, loom_rowid FROM bayesdb_loom_rowid_mapping ''').fetchall() insertions = ','.join( str((generator_id, modelno, rowid[0], rowid[1], kind_id, partition_id)) for kind_id in row_partition for rowid, partition_id in zip(rowids, row_partition[kind_id])) bdb.sql_execute(''' INSERT OR REPLACE INTO bayesdb_loom_row_kind_partition (generator_id, modelno, table_rowid, loom_rowid, kind_id, partition_id) VALUES %s ''' % (insertions, ))
def _convert_to_proper_stattype(self, bdb, generator_id, colno, value): """Convert a value returned by the logpdf_joint method parameters into a form that Loom can handle. For instance, convert from an integer to real or, from a string to an integer. """ if value is None: return value population_id = bayesdb_generator_population(bdb, generator_id) stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) # If nominal, then return the integer code. if _is_nominal(stattype): return self._get_integer_form(bdb, generator_id, colno, value) # If countable, then return value as integer. elif _is_countable(stattype): # XXX This is going to cause a counts of 2.4 to evaluate to 2. # Better than having a StopIteration error coming from Loom. return int(value) # If continuous, return the value as float. elif _is_continuous(stattype): return float(value) else: assert False, 'Unknown stattype'
def _reorder_row(self, bdb, generator_id, row, dense=True): """Reorder a row of columns according to loom's column order. Row should be a list of (colno, value) tuples Returns a list of (colno, value) tuples in the proper order. """ ordered_column_labels = self._get_ordered_column_labels( bdb, generator_id) ordererd_column_dict = OrderedDict([(a, None) for a in ordered_column_labels]) population_id = bayesdb_generator_population(bdb, generator_id) for colno, value in zip(range(1, len(row) + 1), row): column_name = bayesdb_variable_name(bdb, population_id, None, colno) ordererd_column_dict[column_name] = str(value) if dense is False: return [(colno, value) for (colno, value) in ordererd_column_dict.iteritems() if value is not None] return ordererd_column_dict.iteritems()
def _initialize_engine(self, bdb, generator_id, n, variables): population_id = core.bayesdb_generator_population(bdb, generator_id) def map_var(var): return core.bayesdb_variable_number(bdb, population_id, generator_id, var) # If no variables in the population modeled by the gpmcc, then create 1 # dummy variable with one measurement. The design space for how to # refactor cgpm.crosscat.State to initialize without any variables is # not simple, so we will live with this workaround for now. if not variables: (outputs, cctypes, distargs, gpmcc_data) = \ [7**10], ['bernoulli'], [None], [[0]] else: outputs = [map_var(var) for var, _st, _cct, _da in variables] cctypes = [cctype for _n, _st, cctype, _da in variables] distargs = [distargs for _n, _st, _cct, distargs in variables] gpmcc_vars = [var for var, _stattype, _dist, _params in variables] gpmcc_data = self._data(bdb, generator_id, gpmcc_vars) # If gpmcc_data has any column which is all null, then crash early # and notify the user of all offending column names. n_rows = len(gpmcc_data[0]) nulls = [ v for i, v in enumerate(gpmcc_vars) if all( math.isnan(gpmcc_data[r][i]) for r in xrange(n_rows)) ] if nulls: raise BQLError( bdb, 'Failed to initialize, ' 'columns have all null values: %s' % repr(nulls)) return Engine(gpmcc_data, num_states=n, rng=bdb.np_prng, multiprocess=self._ncpu, outputs=outputs, cctypes=cctypes, distargs=distargs)
def _convert_to_proper_stattype(self, bdb, generator_id, colno, value): """Convert a value returned by the logpdf_joint method parameters into a form that Loom can handle. For instance, convert from an integer to real or, from a string to an integer. """ if value is None: return value population_id = bayesdb_generator_population(bdb, generator_id) stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) # If nominal, then return the integer code. if _is_nominal(stattype): return self._get_integer_form(bdb, generator_id, colno, value) # If countable, then return value as integer. elif _is_countable(stattype): # XXX This is going to cause a counts of 2.4 to evaluate to 2. # Better than having a StopIteration error coming from Loom. return int(value) # If continuous, return the value as float. elif _is_continuous(stattype): return float(value) else: assert False, 'Unknown stattype'
def column_mutual_information(self, bdb, generator_id, modelnos, colnos0, colnos1, constraints, numsamples): # XXX Why are the constraints being ignored? If Loom does not support # conditioning, then implement constraints using the simple Monte Carlo # estimator. population_id = bayesdb_generator_population(bdb, generator_id) colnames0 = [ str(bayesdb_variable_name(bdb, population_id, None, colno)) for colno in colnos0 ] colnames1 = [ str(bayesdb_variable_name(bdb, population_id, None, colno)) for colno in colnos1 ] server = self._get_cache_entry(bdb, generator_id, 'preql_server') target_set = server._cols_to_mask(server.encode_set(colnames0)) query_set = server._cols_to_mask(server.encode_set(colnames1)) mi = server._query_server.mutual_information( target_set, query_set, entropys=None, sample_count=loom.preql.SAMPLE_COUNT) return mi
def _initialize_cgpm(self, bdb, generator_id, cgpm_ext): population_id = core.bayesdb_generator_population(bdb, generator_id) def map_var(var): return core.bayesdb_variable_number(bdb, population_id, generator_id, var) name = cgpm_ext['name'] outputs = map(map_var, cgpm_ext['outputs']) inputs = map(map_var, cgpm_ext['inputs']) args = cgpm_ext.get('args', ()) kwds = cgpm_ext.get('kwds', {}) if name not in self._cgpm_registry: raise BQLError(bdb, 'Unknown CGPM: %s' % (repr(name), )) cls = self._cgpm_registry[name] cgpm_vars = cgpm_ext['outputs'] + cgpm_ext['inputs'] cgpm_data = self._data(bdb, generator_id, cgpm_vars) cgpm = cls(outputs, inputs, rng=bdb.np_prng, *args, **kwds) for cgpm_rowid, row in enumerate(cgpm_data): # CGPMs do not uniformly handle null values or missing # values sensibly yet, so until we have that sorted # out we both (a) omit nulls and (b) ignore errors in # incorporate. query = { colno: row[i] for i, colno in enumerate(outputs) if not math.isnan(row[i]) } n = len(outputs) evidence = { colno: row[n + i] for i, colno in enumerate(inputs) if not math.isnan(row[n + i]) } try: cgpm.incorporate(cgpm_rowid, query, evidence) except Exception: pass return cgpm
def column_mutual_information(self, bdb, generator_id, modelnos, colnos0, colnos1, constraints, numsamples): population_id = bayesdb_generator_population(bdb, generator_id) colnames0 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos0 ] colnames1 = [ str(bayesdb_variable_name(bdb, population_id, generator_id, colno)) for colno in colnos1 ] server = self._get_preql_server(bdb, generator_id) target_set = server._cols_to_mask(server.encode_set(colnames0)) query_set = server._cols_to_mask(server.encode_set(colnames1)) if self._marginize_cmi(constraints): inner_numsamples = numsamples conditioning_rows_loom_format = self._get_constraint_rows( constraints, bdb, generator_id, population_id, modelnos, server, inner_numsamples) else: conditioning_rows_loom_format = [ self._get_constraint_row(constraints, bdb, generator_id, population_id, server) ] mi_estimates = [ server._query_server.mutual_information( target_set, query_set, entropys=None, sample_count=loom.preql.SAMPLE_COUNT, conditioning_row=conditioning_row_loom_format ).mean for conditioning_row_loom_format in conditioning_rows_loom_format ] # Output requires an iterable. return [arithmetic_mean(mi_estimates)]
def predict_confidence(self, bdb, generator_id, modelno, colno, rowid, numsamples=None): if not numsamples: numsamples = 2 assert numsamples > 0 def _impute_categorical(sample): counts = Counter(s[0] for s in sample) mode_count = max(counts[v] for v in counts) pred = iter(v for v in counts if counts[v] == mode_count).next() conf = float(mode_count) / numsamples return pred, conf def _impute_numerical(sample): pred = sum(s[0] for s in sample) / float(len(sample)) conf = 0 # XXX Punt confidence for now return pred, conf constraints = [] # If rowid is a hypothetical cell for cgpm (did not exist at the time # of INITIALIZE), but exists in the base table (by INSERT INTO), then # retrieve all values for rowid as the constraints. exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id) max_cgpm_rowid = bdb.sql_execute( ''' SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id, )).fetchall()[0][0] hypothetical = rowid > max_cgpm_rowid if exists and hypothetical: population_id = core.bayesdb_generator_population( bdb, generator_id) # Retrieve all other variables except colno, and ignore latents in # generator_id, and place them in the constraints. pop_names = core.bayesdb_variable_names(bdb, population_id, None) avoid_name = core.bayesdb_variable_name(bdb, population_id, colno) constraints_names = [n for n in pop_names if n != avoid_name] # Obtain the row. qt_names = str.join(',', map(sqlite3_quote_name, constraints_names)) qt_table = sqlite3_quote_name( core.bayesdb_population_table(bdb, population_id)) data = bdb.sql_execute( ''' SELECT %s FROM %s WHERE oid = ? ''' % ( qt_names, qt_table, ), (rowid, )).fetchall()[0] # Build the constraints. pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None) constraints_nos = [n for n in pop_nos if n != colno] # import ipdb; ipdb.set_trace() assert len(data) == len(constraints_nos) constraints = [(rowid, c, v) for c, v in zip(constraints_nos, data) if (v is not None) and v] # Retrieve the samples. sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)], constraints, modelno, numsamples) # Determine the imputation strategy (mode or mean). stattype = core.bayesdb_variable_stattype( bdb, core.bayesdb_generator_population(bdb, generator_id), colno) if _is_categorical(stattype): return _impute_categorical(sample) else: return _impute_numerical(sample)
def analyze_models(self, bdb, generator_id, modelnos=None, iterations=1, max_seconds=None, ckpt_iterations=None, ckpt_seconds=None, program=None): assert modelnos is None if ckpt_iterations is not None or ckpt_seconds is not None: # XXX raise NotImplementedError( 'CGpm analysis checkpoint not supported.') if program is None: program = [] population_id = core.bayesdb_generator_population(bdb, generator_id) def retrieve_analyze_variables(ast): # Transition all variables by default. variables = None # Exactly 1 VARIABLES or SKIP clause supported for simplicity. seen_variables, seen_skip, seen_optimized = False, False, False for clause in ast: # Transition user specified variables only. if isinstance(clause, cgpm_analyze.parse.Variables): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE' ) seen_variables = True included = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable( bdb, population_id, generator_id, var): unknown.add(var) included.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) variables = sorted(included) # Transition all variables except user specified skip. elif isinstance(clause, cgpm_analyze.parse.Skip): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE' ) seen_skip = True excluded = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable( bdb, population_id, generator_id, var): unknown.add(var) excluded.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) all_vars = core.bayesdb_variable_names( bdb, population_id, generator_id) variables = sorted(set(all_vars) - excluded) elif isinstance(clause, cgpm_analyze.parse.Optimized): seen_optimized = True # Unknown/impossible clause. else: raise ValueError('Unknown clause in ANALYZE: %s.' % ast) if variables is None: variables = core.bayesdb_variable_names( bdb, population_id, generator_id) varnos = [ core.bayesdb_variable_number(bdb, population_id, generator_id, v) for v in variables ] # TODO Perform error checking if the OPTIMIZED clause is used. # In particular, the variables in OPTIMIZED must correspond # EXACTLY to the variables that are modeled by the CrossCat # baseline. Avoided this check for now since the nature of a # variable is not stored in the bdb. For now, just check the # user did not include a VARIABLES clause. if seen_optimized: if seen_variables: raise BQLError(bdb, 'OPTIMIZED incompatible with VARIABLES') # TODO Check if varnos are exactly the CrossCat variables. # raise BQLError(bdb, # 'The OPTIMIZED phrase in ANALYZE must target all the ' # 'variables modeled by the baseline, only. ' # 'Use SKIP to explicitly ignore analysis of overriden ' # 'variables') return varnos, seen_optimized # Retrieve target variables and whether optimized. analyze_ast = cgpm_analyze.parse.parse(program) varnos, optimized = retrieve_analyze_variables(analyze_ast) engine = self._engine(bdb, generator_id) if optimized: engine.transition_lovecat(N=iterations, S=max_seconds, multiprocess=self._ncpu) else: engine.transition(N=iterations, S=max_seconds, cols=varnos, multiprocess=self._ncpu) # Serialize the engine. engine_json = json_dumps(engine.to_metadata()) # Update the engine. bdb.sql_execute( ''' UPDATE bayesdb_cgpm_generator SET engine_json = :engine_json WHERE generator_id = :generator_id ''', { 'generator_id': generator_id, 'engine_json': engine_json })
def _retrieve_analyze_variables(bdb, generator_id, ast): population_id = core.bayesdb_generator_population(bdb, generator_id) # Transitions all variables by default. variables = None # Exactly 1 VARIABLES or SKIP clause supported for simplicity. seen_variables, seen_skip, seen_optimized = False, False, False for clause in ast: # Transition user specified variables only. if isinstance(clause, cgpm_analyze.parse.Variables): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE') seen_variables = True included = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): unknown.add(var) included.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) variables = sorted(included) # Transition all variables except user specified skip. elif isinstance(clause, cgpm_analyze.parse.Skip): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE') seen_skip = True excluded = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): unknown.add(var) excluded.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) all_vars = core.bayesdb_variable_names(bdb, population_id, generator_id) variables = sorted(set(all_vars) - excluded) # OPTIMIZED is incompatible with any other clause. elif isinstance(clause, cgpm_analyze.parse.Optimized): seen_optimized = True # Unknown/impossible clause. else: raise BQLError(bdb, 'Unknown clause in ANALYZE: %s.' % (ast, )) # OPTIMIZED is incompatible with any other clause. if seen_optimized: if seen_variables or seen_skip: raise BQLError(bdb, 'OPTIMIZED incompatible with other clauses.') variable_numbers = [ core.bayesdb_variable_number(bdb, population_id, generator_id, v) for v in variables ] if variables else None return (variable_numbers, seen_optimized)
def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints, num_samples=1, accuracy=None): # Retrieve the population id. population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Prepare list of full constraints, potentially adding data from table. constraints_full = constraints # If rowid exist in base table, retrieve conditioning data. # Conditioning values are fetched for any rowid that exists in the base # table irrespective of whether the rowid is incorporated in the Loom # model or whether it was added after creation. if bayesdb_table_has_rowid(bdb, table, rowid): # Fetch population column numbers and row values. colnos = bayesdb_variable_numbers(bdb, population_id, generator_id) rowvals = bayesdb_population_row_values(bdb, population_id, rowid) observations = [(colno, rowval) for colno, rowval in zip(colnos, rowvals) if rowval is not None and colno not in targets] # Raise error if a constraint overrides an observed cell. colnos_constrained = [constraint[0] for constraint in constraints] colnos_observed = [observation[0] for observation in observations] if set.intersection(set(colnos_constrained), set(colnos_observed)): raise BQLError( bdb, 'Overlap between constraints and' ' target row in simulate.') # Update the constraints. constraints_full = constraints + observations # Store mapping from target column name to column number and stattype. target_colno_to_name = { colno: bayesdb_variable_name(bdb, generator_id, None, colno) for colno in targets } target_colno_to_stattype = { colno: bayesdb_variable_stattype(bdb, population_id, None, colno) for colno in targets } # Construct the CSV row for targets. row_targets = {target_colno_to_name[colno]: '' for colno in targets} row_constraints = { bayesdb_variable_name(bdb, generator_id, None, colno): value for colno, value in constraints_full } row = dict( itertools.chain(row_targets.iteritems(), row_constraints.iteritems())) # Fetch the server. server = self._get_preql_server(bdb, generator_id) # Prepare the csv header and values. csv_headers = map(str, row.iterkeys()) csv_values = map(str, row.itervalues()) # Prepare streams for the server. outfile = StringIO() writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue) reader = iter([csv_headers] + [csv_values]) # Obtain the prediction. server._predict(reader, num_samples, writer, False) # Parse the CSV output. output_csv = writer.result() output_rows = output_csv.strip().split('\r\n') # Extract the header of the CSV file. header = output_rows[0].split(CSV_DELIMITER) # Extract list of simulated rows. Each simulated row is represented # as a dictionary mapping column name to its simulated value. simulated_rows = [ dict(zip(header, row.split(CSV_DELIMITER))) for row in output_rows[1:] ] # Prepare the return list of simulated_rows. def _extract_simulated_value(row, colno): colname = target_colno_to_name[colno] stattype = target_colno_to_stattype[colno] value = row[colname] return value if _is_nominal(stattype) else float(value) # Return the list of samples. return [[_extract_simulated_value(row, colno) for colno in targets] for row in simulated_rows]
def _create_schema(bdb, generator_id, schema_ast, **kwargs): # Get some parameters. population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) # State. variables = [] variable_dist = {} latents = {} cgpm_composition = [] modelled = set() default_modelled = set() subsample = None deferred_input = defaultdict(lambda: []) deferred_output = dict() # Error-reporting state. duplicate = set() unknown = set() needed = set() existing_latent = set() must_exist = [] unknown_stattype = {} # XXX Convert all Foreign.exposed lists to Latent clauses. # Retrieve Foreign clauses with exposed variables. foreign_clauses = [ c for c in schema_ast if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0 ] # Add the exposed variables to Foreign.outputs # Note that this assumes if there are K exposed variables, then they are # necessarily the last K outputs of the fc.outputs. for fc in foreign_clauses: fc.outputs.extend([e[0] for e in fc.exposed]) # Convert exposed entries into Latent clauses. latent_vars = list( itertools.chain.from_iterable(c.exposed for c in foreign_clauses)) latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars] # Append the Latent clauses to the ast. schema_ast.extend(latent_clauses) # XXX Convert the baseline to a Foreign clause. # Currently the baselines do not accept a schema, and will fail if # `schema_ast` has any entries. baseline = kwargs.get('baseline', None) if baseline is not None and casefold(baseline.name) != 'crosscat': if schema_ast: raise BQLError( bdb, 'Cannot accept schema with baseline: %s.' % schema_ast) # Retrieve all variable names in the population outputs = core.bayesdb_variable_names(bdb, population_id, None) # Convert the LITERAL namedtuples to their raw values. ps, vs = zip(*baseline.params) vs_new = [v.value for v in vs] params = zip(ps, vs_new) # Create the clause. clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name, params) # And add append it to the schema_ast. schema_ast.append(clause) # Process each clause one by one. for clause in schema_ast: if isinstance(clause, cgpm_schema.parse.Basic): # Basic Crosscat component model: one variable to be put # into Crosscat views. var = clause.var dist = clause.dist params = dict(clause.params) # XXX error checking # Reject if the variable does not exist. if not core.bayesdb_has_variable(bdb, population_id, None, var): unknown.add(var) continue # Reject if the variable has already been modelled. if var in modelled: duplicate.add(var) continue # Reject if the variable is latent. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Get the column number. colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno # Add it to the list and mark it modelled by default. stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) default_modelled.add(var) elif isinstance(clause, cgpm_schema.parse.Latent): var = clause.name stattype = clause.stattype # Reject if the variable has already been modelled by the # default model. if var in default_modelled: duplicate.add(var) continue # Reject if the variable even *exists* in the population # at all yet. if core.bayesdb_has_variable(bdb, population_id, None, var): duplicate.add(var) continue # Reject if the variable is already latent, from another # generator. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Reject if we've already processed it. if var in latents: duplicate.add(var) continue # Add it to the set of latent variables. latents[var] = stattype elif isinstance(clause, cgpm_schema.parse.Foreign): # Foreign model: some set of output variables is to be # modelled by foreign logic, possibly conditional on some # set of input variables. # # Gather up the state for a cgpm_composition record, which # we may have to do incrementally because it must refer to # the distribution types of variables we may not have # seen. name = clause.name outputs = clause.outputs inputs = clause.inputs output_stattypes = [] output_statargs = [] input_stattypes = [] input_statargs = [] distargs = { 'inputs': { 'stattypes': input_stattypes, 'statargs': input_statargs }, 'outputs': { 'stattypes': output_stattypes, 'statargs': output_statargs, } } kwds = {'distargs': distargs} kwds.update(clause.params) # First make sure all the output variables exist and have # not yet been modelled. for var in outputs: must_exist.append(var) if var in modelled: duplicate.add(var) continue modelled.add(var) # Add the output statistical type and its parameters. i = len(output_stattypes) assert i == len(output_statargs) output_stattypes.append(None) output_statargs.append(None) deferred_output[var] = (output_stattypes, output_statargs, i) # Next make sure all the input variables exist, mark them # needed, and record where to put their distribution type # and parameters. for var in inputs: must_exist.append(var) needed.add(var) i = len(input_stattypes) assert i == len(input_statargs) input_stattypes.append(None) input_statargs.append(None) deferred_input[var].append( (input_stattypes, input_statargs, i)) # Finally, add a cgpm_composition record. cgpm_composition.append({ 'name': name, 'inputs': inputs, 'outputs': outputs, 'kwds': kwds, }) elif isinstance(clause, cgpm_schema.parse.Subsample): if subsample is not None: raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, )) subsample = clause.n else: raise BQLError(bdb, 'Unknown clause: %r' % (clause, )) # Make sure all the outputs and inputs exist, either in the # population or as latents in this generator. for var in must_exist: if core.bayesdb_has_variable(bdb, population_id, None, var): continue if var in latents: continue unknown.add(var) # Raise an exception if there were duplicates or unknown # variables. if duplicate: raise BQLError(bdb, 'Duplicate model variables: %r' % (sorted(duplicate), )) if existing_latent: raise BQLError( bdb, 'Latent variables already defined: %r' % (sorted(existing_latent), )) if unknown: raise BQLError(bdb, 'Unknown model variables: %r' % (sorted(unknown), )) def default_dist(var, stattype): stattype = casefold(stattype) if stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == stattype else: unknown_stattype[var] = stattype return None dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var) return dist, params # Use the default distribution for any variables that remain to be # modelled, excluding any that are latent or that have statistical # types we don't know about. for var in core.bayesdb_variable_names(bdb, population_id, None): if var in modelled: continue colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) # Fill in the deferred_input statistical type assignments. for var in sorted(deferred_input.iterkeys()): # Check whether the variable is modelled. If not, skip -- we # will fail later because this variable is guaranteed to also # be in needed. if var not in modelled: assert var in needed continue # Determine (possibly fictitious) distribution and parameters. if var in default_modelled: # Manifest variable modelled by default Crosscat model. assert var in variable_dist stattype, dist, params = variable_dist[var] else: # Modelled by a foreign model. Assign a fictitious # default distribution because the 27B/6 of CGPM requires # this. if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. stattype = latents[var] else: # Manifest variable modelled by a foreign model. Use # the statistical type in the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams # Assign the distribution and parameters. for cctypes, ccargs, i in deferred_input[var]: assert cctypes[i] is None assert ccargs[i] is None cctypes[i] = dist ccargs[i] = params # Fill in the deferred_output statistical type assignments. The need to be # in the form NUMERICAL or CATEGORICAL. for var in deferred_output: if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. var_stattype = casefold(latents[var]) if var_stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == var_stattype else: unknown_stattype[var] = var_stattype # XXX Cannot specify statargs for a latent variable. Trying to using # default_dist might lookup the counts for unique values of the # categorical in the base table causing a failure. var_statargs = {} else: # Manifest variable modelled by a foreign model. Use # the statistical type and arguments from the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) var_stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, var_stattype) if distparams is None: continue _, var_statargs = distparams stattypes, statargs, i = deferred_output[var] assert stattypes[i] is None assert statargs[i] is None stattypes[i] = var_stattype statargs[i] = var_statargs if unknown_stattype: raise BQLError( bdb, 'Unknown statistical types for variables: %r' % (sorted(unknown_stattype.iteritems(), ))) # If there remain any variables that we needed to model, because # others are conditional on them, fail. needed -= modelled if needed: raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, )) # Finally, create a CGPM schema. return { 'variables': variables, 'cgpm_composition': cgpm_composition, 'subsample': subsample, 'latents': latents, }
def _cmd_render_crosscat(self, query, sql=None, **kwargs): '''Returns a rendering of the specified crosscat state Usage: .render_crosscat [options] <generator> <modelno>. Options: --subsample=<n> --width=<w> --height=<c> --rowlabels=<colname> --progress=[True|False] --yticklabeslize=<fontsize> --xticklabeslize=<fontsize> The allowable fontsize strings are: xx-small, x-small, # small, medium, large, x-large, xx-large ''' tokens = query.split() if len(tokens) != 2: self.write_stderr('Usage: .render_crosscat <generator> <modelno>') return generator = tokens[0] modelno = int(tokens[1]) if not bayesdb_has_generator(self._bdb, None, generator): self.write_stderr('No such generator: %s.' % (generator, )) return generator_id = bayesdb_get_generator(self._bdb, None, generator) population_id = bayesdb_generator_population(self._bdb, generator_id) backend = bayesdb_generator_backend(self._bdb, generator_id) if backend.name() != 'cgpm': self.write_stderr('.render_crosscat requires generator from the ' 'cgpm backend') return engine = backend._engine(self._bdb, generator_id) cursor = self._bdb.sql_execute( ''' SELECT cgpm_modelno FROM bayesdb_cgpm_modelno WHERE generator_id = ? AND modelno = ? ''', ( generator_id, modelno, )) cgpm_modelno = cursor_value(cursor, nullok=True) if cgpm_modelno is None: self.write_stderr('No such model number: %d.' % (modelno, )) return state = engine.get_state(cgpm_modelno) row_names = None row_index_column = kwargs.get('rowlabels', None) if row_index_column is not None: table_name = bayesdb_generator_table(self._bdb, generator_id) qt = bql_quote_name(table_name) qc = bql_quote_name(row_index_column) cursor = self._bdb.sql_execute( ''' SELECT %s FROM %s WHERE oid IN ( SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ) ''' % (qc, qt), (generator_id, )) row_names = [c[0] for c in cursor] if 'progress' in kwargs: sys.stdout.write('Creating figure...\n') import cgpm.utils.render if 'variable' not in kwargs: # Plot the entire state. col_names = [ bayesdb_variable_name(self._bdb, population_id, None, colno) for colno in state.outputs ] fig, _ax = cgpm.utils.render.viz_state(state, col_names=col_names, row_names=row_names, **kwargs) else: # Plot the view of the requested variable. varno = bayesdb_variable_number(self._bdb, population_id, generator_id, kwargs['variable']) view = state.view_for(varno) col_names = [ bayesdb_variable_name(self._bdb, population_id, None, colno) for colno in view.outputs[1:] ] fig, _ax = cgpm.utils.render.viz_view(view, col_names=col_names, row_names=row_names, **kwargs) (width, height) = fig.get_size_inches() if 'width' in kwargs: width = float(kwargs['width']) fig.set_size_inches(width, height) if 'height' in kwargs: height = float(kwargs['height']) fig.set_size_inches(width, height) if 'progress' in kwargs: sys.stdout.write('Rendering figure...\n')
def _get_ordered_column_labels(self, bdb, generator_id): population_id = bayesdb_generator_population(bdb, generator_id) return [ bayesdb_variable_name(bdb, population_id, None, colno) for colno in self._get_order(bdb, generator_id) ]
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Name already defined as table: %s' % (repr(phrase.name), )) out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError( bdb, 'Table already exists: %s' % (repr(phrase.name), )) bayesdb_read_csv_file(bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name, )) if 0 < cursor_value(cursor): raise BQLError( bdb, 'Table still in use by populations: %s' % (repr(phrase.name), )) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name, )) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table), )) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(cmd.name), )) rename_table(bdb, table, cmd.name) # If table has implicit population, rename it too. if core.bayesdb_table_has_implicit_population( bdb, cmd.name): populations = \ core.bayesdb_table_populations(bdb, cmd.name) assert len(populations) == 1 population_name = core.bayesdb_population_name( bdb, populations[0]) qt = sqlite3_quote_name(cmd.name) qp = sqlite3_quote_name(population_name) bdb.execute('ALTER POPULATION %s RENAME TO %s' % (qp, qt)) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column( bdb, table, cmd.old): raise BQLError( bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError( bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except backends may have the (case-folded) name cached. if old_folded != new_folded: populations_sql = ''' SELECT id FROM bayesdb_population WHERE tabname = ? ''' cursor = bdb.sql_execute(populations_sql, (table, )) generators = [ core.bayesdb_population_generators( bdb, population_id) for (population_id, ) in cursor ] for generator_id in set(generators): backend = core.bayesdb_generator_backend( bdb, generator_id) backend.rename_column(bdb, generator_id, old_folded, new_folded) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.GuessSchema): if not core.bayesdb_has_table(bdb, phrase.table): raise BQLError(bdb, 'No such table : %s' % phrase.table) out = compiler.Output(0, {}, {}) with bdb.savepoint(): qt = sqlite3_quote_name(phrase.table) temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, )) column_names = [d[0] for d in cursor.description] rows = cursor.fetchall() stattypes = bayesdb_guess_stattypes(column_names, rows) distinct_value_counts = [ len(set([row[i] for row in rows])) for i in range(len(column_names)) ] out.winder( ''' CREATE TEMP TABLE %s ( column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT ) ''' % (qtt, ), ()) for cn, st, ct in zip(column_names, stattypes, distinct_value_counts): out.winder( ''' INSERT INTO %s VALUES (?, ?, ?, ?) ''' % (qtt), (cn, st[0], ct, st[1])) out.write('SELECT * FROM %s' % (qtt, )) out.unwinder('DROP TABLE %s' % (qtt, ), ()) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.CreatePop): with bdb.savepoint(): _create_population(bdb, phrase) return empty_cursor(bdb) if isinstance(phrase, ast.DropPop): with bdb.savepoint(): if not core.bayesdb_has_population(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such population: %r' % (phrase.name, )) population_id = core.bayesdb_get_population(bdb, phrase.name) generator_ids = core.bayesdb_population_generators( bdb, population_id) if generator_ids: generators = [ core.bayesdb_generator_name(bdb, gid) for gid in generator_ids ] raise BQLError( bdb, 'Population %r still has generators: %r' % (phrase.name, generators)) # XXX helpful error checking if generators still exist # XXX check change counts bdb.sql_execute( ''' DELETE FROM bayesdb_variable WHERE population_id = ? ''', (population_id, )) bdb.sql_execute( ''' DELETE FROM bayesdb_population WHERE id = ? ''', (population_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterPop): with bdb.savepoint(): population = phrase.population if not core.bayesdb_has_population(bdb, population): raise BQLError(bdb, 'No such population: %s' % (repr(population), )) population_id = core.bayesdb_get_population(bdb, population) for cmd in phrase.commands: if isinstance(cmd, ast.AlterPopRenamePop): table = core.bayesdb_population_table(bdb, population_id) # Prevent renaming of implicit population directly, unless # being called by ast.AlterTabRenameTab in which case the # table name and population name will not be matching. if core.bayesdb_population_is_implicit(bdb, population_id) \ and casefold(population) == casefold(table): raise BQLError( bdb, 'Cannot rename implicit' 'population %s; rename base table instead' % (population, )) # Make sure nothing else has this name. if casefold(population) != casefold(cmd.name): if core.bayesdb_has_population(bdb, cmd.name): raise BQLError( bdb, 'Name already defined as population' ': %s' % (repr(cmd.name), )) # Update bayesdb_population. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_population SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, population_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # If population has implicit generator, rename it too. if core.bayesdb_population_has_implicit_generator( bdb, population_id): generators = core.bayesdb_population_generators( bdb, population_id) assert len(generators) == 1 generator_name = core.bayesdb_generator_name( bdb, generators[0]) qp = sqlite3_quote_name(cmd.name) qg = sqlite3_quote_name(generator_name) bdb.execute('ALTER GENERATOR %s RENAME TO %s' % ( qg, qp, )) # Remember the new name for subsequent commands. population = cmd.name elif isinstance(cmd, ast.AlterPopAddVar): # Ensure column exists in base table. table = core.bayesdb_population_table(bdb, population_id) if not core.bayesdb_table_has_column(bdb, table, cmd.name): raise BQLError( bdb, 'No such variable in base table: %s' % (cmd.name)) # Ensure variable not already in population. if core.bayesdb_has_variable(bdb, population_id, None, cmd.name): raise BQLError( bdb, 'Variable already in population: %s' % (cmd.name)) # Ensure there is at least observation in the column. qt = sqlite3_quote_name(table) qc = sqlite3_quote_name(cmd.name) cursor = bdb.sql_execute( 'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' % (qt, qc)) if cursor_value(cursor) == 0: raise BQLError( bdb, 'Cannot add variable without any values: %s' % (cmd.name)) # If stattype is None, guess. if cmd.stattype is None: cursor = bdb.sql_execute('SELECT %s FROM %s' % (qc, qt)) rows = cursor.fetchall() [stattype, reason] = bayesdb_guess_stattypes([cmd.name], rows)[0] # Fail if trying to model a key. if stattype == 'key': raise BQLError( bdb, 'Values in column %s appear to be keys.' % (cmd.name, )) # Fail if cannot determine a stattype. elif stattype == 'ignore': raise BQLError( bdb, 'Failed to determine a stattype for %s, ' 'please specify one manually.' % (cmd.name, )) # If user specified stattype, ensure it exists. elif not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError(bdb, 'Invalid stattype: %s' % (cmd.stattype)) else: stattype = cmd.stattype # Check that strings are not being modeled as numerical. if stattype == 'numerical' \ and _column_contains_string(bdb, table, cmd.name): raise BQLError( bdb, 'Numerical column contains string values: %r ' % (qc, )) with bdb.savepoint(): # Add the variable to the population. core.bayesdb_add_variable(bdb, population_id, cmd.name, stattype) colno = core.bayesdb_variable_number( bdb, population_id, None, cmd.name) # Add the variable to each (initialized) generator in # the population. generator_ids = filter( lambda g: core.bayesdb_generator_modelnos(bdb, g), core.bayesdb_population_generators( bdb, population_id), ) for generator_id in generator_ids: backend = core.bayesdb_generator_backend( bdb, generator_id) backend.add_column(bdb, generator_id, colno) elif isinstance(cmd, ast.AlterPopStatType): # Check the no generators are defined for this population. generators = core.bayesdb_population_generators( bdb, population_id) if generators: raise BQLError( bdb, 'Cannot update statistical types for population ' '%s, it has generators: %s' % ( repr(population), repr(generators), )) # Check all the variables are in the population. unknown = [ c for c in cmd.names if not core.bayesdb_has_variable( bdb, population_id, None, c) ] if unknown: raise BQLError( bdb, 'No such variables in population: %s' % (repr(unknown))) # Check the statistical type is valid. if not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError( bdb, 'Invalid statistical type: %r' % (repr(cmd.stattype), )) # Check that strings are not being modeled as numerical. if cmd.stattype == 'numerical': table = core.bayesdb_population_table( bdb, population_id) numerical_string_vars = [ col for col in cmd.names if _column_contains_string(bdb, table, col) ] if numerical_string_vars: raise BQLError( bdb, 'Columns with string values modeled as ' 'numerical: %r' % (numerical_string_vars, )) # Perform the stattype update. colnos = [ core.bayesdb_variable_number(bdb, population_id, None, c) for c in cmd.names ] qcolnos = ','.join('%d' % (colno, ) for colno in colnos) update_stattype_sql = ''' UPDATE bayesdb_variable SET stattype = ? WHERE population_id = ? AND colno IN (%s) ''' % (qcolnos, ) bdb.sql_execute(update_stattype_sql, ( casefold(cmd.stattype), population_id, )) else: assert False, 'Invalid ALTER POPULATION command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population, )) population_id = core.bayesdb_get_population(bdb, phrase.population) # Find the backend, or use the default. backend_name = phrase.backend if phrase.backend is None: backend_name = 'cgpm' if backend_name not in bdb.backends: raise BQLError(bdb, 'No such backend: %s' % (repr(backend_name), )) backend = bdb.backends[backend_name] # Retrieve the (possibility implicit) generator name. generator_name = phrase.name or phrase.population implicit = 1 if phrase.name is None else 0 with bdb.savepoint(): if core.bayesdb_has_generator(bdb, population_id, generator_name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(generator_name), )) else: # Insert a record into bayesdb_generator and get the # assigned id. bdb.sql_execute( ''' INSERT INTO bayesdb_generator (name, population_id, backend, implicit) VALUES (?, ?, ?, ?) ''', (generator_name, population_id, backend.name(), implicit)) generator_id = core.bayesdb_get_generator( bdb, population_id, generator_name) # Do any backend-specific initialization. backend.create_generator(bdb, generator_id, phrase.schema) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, None, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name), )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.name) backend = core.bayesdb_generator_backend(bdb, generator_id) # Backend-specific destruction. backend.drop_generator(bdb, generator_id) # Drop latent variables, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_variable WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id, )) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id, )) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id, )) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, None, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, None, generator) cmds_generic = [] for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): population_id = core.bayesdb_generator_population( bdb, generator_id) population = core.bayesdb_population_name( bdb, population_id) # Prevent renaming of implicit generator directly, unless # being called by ast.AlterPopRenamePop in which case the # population name and generator name will not be matching. if core.bayesdb_population_is_implicit(bdb, generator_id) \ and casefold(generator) == casefold(population): raise BQLError( bdb, 'Cannot rename implicit ' 'generator; rename base population instead') # Disable modelnos with AlterGenRenameGen. if phrase.modelnos is not None: raise BQLError(bdb, 'Cannot specify models for RENAME') # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_generator(bdb, None, cmd.name): raise BQLError( bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name), )) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name elif isinstance(cmd, ast.AlterGenGeneric): cmds_generic.append(cmd.command) else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) if cmds_generic: modelnos = phrase.modelnos modelnos_invalid = None if modelnos is None else [ modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno) ] if modelnos_invalid: raise BQLError( bdb, 'No such models in generator %s: %s' % (repr(phrase.generator), repr(modelnos))) # Call generic alternations on the backend. backend = core.bayesdb_generator_backend(bdb, generator_id) backend.alter(bdb, generator_id, modelnos, cmds_generic) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) modelnos = range(phrase.nmodels) with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model( bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model( bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError( bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno) VALUES (:generator_id, :modelno) ''' for modelno in modelnos: bdb.sql_execute(insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) # Do backend-specific initialization. backend = core.bayesdb_generator_backend(bdb, generator_id) backend.initialize_models(bdb, generator_id, modelnos) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): # WARNING: It is the backend's responsibility to work in a # transaction. # # WARNING: It is the backend's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the backend can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, )) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) backend = core.bayesdb_generator_backend(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. backend.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, program=phrase.program) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) backend = core.bayesdb_generator_backend(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError( bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) backend.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id, )) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) if isinstance(phrase, ast.Regress): # Retrieve the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population, )) population_id = core.bayesdb_get_population(bdb, phrase.population) # Retrieve the generator generator_id = None if phrase.generator: if not core.bayesdb_has_generator(bdb, population_id, phrase.generator): raise BQLError(bdb, 'No such generator: %r' % (phrase.generator, )) generator_id = core.bayesdb_get_generator(bdb, population_id, phrase.generator) # Retrieve the target variable. if not core.bayesdb_has_variable(bdb, population_id, None, phrase.target): raise BQLError(bdb, 'No such variable: %r' % (phrase.target, )) colno_target = core.bayesdb_variable_number(bdb, population_id, None, phrase.target) stattype = core.bayesdb_variable_stattype(bdb, population_id, generator_id, colno_target) if stattype != 'numerical': raise BQLError( bdb, 'Target variable is not numerical: %r' % (phrase.target, )) # Build the given variables. if any(isinstance(col, ast.SelColAll) for col in phrase.givens): # Using * is not allowed to be mixed with other variables. if len(phrase.givens) > 1: raise BQLError(bdb, 'Cannot use (*) with other givens.') colno_givens = core.bayesdb_variable_numbers( bdb, population_id, None) else: if any(isinstance(col, ast.SelColSub) for col in phrase.givens): # Subexpression needs special compiling. out = compiler.Output(n_numpar, nampar_map, bindings) bql_compiler = compiler.BQLCompiler_None() givens = compiler.expand_select_columns( bdb, phrase.givens, True, bql_compiler, out) else: givens = phrase.givens colno_givens = [ core.bayesdb_variable_number(bdb, population_id, None, given.expression.column) for given in givens ] # Build the arguments to bqlfn.bayesdb_simulate. colno_givens_unique = set(colno for colno in colno_givens if colno != colno_target) if len(colno_givens_unique) == 0: raise BQLError(bdb, 'No matching given columns.') constraints = [] colnos = [colno_target] + list(colno_givens_unique) nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value modelnos = None if phrase.modelnos is None else str(phrase.modelnos) rows = bqlfn.bayesdb_simulate(bdb, population_id, generator_id, modelnos, constraints, colnos, numpredictions=nsamp) # Retrieve the stattypes. stattypes = [ core.bayesdb_variable_stattype(bdb, population_id, generator_id, colno_given) for colno_given in colno_givens_unique ] # Separate the target values from the given values. target_values = [row[0] for row in rows] given_values = [row[1:] for row in rows] given_names = [ core.bayesdb_variable_name(bdb, population_id, generator_id, given) for given in colno_givens_unique ] # Compute the coefficients. The import to regress_ols is here since the # feature depends on pandas + sklearn, so avoid module-wide import. from bayeslite.regress import regress_ols coefficients = regress_ols(target_values, given_values, given_names, stattypes) # Store the results in a winder. temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) out = compiler.Output(0, {}, {}) out.winder( ''' CREATE TEMP TABLE %s (variable TEXT, coefficient REAL); ''' % (qtt, ), ()) for variable, coef in coefficients: out.winder( ''' INSERT INTO %s VALUES (?, ?) ''' % (qtt), ( variable, coef, )) out.write('SELECT * FROM %s ORDER BY variable' % (qtt, )) out.unwinder('DROP TABLE %s' % (qtt, ), ()) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) assert False # XXX
def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints, num_samples=1, accuracy=None): # Retrieve the population id. population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Prepare list of full constraints, potentially adding data from table. constraints_full = constraints # If rowid exist in base table, retrieve conditioning data. # Conditioning values are fetched for any rowid that exists in the base # table irrespective of whether the rowid is incorporated in the Loom # model or whether it was added after creation. if bayesdb_table_has_rowid(bdb, table, rowid): # Fetch population column numbers and row values. colnos = bayesdb_variable_numbers(bdb, population_id, generator_id) rowvals = bayesdb_population_row_values(bdb, population_id, rowid) observations = [ (colno, rowval) for colno, rowval in zip(colnos, rowvals) if rowval is not None and colno not in targets ] # Raise error if a constraint overrides an observed cell. colnos_constrained = [constraint[0] for constraint in constraints] colnos_observed = [observation[0] for observation in observations] if set.intersection(set(colnos_constrained), set(colnos_observed)): raise BQLError(bdb, 'Overlap between constraints and' ' target row in simulate.') # Update the constraints. constraints_full = constraints + observations # Store mapping from target column name to column number and stattype. target_colno_to_name = { colno: bayesdb_variable_name(bdb, generator_id, None, colno) for colno in targets } target_colno_to_stattype = { colno: bayesdb_variable_stattype(bdb, population_id, None, colno) for colno in targets } # Construct the CSV row for targets. row_targets = {target_colno_to_name[colno] : '' for colno in targets} row_constraints = { bayesdb_variable_name(bdb, generator_id, None, colno) : value for colno, value in constraints_full } row = dict(itertools.chain( row_targets.iteritems(), row_constraints.iteritems())) # Fetch the server. server = self._get_preql_server(bdb, generator_id) # Prepare the csv header and values. csv_headers = map(str, row.iterkeys()) csv_values = map(str, row.itervalues()) # Prepare streams for the server. outfile = StringIO() writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue) reader = iter([csv_headers]+[csv_values]) # Obtain the prediction. server._predict(reader, num_samples, writer, False) # Parse the CSV output. output_csv = writer.result() output_rows = output_csv.strip().split('\r\n') # Extract the header of the CSV file. header = output_rows[0].split(CSV_DELIMITER) # Extract list of simulated rows. Each simulated row is represented # as a dictionary mapping column name to its simulated value. simulated_rows = [ dict(zip(header, row.split(CSV_DELIMITER))) for row in output_rows[1:] ] # Prepare the return list of simulated_rows. def _extract_simulated_value(row, colno): colname = target_colno_to_name[colno] stattype = target_colno_to_stattype[colno] value = row[colname] return value if _is_nominal(stattype) else float(value) # Return the list of samples. return [ [_extract_simulated_value(row, colno) for colno in targets] for row in simulated_rows ]
def execute_phrase(bdb, phrase, bindings=()): """Execute the BQL AST phrase `phrase` and return a cursor of results.""" if isinstance(phrase, ast.Parametrized): n_numpar = phrase.n_numpar nampar_map = phrase.nampar_map phrase = phrase.phrase assert 0 < n_numpar else: n_numpar = 0 nampar_map = None # Ignore extraneous bindings. XXX Bad idea? if ast.is_query(phrase): # Compile the query in the transaction in case we need to # execute subqueries to determine column lists. Compiling is # a quick tree descent, so this should be fast. out = compiler.Output(n_numpar, nampar_map, bindings) with bdb.savepoint(): compiler.compile_query(bdb, phrase, out) winders, unwinders = out.getwindings() return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.Begin): txn.bayesdb_begin_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Rollback): txn.bayesdb_rollback_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.Commit): txn.bayesdb_commit_transaction(bdb) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabAs): assert ast.is_query(phrase.query) with bdb.savepoint(): if core.bayesdb_has_table(bdb, phrase.name): if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError(bdb, 'Name already defined as table: %s' % (repr(phrase.name),)) out = compiler.Output(n_numpar, nampar_map, bindings) qt = sqlite3_quote_name(phrase.name) temp = 'TEMP ' if phrase.temp else '' ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else '' out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt)) compiler.compile_query(bdb, phrase.query, out) winders, unwinders = out.getwindings() with compiler.bayesdb_wind(bdb, winders, unwinders): bdb.sql_execute(out.getvalue(), out.getbindings()) return empty_cursor(bdb) if isinstance(phrase, ast.CreateTabCsv): with bdb.savepoint(): table_exists = core.bayesdb_has_table(bdb, phrase.name) if table_exists: if phrase.ifnotexists: return empty_cursor(bdb) else: raise BQLError(bdb, 'Table already exists: %s' % (repr(phrase.name),)) bayesdb_read_csv_file( bdb, phrase.name, phrase.csv, header=True, create=True) return empty_cursor(bdb) if isinstance(phrase, ast.DropTab): with bdb.savepoint(): sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?' cursor = bdb.sql_execute(sql, (phrase.name,)) if 0 < cursor_value(cursor): raise BQLError(bdb, 'Table still in use by populations: %s' % (repr(phrase.name),)) bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?', (phrase.name,)) ifexists = 'IF EXISTS ' if phrase.ifexists else '' qt = sqlite3_quote_name(phrase.name) return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt)) if isinstance(phrase, ast.AlterTab): with bdb.savepoint(): table = phrase.table if not core.bayesdb_has_table(bdb, table): raise BQLError(bdb, 'No such table: %s' % (repr(table),)) for cmd in phrase.commands: if isinstance(cmd, ast.AlterTabRenameTab): # If the names differ only in case, we have to do # some extra work because SQLite will reject the # table rename. Note that we may even have table # == cmd.name here, but if the stored table name # differs in case from cmd.name, we want to update # it anyway. if casefold(table) == casefold(cmd.name): # Go via a temporary table. temp = table + '_temp' while core.bayesdb_has_table(bdb, temp): temp += '_temp' rename_table(bdb, table, temp) rename_table(bdb, temp, cmd.name) else: # Make sure nothing else has this name and # rename it. if core.bayesdb_has_table(bdb, cmd.name): raise BQLError(bdb, 'Name already defined as table: %s' % (repr(cmd.name),)) rename_table(bdb, table, cmd.name) # If table has implicit population, rename it too. if core.bayesdb_table_has_implicit_population( bdb, cmd.name): populations = \ core.bayesdb_table_populations(bdb, cmd.name) assert len(populations) == 1 population_name = core.bayesdb_population_name( bdb, populations[0]) qt = sqlite3_quote_name(cmd.name) qp = sqlite3_quote_name(population_name) bdb.execute('ALTER POPULATION %s RENAME TO %s' % (qp, qt)) # Remember the new name for subsequent commands. table = cmd.name elif isinstance(cmd, ast.AlterTabRenameCol): # XXX Need to deal with this in the compiler. raise NotImplementedError('Renaming columns' ' not yet implemented.') # Make sure the old name exist and the new name does not. old_folded = casefold(cmd.old) new_folded = casefold(cmd.new) if old_folded != new_folded: if not core.bayesdb_table_has_column(bdb, table, cmd.old): raise BQLError(bdb, 'No such column in table %s' ': %s' % (repr(table), repr(cmd.old))) if core.bayesdb_table_has_column(bdb, table, cmd.new): raise BQLError(bdb, 'Column already exists' ' in table %s: %s' % (repr(table), repr(cmd.new))) # Update bayesdb_column. Everything else refers # to columns by (tabname, colno) pairs rather than # by names. update_column_sql = ''' UPDATE bayesdb_column SET name = :new WHERE tabname = :table AND name = :old ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_column_sql, { 'table': table, 'old': cmd.old, 'new': cmd.new, }) assert bdb._sqlite3.totalchanges() - total_changes == 1 # ...except backends may have the (case-folded) name cached. if old_folded != new_folded: populations_sql = ''' SELECT id FROM bayesdb_population WHERE tabname = ? ''' cursor = bdb.sql_execute(populations_sql, (table,)) generators = [ core.bayesdb_population_generators( bdb, population_id) for (population_id,) in cursor ] for generator_id in set(generators): backend = core.bayesdb_generator_backend(bdb, generator_id) backend.rename_column(bdb, generator_id, old_folded, new_folded) else: assert False, 'Invalid alter table command: %s' % \ (cmd,) return empty_cursor(bdb) if isinstance(phrase, ast.GuessSchema): if not core.bayesdb_has_table(bdb, phrase.table): raise BQLError(bdb, 'No such table : %s' % phrase.table) out = compiler.Output(0, {}, {}) with bdb.savepoint(): qt = sqlite3_quote_name(phrase.table) temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,)) column_names = [d[0] for d in cursor.description] rows = cursor.fetchall() stattypes = bayesdb_guess_stattypes(column_names, rows) distinct_value_counts = [ len(set([row[i] for row in rows])) for i in range(len(column_names)) ] out.winder(''' CREATE TEMP TABLE %s ( column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT ) ''' % (qtt,), ()) for cn, st, ct in zip(column_names, stattypes, distinct_value_counts): out.winder(''' INSERT INTO %s VALUES (?, ?, ?, ?) ''' % (qtt), (cn, st[0], ct, st[1])) out.write('SELECT * FROM %s' % (qtt,)) out.unwinder('DROP TABLE %s' % (qtt,), ()) winders, unwinders = out.getwindings() return execute_wound( bdb, winders, unwinders, out.getvalue(), out.getbindings()) if isinstance(phrase, ast.CreatePop): with bdb.savepoint(): _create_population(bdb, phrase) return empty_cursor(bdb) if isinstance(phrase, ast.DropPop): with bdb.savepoint(): if not core.bayesdb_has_population(bdb, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such population: %r' % (phrase.name,)) population_id = core.bayesdb_get_population(bdb, phrase.name) generator_ids = core.bayesdb_population_generators( bdb, population_id) if generator_ids: generators = [core.bayesdb_generator_name(bdb, gid) for gid in generator_ids] raise BQLError(bdb, 'Population %r still has generators: %r' % (phrase.name, generators)) # XXX helpful error checking if generators still exist # XXX check change counts bdb.sql_execute(''' DELETE FROM bayesdb_variable WHERE population_id = ? ''', (population_id,)) bdb.sql_execute(''' DELETE FROM bayesdb_population WHERE id = ? ''', (population_id,)) return empty_cursor(bdb) if isinstance(phrase, ast.AlterPop): with bdb.savepoint(): population = phrase.population if not core.bayesdb_has_population(bdb, population): raise BQLError(bdb, 'No such population: %s' % (repr(population),)) population_id = core.bayesdb_get_population(bdb, population) for cmd in phrase.commands: if isinstance(cmd, ast.AlterPopRenamePop): table = core.bayesdb_population_table(bdb, population_id) # Prevent renaming of implicit population directly, unless # being called by ast.AlterTabRenameTab in which case the # table name and population name will not be matching. if core.bayesdb_population_is_implicit(bdb, population_id) \ and casefold(population) == casefold(table): raise BQLError(bdb, 'Cannot rename implicit' 'population %s; rename base table instead' % (population,)) # Make sure nothing else has this name. if casefold(population) != casefold(cmd.name): if core.bayesdb_has_population(bdb, cmd.name): raise BQLError(bdb, 'Name already defined as population' ': %s' % (repr(cmd.name),)) # Update bayesdb_population. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_population SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, population_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # If population has implicit generator, rename it too. if core.bayesdb_population_has_implicit_generator( bdb, population_id): generators = core.bayesdb_population_generators( bdb, population_id) assert len(generators) == 1 generator_name = core.bayesdb_generator_name( bdb, generators[0]) qp = sqlite3_quote_name(cmd.name) qg = sqlite3_quote_name(generator_name) bdb.execute('ALTER GENERATOR %s RENAME TO %s' % (qg, qp,)) # Remember the new name for subsequent commands. population = cmd.name elif isinstance(cmd, ast.AlterPopAddVar): # Ensure column exists in base table. table = core.bayesdb_population_table(bdb, population_id) if not core.bayesdb_table_has_column( bdb, table, cmd.name): raise BQLError(bdb, 'No such variable in base table: %s' % (cmd.name)) # Ensure variable not already in population. if core.bayesdb_has_variable( bdb, population_id, None, cmd.name): raise BQLError(bdb, 'Variable already in population: %s' % (cmd.name)) # Ensure there is at least observation in the column. qt = sqlite3_quote_name(table) qc = sqlite3_quote_name(cmd.name) cursor = bdb.sql_execute( 'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' % (qt, qc)) if cursor_value(cursor) == 0: raise BQLError(bdb, 'Cannot add variable without any values: %s' % (cmd.name)) # If stattype is None, guess. if cmd.stattype is None: cursor = bdb.sql_execute( 'SELECT %s FROM %s' % (qc, qt)) rows = cursor.fetchall() [stattype, reason] = bayesdb_guess_stattypes( [cmd.name], rows)[0] # Fail if trying to model a key. if stattype == 'key': raise BQLError(bdb, 'Values in column %s appear to be keys.' % (cmd.name,)) # Fail if cannot determine a stattype. elif stattype == 'ignore': raise BQLError(bdb, 'Failed to determine a stattype for %s, ' 'please specify one manually.' % (cmd.name,)) # If user specified stattype, ensure it exists. elif not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError(bdb, 'Invalid stattype: %s' % (cmd.stattype)) else: stattype = cmd.stattype # Check that strings are not being modeled as numerical. if stattype == 'numerical' \ and _column_contains_string(bdb, table, cmd.name): raise BQLError(bdb, 'Numerical column contains string values: %r ' % (qc,)) with bdb.savepoint(): # Add the variable to the population. core.bayesdb_add_variable( bdb, population_id, cmd.name, stattype) colno = core.bayesdb_variable_number( bdb, population_id, None, cmd.name) # Add the variable to each (initialized) generator in # the population. generator_ids = filter( lambda g: core.bayesdb_generator_modelnos(bdb, g), core.bayesdb_population_generators( bdb, population_id), ) for generator_id in generator_ids: backend = core.bayesdb_generator_backend( bdb, generator_id) backend.add_column(bdb, generator_id, colno) elif isinstance(cmd, ast.AlterPopStatType): # Check the no generators are defined for this population. generators = core.bayesdb_population_generators( bdb, population_id) if generators: raise BQLError(bdb, 'Cannot update statistical types for population ' '%s, it has generators: %s' % (repr(population), repr(generators),)) # Check all the variables are in the population. unknown = [ c for c in cmd.names if not core.bayesdb_has_variable(bdb, population_id, None, c) ] if unknown: raise BQLError(bdb, 'No such variables in population: %s' % (repr(unknown))) # Check the statistical type is valid. if not core.bayesdb_has_stattype(bdb, cmd.stattype): raise BQLError(bdb, 'Invalid statistical type: %r' % (repr(cmd.stattype),)) # Check that strings are not being modeled as numerical. if cmd.stattype == 'numerical': table = core.bayesdb_population_table( bdb, population_id) numerical_string_vars = [ col for col in cmd.names if _column_contains_string(bdb, table, col) ] if numerical_string_vars: raise BQLError(bdb, 'Columns with string values modeled as ' 'numerical: %r' % (numerical_string_vars,)) # Perform the stattype update. colnos = [ core.bayesdb_variable_number( bdb, population_id, None, c) for c in cmd.names ] qcolnos = ','.join('%d' % (colno,) for colno in colnos) update_stattype_sql = ''' UPDATE bayesdb_variable SET stattype = ? WHERE population_id = ? AND colno IN (%s) ''' % (qcolnos,) bdb.sql_execute( update_stattype_sql, (casefold(cmd.stattype), population_id,)) else: assert False, 'Invalid ALTER POPULATION command: %s' % \ (repr(cmd),) return empty_cursor(bdb) if isinstance(phrase, ast.CreateGen): # Find the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population,)) population_id = core.bayesdb_get_population(bdb, phrase.population) # Find the backend, or use the default. backend_name = phrase.backend if phrase.backend is None: backend_name = 'cgpm' if backend_name not in bdb.backends: raise BQLError(bdb, 'No such backend: %s' % (repr(backend_name),)) backend = bdb.backends[backend_name] # Retrieve the (possibility implicit) generator name. generator_name = phrase.name or phrase.population implicit = 1 if phrase.name is None else 0 with bdb.savepoint(): if core.bayesdb_has_generator(bdb, population_id, generator_name): if not phrase.ifnotexists: raise BQLError( bdb, 'Name already defined as generator: %s' % (repr(generator_name),)) else: # Insert a record into bayesdb_generator and get the # assigned id. bdb.sql_execute(''' INSERT INTO bayesdb_generator (name, population_id, backend, implicit) VALUES (?, ?, ?, ?) ''', (generator_name, population_id, backend.name(), implicit)) generator_id = core.bayesdb_get_generator( bdb, population_id, generator_name) # Do any backend-specific initialization. backend.create_generator(bdb, generator_id, phrase.schema) # All done. Nothing to return. return empty_cursor(bdb) if isinstance(phrase, ast.DropGen): with bdb.savepoint(): if not core.bayesdb_has_generator(bdb, None, phrase.name): if phrase.ifexists: return empty_cursor(bdb) raise BQLError(bdb, 'No such generator: %s' % (repr(phrase.name),)) generator_id = core.bayesdb_get_generator(bdb, None, phrase.name) backend = core.bayesdb_generator_backend(bdb, generator_id) # Backend-specific destruction. backend.drop_generator(bdb, generator_id) # Drop latent variables, models, and, finally, generator. drop_columns_sql = ''' DELETE FROM bayesdb_variable WHERE generator_id = ? ''' bdb.sql_execute(drop_columns_sql, (generator_id,)) drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_model_sql, (generator_id,)) drop_generator_sql = ''' DELETE FROM bayesdb_generator WHERE id = ? ''' bdb.sql_execute(drop_generator_sql, (generator_id,)) return empty_cursor(bdb) if isinstance(phrase, ast.AlterGen): with bdb.savepoint(): generator = phrase.generator if not core.bayesdb_has_generator(bdb, None, generator): raise BQLError(bdb, 'No such generator: %s' % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, None, generator) cmds_generic = [] for cmd in phrase.commands: if isinstance(cmd, ast.AlterGenRenameGen): population_id = core.bayesdb_generator_population( bdb, generator_id) population = core.bayesdb_population_name( bdb, population_id) # Prevent renaming of implicit generator directly, unless # being called by ast.AlterPopRenamePop in which case the # population name and generator name will not be matching. if core.bayesdb_population_is_implicit(bdb, generator_id) \ and casefold(generator) == casefold(population): raise BQLError(bdb, 'Cannot rename implicit ' 'generator; rename base population instead') # Disable modelnos with AlterGenRenameGen. if phrase.modelnos is not None: raise BQLError(bdb, 'Cannot specify models for RENAME') # Make sure nothing else has this name. if casefold(generator) != casefold(cmd.name): if core.bayesdb_has_generator(bdb, None, cmd.name): raise BQLError(bdb, 'Name already defined' ' as generator: %s' % (repr(cmd.name),)) # Update bayesdb_generator. Everything else # refers to it by id. update_generator_sql = ''' UPDATE bayesdb_generator SET name = ? WHERE id = ? ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(update_generator_sql, (cmd.name, generator_id)) assert bdb._sqlite3.totalchanges() - total_changes == 1 # Remember the new name for subsequent commands. generator = cmd.name elif isinstance(cmd, ast.AlterGenGeneric): cmds_generic.append(cmd.command) else: assert False, 'Invalid ALTER GENERATOR command: %s' % \ (repr(cmd),) if cmds_generic: modelnos = phrase.modelnos modelnos_invalid = None if modelnos is None else [ modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno) ] if modelnos_invalid: raise BQLError(bdb, 'No such models in generator %s: %s' % (repr(phrase.generator), repr(modelnos))) # Call generic alternations on the backend. backend = core.bayesdb_generator_backend(bdb, generator_id) backend.alter(bdb, generator_id, modelnos, cmds_generic) return empty_cursor(bdb) if isinstance(phrase, ast.InitModels): if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator,)) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) modelnos = range(phrase.nmodels) with bdb.savepoint(): # Find the model numbers. Omit existing ones for # ifnotexists; reject existing ones otherwise. if phrase.ifnotexists: modelnos = set(modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno)) else: existing = set(modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno)) if 0 < len(existing): raise BQLError(bdb, 'Generator %s already has models: %s' % (repr(phrase.generator), sorted(existing))) # Stop now if there's nothing to initialize. if len(modelnos) == 0: return # Create the bayesdb_generator_model records. modelnos = sorted(modelnos) insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno) VALUES (:generator_id, :modelno) ''' for modelno in modelnos: bdb.sql_execute(insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) # Do backend-specific initialization. backend = core.bayesdb_generator_backend(bdb, generator_id) backend.initialize_models(bdb, generator_id, modelnos) return empty_cursor(bdb) if isinstance(phrase, ast.AnalyzeModels): # WARNING: It is the backend's responsibility to work in a # transaction. # # WARNING: It is the backend's responsibility to update the # iteration count in bayesdb_generator_model records. # # We do this so that the backend can save incremental # progress in case of ^C in the middle. # # XXX Put these warning somewhere more appropriate. if not core.bayesdb_has_generator(bdb, None, phrase.generator): raise BQLError(bdb, 'No such generator: %s' % (phrase.generator,)) generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator) backend = core.bayesdb_generator_backend(bdb, generator_id) # XXX Should allow parameters for iterations and ckpt/iter. backend.analyze_models(bdb, generator_id, modelnos=phrase.modelnos, iterations=phrase.iterations, max_seconds=phrase.seconds, ckpt_iterations=phrase.ckpt_iterations, ckpt_seconds=phrase.ckpt_seconds, program=phrase.program) return empty_cursor(bdb) if isinstance(phrase, ast.DropModels): with bdb.savepoint(): generator_id = core.bayesdb_get_generator( bdb, None, phrase.generator) backend = core.bayesdb_generator_backend(bdb, generator_id) modelnos = None if phrase.modelnos is not None: lookup_model_sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' modelnos = sorted(list(phrase.modelnos)) for modelno in modelnos: cursor = bdb.sql_execute(lookup_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) if cursor_value(cursor) == 0: raise BQLError(bdb, 'No such model' ' in generator %s: %s' % (repr(phrase.generator), repr(modelno))) backend.drop_models(bdb, generator_id, modelnos=modelnos) if modelnos is None: drop_models_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = ? ''' bdb.sql_execute(drop_models_sql, (generator_id,)) else: drop_model_sql = ''' DELETE FROM bayesdb_generator_model WHERE generator_id = :generator_id AND modelno = :modelno ''' for modelno in modelnos: bdb.sql_execute(drop_model_sql, { 'generator_id': generator_id, 'modelno': modelno, }) return empty_cursor(bdb) if isinstance(phrase, ast.Regress): # Retrieve the population. if not core.bayesdb_has_population(bdb, phrase.population): raise BQLError(bdb, 'No such population: %r' % (phrase.population,)) population_id = core.bayesdb_get_population(bdb, phrase.population) # Retrieve the generator generator_id = None if phrase.generator: if not core.bayesdb_has_generator(bdb, population_id, phrase.generator): raise BQLError(bdb, 'No such generator: %r' % (phrase.generator,)) generator_id = core.bayesdb_get_generator( bdb, population_id, phrase.generator) # Retrieve the target variable. if not core.bayesdb_has_variable( bdb, population_id, None, phrase.target): raise BQLError(bdb, 'No such variable: %r' % (phrase.target,)) colno_target = core.bayesdb_variable_number( bdb, population_id, None, phrase.target) stattype = core.bayesdb_variable_stattype(bdb, population_id, generator_id, colno_target) if stattype != 'numerical': raise BQLError(bdb, 'Target variable is not numerical: %r' % (phrase.target,)) # Build the given variables. if any(isinstance(col, ast.SelColAll) for col in phrase.givens): # Using * is not allowed to be mixed with other variables. if len(phrase.givens) > 1: raise BQLError(bdb, 'Cannot use (*) with other givens.') colno_givens = core.bayesdb_variable_numbers( bdb, population_id, None) else: if any(isinstance(col, ast.SelColSub) for col in phrase.givens): # Subexpression needs special compiling. out = compiler.Output(n_numpar, nampar_map, bindings) bql_compiler = compiler.BQLCompiler_None() givens = compiler.expand_select_columns( bdb, phrase.givens, True, bql_compiler, out) else: givens = phrase.givens colno_givens = [ core.bayesdb_variable_number( bdb, population_id, None, given.expression.column) for given in givens ] # Build the arguments to bqlfn.bayesdb_simulate. colno_givens_unique = set( colno for colno in colno_givens if colno!= colno_target ) if len(colno_givens_unique) == 0: raise BQLError(bdb, 'No matching given columns.') constraints = [] colnos = [colno_target] + list(colno_givens_unique) nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value modelnos = None if phrase.modelnos is None else str(phrase.modelnos) rows = bqlfn.bayesdb_simulate( bdb, population_id, generator_id, modelnos, constraints, colnos, numpredictions=nsamp) # Retrieve the stattypes. stattypes = [ core.bayesdb_variable_stattype( bdb, population_id, generator_id, colno_given) for colno_given in colno_givens_unique ] # Separate the target values from the given values. target_values = [row[0] for row in rows] given_values = [row[1:] for row in rows] given_names = [ core.bayesdb_variable_name(bdb, population_id, generator_id, given) for given in colno_givens_unique ] # Compute the coefficients. The import to regress_ols is here since the # feature depends on pandas + sklearn, so avoid module-wide import. from bayeslite.regress import regress_ols coefficients = regress_ols( target_values, given_values, given_names, stattypes) # Store the results in a winder. temptable = bdb.temp_table_name() qtt = sqlite3_quote_name(temptable) out = compiler.Output(0, {}, {}) out.winder(''' CREATE TEMP TABLE %s (variable TEXT, coefficient REAL); ''' % (qtt,), ()) for variable, coef in coefficients: out.winder(''' INSERT INTO %s VALUES (?, ?) ''' % (qtt), (variable, coef,)) out.write('SELECT * FROM %s ORDER BY variable' % (qtt,)) out.unwinder('DROP TABLE %s' % (qtt,), ()) winders, unwinders = out.getwindings() return execute_wound( bdb, winders, unwinders, out.getvalue(), out.getbindings()) assert False # XXX
def create_generator(self, bdb, generator_id, schema_tokens, **kwargs): schema_ast = cgpm_schema.parse.parse(schema_tokens) schema = _create_schema(bdb, generator_id, schema_ast, **kwargs) # Store the schema. bdb.sql_execute( ''' INSERT INTO bayesdb_cgpm_generator (generator_id, schema_json) VALUES (?, ?) ''', (generator_id, json_dumps(schema))) # Get the underlying population and table. population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) qt = sqlite3_quote_name(table) # Assign latent variable numbers. for var, stattype in sorted(schema['latents'].iteritems()): core.bayesdb_add_latent(bdb, population_id, generator_id, var, stattype) # Assign codes to categories and consecutive column numbers to # the modelled variables. vars_cursor = bdb.sql_execute( ''' SELECT colno, name, stattype FROM bayesdb_variable WHERE population_id = ? AND 0 <= colno ''', (population_id, )) for colno, name, stattype in vars_cursor: if _is_categorical(stattype): qn = sqlite3_quote_name(name) cursor = bdb.sql_execute(''' SELECT DISTINCT %s FROM %s WHERE %s IS NOT NULL ''' % (qn, qt, qn)) for code, (value, ) in enumerate(cursor): bdb.sql_execute( ''' INSERT INTO bayesdb_cgpm_category (generator_id, colno, value, code) VALUES (?, ?, ?, ?) ''', (generator_id, colno, value, code)) # Assign contiguous 0-indexed ids to the individuals in the # table. if schema['subsample']: k = schema['subsample'] n = cursor_value( bdb.sql_execute('SELECT COUNT(*) FROM %s' % (qt, ))) cursor = bdb.sql_execute( 'SELECT _rowid_ FROM %s ORDER BY _rowid_ ASC' % (qt, )) uniform = bdb._prng.weakrandom_uniform # https://en.wikipedia.org/wiki/Reservoir_sampling samples = [] for i, row in enumerate(cursor): if i < k: samples.append(row) else: r = uniform(i + 1) if r < k: samples[r] = row cursor = samples else: cursor = bdb.sql_execute('SELECT _rowid_ FROM %s' % (qt, )) for cgpm_rowid, (table_rowid, ) in enumerate(cursor): bdb.sql_execute( ''' INSERT INTO bayesdb_cgpm_individual (generator_id, table_rowid, cgpm_rowid) VALUES (?, ?, ?) ''', (generator_id, table_rowid, cgpm_rowid))
def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets, constraints, num_samples=1, accuracy=None): # Retrieve the population id. population_id = bayesdb_generator_population(bdb, generator_id) # If rowid exists, retrieve conditioning data from the table. if rowid != bayesdb_population_fresh_row_id(bdb, generator_id): row_values_raw = bayesdb_population_row_values( bdb, population_id, rowid) row_values = [ str(a) if isinstance(a, unicode) else a for a in row_values_raw ] row = [ entry for entry in enumerate(row_values) if entry[1] is not None ] constraints_colnos = [c[0] for c in constraints] row_colnos = [r[0] for r in row] if any([colno in constraints_colnos for colno in row_colnos]): raise BQLError(bdb, 'Overlap between constraints and' \ 'target row in simulate.') constraints.extend(row) # Prepare the query row to provide to Loom. row = {} target_num_to_name = {} for colno in targets: name = bayesdb_variable_name(bdb, generator_id, None, colno) target_num_to_name[colno] = name row[name] = '' for (colno, value) in constraints: name = bayesdb_variable_name(bdb, generator_id, None, colno) row[name] = value # Fetch the server. server = self._get_cache_entry(bdb, generator_id, 'preql_server') # Prepare the csv header. csv_headers, csv_values = zip(*row.iteritems()) lower_to_upper = {str(a).lower(): str(a) for a in csv_headers} csv_headers = lower_to_upper.keys() csv_values = [str(a) for a in csv_values] # Retrieve the samples from the server.. outfile = StringIO() writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue) reader = iter([csv_headers] + [csv_values]) server._predict(reader, num_samples, writer, False) output = writer.result() # Parse output. returned_headers = [ lower_to_upper[a] for a in output.strip().split('\r\n')[0].split(CSV_DELIMITER) ] loom_output = [ zip(returned_headers, a.split(CSV_DELIMITER)) for a in output.strip().split('\r\n')[1:] ] return_list = [] for row in loom_output: # Prepare the row. row_values = [] row_dict = dict(row) for colno in targets: colname = target_num_to_name[colno] value = row_dict[colname] stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) if not _is_nominal(stattype): value = float(value) row_values.append(value) # Add this row to the return list. return_list.append(row_values) return return_list