def get_schema_as_list(bdb, population_name): population_id = bayesdb_get_population(bdb, population_name) table_name = bayesdb_population_table(bdb, population_id) qt = bql_quote_name(table_name) variable_names = bayesdb_variable_names(bdb, population_id, None) schema = [] for variable_name in variable_names: colno = bayesdb_variable_number(bdb, population_id, None, variable_name) stattype = bayesdb_variable_stattype(bdb, population_id, None, colno) stattype_lookup = { 'numerical': 'realAdditive', 'nominal': 'categorical', 'categorical': 'categorical', } schema_entry = { 'name': variable_name, 'stat_type': stattype_lookup[stattype] } if stattype == 'nominal': qv = bql_quote_name(variable_name) values = utils_bql.query( bdb, ''' SELECT DISTINCT(%s) FROM %s WHERE %s IS NOT NULL ''' % ( qv, qt, qv, )) schema_entry['unique_values'] = \ values[values.columns[0]].unique().tolist() schema.append(schema_entry) return schema
def test_case(): with bayeslite.bayesdb_open(':memory:') as bdb: bdb.sql_execute('create table t(x,Y)') bdb.sql_execute('insert into t values(1,2)') bdb.sql_execute('insert into t values(3,4)') bdb.sql_execute('insert into t values(1,4)') bdb.sql_execute('insert into t values(2,2)') bdb.execute('create population p for t(guess(*))') population_id = core.bayesdb_get_population(bdb, 'p') assert core.bayesdb_variable_names(bdb, population_id, None) == \ ['x', 'Y']
def _create_schema(bdb, generator_id, schema_ast, **kwargs): # Get some parameters. population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) # State. variables = [] variable_dist = {} latents = {} cgpm_composition = [] modelled = set() default_modelled = set() subsample = None deferred_input = defaultdict(lambda: []) deferred_output = dict() # Error-reporting state. duplicate = set() unknown = set() needed = set() existing_latent = set() must_exist = [] unknown_stattype = {} # XXX Convert all Foreign.exposed lists to Latent clauses. # Retrieve Foreign clauses with exposed variables. foreign_clauses = [ c for c in schema_ast if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0 ] # Add the exposed variables to Foreign.outputs # Note that this assumes if there are K exposed variables, then they are # necessarily the last K outputs of the fc.outputs. for fc in foreign_clauses: fc.outputs.extend([e[0] for e in fc.exposed]) # Convert exposed entries into Latent clauses. latent_vars = list( itertools.chain.from_iterable(c.exposed for c in foreign_clauses)) latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars] # Append the Latent clauses to the ast. schema_ast.extend(latent_clauses) # XXX Convert the baseline to a Foreign clause. # Currently the baselines do not accept a schema, and will fail if # `schema_ast` has any entries. baseline = kwargs.get('baseline', None) if baseline is not None and casefold(baseline.name) != 'crosscat': if schema_ast: raise BQLError( bdb, 'Cannot accept schema with baseline: %s.' % schema_ast) # Retrieve all variable names in the population outputs = core.bayesdb_variable_names(bdb, population_id, None) # Convert the LITERAL namedtuples to their raw values. ps, vs = zip(*baseline.params) vs_new = [v.value for v in vs] params = zip(ps, vs_new) # Create the clause. clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name, params) # And add append it to the schema_ast. schema_ast.append(clause) # Process each clause one by one. for clause in schema_ast: if isinstance(clause, cgpm_schema.parse.Basic): # Basic Crosscat component model: one variable to be put # into Crosscat views. var = clause.var dist = clause.dist params = dict(clause.params) # XXX error checking # Reject if the variable does not exist. if not core.bayesdb_has_variable(bdb, population_id, None, var): unknown.add(var) continue # Reject if the variable has already been modelled. if var in modelled: duplicate.add(var) continue # Reject if the variable is latent. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Get the column number. colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno # Add it to the list and mark it modelled by default. stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) default_modelled.add(var) elif isinstance(clause, cgpm_schema.parse.Latent): var = clause.name stattype = clause.stattype # Reject if the variable has already been modelled by the # default model. if var in default_modelled: duplicate.add(var) continue # Reject if the variable even *exists* in the population # at all yet. if core.bayesdb_has_variable(bdb, population_id, None, var): duplicate.add(var) continue # Reject if the variable is already latent, from another # generator. if core.bayesdb_has_latent(bdb, population_id, var): existing_latent.add(var) continue # Reject if we've already processed it. if var in latents: duplicate.add(var) continue # Add it to the set of latent variables. latents[var] = stattype elif isinstance(clause, cgpm_schema.parse.Foreign): # Foreign model: some set of output variables is to be # modelled by foreign logic, possibly conditional on some # set of input variables. # # Gather up the state for a cgpm_composition record, which # we may have to do incrementally because it must refer to # the distribution types of variables we may not have # seen. name = clause.name outputs = clause.outputs inputs = clause.inputs output_stattypes = [] output_statargs = [] input_stattypes = [] input_statargs = [] distargs = { 'inputs': { 'stattypes': input_stattypes, 'statargs': input_statargs }, 'outputs': { 'stattypes': output_stattypes, 'statargs': output_statargs, } } kwds = {'distargs': distargs} kwds.update(clause.params) # First make sure all the output variables exist and have # not yet been modelled. for var in outputs: must_exist.append(var) if var in modelled: duplicate.add(var) continue modelled.add(var) # Add the output statistical type and its parameters. i = len(output_stattypes) assert i == len(output_statargs) output_stattypes.append(None) output_statargs.append(None) deferred_output[var] = (output_stattypes, output_statargs, i) # Next make sure all the input variables exist, mark them # needed, and record where to put their distribution type # and parameters. for var in inputs: must_exist.append(var) needed.add(var) i = len(input_stattypes) assert i == len(input_statargs) input_stattypes.append(None) input_statargs.append(None) deferred_input[var].append( (input_stattypes, input_statargs, i)) # Finally, add a cgpm_composition record. cgpm_composition.append({ 'name': name, 'inputs': inputs, 'outputs': outputs, 'kwds': kwds, }) elif isinstance(clause, cgpm_schema.parse.Subsample): if subsample is not None: raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, )) subsample = clause.n else: raise BQLError(bdb, 'Unknown clause: %r' % (clause, )) # Make sure all the outputs and inputs exist, either in the # population or as latents in this generator. for var in must_exist: if core.bayesdb_has_variable(bdb, population_id, None, var): continue if var in latents: continue unknown.add(var) # Raise an exception if there were duplicates or unknown # variables. if duplicate: raise BQLError(bdb, 'Duplicate model variables: %r' % (sorted(duplicate), )) if existing_latent: raise BQLError( bdb, 'Latent variables already defined: %r' % (sorted(existing_latent), )) if unknown: raise BQLError(bdb, 'Unknown model variables: %r' % (sorted(unknown), )) def default_dist(var, stattype): stattype = casefold(stattype) if stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == stattype else: unknown_stattype[var] = stattype return None dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var) return dist, params # Use the default distribution for any variables that remain to be # modelled, excluding any that are latent or that have statistical # types we don't know about. for var in core.bayesdb_variable_names(bdb, population_id, None): if var in modelled: continue colno = core.bayesdb_variable_number(bdb, population_id, None, var) assert 0 <= colno stattype = core.bayesdb_variable_stattype(bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams variables.append([var, stattype, dist, params]) assert var not in variable_dist variable_dist[var] = (stattype, dist, params) modelled.add(var) # Fill in the deferred_input statistical type assignments. for var in sorted(deferred_input.iterkeys()): # Check whether the variable is modelled. If not, skip -- we # will fail later because this variable is guaranteed to also # be in needed. if var not in modelled: assert var in needed continue # Determine (possibly fictitious) distribution and parameters. if var in default_modelled: # Manifest variable modelled by default Crosscat model. assert var in variable_dist stattype, dist, params = variable_dist[var] else: # Modelled by a foreign model. Assign a fictitious # default distribution because the 27B/6 of CGPM requires # this. if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. stattype = latents[var] else: # Manifest variable modelled by a foreign model. Use # the statistical type in the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, stattype) if distparams is None: continue dist, params = distparams # Assign the distribution and parameters. for cctypes, ccargs, i in deferred_input[var]: assert cctypes[i] is None assert ccargs[i] is None cctypes[i] = dist ccargs[i] = params # Fill in the deferred_output statistical type assignments. The need to be # in the form NUMERICAL or CATEGORICAL. for var in deferred_output: if var in latents: # Latent variable modelled by a foreign model. Use # the statistical type specified for it. var_stattype = casefold(latents[var]) if var_stattype not in _DEFAULT_DIST: if var in unknown_stattype: assert unknown_stattype[var] == var_stattype else: unknown_stattype[var] = var_stattype # XXX Cannot specify statargs for a latent variable. Trying to using # default_dist might lookup the counts for unique values of the # categorical in the base table causing a failure. var_statargs = {} else: # Manifest variable modelled by a foreign model. Use # the statistical type and arguments from the population. assert core.bayesdb_has_variable(bdb, population_id, None, var) colno = core.bayesdb_variable_number(bdb, population_id, None, var) var_stattype = core.bayesdb_variable_stattype( bdb, population_id, colno) distparams = default_dist(var, var_stattype) if distparams is None: continue _, var_statargs = distparams stattypes, statargs, i = deferred_output[var] assert stattypes[i] is None assert statargs[i] is None stattypes[i] = var_stattype statargs[i] = var_statargs if unknown_stattype: raise BQLError( bdb, 'Unknown statistical types for variables: %r' % (sorted(unknown_stattype.iteritems(), ))) # If there remain any variables that we needed to model, because # others are conditional on them, fail. needed -= modelled if needed: raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, )) # Finally, create a CGPM schema. return { 'variables': variables, 'cgpm_composition': cgpm_composition, 'subsample': subsample, 'latents': latents, }
def predict_confidence(self, bdb, generator_id, modelno, colno, rowid, numsamples=None): if not numsamples: numsamples = 2 assert numsamples > 0 def _impute_categorical(sample): counts = Counter(s[0] for s in sample) mode_count = max(counts[v] for v in counts) pred = iter(v for v in counts if counts[v] == mode_count).next() conf = float(mode_count) / numsamples return pred, conf def _impute_numerical(sample): pred = sum(s[0] for s in sample) / float(len(sample)) conf = 0 # XXX Punt confidence for now return pred, conf constraints = [] # If rowid is a hypothetical cell for cgpm (did not exist at the time # of INITIALIZE), but exists in the base table (by INSERT INTO), then # retrieve all values for rowid as the constraints. exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id) max_cgpm_rowid = bdb.sql_execute( ''' SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id, )).fetchall()[0][0] hypothetical = rowid > max_cgpm_rowid if exists and hypothetical: population_id = core.bayesdb_generator_population( bdb, generator_id) # Retrieve all other variables except colno, and ignore latents in # generator_id, and place them in the constraints. pop_names = core.bayesdb_variable_names(bdb, population_id, None) avoid_name = core.bayesdb_variable_name(bdb, population_id, colno) constraints_names = [n for n in pop_names if n != avoid_name] # Obtain the row. qt_names = str.join(',', map(sqlite3_quote_name, constraints_names)) qt_table = sqlite3_quote_name( core.bayesdb_population_table(bdb, population_id)) data = bdb.sql_execute( ''' SELECT %s FROM %s WHERE oid = ? ''' % ( qt_names, qt_table, ), (rowid, )).fetchall()[0] # Build the constraints. pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None) constraints_nos = [n for n in pop_nos if n != colno] # import ipdb; ipdb.set_trace() assert len(data) == len(constraints_nos) constraints = [(rowid, c, v) for c, v in zip(constraints_nos, data) if (v is not None) and v] # Retrieve the samples. sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)], constraints, modelno, numsamples) # Determine the imputation strategy (mode or mean). stattype = core.bayesdb_variable_stattype( bdb, core.bayesdb_generator_population(bdb, generator_id), colno) if _is_categorical(stattype): return _impute_categorical(sample) else: return _impute_numerical(sample)
def retrieve_analyze_variables(ast): # Transition all variables by default. variables = None # Exactly 1 VARIABLES or SKIP clause supported for simplicity. seen_variables, seen_skip, seen_optimized = False, False, False for clause in ast: # Transition user specified variables only. if isinstance(clause, cgpm_analyze.parse.Variables): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE' ) seen_variables = True included = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable( bdb, population_id, generator_id, var): unknown.add(var) included.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) variables = sorted(included) # Transition all variables except user specified skip. elif isinstance(clause, cgpm_analyze.parse.Skip): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE' ) seen_skip = True excluded = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable( bdb, population_id, generator_id, var): unknown.add(var) excluded.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) all_vars = core.bayesdb_variable_names( bdb, population_id, generator_id) variables = sorted(set(all_vars) - excluded) elif isinstance(clause, cgpm_analyze.parse.Optimized): seen_optimized = True # Unknown/impossible clause. else: raise ValueError('Unknown clause in ANALYZE: %s.' % ast) if variables is None: variables = core.bayesdb_variable_names( bdb, population_id, generator_id) varnos = [ core.bayesdb_variable_number(bdb, population_id, generator_id, v) for v in variables ] # TODO Perform error checking if the OPTIMIZED clause is used. # In particular, the variables in OPTIMIZED must correspond # EXACTLY to the variables that are modeled by the CrossCat # baseline. Avoided this check for now since the nature of a # variable is not stored in the bdb. For now, just check the # user did not include a VARIABLES clause. if seen_optimized: if seen_variables: raise BQLError(bdb, 'OPTIMIZED incompatible with VARIABLES') # TODO Check if varnos are exactly the CrossCat variables. # raise BQLError(bdb, # 'The OPTIMIZED phrase in ANALYZE must target all the ' # 'variables modeled by the baseline, only. ' # 'Use SKIP to explicitly ignore analysis of overriden ' # 'variables') return varnos, seen_optimized
def _retrieve_analyze_variables(bdb, generator_id, ast): population_id = core.bayesdb_generator_population(bdb, generator_id) # Transitions all variables by default. variables = None # Exactly 1 VARIABLES or SKIP clause supported for simplicity. seen_variables, seen_skip, seen_optimized = False, False, False for clause in ast: # Transition user specified variables only. if isinstance(clause, cgpm_analyze.parse.Variables): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE') seen_variables = True included = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): unknown.add(var) included.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) variables = sorted(included) # Transition all variables except user specified skip. elif isinstance(clause, cgpm_analyze.parse.Skip): if seen_variables or seen_skip: raise BQLError( bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE') seen_skip = True excluded = set() unknown = set() for var in clause.vars: if not core.bayesdb_has_variable(bdb, population_id, generator_id, var): unknown.add(var) excluded.add(var) if unknown: raise BQLError( bdb, 'Unknown variables in ANALYZE: %r' % (sorted(unknown), )) all_vars = core.bayesdb_variable_names(bdb, population_id, generator_id) variables = sorted(set(all_vars) - excluded) # OPTIMIZED is incompatible with any other clause. elif isinstance(clause, cgpm_analyze.parse.Optimized): seen_optimized = True # Unknown/impossible clause. else: raise BQLError(bdb, 'Unknown clause in ANALYZE: %s.' % (ast, )) # OPTIMIZED is incompatible with any other clause. if seen_optimized: if seen_variables or seen_skip: raise BQLError(bdb, 'OPTIMIZED incompatible with other clauses.') variable_numbers = [ core.bayesdb_variable_number(bdb, population_id, generator_id, v) for v in variables ] if variables else None return (variable_numbers, seen_optimized)