def analyze_models(self, bdb, generator_id, modelnos=None, iterations=1, max_seconds=None, ckpt_iterations=None, ckpt_seconds=None, program=None): if max_seconds is not None: raise BQLError(bdb, 'Loom analyze does not support number of seconds.') if ckpt_iterations is not None or ckpt_seconds is not None: raise BQLError(bdb, 'Loom analyze does not support checkpoint.') if program is not None: raise BQLError(bdb, 'Loom analyze does not support programs.') if modelnos is not None: raise BQLError(bdb, 'Loom cannot analyze specific model numbers.') # Prepare arguments for loom.tasks.infer invocation. num_models = (self._get_num_models(bdb, generator_id)) iterations = max(int(iterations), 1) config = {'schedule': {'extra_passes': iterations}} project_path = self._get_loom_project_path(bdb, generator_id) # Run inference. loom.tasks.infer(project_path, sample_count=num_models, config=config) # Save the column and row partitions. self._store_kind_partition(bdb, generator_id, modelnos) # Close cached query servers. self._close_query_server(bdb, generator_id) self._close_preql_server(bdb, generator_id)
def create_generator(self, bdb, generator_id, schema, **kwargs): # XXX Do something with the schema. insert_column_sql = ''' INSERT INTO bayesdb_nig_normal_column (population_id, generator_id, colno, count, sum, sumsq) VALUES (:population_id, :generator_id, :colno, :count, :sum, :sumsq) ''' population_id = core.bayesdb_generator_population(bdb, generator_id) table = core.bayesdb_population_table(bdb, population_id) for colno in core.bayesdb_variable_numbers(bdb, population_id, None): column_name = core.bayesdb_variable_name(bdb, population_id, generator_id, colno) stattype = core.bayesdb_variable_stattype(bdb, population_id, generator_id, colno) if not stattype == 'numerical': raise BQLError( bdb, 'NIG-Normal only supports' ' numerical columns, but %s is %s' % (repr(column_name), repr(stattype))) (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name) bdb.sql_execute( insert_column_sql, { 'population_id': population_id, 'generator_id': generator_id, 'colno': colno, 'count': count, 'sum': xsum, 'sumsq': sumsq, }) # XXX Make the schema a little more flexible. if schema == [[]]: return for clause in schema: if not (len(clause) == 3 and \ isinstance(clause[0], str) and \ clause[1] == 'deviation' and \ isinstance(clause[2], list) and \ len(clause[2]) == 1 and \ isinstance(clause[2][0], str)): raise BQLError(bdb, 'Invalid nig_normal clause: %r' % (clause, )) dev_var = clause[0] obs_var = clause[2][0] if not core.bayesdb_has_variable(bdb, population_id, None, obs_var): raise BQLError(bdb, 'No such variable: %r' % (obs_var, )) obs_colno = core.bayesdb_variable_number(bdb, population_id, None, obs_var) dev_colno = core.bayesdb_add_latent(bdb, population_id, generator_id, dev_var, 'numerical') bdb.sql_execute( ''' INSERT INTO bayesdb_nig_normal_deviation (population_id, generator_id, deviation_colno, observed_colno) VALUES (?, ?, ?, ?) ''', (population_id, generator_id, dev_colno, obs_colno))
def bayesdb_generator_row_values(bdb, generator_id, rowid): table_name = bayesdb_generator_table(bdb, generator_id) column_names = bayesdb_generator_column_names(bdb, generator_id) qt = sqlite3_quote_name(table_name) qcns = ','.join(map(sqlite3_quote_name, column_names)) select_sql = ('SELECT %s FROM %s WHERE _rowid_ = ?' % (qcns, qt)) cursor = bdb.sql_execute(select_sql, (rowid, )) row = None try: row = cursor.next() except StopIteration: generator = bayesdb_generator_table(bdb, generator_id) raise BQLError( bdb, 'No such row in table %s' ' for generator %d: %d' % (repr(table_name), repr(generator), repr(rowid))) try: cursor.next() except StopIteration: pass else: generator = bayesdb_generator_table(bdb, generator_id) raise BQLError( bdb, 'More than one such row' ' in table %s for generator %s: %d' % (repr(table_name), repr(generator), repr(rowid))) return row
def simulate_column(phrase): if isinstance(phrase, ast.ExpBQLDepProb): raise BQLError( bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.') elif isinstance(phrase, ast.ExpBQLProb): raise BQLError(bdb, 'PROBABILITY OF simulation still unsupported.') elif isinstance(phrase, ast.ExpBQLMutInf): colno0 = retrieve_variable(phrase.column0) colno1 = retrieve_variable(phrase.column1) constraint_args = () if phrase.constraints is not None: constraint_args = tuple( itertools.chain.from_iterable( [[retrieve_variable(colname), retrieve_literal(expr)] for colname, expr in phrase.constraints])) nsamples = phrase.nsamples and retrieve_literal(phrase.nsamples) # One mi_list per generator of the population. mi_lists = bqlfn._bql_column_mutual_information( bdb, population_id, generator_id, colno0, colno1, nsamples, *constraint_args) return list(itertools.chain.from_iterable(mi_lists)) else: raise BQLError( bdb, 'Only constants can be simulated: %s.' % (simulation, ))
def analyze_models(self, bdb, generator_id, modelnos=None, iterations=1, max_seconds=None, ckpt_iterations=None, ckpt_seconds=None, program=None): if max_seconds is not None: raise BQLError(bdb, 'Loom analyze does not support number of seconds.') if ckpt_iterations is not None or ckpt_seconds is not None: raise BQLError(bdb, 'Loom analyze does not support checkpoint.') if program is not None: raise BQLError(bdb, 'Loom analyze does not support programs.') if modelnos is not None: raise BQLError(bdb, 'Loom cannot analyze specific model numbers.') num_models = (self._get_num_models(bdb, generator_id)) iterations = max(int(iterations), 1) config = {'schedule': {'extra_passes': iterations}} project_path = self._get_loom_project_path(bdb, generator_id) loom.tasks.infer(project_path, sample_count=num_models, config=config) self._store_kind_partition(bdb, generator_id, modelnos) self._set_cache_entry( bdb, generator_id, 'q_server', loom.query.get_server( self._get_loom_project_path(bdb, generator_id))) preqlServer = loom.tasks.query( self._get_loom_project_path(bdb, generator_id)) self._set_cache_entry(bdb, generator_id, 'preql_server', preqlServer)
def bayesdb_population_row_values(bdb, population_id, rowid): """Return values stored in `rowid` of given `population_id`.""" table_name = bayesdb_population_table(bdb, population_id) column_names = bayesdb_variable_names(bdb, population_id, None) qt = sqlite3_quote_name(table_name) qcns = ','.join(map(sqlite3_quote_name, column_names)) select_sql = ('SELECT %s FROM %s WHERE oid = ?' % (qcns, qt)) cursor = bdb.sql_execute(select_sql, (rowid,)) row = None try: row = cursor.next() except StopIteration: population = bayesdb_population_table(bdb, population_id) raise BQLError(bdb, 'No such row in table %s for population %s: %d' % (repr(table_name), repr(population), rowid)) try: cursor.next() except StopIteration: pass else: population = bayesdb_population_table(bdb, population_id) raise BQLError(bdb, 'More than one such row in table %s for population %s: %d' % (repr(table_name), repr(population), rowid)) return row
def bql_row_predictive_relevance(bdb, population_id, generator_id, modelnos, rowid_target, rowid_query, colno, *constraint_args): if rowid_target is None: raise BQLError(bdb, 'No such target row for SIMILARITY') rowid_query = json.loads(rowid_query) modelnos = _retrieve_modelnos(modelnos) # Build the list of hypothetical values. # Each sequence of values is separated by None to demarcate between rows. splits = [-1] + [i for i, x in enumerate(constraint_args) if x is None] assert splits[-1] == len(constraint_args) - 1 rows_list = [ constraint_args[splits[i] + 1:splits[i + 1]] for i in range(len(splits) - 1) ] assert all(len(row) % 2 == 0 for row in rows_list) hypotheticals = [zip(row[::2], row[1::2]) for row in rows_list] if len(rowid_query) == 0 and len(hypotheticals) == 0: raise BQLError(bdb, 'No matching rows for PREDICTIVE RELEVANCE.') def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.predictive_relevance(bdb, generator_id, modelnos, rowid_target, rowid_query, hypotheticals, colno) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) sims = map(generator_similarity, generator_ids) return stats.arithmetic_mean([stats.arithmetic_mean(s) for s in sims])
def parse(schema, subsample_default): '''Parses a generator schema as passed to CrosscatMetamodel. schema is a tokenized expression of the form [['GUESS', ['*']], ['x', 'NUMERICAL'], ...] that is passed to CrosscatMetamodel.create_generator and represents the argument to "crosscat" in CREATE GENERATOR ... FOR ... USING crosscat(...). Returns a GeneratorSchema. See test_crosscat_generator_schema.py for examples. ''' guess = False subsample = subsample_default columns = [] dep_constraints = [] for directive in schema: if directive == []: # Skip extra commas so you can write # # CREATE GENERATOR t_cc FOR t USING crosscat( # x, # y, # z, # ) continue if (not isinstance(directive, list) or len(directive) != 2 or not isinstance(directive[0], basestring)): raise BQLError( None, 'Invalid crosscat column model directive: %r' % (directive, )) op = casefold(directive[0]) if op == 'guess' and directive[1] == ['*']: guess = True elif (op == 'subsample' and isinstance(directive[1], list) and len(directive[1]) == 1): subsample = _parse_subsample_clause(directive[1][0]) elif op == 'dependent': constraint = (_parse_dependent_clause(directive[1]), True) dep_constraints.append(constraint) elif op == 'independent': constraint = (_parse_dependent_clause(directive[1]), False) dep_constraints.append(constraint) elif op != 'guess' and casefold(directive[1]) != 'guess': columns.append((directive[0], directive[1])) else: raise BQLError( None, 'Invalid crosscat column model: %r' % (directive), ) return GeneratorSchema(guess=guess, subsample=subsample, columns=columns, dep_constraints=dep_constraints)
def bayesdb_simulate(bdb, population_id, constraints, colnos, generator_id=None, numpredictions=1, accuracy=None): """Simulate rows from a generative model, subject to constraints. Returns a list of `numpredictions` tuples, with a value for each column specified in the list `colnos`, conditioned on the constraints in the list `constraints` of tuples ``(colno, value)``. The results are simulated from the predictive distribution on fresh rows. """ rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints is not None: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,)) constraints = [ (rowid, c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb) ] targets = [(rowid, colno) for colno in colnos] def loglikelihood(generator_id, metamodel): if not constraints: return 0 return metamodel.logpdf_joint( bdb, generator_id, constraints, [], None) def simulate(generator_id, metamodel, n): return metamodel.simulate_joint( bdb, generator_id, targets, constraints, None, num_predictions=n, accuracy=accuracy) generator_ids = [generator_id] if generator_id is not None else \ core.bayesdb_population_generators(bdb, population_id) metamodels = [core.bayesdb_generator_metamodel(bdb, generator_id) for generator_id in generator_ids] if len(generator_ids) > 1: loglikelihoods = map(loglikelihood, generator_ids, metamodels) likelihoods = map(math.exp, loglikelihoods) total_likelihood = sum(likelihoods) if total_likelihood == 0: # XXX Show the constraints with symbolic names. raise BQLError(bdb, 'Impossible constraints: %r' % (constraints,)) probabilities = [likelihood/total_likelihood for likelihood in likelihoods] countses = bdb.np_prng.multinomial( numpredictions, probabilities, size=1) counts = countses[0] else: counts = [numpredictions] rowses = map(simulate, generator_ids, metamodels, counts) all_rows = [row for rows in rowses for row in rows] assert all(isinstance(row, (tuple, list)) for row in all_rows) return all_rows
def simulate_column(exp): if isinstance(exp, ast.ExpCol): # XXX This is wrong -- it returns independent samples from # the marginals of each variable, not one sample from the # joint on all variables. if False: raise BQLError( bdb, 'SIMULATE FROM MODELS OF can\'t sample conditional') # XXX Gotta weight each model by probability of # constraints. constraints = [(retrieve_variable(v), retrieve_literal(e)) for v, e in simulation.constraints] else: constraints = [] colnos = [retrieve_variable(exp.column)] accuracy = 1 # XXX Allow nontrivial accuracy? samples = bqlfn.bayesdb_simulate(bdb, population_id, constraints, colnos, generator_id=generator_id, numpredictions=1, accuracy=accuracy) return [sample[0] for sample in samples] elif isinstance(exp, ast.ExpBQLDepProb): raise BQLError( bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.') elif isinstance(exp, ast.ExpBQLProbDensity): raise BQLError( bdb, 'PROBABILITY DENSITY OF simulation still unsupported.') elif isinstance(exp, ast.ExpBQLMutInf): colnos0 = [retrieve_variable(c) for c in exp.columns0] colnos1 = [retrieve_variable(c) for c in exp.columns1] constraint_args = () if exp.constraints is not None: constraint_args = tuple( itertools.chain.from_iterable( [[retrieve_variable(colname), retrieve_literal(expr)] for colname, expr in exp.constraints])) nsamples = exp.nsamples and retrieve_literal(exp.nsamples) # One mi_list per generator of the population. # # XXX fsaad@20170625: Setting modelnos = None arbitrarily, figure # out how to set the modelnos argument. mi_lists = bqlfn._bql_column_mutual_information( bdb, population_id, generator_id, None, colnos0, colnos1, nsamples, *constraint_args) return list(itertools.chain.from_iterable(mi_lists)) else: raise BQLError( bdb, 'Only constants can be simulated: %s.' % (simulation, ))
def bql_column_correlation(bdb, population_id, _generator_id, colno0, colno1): if colno0 < 0: raise BQLError(bdb, 'No correlation for latent variable: %r' % (core.bayesdb_variable_name(bdb, population_id, colno0),)) if colno1 < 0: raise BQLError(bdb, 'No correlation for latent variable: %r' % (core.bayesdb_variable_name(bdb, population_id, colno1),)) (st0, st1, data0, data1) = bql_variable_stattypes_and_data(bdb, population_id, colno0, colno1) if (st0, st1) not in correlation_methods: raise NotImplementedError('No correlation method for %s/%s.' % (st0, st1)) return correlation_methods[st0, st1](data0, data1)
def _schema(self, bdb, generator_id): # Probe the cache. cache = self._cache(bdb) if cache is not None: if generator_id in cache.schema: return cache.schema[generator_id] # Not cached. Load the schema from the database. cursor = bdb.sql_execute( ''' SELECT schema_json FROM bayesdb_cgpm_generator WHERE generator_id = ? ''', (generator_id, )) schema_json = cursor_value(cursor, nullok=True) if schema_json is None: generator = core.bayesdb_generator_name(bdb, generator_id) raise BQLError(bdb, 'No such CGPM generator: %r' % (generator, )) # Deserialize the schema. schema = json.loads(schema_json) # Cache it, if we can. if cache is not None: cache.schema[generator_id] = schema return schema
def create_generator(self, bdb, table, schema, instantiate): # The schema is the column list. May want to change this later # to make room for specifying the hyperparameters, etc. insert_column_sql = ''' INSERT INTO bayesdb_nig_normal_column (generator_id, colno, count, sum, sumsq) VALUES (:generator_id, :colno, :count, :sum, :sumsq) ''' with bdb.savepoint(): generator_id, column_list = instantiate(schema) for (colno, column_name, stattype) in column_list: if not stattype == 'numerical': raise BQLError( bdb, 'NIG-Normal only supports' ' numerical columns, but %s is %s' % (repr(column_name), repr(stattype))) (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name) bdb.sql_execute( insert_column_sql, { 'generator_id': generator_id, 'colno': colno, 'count': count, 'sum': xsum, 'sumsq': sumsq, })
def _parse_subsample_clause(clause): if isinstance(clause, basestring) and casefold(clause) == 'off': return False elif isinstance(clause, int): return clause else: raise BQLError(None, 'Invalid subsampling: %r' % (clause, ))
def _engine(self, bdb, generator_id): # Probe the cache. cache = self._cache(bdb) if cache is not None and generator_id in cache.engine: return cache.engine[generator_id] # Not cached. Load the engine from the database. cursor = bdb.sql_execute( ''' SELECT engine_json FROM bayesdb_cgpm_generator WHERE generator_id = ? ''', (generator_id, )) engine_json = cursor_value(cursor) if engine_json is None: generator = core.bayesdb_generator_name(bdb, generator_id) raise BQLError( bdb, 'No models initialized for generator: %r' % (generator, )) # Deserialize the engine. engine = Engine.from_metadata(json.loads(engine_json), rng=bdb.np_prng, multiprocess=self._ncpu) # Cache it, if we can. if cache is not None: cache.engine[generator_id] = engine return engine
def drop_models(self, bdb, generator_id, modelnos=None): with bdb.savepoint(): if modelnos is not None: raise BQLError(bdb, 'Loom cannot drop specific model numbers.') bdb.sql_execute( ''' DELETE FROM bayesdb_loom_column_kind_partition WHERE generator_id = ? ''', (generator_id, )) bdb.sql_execute( ''' DELETE FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? ''', (generator_id, )) # Close the servers. self._close_query_server(bdb, generator_id) self._close_preql_server(bdb, generator_id) bdb.sql_execute( ''' UPDATE bayesdb_loom_generator_model_info SET num_models = 0 WHERE generator_id = ? ''', (generator_id, )) # Remove directories stored on disk. project_path = self._get_loom_project_path(bdb, generator_id) paths = loom.store.get_paths(project_path) if 'root' in paths: folder_with_models = os.path.join(paths['root'], 'samples') # XXX Change to subprocess.check_call os.system('rm -rf {}'.format(folder_with_models))
def ungrouped_schema(): schema = '' for i, var in enumerate(guesses.keys()): if len(var) > 0: guessed_type_reason = guesses[var] guessed_type = guessed_type_reason[0].lower() guessed_reason = guessed_type_reason[1] # Ignore the type key as well as ignore. if guessed_type in ['key', 'ignore']: schema += 'IGNORE %s' % (var,) else: schema += 'MODEL %s AS %s' % (var, guessed_type.upper(),) # Append a semicolon if not last var in schema. if i != len(guesses.keys()) - 1: schema += ';' if len(guessed_reason) > 0: schema += "'''# %s" % (guessed_reason,) else: if guessed_type == 'key': schema += "'''# This variable is a key." schema += os.linesep else: raise BQLError(bdb, 'Empty column name(s) in table %s' % \ (tablename,)) # If reason was commented on previous line, need triple quote to # re-enter schema string. if len(guessed_reason) > 0 or guessed_type == 'key': schema += "''' %s" % (os.linesep,) return schema
def bql_row_similarity(bdb, generator_id, modelno, rowid, target_rowid, *colnos): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) if len(colnos) == 0: colnos = core.bayesdb_generator_column_numbers(bdb, generator_id) return metamodel.row_similarity(bdb, generator_id, modelno, rowid, target_rowid, colnos)
def _parse_dependent_clause(args): i = 0 dep_columns = [] while i < len(args): dep_columns.append(args[i]) if i + 1 < len(args) and args[i + 1] != ',': raise BQLError(None, 'Invalid dependent columns: %r' % (args, )) i += 2 return dep_columns
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid, *colnos): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') if len(colnos) == 0: colnos = core.bayesdb_variable_numbers(bdb, population_id, generator_id) if len(colnos) != 1: raise BQLError(bdb, 'Multiple with respect to columns: %s.' % (colnos, )) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.row_similarity(bdb, generator_id, None, rowid, target_rowid, colnos) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def register(self, bdb): with bdb.savepoint(): version = bayesdb_metamodel_version(bdb, self.name()) if version is None: bdb.sql_execute(nig_normal_schema_1) version = 1 if version == 1: bdb.sql_execute(nig_normal_schema_2) version = 2 if version != 2: raise BQLError(bdb, 'NIG-Normal already installed' ' with unknown schema version: %d' % (version,))
def bayesdb_simulate( bdb, population_id, generator_id, modelnos, constraints, colnos, numpredictions=1, accuracy=None): """Simulate rows from a generative model, subject to constraints. Returns a list of `numpredictions` tuples, with a value for each column specified in the list `colnos`, conditioned on the constraints in the list `constraints` of tuples ``(colno, value)``. The results are simulated from the predictive distribution on fresh rows. """ modelnos = _retrieve_modelnos(modelnos) rowid, constraints = _retrieve_rowid_constraints( bdb, population_id, constraints) def loglikelihood(generator_id, metamodel): if not constraints: return 0 return metamodel.logpdf_joint( bdb, generator_id, modelnos, rowid, constraints, []) def simulate(generator_id, metamodel, n): return metamodel.simulate_joint( bdb, generator_id, modelnos, rowid, colnos, constraints, num_samples=n, accuracy=accuracy) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) metamodels = [ core.bayesdb_generator_metamodel(bdb, generator_id) for generator_id in generator_ids ] if len(generator_ids) > 1: loglikelihoods = map(loglikelihood, generator_ids, metamodels) likelihoods = map(math.exp, loglikelihoods) total_likelihood = sum(likelihoods) if total_likelihood == 0: # XXX Show the constraints with symbolic names. raise BQLError(bdb, 'Impossible constraints: %r' % (constraints,)) probabilities = [ likelihood / total_likelihood for likelihood in likelihoods ] countses = bdb.np_prng.multinomial( numpredictions, probabilities, size=1) counts = countses[0] elif len(generator_ids) == 1: counts = [numpredictions] else: counts = [] rowses = map(simulate, generator_ids, metamodels, counts) all_rows = [row for rows in rowses for row in rows] assert all(isinstance(row, (tuple, list)) for row in all_rows) return all_rows
def bql_row_similarity( bdb, population_id, generator_id, modelnos, rowid, target_rowid, colno): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') modelnos = _retrieve_modelnos(modelnos) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) # XXX Change [colno] to colno by updating IBayesDBMetamodel. return metamodel.row_similarity( bdb, generator_id, modelnos, rowid, target_rowid, [colno]) generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def _retrieve_rowid_constraints(bdb, population_id, constraints): rowid = core.bayesdb_population_fresh_row_id(bdb, population_id) if constraints: user_rowid = [ v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb) ] if len(user_rowid) == 1: rowid = user_rowid[0] elif len(user_rowid) > 1: raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints, )) constraints = [(c, v) for c, v in constraints if c not in core.bayesdb_rowid_tokens(bdb)] return rowid, constraints
def register(self, bdb): with bdb.savepoint(): # Get the current version, if there is one. version = bayesdb_metamodel_version(bdb, self.name()) # Check the version. if version is None: # No version -- CGPM schema not instantaited. # Instantiate it. bdb.sql_execute(CGPM_SCHEMA_1) version = 1 if version != 1: # Unrecognized version. raise BQLError( bdb, 'CGPM already installed' ' with unknown schema version: %d' % (version, ))
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid, *colnos): if target_rowid is None: raise BQLError(bdb, 'No such target row for SIMILARITY') if len(colnos) == 0: colnos = core.bayesdb_variable_numbers(bdb, population_id, generator_id) def generator_similarity(generator_id): metamodel = core.bayesdb_generator_metamodel(bdb, generator_id) return metamodel.row_similarity(bdb, generator_id, None, rowid, target_rowid, colnos) generator_ids = [generator_id] if generator_id is not None else \ core.bayesdb_population_generators(bdb, population_id) similarities = map(generator_similarity, generator_ids) return stats.arithmetic_mean(similarities)
def bayesdb_generator_cell_value(bdb, generator_id, rowid, colno): table_name = bayesdb_generator_table(bdb, generator_id) colname = bayesdb_generator_column_name(bdb, generator_id, colno) qt = sqlite3_quote_name(table_name) qcn = sqlite3_quote_name(colname) value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt) value_cursor = bdb.sql_execute(value_sql, (rowid, )) value = None try: row = value_cursor.next() except StopIteration: generator = bayesdb_generator_name(bdb, generator_id) raise BQLError(bdb, 'No such row in %s: %d' % (repr(generator), rowid)) else: assert len(row) == 1 value = row[0] return value
def _check_loom_initialized(self, bdb, generator_id): # Not invoked on a per-query basis due to high overhead. cursor = bdb.sql_execute( ''' SELECT COUNT(*) FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? ''', (generator_id, )) count_row = cursor.fetchall() cursor = bdb.sql_execute( ''' SELECT COUNT(*) FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? ''', (generator_id, )) count_col = cursor.fetchall() if count_row[0][0] == 0 or count_col[0][0] == 0: raise BQLError(bdb, 'Analyze must be run before any BQL'\ ' queries when using loom.')
def predictive_relevance(self, bdb, generator_id, modelnos, rowid_target, rowid_queries, hypotheticals, colno): if len(hypotheticals) > 0: raise BQLError(bdb, 'Loom cannot handle hypothetical rows' \ ' because it is unable to insert rows into CrossCat') if modelnos is None: modelnos = range(self._get_num_models(bdb, generator_id)) relevances = [0] * len(rowid_queries) for modelno in modelnos: kind_id_context = self._get_kind_id(bdb, generator_id, modelno, colno) partition_id_target = self._get_partition_id( bdb, generator_id, modelno, kind_id_context, rowid_target) for idx, rowid in enumerate(rowid_queries): partition_id_query = self._get_partition_id( bdb, generator_id, modelno, kind_id_context, rowid) if partition_id_target == partition_id_query: relevances[idx] += 1 # XXX This procedure appears to be computing the wrong thing. return [xsum / float(len(modelnos)) for xsum in relevances]
def register(self, bdb): with bdb.savepoint(): schema_sql = 'SELECT version FROM bayesdb_metamodel WHERE name = ?' cursor = bdb.sql_execute(schema_sql, (self.name(),)) version = None try: row = cursor.next() except StopIteration: version = 0 else: version = row[0] assert version is not None if version == 0: # XXX WHATTAKLUDGE! for stmt in std_normal_schema_1.split(';'): bdb.sql_execute(stmt) version = 1 if version != 1: raise BQLError(bdb, 'IID-Gaussian already installed' ' with unknown schema version: %d' % (version,))