def dump_session_as_json(self, session_id): """Returns a JSON string representing the list of SQL or BQL entries (e.g. queries) executed within session `session_id`.""" if session_id > self.session_id or session_id < 1: raise ValueError('No such session (%d)' % session_id) version = cursor_value(self._sql(''' SELECT version FROM bayesdb_session WHERE id = ? ''', (session_id,))) cursor = self._sql(''' SELECT * FROM bayesdb_session_entries WHERE session_id = ? ORDER BY start_time DESC ''', (session_id,)) # XXX Get the description first because apsw cursors, for # whatever reason, don't let you get the description after # you've gotten all the results. # (see also bql.py BayesDBCursor.__init__) fields = [] try: fields = [d[0] for d in cursor.description] except apsw.ExecutionCompleteError: pass # Probably no rows. entries = cursor.fetchall() session = { 'entries': entries, 'fields': fields, 'version': version, } return json.dumps(session, sort_keys=True)
def _get_num_models(self, bdb, generator_id): cursor = bdb.sql_execute(''' SELECT num_models FROM bayesdb_loom_generator_model_info WHERE generator_id = ? ''', (generator_id,)) return cursor_value(cursor)
def bayesdb_variable_name(bdb, population_id, colno): """Return the name a population variable.""" cursor = bdb.sql_execute( ''' SELECT name FROM bayesdb_variable WHERE population_id = ? AND colno = ? ''', (population_id, colno)) return cursor_value(cursor)
def bayesdb_has_generator(bdb, name): """True if there is a generator named `name` in `bdb`. Only actual generator names are considered. """ sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE name = ?' return 0 != cursor_value(bdb.sql_execute(sql, (name,)))
def cardinality(bdb, table, cols=None): """Compute the number of unique values in the columns of a table. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. table : str Name of table. cols : list<str>, optional Columns to compute the unique values. Defaults to all. Returns ------- counts : list<tuple<str,int>> A list of tuples of the form [(col_1, cardinality_1), ...] """ # If no columns specified, use all. if not cols: sql = "PRAGMA table_info(%s)" % (quote(table),) res = bdb.sql_execute(sql) cols = [r[1] for r in res] counts = [] for col in cols: sql = """ SELECT COUNT (DISTINCT %s) FROM %s """ % ( quote(col), quote(table), ) res = bdb.sql_execute(sql) counts.append((col, cursor_value(res))) return counts
def _add_entry(self, qid, type, query, bindings): '''Save a session entry into the database. The entry is initially in the not-completed state. Return the new entry's id so that it can be set to completed when appropriate. qid: str, num, or anything that can be used as a hash key. Any identifier unique to this query. type: str The class of query, usually "bql" or "sql". Case-sensitive. query: str The text of the query, perhaps with unbound positional references. bindings: iterable(str) Fillers for unbound references in query, if any. ''' # check for errors on this session and suggest if we haven't already if not self._suggested_send: self._check_error_entries(self.session_id) data = query + json.dumps(bindings) self._sql(''' INSERT INTO bayesdb_session_entries (session_id, type, data, start_time) VALUES (?,?,?,?) ''', (self.session_id, type, data, time.time())) entry_id = cursor_value(self._sql('SELECT last_insert_rowid()')) self._qid_to_entry_id[qid] = entry_id
def _add_entry(self, qid, type, query, bindings): '''Save a session entry into the database. The entry is initially in the not-completed state. Return the new entry's id so that it can be set to completed when appropriate. qid: str, num, or anything that can be used as a hash key. Any identifier unique to this query. type: str The class of query, usually "bql" or "sql". Case-sensitive. query: str The text of the query, perhaps with unbound positional references. bindings: iterable(str) Fillers for unbound references in query, if any. ''' # check for errors on this session and suggest if we haven't already if not self._suggested_send: self._check_error_entries(self.session_id) data = query + json.dumps(bindings) self._sql( ''' INSERT INTO bayesdb_session_entries (session_id, type, data, start_time) VALUES (?,?,?,?) ''', (self.session_id, type, data, time.time())) entry_id = cursor_value(self._sql('SELECT last_insert_rowid()')) self._qid_to_entry_id[qid] = entry_id
def dump_session_as_json(self, session_id): """Returns a JSON string representing the list of SQL or BQL entries (e.g. queries) executed within session `session_id`.""" if session_id > self.session_id or session_id < 1: raise ValueError('No such session (%d)' % session_id) version = cursor_value( self._sql( ''' SELECT version FROM bayesdb_session WHERE id = ? ''', (session_id, ))) cursor = self._sql( ''' SELECT * FROM bayesdb_session_entries WHERE session_id = ? ORDER BY start_time DESC ''', (session_id, )) # XXX Get the description first because apsw cursors, for # whatever reason, don't let you get the description after # you've gotten all the results. # (see also bql.py BayesDBCursor.__init__) fields = [] try: fields = [d[0] for d in cursor.description] except apsw.ExecutionCompleteError: pass # Probably no rows. entries = cursor.fetchall() session = { 'entries': entries, 'fields': fields, 'version': version, } return json.dumps(session, sort_keys=True)
def bayesdb_generator_column_stattype(bdb, generator_id, colno): """Return the statistical type of the column `colno` in `generator_id`.""" sql = ''' SELECT stattype FROM bayesdb_generator_column WHERE generator_id = ? AND colno = ? ''' cursor = bdb.sql_execute(sql, (generator_id, colno)) try: row = cursor.next() except StopIteration: generator = bayesdb_generator_name(bdb, generator_id) sql = ''' SELECT COUNT(*) FROM bayesdb_generator AS g, bayesdb_column AS c WHERE g.id = :generator_id AND g.tabname = c.tabname AND c.colno = :colno ''' cursor = bdb.sql_execute(sql, { 'generator_id': generator_id, 'colno': colno, }) if cursor_value(cursor) == 0: raise ValueError('No such column in generator %s: %d' % (generator, colno)) else: raise ValueError('Column not modelled in generator %s: %d' % (generator, colno)) else: assert len(row) == 1 return row[0]
def bayesdb_has_latent(bdb, population_id, var): """True if the population has a latent variable by the given name.""" cursor = bdb.sql_execute(''' SELECT COUNT(*) FROM bayesdb_variable WHERE population_id = ? AND name = ? AND generator_id IS NOT NULL ''', (population_id, var)) return cursor_value(cursor)
def _get_name(self, bdb, generator_id): return cursor_value( bdb.sql_execute( ''' SELECT name FROM bayesdb_loom_generator WHERE generator_id = ? ''', (generator_id, )))
def bayesdb_variable_stattype(bdb, population_id, generator_id, colno): """Return the statistical type of a population variable.""" sql = ''' SELECT stattype FROM bayesdb_variable WHERE population_id = ? AND (generator_id IS NULL OR generator_id = ?) AND colno = ? ''' cursor = bdb.sql_execute(sql, (population_id, generator_id, colno)) try: row = cursor.next() except StopIteration: population = bayesdb_population_name(bdb, population_id) sql = ''' SELECT COUNT(*) FROM bayesdb_population AS p, bayesdb_column AS c WHERE p.id = :population_id AND p.tabname = c.tabname AND c.colno = :colno ''' cursor = bdb.sql_execute(sql, { 'population_id': population_id, 'colno': colno, }) if cursor_value(cursor) == 0: raise ValueError('No such variable in population %s: %d' % (population, colno)) else: raise ValueError('Variable not modeled in population %s: %d' % (population, colno)) else: assert len(row) == 1 return row[0]
def bayesdb_generator_has_model(bdb, generator_id, modelno): """True if `generator_id` has a model numbered `modelno`.""" sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model AS m WHERE generator_id = ? AND modelno = ? ''' return cursor_value(bdb.sql_execute(sql, (generator_id, modelno)))
def cardinality(bdb, table, cols=None): """Compute the number of unique values in the columns of a table. Parameters ---------- bdb : __population_to_bdb__ table : __population_name__ Name of table. cols : list<str>, optional Columns to compute the unique values. Defaults to all. Returns ------- counts : pandas.DataFrame whose .columns are ['name', 'distinct_count']. """ # If no columns specified, use all. if not cols: sql = 'PRAGMA table_info(%s)' % (quote(table), ) res = bdb.sql_execute(sql) cols = [r[1] for r in res] names = [] counts = [] for col in cols: sql = ''' SELECT COUNT (DISTINCT %s) FROM %s ''' % (quote(col), quote(table)) res = bdb.sql_execute(sql) names.append(col) counts.append(cursor_value(res)) return pd.DataFrame({'name': names, 'distinct_count': counts})
def _schema(self, bdb, generator_id): # Probe the cache. cache = self._cache(bdb) if cache is not None: if generator_id in cache.schema: return cache.schema[generator_id] # Not cached. Load the schema from the database. cursor = bdb.sql_execute( ''' SELECT schema_json FROM bayesdb_cgpm_generator WHERE generator_id = ? ''', (generator_id, )) schema_json = cursor_value(cursor, nullok=True) if schema_json is None: generator = core.bayesdb_generator_name(bdb, generator_id) raise BQLError(bdb, 'No such CGPM generator: %r' % (generator, )) # Deserialize the schema. schema = json.loads(schema_json) # Cache it, if we can. if cache is not None: cache.schema[generator_id] = schema return schema
def _default_categorical(bdb, generator_id, var): table = core.bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table) qv = sqlite3_quote_name(var) cursor = bdb.sql_execute('SELECT COUNT(DISTINCT %s) FROM %s' % (qv, qt)) k = cursor_value(cursor) return 'categorical', {'k': k}
def _engine(self, bdb, generator_id): # Probe the cache. cache = self._cache(bdb) if cache is not None and generator_id in cache.engine: return cache.engine[generator_id] # Not cached. Load the engine from the database. cursor = bdb.sql_execute( ''' SELECT engine_json FROM bayesdb_cgpm_generator WHERE generator_id = ? ''', (generator_id, )) engine_json = cursor_value(cursor) if engine_json is None: generator = core.bayesdb_generator_name(bdb, generator_id) raise BQLError( bdb, 'No models initialized for generator: %r' % (generator, )) # Deserialize the engine. engine = Engine.from_metadata(json.loads(engine_json), rng=bdb.np_prng, multiprocess=self._ncpu) # Cache it, if we can. if cache is not None: cache.engine[generator_id] = engine return engine
def bayesdb_has_generator_default(bdb, name): """True if there is a generator or default-modelled table named `name`.""" sql = ''' SELECT COUNT(*) FROM bayesdb_generator WHERE name = :name OR (defaultp AND tabname = :name) ''' return 0 != cursor_value(bdb.sql_execute(sql, {'name': name}))
def test_math_func_one_param(name, probe): # Retrieve result from python. python_value_error = None python_type_error = None try: result_python = get_python_math_call(name, probe) except ValueError: python_value_error = True except TypeError: python_type_error = True # Retrieve result from SQL. sql_value_error = None sql_type_error = None try: with bayesdb_open(':memory:') as bdb: cursor = bdb.execute(get_sql_math_call(name, probe)) result_sql = cursor_value(cursor) except ValueError: sql_value_error = True except (TypeError, apsw.SQLError): sql_type_error = True # Domain error on both. if python_value_error or sql_value_error: assert python_value_error and sql_value_error # Arity error on both. elif python_type_error or sql_type_error: assert python_type_error and sql_type_error # Both invocations succeeded, confirm results match. else: assert abserr(result_python, result_sql) < 1e-4
def bayesdb_has_generator(bdb, name): """True if there is a generator named `name` in `bdb`. Only actual generator names are considered. """ sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE name = ?' return 0 != cursor_value(bdb.sql_execute(sql, (name, )))
def _cgpm_rowid(self, bdb, generator_id, table_rowid): cursor = bdb.sql_execute( ''' SELECT cgpm_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? AND table_rowid = ? ''', (generator_id, table_rowid)) cgpm_rowid = cursor_value(cursor, nullok=True) return cgpm_rowid if cgpm_rowid is not None else -1
def bayesdb_generator_fresh_row_id(bdb, generator_id): table_name = bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt, )) max_rowid = cursor_value(cursor) if max_rowid is None: max_rowid = 0 return max_rowid + 1 # Synthesize a non-existent SQLite row id
def _get_num_models(self, bdb, generator_id): cursor = bdb.sql_execute( ''' SELECT num_models FROM bayesdb_loom_generator_model_info WHERE generator_id = ? ''', (generator_id, )) return cursor_value(cursor)
def _get_is_incorporated_rowid(self, bdb, generator_id, rowid): """Return True iff the rowid is incorporated in the loom model.""" cursor = bdb.sql_execute(''' SELECT COUNT(*) FROM bayesdb_loom_rowid_mapping WHERE generator_id = ? AND table_rowid = ? ''', (generator_id, rowid)) return cursor_value(cursor) > 0
def bayesdb_generator_fresh_row_id(bdb, generator_id): table_name = bayesdb_generator_table(bdb, generator_id) qt = sqlite3_quote_name(table_name) cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt,)) max_rowid = cursor_value(cursor) if max_rowid is None: max_rowid = 0 return max_rowid + 1 # Synthesize a non-existent SQLite row id
def bayesdb_population_fresh_row_id(bdb, population_id): """Return one plus maximum rowid in base table of given `population_id`.""" table_name = bayesdb_population_table(bdb, population_id) qt = sqlite3_quote_name(table_name) cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt,)) max_rowid = cursor_value(cursor) if max_rowid is None: max_rowid = 0 return max_rowid + 1 # Synthesize a non-existent SQLite row id
def bayesdb_variable_number(bdb, population_id, generator_id, name): """Return the column number of a population variable.""" cursor = bdb.sql_execute(''' SELECT colno FROM bayesdb_variable WHERE population_id = ? AND (generator_id IS NULL OR generator_id = ?) AND name = ? ''', (population_id, generator_id, name)) return cursor_value(cursor)
def bayesdb_variable_name(bdb, population_id, generator_id, colno): """Return the name a population variable.""" cursor = bdb.sql_execute(''' SELECT name FROM bayesdb_variable WHERE population_id = ? AND (generator_id IS NULL OR generator_id = ?) AND colno = ? ''', (population_id, generator_id, colno)) return cursor_value(cursor)
def _get_loom_rank(self, bdb, generator_id, colno): """Return the loom rank (column number) for the given colno.""" cursor = bdb.sql_execute(''' SELECT rank FROM bayesdb_loom_column_ordering WHERE generator_id = ? AND colno = ? ''', (generator_id, colno,)) return cursor_value(cursor)
def _get_is_incorporated_rowid(self, bdb, generator_id, rowid): """Return True iff the rowid is incorporated in the loom model.""" cursor = bdb.sql_execute( ''' SELECT COUNT(*) FROM bayesdb_loom_rowid_mapping WHERE generator_id = ? AND table_rowid = ? ''', (generator_id, rowid)) return cursor_value(cursor) > 0
def _get_integer_form(self, bdb, generator_id, colno, string_form): """Return integer code representing the string.""" cursor = bdb.sql_execute(''' SELECT integer_form FROM bayesdb_loom_string_encoding WHERE generator_id = ? AND colno = ? AND string_form = ? ''', (generator_id, colno, string_form,)) return cursor_value(cursor)
def bayesdb_table_has_rowid(bdb, table, rowid): """True if the table named `table` has record with given rowid. `bdb` must have a table named `table`. If you're not sure, call :func:`bayesdb_has_table` first. """ qt = sqlite3_quote_name(table) sql = 'SELECT COUNT(*) FROM %s WHERE oid = ?' cursor = bdb.sql_execute(sql % (qt,), (rowid,)) return cursor_value(cursor) != 0
def _get_kind_id(self, bdb, generator_id, modelno, colno): """Return kind_id (view assignment) of colno in modelno.""" cursor = bdb.sql_execute(''' SELECT kind_id FROM bayesdb_loom_column_kind_partition WHERE generator_id = ? AND modelno = ? AND colno = ? ''', (generator_id, modelno, colno,)) return cursor_value(cursor)
def _get_partition_id(self, bdb, generator_id, modelno, kind_id, rowid): cursor = bdb.sql_execute( ''' SELECT partition_id FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? AND modelno = ? AND kind_id = ? AND rowid = ? ''', (generator_id, modelno, kind_id, rowid)) return cursor_value(cursor)
def _simulate_1(self, bdb, generator_id, mus, sigmas, colno): if colno < 0: dev_colno = colno cursor = bdb.sql_execute(''' SELECT observed_colno FROM bayesdb_nig_normal_deviation WHERE generator_id = ? AND deviation_colno = ? ''', (generator_id, dev_colno)) obs_colno = cursor_value(cursor) return self.prng.gauss(0, sigmas[obs_colno]) else: return self.prng.gauss(mus[colno], sigmas[colno])
def _logpdf_1(self, bdb, generator_id, mus, sigmas, colno, x): if colno < 0: dev_colno = colno cursor = bdb.sql_execute(''' SELECT observed_colno FROM bayesdb_nig_normal_deviation WHERE generator_id = ? AND deviation_colno = ? ''', (generator_id, dev_colno)) obs_colno = cursor_value(cursor) return logpdf_gaussian(x, 0, sigmas[obs_colno]) else: return logpdf_gaussian(x, mus[colno], sigmas[colno])
def _get_partition_id(self, bdb, generator_id, modelno, kind_id, rowid): """Return row partition_id of given rowid, within kind_id of modelno.""" cursor = bdb.sql_execute(''' SELECT partition_id FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? AND modelno = ? AND kind_id = ? AND table_rowid = ? ''', (generator_id, modelno, kind_id, rowid)) return cursor_value(cursor)
def _get_partition_id(self, bdb, generator_id, modelno, kind_id, rowid): """Return row partition_id of given rowid, within kind_id of modelno.""" cursor = bdb.sql_execute( ''' SELECT partition_id FROM bayesdb_loom_row_kind_partition WHERE generator_id = ? AND modelno = ? AND kind_id = ? AND table_rowid = ? ''', (generator_id, modelno, kind_id, rowid)) return cursor_value(cursor)
def _get_loom_rank(self, bdb, generator_id, colno): cursor = bdb.sql_execute( ''' SELECT rank FROM bayesdb_loom_column_ordering WHERE generator_id = ? AND colno = ? ''', ( generator_id, colno, )) return cursor_value(cursor)
def _check_error_entries(self, session_id): '''Check if the previous session contains queries that resulted in errors and suggest sending the session''' error_entries = cursor_value(self._sql(''' SELECT COUNT(*) FROM bayesdb_session_entries WHERE error IS NOT NULL AND session_id = ? ''', (session_id,))) # suggest sending sessions but don't suggest more than once if (error_entries > 0 and not self._suggested_send): self._logger.warn(_error_previous_session_msg) self._suggested_send = True return error_entries
def bayesdb_table_has_column(bdb, table, name): """True if the table named `table` has a column named `name`. `bdb` must have a table named `table`. If you're not sure, call :func:`bayesdb_has_table` first. WARNING: This may modify the database by populating the ``bayesdb_column`` table if it has not yet been populated. """ bayesdb_table_guarantee_columns(bdb, table) sql = 'SELECT COUNT(*) FROM bayesdb_column WHERE tabname = ? AND name = ?' return cursor_value(bdb.sql_execute(sql, (table, name)))
def bayesdb_has_generator(bdb, population_id, name): """True if there is a generator named `name` in `bdb`.""" if population_id is None: sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE name = ?' cursor = bdb.sql_execute(sql, (name, )) else: sql = ''' SELECT COUNT(*) FROM bayesdb_generator WHERE name = ? AND population_id = ? ''' cursor = bdb.sql_execute(sql, (name, population_id)) return 0 != cursor_value(cursor)
def bayesdb_has_variable(bdb, population_id, generator_id, name): """True if the population has a given variable. generator_id is None for manifest variables and the id of a generator for variables that may be latent. """ cursor = bdb.sql_execute(''' SELECT COUNT(*) FROM bayesdb_variable WHERE population_id = ? AND (generator_id IS NULL OR generator_id = ?) AND name = ? ''', (population_id, generator_id, name)) return cursor_value(cursor) != 0
def run_queries(target, m): extra = 'MODELED BY %s' % (m,) if m is not None else '' bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF %s = 1 BY p %s ''' % (target, extra,)).fetchall() for other in ['age', 'gender', 'salary']: cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF %s WITH %s BY p %s ''' % (target, other, extra)) assert cursor_value(cursor) >= 0 bdb.execute(''' ESTIMATE SIMILARITY IN THE CONTEXT OF %s FROM PAIRWISE p %s; ''' % (target, extra,)).fetchall()
def _basic_test_trace(executor): # a new session is automatically initialized with id 1 assert get_num_sessions(executor) == 1 # the above select query counts should become one or more entries num_entries = cursor_value(executor(''' SELECT COUNT(*) FROM bayesdb_session_entries ''')) assert num_entries > 0 # entries are ordered starting from 1 for id, entry in enumerate(get_entries(executor)): assert entry.session_id == 1 assert entry.id == id + 1
def _add_entry(self, qid, type, query, bindings): '''Save a session entry into the database. The entry is initially in the not-completed state. Return the new entry's id so that it can be set to completed when appropriate.''' # check for errors on this session and suggest if we haven't already if not self._suggested_send: self._check_error_entries(self.session_id) data = query + json.dumps(bindings) self._sql(''' INSERT INTO bayesdb_session_entries (session_id, type, data, start_time) VALUES (?,?,?,?) ''', (self.session_id, type, data, time.time())) entry_id = cursor_value(self._sql('SELECT last_insert_rowid()')) self._qid_to_entry_id[qid] = entry_id
def bayesdb_generator_has_column(bdb, generator_id, column_name): """True if `generator_id` models a column named `name`.""" sql = ''' SELECT COUNT(*) FROM bayesdb_generator AS g, bayesdb_generator_column as gc, bayesdb_column AS c WHERE g.id = :generator_id AND c.name = :column_name AND g.id = gc.generator_id AND g.tabname = c.tabname AND gc.colno = c.colno ''' cursor = bdb.sql_execute(sql, { 'generator_id': generator_id, 'column_name': column_name, }) return cursor_value(cursor)