def create_empty_table(bdb, column_names): """Create a fresh empty table with the given column names. Give all the columns a NUMERIC data type in the underlying SQL. Return the name of the new table. """ table = bdb.temp_table_name() qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) return table
def create_empty_table(bdb, column_names): """Create a fresh empty table with the given column names. Give all the columns a NUMERIC data type in the underlying SQL. Return the name of the new table. """ table = bdb.temp_table_name() qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) return table
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False): """Read data from a pandas dataframe into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param pandas.DataFrame df: pandas dataframe :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true, and `create` is true` and `table` exists, read data into it anyway """ if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') # XXX Whattakludge! idxcol = '_rowid_' if idxcol in df.columns: raise ValueError('Column `_rowid_\' is not allowed.') with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table),)) core.bayesdb_table_guarantee_columns(bdb, table) unknown = set(name for name in df.columns if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise ValueError('Unknown columns: %s' % (list(unknown),)) column_names = ['_rowid_'] + df.columns elif create: column_names = [idxcol] + list(df.columns) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns) qt = sqlite3_quote_name(table) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: raise ValueError('No such table: %s' % (repr(table),)) qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qcns), ','.join('?' for _qcn in qcns)) for row in df.to_records(): bdb.sql_execute(sql, row)
def dot_describe(self, line): '''describe BayesDB entities [table(s)|generator(s)|columns|model(s)] [<name>...] Print a human-readable description of the specified BayesDB entities. ''' # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) == 0: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe population(s) [<pop>...]\n') self.stdout.write(' .describe variables <pop>\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n') return if casefold(tokens[0]) == 'table' or \ casefold(tokens[0]) == 'tables': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] qualifier = \ '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')' ok = True for table in params: if not core.bayesdb_has_table(self._bdb, table): self.stdout.write('No such table: %s\n' % (repr(table), )) ok = False if not ok: return for table in params: core.bayesdb_table_guarantee_columns(self._bdb, table) sql = ''' SELECT tabname, colno, name, shortname FROM bayesdb_column WHERE %s ORDER BY tabname ASC, colno ASC ''' % (qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params)) elif casefold(tokens[0]) in ('population', 'populations'): params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] names = ','.join('?%d' % (i + 1, ) for i in xrange(len(params))) qualifier = '(name IN (%s))' % (names, ) ok = True for population in params: if not core.bayesdb_has_population(self._bdb, population): self.stdout.write('No such population: %s\n' % (repr(population), )) ok = False if not ok: return with self._bdb.savepoint(): cursor = self._bdb.sql_execute( ''' SELECT id, name, tabname FROM bayesdb_population WHERE %s ''' % (qualifier, ), params) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == 'generator' or \ casefold(tokens[0]) == 'generators': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] names = ','.join('?%d' % (i + 1, ) for i in range(len(params))) qualifier = ''' (name IN ({names})) '''.format(names=names) ok = True for generator in params: if not core.bayesdb_has_generator(self._bdb, None, generator): self.stdout.write('No such generator: %s\n' % (repr(generator), )) ok = False if not ok: return sql = ''' SELECT id, name, tabname, backend FROM bayesdb_generator WHERE %s ''' % (qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params)) elif casefold(tokens[0]) == 'variables': if len(tokens) != 2: self.stdout.write('Usage: .describe variables <population>\n') return population = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_population(self._bdb, population): self.stdout.write('No such population: %r\n' % (population, )) return population_id = core.bayesdb_get_population( self._bdb, population) sql = ''' SELECT c.colno AS colno, c.name AS name, v.stattype AS stattype, c.shortname AS shortname FROM bayesdb_population AS p, (bayesdb_column AS c LEFT OUTER JOIN bayesdb_variable AS v USING (colno)) WHERE p.id = ? AND p.id = v.population_id AND p.tabname = c.tabname ORDER BY colno ASC; ''' cursor = self._bdb.sql_execute(sql, (population_id, )) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == 'model' or \ casefold(tokens[0]) == 'models': if len(tokens) < 2: self.stdout.write('Describe models of what generator?\n') return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator(self._bdb, None, generator): self.stdout.write('No such generator: %s\n' % (repr(generator), )) return generator_id = core.bayesdb_get_generator( self._bdb, None, generator) qualifier = None if len(tokens) == 2: qualifier = '1' else: modelnos = [] for token in tokens[2:]: try: modelno = int(token) except ValueError: self.stdout.write('Invalid model number: %s\n' % (repr(token), )) return else: if not core.bayesdb_generator_has_model( self._bdb, generator_id, modelno): self.stdout.write('No such model: %d\n' % (modelno, )) return modelnos.append(modelno) qualifier = 'modelno IN (%s)' % \ (','.join(map(str, modelnos),)) sql = ''' SELECT modelno, iterations FROM bayesdb_generator_model WHERE generator_id = ? AND %s ''' % (qualifier, ) cursor = self._bdb.sql_execute(sql, (generator_id, )) pretty.pp_cursor(self.stdout, cursor) else: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe variables <pop>\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n')
def instantiate_generator(bdb, gen_name, table, metamodel, columns, ifnotexists=None, default=None): if ifnotexists is None: ifnotexists = False if default is None: default = False # Make sure there is no table by this name. if core.bayesdb_has_table(bdb, gen_name): raise BQLError(bdb, "Name already defined as table: %s" % (repr(gen_name),)) # Make sure there's no generator by this name unless we were asked # to redefine it in that case. if not ifnotexists and core.bayesdb_has_generator(bdb, gen_name): raise BQLError(bdb, "Name already defined as generator: %s" % (repr(gen_name),)) # Make sure the bayesdb_column table knows all the columns. core.bayesdb_table_guarantee_columns(bdb, table) # Create the generator record. generator_sql = """ INSERT%s INTO bayesdb_generator (name, tabname, metamodel, defaultp) VALUES (:name, :table, :metamodel, :defaultp) """ % ( " OR IGNORE" if ifnotexists else "", ) cursor = bdb.sql_execute( generator_sql, {"name": gen_name, "table": table, "metamodel": metamodel.name(), "defaultp": default} ) generator_id = cursor.lastrowid assert generator_id assert 0 < generator_id # Get a map from column name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. column_map = {} duplicates = set() missing = set() invalid = set() colno_sql = """ SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name """ stattype_sql = """ SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype """ for name, stattype in columns: name_folded = casefold(name) if name_folded in column_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, {"table": table, "column_name": name}) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, {"stattype": stattype}) if cursor_value(cursor) == 0: invalid.add(stattype) continue column_map[casefold(name)] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError(bdb, "No such columns in table %s: %s" % (repr(table), repr(list(missing)))) if duplicates: raise BQLError(bdb, "Duplicate column names: %s" % (repr(list(duplicates)),)) if invalid: raise BQLError(bdb, "Invalid statistical types: %s" % (repr(list(invalid)),)) # Insert column records. column_sql = """ INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) VALUES (:generator_id, :colno, :stattype) """ for name, stattype in columns: colno = column_map[casefold(name)] stattype = casefold(stattype) bdb.sql_execute(column_sql, {"generator_id": generator_id, "colno": colno, "stattype": stattype}) column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns) return generator_id, column_list
def instantiate_generator(bdb, gen_name, table, metamodel, columns, default=None): if default is None: default = False # Make sure there is no table by this name. if core.bayesdb_has_table(bdb, gen_name): raise BQLError( bdb, 'Name already defined as table: %s' % (repr(gen_name), )) # Make sure the bayesdb_column table knows all the columns. core.bayesdb_table_guarantee_columns(bdb, table) generator_already_existed = False if core.bayesdb_has_generator(bdb, gen_name): generator_already_existed = True else: # Create the generator record. generator_sql = '''INSERT INTO bayesdb_generator (name, tabname, metamodel, defaultp) VALUES (:name, :table, :metamodel, :defaultp)''' cursor = bdb.sql_execute( generator_sql, { 'name': gen_name, 'table': table, 'metamodel': metamodel.name(), 'defaultp': default, }) generator_id = core.bayesdb_get_generator(bdb, gen_name) assert generator_id assert 0 < generator_id # Get a map from column name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. column_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for name, stattype in columns: name_folded = casefold(name) if name_folded in column_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, { 'stattype': stattype, }) if cursor_value(cursor) == 0: invalid.add(stattype) continue column_map[casefold(name)] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError( bdb, 'No such columns in table %s: %s' % (repr(table), repr(list(missing)))) if duplicates: raise BQLError( bdb, 'Duplicate column names: %s' % (repr(list(duplicates)), )) if invalid: raise BQLError( bdb, 'Invalid statistical types: %s' % (repr(list(invalid)), )) if not generator_already_existed: # Insert column records. column_sql = ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) VALUES (:generator_id, :colno, :stattype) ''' for name, stattype in columns: colno = column_map[casefold(name)] stattype = casefold(stattype) bdb.sql_execute( column_sql, { 'generator_id': generator_id, 'colno': colno, 'stattype': stattype, }) column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns) return generator_id, column_list
def bayesdb_read_csv(bdb, table, f, header=False, create=False, ifnotexists=False): """Read CSV data from a line iterator into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param iterable f: iterator returning lines as :class:`str` :param bool header: if true, first line specifies column names :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true and `table` exists, do it anyway """ if not header: if create: raise ValueError('Can\'t create table from headerless CSV!') if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table), )) elif not create: raise ValueError('No such table: %s' % (repr(table), )) reader = csv.reader(f) line = 1 if header: row = None try: row = reader.next() except StopIteration: raise IOError('Missing header in CSV file') line += 1 column_names = [unicode(name, 'utf8').strip() for name in row] if len(column_names) == 0: raise IOError('No columns in CSV file!') column_name_map = {} duplicates = set([]) for name in column_names: name_folded = casefold(name) if name_folded in column_name_map: duplicates.add(name_folded) else: column_name_map[name_folded] = name if 0 < len(duplicates): raise IOError('Duplicate columns in CSV: %s' % (repr(list(duplicates)), )) if create and not core.bayesdb_has_table(bdb, table): qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: core.bayesdb_table_guarantee_columns(bdb, table) unknown = set( name for name in column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise IOError('Unknown columns: %s' % (list(unknown), )) else: assert not create assert not ifnotexists column_names = core.bayesdb_table_column_names(bdb, table) ncols = len(column_names) qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) # XXX Would be nice if we could prepare this statement before # reading any rows in order to check whether there are missing # nonnull columns with no default value. However, the only # way to prepare a statement in the Python wrapper is to # execute a cursor, which also binds and steps the statement. sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qcns), ','.join('?' for _qcn in qcns)) for row in reader: if len(row) < ncols: raise IOError('Line %d: Too few columns: %d < %d' % (line, len(row), ncols)) if len(row) > ncols: raise IOError('Line %d: Too many columns: %d > %d' % (line, len(row), ncols)) bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
def bayesdb_read_csv(bdb, table, f, header=False, create=False, ifnotexists=False): """Read CSV data from a line iterator into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param iterable f: iterator returning lines as :class:`str` :param bool header: if true, first line specifies column names :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true and `table` exists, do it anyway """ if not header: if create: raise ValueError('Can\'t create table from headerless CSV!') if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table),)) elif not create: raise ValueError('No such table: %s' % (repr(table),)) reader = csv.reader(f) line = 1 if header: row = None try: row = reader.next() except StopIteration: raise IOError('Missing header in CSV file') line += 1 column_names = [unicode(name, 'utf8').strip() for name in row] if len(column_names) == 0: raise IOError('No columns in CSV file!') if any(len(c)==0 for c in column_names): raise IOError( 'Missing column names in header: %s' %repr(column_names)) column_name_map = {} duplicates = set([]) for name in column_names: name_folded = casefold(name) if name_folded in column_name_map: duplicates.add(name_folded) else: column_name_map[name_folded] = name if 0 < len(duplicates): raise IOError('Duplicate columns in CSV: %s' % (repr(list(duplicates)),)) if create and not core.bayesdb_has_table(bdb, table): qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: core.bayesdb_table_guarantee_columns(bdb, table) unknown = set(name for name in column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise IOError('Unknown columns: %s' % (list(unknown),)) else: assert not create assert not ifnotexists column_names = core.bayesdb_table_column_names(bdb, table) ncols = len(column_names) qt = sqlite3_quote_name(table) qcns = map(sqlite3_quote_name, column_names) # XXX Would be nice if we could prepare this statement before # reading any rows in order to check whether there are missing # nonnull columns with no default value. However, the only # way to prepare a statement in the Python wrapper is to # execute a cursor, which also binds and steps the statement. sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qcns), ','.join('?' for _qcn in qcns)) for row in reader: if len(row) < ncols: raise IOError('Line %d: Too few columns: %d < %d' % (line, len(row), ncols)) if len(row) > ncols: raise IOError('Line %d: Too many columns: %d > %d' % (line, len(row), ncols)) bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False, index=None): """Read data from a pandas dataframe into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param pandas.DataFrame df: pandas dataframe :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true, and `create` is true` and `table` exists, read data into it anyway :param str index: name of column for index If `index` is `None`, then the dataframe's index dtype must be convertible to int64, and it is mapped to the table's rowids. If the dataframe's index dtype is not convertible to int64, you must specify `index` to give a primary key for the table. """ if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') column_names = [str(column) for column in df.columns] if index is None: create_column_names = column_names insert_column_names = ['_rowid_'] + column_names try: key_index = df.index.astype('int64') except ValueError: raise ValueError('Must specify index name for non-integral index!') else: if index in df.columns: raise ValueError('Index name collides with column name: %r' % (index, )) create_column_names = [index] + column_names insert_column_names = create_column_names key_index = df.index with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table), )) core.bayesdb_table_guarantee_columns(bdb, table) unknown = set( name for name in create_column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise ValueError('Unknown columns: %s' % (list(unknown), )) elif create: qccns = map(sqlite3_quote_name, create_column_names) def column_schema(column_name, qcn): if column_name == index: return '%s NUMERIC PRIMARY KEY' % (qcn, ) else: return '%s NUMERIC' % (qcn, ) schema = ','.join( column_schema(ccn, qccn) for ccn, qccn in zip(create_column_names, qccns)) qt = sqlite3_quote_name(table) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: raise ValueError('No such table: %s' % (repr(table), )) qt = sqlite3_quote_name(table) qicns = map(sqlite3_quote_name, insert_column_names) sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qicns), ','.join('?' for _qicn in qicns)) for key, i in zip(key_index, df.index): bdb.sql_execute(sql, (key, ) + tuple(df.ix[i]))
def _create_population(bdb, phrase): if core.bayesdb_has_population(bdb, phrase.name): if phrase.ifnotexists: return else: raise BQLError( bdb, 'Name already defined as population: %r' % (phrase.name, )) # Make sure the bayesdb_column table knows all the columns of the # underlying table. core.bayesdb_table_guarantee_columns(bdb, phrase.table) # Retrieve all columns from the base table. The user is required to provide # a strategy for each single variable, either MODEL, IGNORE, or GUESS. base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table) seen_columns = [] # Create the population record and get the assigned id. bdb.sql_execute( ''' INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?) ''', (phrase.name, phrase.table)) population_id = core.bayesdb_get_population(bdb, phrase.name) # Extract the population column names and stattypes as pairs. pop_model_vars = list( itertools.chain.from_iterable([[(name, s.stattype) for name in s.names] for s in phrase.schema if isinstance(s, ast.PopModelVars)])) # Extract the ignored columns. pop_ignore_vars = list( itertools.chain.from_iterable([[(name, 'ignore') for name in s.names] for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)])) # Extract the columns to guess. pop_guess = list( itertools.chain.from_iterable([ s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars) ])) if '*' in pop_guess: # Do not allow * to coincide with other variables. if len(pop_guess) > 1: raise BQLError( bdb, 'Cannot use wildcard GUESS with variables names: %r' % (pop_guess, )) # Retrieve all variables in the base table. avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars) pop_guess = [t for t in base_table_columns if casefold(t) not in avoid] # Perform the guessing. if pop_guess: qt = sqlite3_quote_name(phrase.table) qcns = ','.join(map(sqlite3_quote_name, pop_guess)) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt)) rows = cursor.fetchall() # XXX This function returns a stattype called `key`, which we will add # to the pop_ignore_vars. pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows) pop_guess_vars = zip(pop_guess, pop_guess_stattypes) migrate = [(col, st) for col, st in pop_guess_vars if st == 'key'] for col, st in migrate: pop_guess_vars.remove((col, st)) pop_ignore_vars.append((col, 'ignore')) else: pop_guess_vars = [] # Pool all the variables and statistical types together. pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars # Check that everyone in the population is modeled. # `known` contains all the variables for which a policy is known. known = [casefold(t[0]) for t in pop_all_vars] not_found = [t for t in base_table_columns if casefold(t) not in known] if not_found: raise BQLError( bdb, 'Cannot determine a modeling policy for variables: %r' % (not_found, )) # Get a map from variable name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. variable_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if name in variable_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': phrase.table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype}) if cursor_value(cursor) == 0 and stattype != 'ignore': invalid.add(stattype) continue variable_map[name] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError( bdb, 'No such columns in table %r: %r' % (phrase.table, list(missing))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates), )) if invalid: raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid), )) # Insert variable records. for nm, st in pop_all_vars: name = casefold(nm) colno = variable_map[name] stattype = casefold(st) if stattype == 'ignore': continue bdb.sql_execute( ''' INSERT INTO bayesdb_variable (population_id, name, colno, stattype) VALUES (?, ?, ?, ?) ''', (population_id, name, colno, stattype))
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False, index=None): """Read data from a pandas dataframe into a table. :param bayeslite.BayesDB bdb: BayesDB instance :param str table: name of table :param pandas.DataFrame df: pandas dataframe :param bool create: if true and `table` does not exist, create it :param bool ifnotexists: if true, and `create` is true` and `table` exists, read data into it anyway :param str index: name of column for index If `index` is `None`, then the dataframe's index dtype must be convertible to int64, and it is mapped to the table's rowids. If the dataframe's index dtype is not convertible to int64, you must specify `index` to give a primary key for the table. """ if not create: if ifnotexists: raise ValueError('Not creating table whether or not exists!') column_names = [str(column) for column in df.columns] if index is None: create_column_names = column_names insert_column_names = ['_rowid_'] + column_names try: key_index = df.index.astype('int64') except ValueError: raise ValueError('Must specify index name for non-integral index!') else: if index in df.columns: raise ValueError('Index name collides with column name: %r' % (index,)) create_column_names = [index] + column_names insert_column_names = create_column_names key_index = df.index with bdb.savepoint(): if core.bayesdb_has_table(bdb, table): if create and not ifnotexists: raise ValueError('Table already exists: %s' % (repr(table),)) core.bayesdb_table_guarantee_columns(bdb, table) unknown = set(name for name in create_column_names if not core.bayesdb_table_has_column(bdb, table, name)) if len(unknown) != 0: raise ValueError('Unknown columns: %s' % (list(unknown),)) elif create: qccns = map(sqlite3_quote_name, create_column_names) def column_schema(column_name, qcn): if column_name == index: return '%s NUMERIC PRIMARY KEY' % (qcn,) else: return '%s NUMERIC' % (qcn,) schema = ','.join(column_schema(ccn, qccn) for ccn, qccn in zip(create_column_names, qccns)) qt = sqlite3_quote_name(table) bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema)) core.bayesdb_table_guarantee_columns(bdb, table) else: raise ValueError('No such table: %s' % (repr(table),)) qt = sqlite3_quote_name(table) qicns = map(sqlite3_quote_name, insert_column_names) sql = 'INSERT INTO %s (%s) VALUES (%s)' % \ (qt, ','.join(qicns), ','.join('?' for _qicn in qicns)) for key, i in zip(key_index, df.index): bdb.sql_execute(sql, (key,) + tuple(df.ix[i]))
def _create_population(bdb, phrase): # Retrieve the (possibility implicit) population name. population_name = phrase.name or phrase.table implicit = 1 if phrase.name is None else 0 # Handle IF NOT EXISTS. if core.bayesdb_has_population(bdb, population_name): if phrase.ifnotexists: return else: raise BQLError(bdb, 'Name already defined as population: %r' % (population_name,)) # Make sure the bayesdb_column table knows all the columns of the # underlying table. core.bayesdb_table_guarantee_columns(bdb, phrase.table) # Retrieve all columns from the base table. The user is required to provide # a strategy for each single variable, either MODEL, IGNORE, or GUESS. base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table) # Create the population record and get the assigned id. bdb.sql_execute(''' INSERT INTO bayesdb_population (name, tabname, implicit) VALUES (?, ?, ?) ''', (population_name, phrase.table, implicit)) population_id = core.bayesdb_get_population(bdb, population_name) # Extract the population column names and stattypes as pairs. pop_model_vars = list(itertools.chain.from_iterable( [[(name, s.stattype) for name in s.names] for s in phrase.schema if isinstance(s, ast.PopModelVars)])) # Extract the ignored columns. pop_ignore_vars = list(itertools.chain.from_iterable( [[(name, 'ignore') for name in s.names] for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)])) # Extract the columns to guess. pop_guess = list(itertools.chain.from_iterable( [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)])) if '*' in pop_guess: # Do not allow * to coincide with other variables. if len(pop_guess) > 1: raise BQLError( bdb, 'Cannot use wildcard GUESS with variables names: %r' % (pop_guess, )) # Retrieve all variables in the base table. avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars) pop_guess = [t for t in base_table_columns if casefold(t) not in avoid] # Perform the guessing. if pop_guess: qt = sqlite3_quote_name(phrase.table) qcns = ','.join(map(sqlite3_quote_name, pop_guess)) cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt)) rows = cursor.fetchall() # XXX This function returns a stattype called `key`, which we will add # to the pop_ignore_vars. pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows) pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes]) migrate = [(col, st) for col, st in pop_guess_vars if st=='key'] for col, st in migrate: pop_guess_vars.remove((col, st)) pop_ignore_vars.append((col, 'ignore')) else: pop_guess_vars = [] # Ensure no string-valued variables are being modeled as numerical. numerical_string_vars = [ var for var, stattype in pop_model_vars if stattype == 'numerical' and _column_contains_string(bdb, phrase.table, var) ] if numerical_string_vars: raise BQLError(bdb, 'Column(s) with string values modeled as numerical: %r' % (numerical_string_vars, )) # Pool all the variables and statistical types together. pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars # Check that everyone in the population is modeled. # `known` contains all the variables for which a policy is known. known = [casefold(t[0]) for t in pop_all_vars] not_found = [t for t in base_table_columns if casefold(t) not in known] if not_found: raise BQLError( bdb, 'Cannot determine a modeling policy for variables: %r' % (not_found, )) # Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. seen_variables = set() duplicates = set() missing = set() invalid = set() stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if name in seen_variables: duplicates.add(name) continue if not core.bayesdb_table_has_column(bdb, phrase.table, nm): missing.add(name) continue cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype}) if cursor_value(cursor) == 0 and stattype != 'ignore': invalid.add(stattype) continue seen_variables.add(nm) # XXX Would be nice to report these simultaneously. if missing: raise BQLError(bdb, 'No such columns in table %r: %r' % (phrase.table, list(missing))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),)) if invalid: raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),)) # Insert variable records. for nm, st in pop_all_vars: name = casefold(nm) stattype = casefold(st) if stattype == 'ignore': continue core.bayesdb_add_variable(bdb, population_id, name, stattype)
def dot_describe(self, line): '''describe BayesDB entities [table(s)|generator(s)|columns|model(s)] [<name>...] Print a human-readable description of the specified BayesDB entities. ''' # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) == 0: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe columns <gen>\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n') return if casefold(tokens[0]) == 'table' or \ casefold(tokens[0]) == 'tables': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] qualifier = \ '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')' ok = True for table in params: if not core.bayesdb_has_table(self._bdb, table): self.stdout.write('No such table: %s\n' % (repr(table),)) ok = False if not ok: return for table in params: core.bayesdb_table_guarantee_columns(self._bdb, table) sql = ''' SELECT tabname, colno, name, shortname FROM bayesdb_column WHERE %s ORDER BY tabname ASC, colno ASC ''' % (qualifier,) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params)) elif casefold(tokens[0]) == 'generator' or \ casefold(tokens[0]) == 'generators': params = None qualifier = None if len(tokens) == 1: params = () qualifier = '1' else: params = tokens[1:] names = ','.join('?%d' % (i + 1,) for i in range(len(params))) qualifier = ''' (name IN ({names}) OR (defaultp AND tabname IN ({names}))) '''.format(names=names) ok = True for generator in params: if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write('No such generator: %s\n' % (repr(generator),)) ok = False if not ok: return sql = ''' SELECT id, name, tabname, metamodel FROM bayesdb_generator WHERE %s ''' % (qualifier,) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params)) elif casefold(tokens[0]) == 'columns': if len(tokens) != 2: self.stdout.write('Describe columns of what generator?\n') return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write('No such generator: %s\n' % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) sql = ''' SELECT c.colno AS colno, c.name AS name, gc.stattype AS stattype, c.shortname AS shortname FROM bayesdb_generator AS g, (bayesdb_column AS c LEFT OUTER JOIN bayesdb_generator_column AS gc USING (colno)) WHERE g.id = ? AND g.id = gc.generator_id AND g.tabname = c.tabname ORDER BY colno ASC; ''' cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == 'model' or \ casefold(tokens[0]) == 'models': if len(tokens) < 2: self.stdout.write('Describe models of what generator?\n') return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write('No such generator: %s\n' % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) qualifier = None if len(tokens) == 2: qualifier = '1' else: modelnos = [] for token in tokens[2:]: try: modelno = int(token) except ValueError: self.stdout.write('Invalid model number: %s\n' % (repr(token),)) return else: if not core.bayesdb_generator_has_model( self._bdb, generator_id, modelno): self.stdout.write('No such model: %d\n' % (modelno,)) return modelnos.append(modelno) qualifier = 'modelno IN (%s)' % \ (','.join(map(str, modelnos),)) sql = ''' SELECT modelno, iterations FROM bayesdb_generator_model WHERE generator_id = ? AND %s ''' % (qualifier,) cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) else: self.stdout.write('Usage: .describe table(s) [<table>...]\n') self.stdout.write(' .describe generator(s) [<gen>...]\n') self.stdout.write(' .describe columns <gen>\n') self.stdout.write(' .describe model(s) <gen> [<model>...]\n')
def dot_describe(self, line): """describe BayesDB entities [table(s)|generator(s)|columns|model(s)] [<name>...] Print a human-readable description of the specified BayesDB entities. """ # XXX Lousy, lousy tokenizer. tokens = line.split() if len(tokens) == 0: self.stdout.write("Usage: .describe table(s) [<table>...]\n") self.stdout.write(" .describe generator(s) [<gen>...]\n") self.stdout.write(" .describe columns <gen>\n") self.stdout.write(" .describe model(s) <gen> [<model>...]\n") return if casefold(tokens[0]) == "table" or casefold(tokens[0]) == "tables": params = None qualifier = None if len(tokens) == 1: params = () qualifier = "1" else: params = tokens[1:] qualifier = "(" + " OR ".join(["tabname = ?" for _p in params]) + ")" ok = True for table in params: if not core.bayesdb_has_table(self._bdb, table): self.stdout.write("No such table: %s\n" % (repr(table),)) ok = False if not ok: return for table in params: core.bayesdb_table_guarantee_columns(self._bdb, table) sql = """ SELECT tabname, colno, name, shortname FROM bayesdb_column WHERE %s ORDER BY tabname ASC, colno ASC """ % ( qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params)) elif casefold(tokens[0]) == "generator" or casefold(tokens[0]) == "generators": params = None qualifier = None if len(tokens) == 1: params = () qualifier = "1" else: params = tokens[1:] names = ",".join("?%d" % (i + 1,) for i in range(len(params))) qualifier = """ (name IN ({names}) OR (defaultp AND tabname IN ({names}))) """.format( names=names ) ok = True for generator in params: if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write("No such generator: %s\n" % (repr(generator),)) ok = False if not ok: return sql = """ SELECT id, name, tabname, metamodel FROM bayesdb_generator WHERE %s """ % ( qualifier, ) with self._bdb.savepoint(): pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params)) elif casefold(tokens[0]) == "columns": if len(tokens) != 2: self.stdout.write("Describe columns of what generator?\n") return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write("No such generator: %s\n" % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) sql = """ SELECT c.colno AS colno, c.name AS name, gc.stattype AS stattype, c.shortname AS shortname FROM bayesdb_generator AS g, (bayesdb_column AS c LEFT OUTER JOIN bayesdb_generator_column AS gc USING (colno)) WHERE g.id = ? AND g.id = gc.generator_id AND g.tabname = c.tabname ORDER BY colno ASC; """ cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) elif casefold(tokens[0]) == "model" or casefold(tokens[0]) == "models": if len(tokens) < 2: self.stdout.write("Describe models of what generator?\n") return generator = tokens[1] with self._bdb.savepoint(): if not core.bayesdb_has_generator_default(self._bdb, generator): self.stdout.write("No such generator: %s\n" % (repr(generator),)) return generator_id = core.bayesdb_get_generator_default(self._bdb, generator) qualifier = None if len(tokens) == 2: qualifier = "1" else: modelnos = [] for token in tokens[2:]: try: modelno = int(token) except ValueError: self.stdout.write("Invalid model number: %s\n" % (repr(token),)) return else: if not core.bayesdb_generator_has_model(self._bdb, generator_id, modelno): self.stdout.write("No such model: %d\n" % (modelno,)) return modelnos.append(modelno) qualifier = "modelno IN (%s)" % (",".join(map(str, modelnos))) sql = """ SELECT modelno, iterations FROM bayesdb_generator_model WHERE generator_id = ? AND %s """ % ( qualifier, ) cursor = self._bdb.sql_execute(sql, (generator_id,)) pretty.pp_cursor(self.stdout, cursor) else: self.stdout.write("Usage: .describe table(s) [<table>...]\n") self.stdout.write(" .describe generator(s) [<gen>...]\n") self.stdout.write(" .describe columns <gen>\n") self.stdout.write(" .describe model(s) <gen> [<model>...]\n")
def instantiate_generator(bdb, gen_name, table, metamodel, columns, default=None): if default is None: default = False # Make sure there is no table by this name. if core.bayesdb_has_table(bdb, gen_name): raise BQLError(bdb, 'Name already defined as table: %s' % (repr(gen_name),)) # Make sure the bayesdb_column table knows all the columns. core.bayesdb_table_guarantee_columns(bdb, table) generator_already_existed = False if core.bayesdb_has_generator(bdb, gen_name): generator_already_existed = True else: # Create the generator record. generator_sql = '''INSERT INTO bayesdb_generator (name, tabname, metamodel, defaultp) VALUES (:name, :table, :metamodel, :defaultp)''' cursor = bdb.sql_execute(generator_sql, { 'name': gen_name, 'table': table, 'metamodel': metamodel.name(), 'defaultp': default, }) generator_id = core.bayesdb_get_generator(bdb, gen_name) assert generator_id assert 0 < generator_id # Get a map from column name to colno. Check # - for duplicates, # - for nonexistent columns, # - for invalid statistical types. column_map = {} duplicates = set() missing = set() invalid = set() colno_sql = ''' SELECT colno FROM bayesdb_column WHERE tabname = :table AND name = :column_name ''' stattype_sql = ''' SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype ''' for name, stattype in columns: name_folded = casefold(name) if name_folded in column_map: duplicates.add(name) continue cursor = bdb.sql_execute(colno_sql, { 'table': table, 'column_name': name, }) try: row = cursor.next() except StopIteration: missing.add(name) continue else: colno = row[0] assert isinstance(colno, int) cursor = bdb.sql_execute(stattype_sql, { 'stattype': stattype, }) if cursor_value(cursor) == 0: invalid.add(stattype) continue column_map[casefold(name)] = colno # XXX Would be nice to report these simultaneously. if missing: raise BQLError(bdb, 'No such columns in table %s: %s' % (repr(table), repr(list(missing)))) if duplicates: raise BQLError(bdb, 'Duplicate column names: %s' % (repr(list(duplicates)),)) if invalid: raise BQLError(bdb, 'Invalid statistical types: %s' % (repr(list(invalid)),)) if not generator_already_existed: # Insert column records. column_sql = ''' INSERT INTO bayesdb_generator_column (generator_id, colno, stattype) VALUES (:generator_id, :colno, :stattype) ''' for name, stattype in columns: colno = column_map[casefold(name)] stattype = casefold(stattype) bdb.sql_execute(column_sql, { 'generator_id': generator_id, 'colno': colno, 'stattype': stattype, }) column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns) return generator_id, column_list