def _store_encoding_info(self, bdb, generator_id): encoding_path = os.path.join( self._get_loom_project_path(bdb, generator_id), 'ingest', 'encoding.json.gz') with gzip.open(encoding_path) as encoding_file: encoding = json.loads(encoding_file.read().decode('ascii')) population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Store string encoding. insert_string_encoding = ''' INSERT INTO bayesdb_loom_string_encoding (generator_id, colno, string_form, integer_form) VALUES (:generator_id, :colno, :string_form, :integer_form) ''' for col in encoding: if 'symbols' in col: colno = bayesdb_table_column_number(bdb, table, str(col['name'])) for string_form, integer_form in col['symbols'].iteritems(): bdb.sql_execute( insert_string_encoding, { 'generator_id': generator_id, 'colno': colno, 'string_form': string_form, 'integer_form': integer_form }) # Store ordering of columns. insert_order_sql = ''' INSERT INTO bayesdb_loom_column_ordering (generator_id, colno, rank) VALUES (:generator_id, :colno, :rank) ''' for col_index in xrange(len(encoding)): colno = bayesdb_table_column_number( bdb, table, str(encoding[col_index]['name'])) bdb.sql_execute(insert_order_sql, { 'generator_id': generator_id, 'colno': colno, 'rank': col_index })
def _store_encoding_info(self, bdb, generator_id): encoding_path = os.path.join( self._get_loom_project_path(bdb, generator_id), 'ingest', 'encoding.json.gz' ) with gzip.open(encoding_path) as encoding_file: encoding = json.loads(encoding_file.read().decode('ascii')) population_id = bayesdb_generator_population(bdb, generator_id) table = bayesdb_population_table(bdb, population_id) # Store string encoding. insert_string_encoding = ''' INSERT INTO bayesdb_loom_string_encoding (generator_id, colno, string_form, integer_form) VALUES (:generator_id, :colno, :string_form, :integer_form) ''' for col in encoding: if 'symbols' in col: colno = bayesdb_table_column_number(bdb, table, str(col['name'])) for string_form, integer_form in col['symbols'].iteritems(): bdb.sql_execute(insert_string_encoding, { 'generator_id': generator_id, 'colno': colno, 'string_form': string_form, 'integer_form': integer_form }) # Store ordering of columns. insert_order_sql = ''' INSERT INTO bayesdb_loom_column_ordering (generator_id, colno, rank) VALUES (:generator_id, :colno, :rank) ''' for col_index in xrange(len(encoding)): colno = bayesdb_table_column_number( bdb, table, str(encoding[col_index]['name'])) bdb.sql_execute(insert_order_sql, { 'generator_id': generator_id, 'colno': colno, 'rank': col_index })
def test_bayesdb_population_add_variable(): with bayesdb() as bdb: bdb.sql_execute('create table t (a real, b ignore, c real)') bdb.execute(''' create population p for t with schema( set stattypes of a, c to numerical; b ignore; ); ''') population_id = core.bayesdb_get_population(bdb, 'p') # Checks column a. assert core.bayesdb_has_variable(bdb, population_id, None, 'a') assert core.bayesdb_table_column_number(bdb, 't', 'a') == 0 assert core.bayesdb_variable_number(bdb, population_id, None, 'a') == 0 # Checks column b, which is not in the population yet. assert not core.bayesdb_has_variable(bdb, population_id, None, 'b') assert core.bayesdb_table_column_number(bdb, 't', 'b') == 1 # Checks column c. assert core.bayesdb_has_variable(bdb, population_id, None, 'c') assert core.bayesdb_table_column_number(bdb, 't', 'c') == 2 assert core.bayesdb_variable_number(bdb, population_id, None, 'c') == 2 # Cannot add variable 'c', already exists. with pytest.raises(apsw.ConstraintError): core.bayesdb_add_variable(bdb, population_id, 'c', 'nominal') # Cannot add variable 'b' with a bad stattype. with pytest.raises(apsw.ConstraintError): core.bayesdb_add_variable(bdb, population_id, 'b', 'quzz') # Now add column b to the population. core.bayesdb_add_variable(bdb, population_id, 'b', 'nominal') assert core.bayesdb_variable_number(bdb, population_id, None, 'b') == 1 # Add a new column q to table t, then add it to population p. bdb.sql_execute('alter table t add column q real;') assert core.bayesdb_table_column_number(bdb, 't', 'q') == 3 assert not core.bayesdb_has_variable(bdb, population_id, None, 'q') core.bayesdb_add_variable(bdb, population_id, 'q', 'numerical') assert core.bayesdb_has_variable(bdb, population_id, None, 'q') assert core.bayesdb_variable_number(bdb, population_id, None, 'q') == 3
def test_bayesdb_population_add_variable(): with bayesdb() as bdb: bdb.sql_execute('create table t (a real, b ignore, c real)') bdb.execute(''' create population p for t with schema( set stattypes of a, c to numerical; b ignore; ); ''') population_id = core.bayesdb_get_population(bdb, 'p') # Checks column a. assert core.bayesdb_has_variable(bdb, population_id, None, 'a') assert core.bayesdb_table_column_number(bdb, 't', 'a') == 0 assert core.bayesdb_variable_number(bdb, population_id, None, 'a') == 0 # Checks column b, which is not in the population yet. assert not core.bayesdb_has_variable(bdb, population_id, None, 'b') assert core.bayesdb_table_column_number(bdb, 't', 'b') == 1 # Checks column c. assert core.bayesdb_has_variable(bdb, population_id, None, 'c') assert core.bayesdb_table_column_number(bdb, 't', 'c') == 2 assert core.bayesdb_variable_number(bdb, population_id, None, 'c') == 2 # Cannot add variable 'c', already exists. with pytest.raises(apsw.ConstraintError): core.bayesdb_add_variable(bdb, population_id, 'c', 'nominal') # Cannot add variable 'b' with a bad stattype. with pytest.raises(apsw.ConstraintError): core.bayesdb_add_variable(bdb, population_id, 'b', 'quzz') # Now add column b to the population. core.bayesdb_add_variable(bdb, population_id, 'b', 'nominal') assert core.bayesdb_variable_number(bdb, population_id, None, 'b') == 1 # Add a new column q to table t, then add it to population p. bdb.sql_execute('alter table t add column q real;') assert core.bayesdb_table_column_number(bdb, 't', 'q') == 3 assert not core.bayesdb_has_variable(bdb, population_id, None, 'q') core.bayesdb_add_variable(bdb, population_id, 'q', 'numerical') assert core.bayesdb_has_variable(bdb, population_id, None, 'q') assert core.bayesdb_variable_number(bdb, population_id, None, 'q') == 3
def bayesdb_load_codebook_csv_file(bdb, table, pathname): """Load a codebook for `table` from the CSV file at `pathname`.""" codebook = None with open(pathname, 'rU') as f: reader = csv.reader(f) try: header = reader.next() except StopIteration: raise IOError('Empty codebook file') header = [unicode(h, 'utf8').strip() for h in header] if header != ['name','shortname','description','value_map']: raise IOError('Wrong CSV header for codebook') codebook = [] line = 1 for row in reader: if len(row) != 4: raise IOError('Wrong number of columns at line %d: %d' % (line, len(row))) column_name, _shortname, _description, _value_map_json = row codebook.append(row) line += 1 with bdb.savepoint(): for column_name, shortname, description, value_map_json in codebook: if not core.bayesdb_table_has_column(bdb, table, column_name): raise IOError('Column does not exist in table %s: %s' % (repr(table), repr(column_name))) colno = core.bayesdb_table_column_number(bdb, table, column_name) try: value_map = dict(json.loads(value_map_json)) except (ValueError, TypeError): if value_map_json == '' or value_map_json.lower() == 'nan': value_map = {} else: raise IOError('Invalid value map for column %r: %r' % (column_name, value_map_json)) sql = ''' DELETE FROM bayesdb_column_map WHERE tabname = ? AND colno = ? ''' bdb.sql_execute(sql, (table, colno)) sql = ''' INSERT INTO bayesdb_column_map (tabname, colno, key, value) VALUES (?, ?, ?, ?) ''' for key in sorted(value_map.keys()): value = value_map[key] bdb.sql_execute(sql, (table, colno, key, value)) sql = ''' UPDATE bayesdb_column SET shortname = :shortname, description = :description WHERE tabname = :table AND colno = :colno ''' total_changes = bdb._sqlite3.totalchanges() bdb.sql_execute(sql, { 'shortname': shortname, 'description': description, 'table': table, 'colno': colno, }) assert bdb._sqlite3.totalchanges() - total_changes == 1
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname, create=False, ifnotexists=False, gzipped=None): """Load legacy BayesDB models from a file. Legacy models are from the previous incarnation of BayesDB, before bayeslite. If you did not use the previous incarnation of BayesDB, you need not worry about this. :param bayeslite.BayesDB bdb: BayesDB instance :param str generator: name of generator :param str table: name of table :param str metamodel: name of metamodel, must be ``crosscat`` :param str pathname: pathname of legacy models file :param bool create: if true and `generator` does not exist, create it :param bool ifnotexists: if true and `generator` exists, do it anyway :param bool gzipped: if true, or if ``None`` and `pathname` ends in ``.pkl.gz``, decompress with gzip first """ if metamodel != 'crosscat': raise ValueError('Only crosscat legacy models are supported.') if not create: if ifnotexists: raise ValueError('Not creating generator whether or not exists!') # Load the pickled file -- gzipped, if gzipped is true or if # gzipped is not specified and the file ends in .pkl.gz. pickled = None with open(pathname, 'rb') as f: if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')): with gzip.GzipFile(fileobj=f) as gzf: pickled = pickle.load(gzf) else: pickled = pickle.load(f) # Pick apart the schema and model data. # # XXX Support even older models formats, from before the schema # was included. Not sure exactly how they were structured. if 'schema' not in pickled: raise IOError('Invalid legacy model: missing schema') if 'models' not in pickled: raise IOError('Invalid legacy model: missing models') schema = pickled['schema'] models = pickled['models'] # Make sure the schema looks sensible. Map legacy stattypes # (`cctypes') to modern stattypes. if not isinstance(schema, dict): raise IOError('Invalid legacy model: schema is not a dict') for column_name in schema: column_schema = schema[column_name] if not isinstance(column_schema, dict): raise IOError('Invalid legacy model: column schema is not a dict') if not 'cctype' in column_schema: raise IOError('Invalid legacy model: column schema missing cctype') if column_schema['cctype'] in renamed_column_stattypes: column_schema['cctype'] = \ renamed_column_stattypes[column_schema['cctype']] if column_schema['cctype'] not in allowed_column_stattypes: raise IOError('Invalid legacy model: unknown column type') # XXX Check whether the schema resembles a sane generator schema. # XXX Check whether models is a dict mapping integers to thetas. # XXX Check whether the thetas look sensible. # XXX Check whether the metamodel makes sense of it! column_stattypes = dict((casefold(column_name), casefold(schema[column_name]['cctype'])) for column_name in schema) # Ready to update the database. Do it in a savepoint in case # anything goes wrong. with bdb.savepoint(): # Ensure the table exists. Can't do anything if we have no # data. if not core.bayesdb_has_table(bdb, table): raise ValueError('No such table: %s' % (repr(table),)) # Ensure the generator exists. if core.bayesdb_has_generator(bdb, generator): if create and not ifnotexists: raise ValueError('Generator already exists: %s' % (repr(generator),)) generator_id = core.bayesdb_get_generator(bdb, generator) generator_table = core.bayesdb_generator_table(bdb, generator_id) if casefold(table) != generator_table: raise ValueError( 'Generator %r is for table %r, not for table: %r' % (generator, generator_table, table)) # Generator exists. If the schema differs and there are # existing models, fail. If the schema differs and there # are no existing models, change the schema. # # XXX Not clear changing the schema is really appropriate. generator_id = core.bayesdb_get_generator(bdb, generator) old_types = bayesdb_generator_column_stattypes(bdb, generator_id) if column_stattypes != old_types: sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(bdb, (generator_id,)) if 0 < cursor_value(cursor): raise ValueError('Legacy models mismatch schema: %s' % (repr(generator),)) qg = sqlite3_quote_name(generator) bdb.execute('DROP GENERATOR %s' % (qg,)) bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) elif create: bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) else: raise ValueError('No such generator: %s' % (repr(generator),)) # Map the case of the column names in the models. # # XXX Check more than just the column names. for modelno in models: # dictionary theta = models[modelno] if 'X_L' not in theta: raise IOError('Invalid legacy model: no X_L in theta[%u]' % (modelno,)) X_L = theta['X_L'] if 'view_state' not in X_L: raise IOError('Invalid legacy model' ': no view_state in X_L[%u]' % (modelno,)) for viewno, view_state in enumerate(X_L['view_state']): if 'column_names' not in view_state: raise IOError('Invalid legacy model: no column names' ' in view state %u of X_L[%u]' % (viewno, modelno)) view_column_names = view_state['column_names'] if not isinstance(view_column_names, list): raise IOError('Invalid legacy model' ': non-list for view %u columns in X_L[%u]' % (viewno, modelno)) for i in range(len(view_column_names)): name = view_column_names[i] if not core.bayesdb_table_has_column(bdb, table, name): raise IOError('No such column in table %s: %s' % (repr(table), repr(name))) # Canonicalize the case. colno = core.bayesdb_table_column_number(bdb, table, name) name = core.bayesdb_table_column_name(bdb, table, colno) view_column_names[i] = name # Determine where to start numbering the new models. generator_id = core.bayesdb_get_generator(bdb, generator) modelno_max_sql = ''' SELECT MAX(modelno) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(modelno_max_sql, (generator_id,)) modelno_max = cursor_value(cursor) modelno_start = 0 if modelno_max is None else modelno_max + 1 # Consistently number the models consecutively in order of the # external numbering starting at the smallest nonnegative # model number not currently used. Do not vary based on the # ordering of Python dict iteration. insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' insert_theta_json_sql = ''' INSERT INTO bayesdb_crosscat_theta (generator_id, modelno, theta_json) VALUES (:generator_id, :modelno, :theta_json) ''' for i, modelno_ext in enumerate(sorted(models.keys())): modelno = modelno_start + i theta = models[modelno_ext] iterations = 0 if 'iterations' in theta and isinstance(theta['iterations'], int): iterations = theta['iterations'] bdb.sql_execute(insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': iterations, }) bdb.sql_execute(insert_theta_json_sql, { 'generator_id': generator_id, 'modelno': modelno, 'theta_json': json.dumps(theta), })
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname, create=False, ifnotexists=False, gzipped=None): """Load legacy BayesDB models from a file. Legacy models are from the previous incarnation of BayesDB, before bayeslite. If you did not use the previous incarnation of BayesDB, you need not worry about this. :param bayeslite.BayesDB bdb: BayesDB instance :param str generator: name of generator :param str table: name of table :param str metamodel: name of metamodel, must be ``crosscat`` :param str pathname: pathname of legacy models file :param bool create: if true and `generator` does not exist, create it :param bool ifnotexists: if true and `generator` exists, do it anyway :param bool gzipped: if true, or if ``None`` and `pathname` ends in ``.pkl.gz``, decompress with gzip first """ if metamodel != 'crosscat': raise ValueError('Only crosscat legacy models are supported.') if not create: if ifnotexists: raise ValueError('Not creating generator whether or not exists!') # Load the pickled file -- gzipped, if gzipped is true or if # gzipped is not specified and the file ends in .pkl.gz. pickled = None with open(pathname, 'rb') as f: if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')): with gzip.GzipFile(fileobj=f) as gzf: pickled = pickle.load(gzf) else: pickled = pickle.load(f) # Pick apart the schema and model data. # # XXX Support even older models formats, from before the schema # was included. Not sure exactly how they were structured. if 'schema' not in pickled: raise IOError('Invalid legacy model: missing schema') if 'models' not in pickled: raise IOError('Invalid legacy model: missing models') schema = pickled['schema'] models = pickled['models'] # Make sure the schema looks sensible. Map legacy stattypes # (`cctypes') to modern stattypes. if not isinstance(schema, dict): raise IOError('Invalid legacy model: schema is not a dict') for column_name in schema: column_schema = schema[column_name] if not isinstance(column_schema, dict): raise IOError('Invalid legacy model: column schema is not a dict') if not 'cctype' in column_schema: raise IOError('Invalid legacy model: column schema missing cctype') if column_schema['cctype'] in renamed_column_stattypes: column_schema['cctype'] = \ renamed_column_stattypes[column_schema['cctype']] if column_schema['cctype'] not in allowed_column_stattypes: raise IOError('Invalid legacy model: unknown column type') # XXX Check whether the schema resembles a sane generator schema. # XXX Check whether models is a dict mapping integers to thetas. # XXX Check whether the thetas look sensible. # XXX Check whether the metamodel makes sense of it! column_stattypes = dict( (casefold(column_name), casefold(schema[column_name]['cctype'])) for column_name in schema) # Ready to update the database. Do it in a savepoint in case # anything goes wrong. with bdb.savepoint(): # Ensure the table exists. Can't do anything if we have no # data. if not core.bayesdb_has_table(bdb, table): raise ValueError('No such table: %s' % (repr(table), )) # Ensure the generator exists. if core.bayesdb_has_generator(bdb, generator): if create and not ifnotexists: raise ValueError('Generator already exists: %s' % (repr(generator), )) generator_id = core.bayesdb_get_generator(bdb, generator) generator_table = core.bayesdb_generator_table(bdb, generator_id) if casefold(table) != generator_table: raise ValueError( 'Generator %r is for table %r, not for table: %r' % (generator, generator_table, table)) # Generator exists. If the schema differs and there are # existing models, fail. If the schema differs and there # are no existing models, change the schema. # # XXX Not clear changing the schema is really appropriate. generator_id = core.bayesdb_get_generator(bdb, generator) old_types = bayesdb_generator_column_stattypes(bdb, generator_id) if column_stattypes != old_types: sql = ''' SELECT COUNT(*) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(bdb, (generator_id, )) if 0 < cursor_value(cursor): raise ValueError('Legacy models mismatch schema: %s' % (repr(generator), )) qg = sqlite3_quote_name(generator) bdb.execute('DROP GENERATOR %s' % (qg, )) bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) elif create: bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes) else: raise ValueError('No such generator: %s' % (repr(generator), )) # Map the case of the column names in the models. # # XXX Check more than just the column names. for modelno in models: # dictionary theta = models[modelno] if 'X_L' not in theta: raise IOError('Invalid legacy model: no X_L in theta[%u]' % (modelno, )) X_L = theta['X_L'] if 'view_state' not in X_L: raise IOError('Invalid legacy model' ': no view_state in X_L[%u]' % (modelno, )) for viewno, view_state in enumerate(X_L['view_state']): if 'column_names' not in view_state: raise IOError('Invalid legacy model: no column names' ' in view state %u of X_L[%u]' % (viewno, modelno)) view_column_names = view_state['column_names'] if not isinstance(view_column_names, list): raise IOError('Invalid legacy model' ': non-list for view %u columns in X_L[%u]' % (viewno, modelno)) for i in range(len(view_column_names)): name = view_column_names[i] if not core.bayesdb_table_has_column(bdb, table, name): raise IOError('No such column in table %s: %s' % (repr(table), repr(name))) # Canonicalize the case. colno = core.bayesdb_table_column_number(bdb, table, name) name = core.bayesdb_table_column_name(bdb, table, colno) view_column_names[i] = name # Determine where to start numbering the new models. generator_id = core.bayesdb_get_generator(bdb, generator) modelno_max_sql = ''' SELECT MAX(modelno) FROM bayesdb_generator_model WHERE generator_id = ? ''' cursor = bdb.sql_execute(modelno_max_sql, (generator_id, )) modelno_max = cursor_value(cursor) modelno_start = 0 if modelno_max is None else modelno_max + 1 # Consistently number the models consecutively in order of the # external numbering starting at the smallest nonnegative # model number not currently used. Do not vary based on the # ordering of Python dict iteration. insert_model_sql = ''' INSERT INTO bayesdb_generator_model (generator_id, modelno, iterations) VALUES (:generator_id, :modelno, :iterations) ''' insert_theta_json_sql = ''' INSERT INTO bayesdb_crosscat_theta (generator_id, modelno, theta_json) VALUES (:generator_id, :modelno, :theta_json) ''' for i, modelno_ext in enumerate(sorted(models.keys())): modelno = modelno_start + i theta = models[modelno_ext] iterations = 0 if 'iterations' in theta and isinstance(theta['iterations'], int): iterations = theta['iterations'] bdb.sql_execute( insert_model_sql, { 'generator_id': generator_id, 'modelno': modelno, 'iterations': iterations, }) bdb.sql_execute( insert_theta_json_sql, { 'generator_id': generator_id, 'modelno': modelno, 'theta_json': json.dumps(theta), })