Example #1
0
def create_empty_table(bdb, column_names):
    """Create a fresh empty table with the given column names.

    Give all the columns a NUMERIC data type in the underlying SQL.
    Return the name of the new table.
    """
    table = bdb.temp_table_name()
    qt = sqlite3_quote_name(table)
    qcns = map(sqlite3_quote_name, column_names)
    schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns)
    bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
    core.bayesdb_table_guarantee_columns(bdb, table)
    return table
def create_empty_table(bdb, column_names):
    """Create a fresh empty table with the given column names.

    Give all the columns a NUMERIC data type in the underlying SQL.
    Return the name of the new table.
    """
    table = bdb.temp_table_name()
    qt = sqlite3_quote_name(table)
    qcns = map(sqlite3_quote_name, column_names)
    schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
    bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
    core.bayesdb_table_guarantee_columns(bdb, table)
    return table
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    # XXX Whattakludge!
    idxcol = '_rowid_'
    if idxcol in df.columns:
        raise ValueError('Column `_rowid_\' is not allowed.')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(name for name in df.columns
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown),))
            column_names = ['_rowid_'] + df.columns
        elif create:
            column_names = [idxcol] + list(df.columns)
            qcns = map(sqlite3_quote_name, column_names)
            schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table),))
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in df.to_records():
            bdb.sql_execute(sql, row)
Example #4
0
    def dot_describe(self, line):
        '''describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        '''
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe population(s) [<pop>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
            return
        if casefold(tokens[0]) == 'table' or \
           casefold(tokens[0]) == 'tables':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                qualifier = \
                    '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')'
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write('No such table: %s\n' %
                                          (repr(table), ))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = '''
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            ''' % (qualifier, )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) in ('population', 'populations'):
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1, )
                                 for i in xrange(len(params)))
                qualifier = '(name IN (%s))' % (names, )
                ok = True
                for population in params:
                    if not core.bayesdb_has_population(self._bdb, population):
                        self.stdout.write('No such population: %s\n' %
                                          (repr(population), ))
                        ok = False
                if not ok:
                    return
            with self._bdb.savepoint():
                cursor = self._bdb.sql_execute(
                    '''
                    SELECT id, name, tabname
                        FROM bayesdb_population
                        WHERE %s
                ''' % (qualifier, ), params)
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'generator' or \
                casefold(tokens[0]) == 'generators':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1, ) for i in range(len(params)))
                qualifier = '''
                    (name IN ({names}))
                '''.format(names=names)
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator(self._bdb, None,
                                                      generator):
                        self.stdout.write('No such generator: %s\n' %
                                          (repr(generator), ))
                        ok = False
                if not ok:
                    return
            sql = '''
                SELECT id, name, tabname, backend
                    FROM bayesdb_generator
                    WHERE %s
            ''' % (qualifier, )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout,
                                 self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == 'variables':
            if len(tokens) != 2:
                self.stdout.write('Usage: .describe variables <population>\n')
                return
            population = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_population(self._bdb, population):
                    self.stdout.write('No such population: %r\n' %
                                      (population, ))
                    return
                population_id = core.bayesdb_get_population(
                    self._bdb, population)
                sql = '''
                    SELECT c.colno AS colno, c.name AS name,
                            v.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_population AS p,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_variable AS v
                                USING (colno))
                        WHERE p.id = ? AND p.id = v.population_id
                            AND p.tabname = c.tabname
                        ORDER BY colno ASC;
                '''
                cursor = self._bdb.sql_execute(sql, (population_id, ))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'model' or \
                casefold(tokens[0]) == 'models':
            if len(tokens) < 2:
                self.stdout.write('Describe models of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator(self._bdb, None, generator):
                    self.stdout.write('No such generator: %s\n' %
                                      (repr(generator), ))
                    return
                generator_id = core.bayesdb_get_generator(
                    self._bdb, None, generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = '1'
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write('Invalid model number: %s\n' %
                                              (repr(token), ))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(
                                    self._bdb, generator_id, modelno):
                                self.stdout.write('No such model: %d\n' %
                                                  (modelno, ))
                                return
                            modelnos.append(modelno)
                    qualifier = 'modelno IN (%s)' % \
                        (','.join(map(str, modelnos),))
                sql = '''
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                ''' % (qualifier, )
                cursor = self._bdb.sql_execute(sql, (generator_id, ))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
Example #5
0
def instantiate_generator(bdb, gen_name, table, metamodel, columns, ifnotexists=None, default=None):
    if ifnotexists is None:
        ifnotexists = False
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(bdb, "Name already defined as table: %s" % (repr(gen_name),))

    # Make sure there's no generator by this name unless we were asked
    # to redefine it in that case.
    if not ifnotexists and core.bayesdb_has_generator(bdb, gen_name):
        raise BQLError(bdb, "Name already defined as generator: %s" % (repr(gen_name),))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    # Create the generator record.
    generator_sql = """
        INSERT%s INTO bayesdb_generator
            (name, tabname, metamodel, defaultp)
            VALUES (:name, :table, :metamodel, :defaultp)
    """ % (
        " OR IGNORE" if ifnotexists else "",
    )
    cursor = bdb.sql_execute(
        generator_sql, {"name": gen_name, "table": table, "metamodel": metamodel.name(), "defaultp": default}
    )
    generator_id = cursor.lastrowid
    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = """
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    """
    stattype_sql = """
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    """
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {"table": table, "column_name": name})
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {"stattype": stattype})
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, "No such columns in table %s: %s" % (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(bdb, "Duplicate column names: %s" % (repr(list(duplicates)),))
    if invalid:
        raise BQLError(bdb, "Invalid statistical types: %s" % (repr(list(invalid)),))

    # Insert column records.
    column_sql = """
        INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
    """
    for name, stattype in columns:
        colno = column_map[casefold(name)]
        stattype = casefold(stattype)
        bdb.sql_execute(column_sql, {"generator_id": generator_id, "colno": colno, "stattype": stattype})

    column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns)
    return generator_id, column_list
Example #6
0
def instantiate_generator(bdb,
                          gen_name,
                          table,
                          metamodel,
                          columns,
                          default=None):
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(
            bdb, 'Name already defined as table: %s' % (repr(gen_name), ))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    generator_already_existed = False
    if core.bayesdb_has_generator(bdb, gen_name):
        generator_already_existed = True
    else:
        # Create the generator record.
        generator_sql = '''INSERT INTO bayesdb_generator
                           (name, tabname, metamodel, defaultp)
                           VALUES (:name, :table, :metamodel, :defaultp)'''
        cursor = bdb.sql_execute(
            generator_sql, {
                'name': gen_name,
                'table': table,
                'metamodel': metamodel.name(),
                'defaultp': default,
            })
    generator_id = core.bayesdb_get_generator(bdb, gen_name)

    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {
                'stattype': stattype,
            })
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb, 'No such columns in table %s: %s' %
            (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(
            bdb, 'Duplicate column names: %s' % (repr(list(duplicates)), ))
    if invalid:
        raise BQLError(
            bdb, 'Invalid statistical types: %s' % (repr(list(invalid)), ))

    if not generator_already_existed:
        # Insert column records.
        column_sql = '''
            INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
        '''
        for name, stattype in columns:
            colno = column_map[casefold(name)]
            stattype = casefold(stattype)
            bdb.sql_execute(
                column_sql, {
                    'generator_id': generator_id,
                    'colno': colno,
                    'stattype': stattype,
                })

    column_list = sorted((column_map[casefold(name)], name, stattype)
                         for name, stattype in columns)
    return generator_id, column_list
Example #7
0
def bayesdb_read_csv(bdb,
                     table,
                     f,
                     header=False,
                     create=False,
                     ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table), ))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                              (repr(list(duplicates)), ))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(
                    name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown), ))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                              (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                              (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Example #8
0
def bayesdb_read_csv(bdb, table, f, header=False,
        create=False, ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table),))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            if any(len(c)==0 for c in column_names):
                raise IOError(
                    'Missing column names in header: %s' %repr(column_names))
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                    (repr(list(duplicates)),))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown),))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                    (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                    (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Example #9
0
def bayesdb_read_pandas_df(bdb,
                           table,
                           df,
                           create=False,
                           ifnotexists=False,
                           index=None):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    :param str index: name of column for index

    If `index` is `None`, then the dataframe's index dtype must be
    convertible to int64, and it is mapped to the table's rowids.  If
    the dataframe's index dtype is not convertible to int64, you must
    specify `index` to give a primary key for the table.
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    column_names = [str(column) for column in df.columns]
    if index is None:
        create_column_names = column_names
        insert_column_names = ['_rowid_'] + column_names
        try:
            key_index = df.index.astype('int64')
        except ValueError:
            raise ValueError('Must specify index name for non-integral index!')
    else:
        if index in df.columns:
            raise ValueError('Index name collides with column name: %r' %
                             (index, ))
        create_column_names = [index] + column_names
        insert_column_names = create_column_names
        key_index = df.index
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(
                name for name in create_column_names
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown), ))
        elif create:
            qccns = map(sqlite3_quote_name, create_column_names)

            def column_schema(column_name, qcn):
                if column_name == index:
                    return '%s NUMERIC PRIMARY KEY' % (qcn, )
                else:
                    return '%s NUMERIC' % (qcn, )

            schema = ','.join(
                column_schema(ccn, qccn)
                for ccn, qccn in zip(create_column_names, qccns))
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table), ))
        qt = sqlite3_quote_name(table)
        qicns = map(sqlite3_quote_name, insert_column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qicns), ','.join('?' for _qicn in qicns))
        for key, i in zip(key_index, df.index):
            bdb.sql_execute(sql, (key, ) + tuple(df.ix[i]))
Example #10
0
def _create_population(bdb, phrase):
    if core.bayesdb_has_population(bdb, phrase.name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(
                bdb,
                'Name already defined as population: %r' % (phrase.name, ))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)
    seen_columns = []

    # Create the population record and get the assigned id.
    bdb.sql_execute(
        '''
        INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?)
    ''', (phrase.name, phrase.table))
    population_id = core.bayesdb_get_population(bdb, phrase.name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(
        itertools.chain.from_iterable([[(name, s.stattype) for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(
        itertools.chain.from_iterable([[(name, 'ignore') for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(
        itertools.chain.from_iterable([
            s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)
        ]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r' %
                (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, pop_guess_stattypes)
        migrate = [(col, st) for col, st in pop_guess_vars if st == 'key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r' %
            (not_found, ))

    # Get a map from variable name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    variable_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in variable_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': phrase.table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
            if cursor_value(cursor) == 0 and stattype != 'ignore':
                invalid.add(stattype)
                continue
            variable_map[name] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb,
            'No such columns in table %r: %r' % (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb,
                       'Duplicate column names: %r' % (list(duplicates), ))
    if invalid:
        raise BQLError(bdb,
                       'Invalid statistical types: %r' % (list(invalid), ))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        colno = variable_map[name]
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_variable
                (population_id, name, colno, stattype)
                VALUES (?, ?, ?, ?)
        ''', (population_id, name, colno, stattype))
Example #11
0
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False,
        index=None):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    :param str index: name of column for index

    If `index` is `None`, then the dataframe's index dtype must be
    convertible to int64, and it is mapped to the table's rowids.  If
    the dataframe's index dtype is not convertible to int64, you must
    specify `index` to give a primary key for the table.
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    column_names = [str(column) for column in df.columns]
    if index is None:
        create_column_names = column_names
        insert_column_names = ['_rowid_'] + column_names
        try:
            key_index = df.index.astype('int64')
        except ValueError:
            raise ValueError('Must specify index name for non-integral index!')
    else:
        if index in df.columns:
            raise ValueError('Index name collides with column name: %r'
                % (index,))
        create_column_names = [index] + column_names
        insert_column_names = create_column_names
        key_index = df.index
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(name for name in create_column_names
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown),))
        elif create:
            qccns = map(sqlite3_quote_name, create_column_names)
            def column_schema(column_name, qcn):
                if column_name == index:
                    return '%s NUMERIC PRIMARY KEY' % (qcn,)
                else:
                    return '%s NUMERIC' % (qcn,)
            schema = ','.join(column_schema(ccn, qccn)
                for ccn, qccn in zip(create_column_names, qccns))
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table),))
        qt = sqlite3_quote_name(table)
        qicns = map(sqlite3_quote_name, insert_column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qicns), ','.join('?' for _qicn in qicns))
        for key, i in zip(key_index, df.index):
            bdb.sql_execute(sql, (key,) + tuple(df.ix[i]))
Example #12
0
def _create_population(bdb, phrase):
    # Retrieve the (possibility implicit) population name.
    population_name = phrase.name or phrase.table
    implicit = 1 if phrase.name is None else 0

    # Handle IF NOT EXISTS.
    if core.bayesdb_has_population(bdb, population_name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(bdb, 'Name already defined as population: %r' %
                (population_name,))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)

    # Create the population record and get the assigned id.
    bdb.sql_execute('''
        INSERT INTO bayesdb_population (name, tabname, implicit)
            VALUES (?, ?, ?)
    ''', (population_name, phrase.table, implicit))
    population_id = core.bayesdb_get_population(bdb, population_name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(itertools.chain.from_iterable(
        [[(name, s.stattype) for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(itertools.chain.from_iterable(
        [[(name, 'ignore') for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(itertools.chain.from_iterable(
        [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r'
                % (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes])
        migrate = [(col, st) for col, st in pop_guess_vars if st=='key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Ensure no string-valued variables are being modeled as numerical.
    numerical_string_vars = [
        var for var, stattype in pop_model_vars
        if stattype == 'numerical'
            and _column_contains_string(bdb, phrase.table, var)
    ]
    if numerical_string_vars:
        raise BQLError(bdb,
            'Column(s) with string values modeled as numerical: %r'
            % (numerical_string_vars, ))

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r'
            % (not_found, ))

    # Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    seen_variables = set()
    duplicates = set()
    missing = set()
    invalid = set()
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in seen_variables:
            duplicates.add(name)
            continue
        if not core.bayesdb_table_has_column(bdb, phrase.table, nm):
            missing.add(name)
            continue
        cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
        if cursor_value(cursor) == 0 and stattype != 'ignore':
            invalid.add(stattype)
            continue
        seen_variables.add(nm)
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, 'No such columns in table %r: %r' %
            (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),))
    if invalid:
        raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        core.bayesdb_add_variable(bdb, population_id, name, stattype)
Example #13
0
    def dot_describe(self, line):
        '''describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        '''
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe columns <gen>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
            return
        if casefold(tokens[0]) == 'table' or \
           casefold(tokens[0]) == 'tables':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                qualifier = \
                    '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')'
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write('No such table: %s\n' %
                                          (repr(table),))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = '''
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            ''' % (qualifier,)
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) == 'generator' or \
                casefold(tokens[0]) == 'generators':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1,) for i in range(len(params)))
                qualifier = '''
                    (name IN ({names}) OR (defaultp AND tabname IN ({names})))
                '''.format(names=names)
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator_default(self._bdb,
                            generator):
                        self.stdout.write('No such generator: %s\n' %
                            (repr(generator),))
                        ok = False
                if not ok:
                    return
            sql = '''
                SELECT id, name, tabname, metamodel
                    FROM bayesdb_generator
                    WHERE %s
            ''' % (qualifier,)
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout,
                    self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == 'columns':
            if len(tokens) != 2:
                self.stdout.write('Describe columns of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb,
                        generator):
                    self.stdout.write('No such generator: %s\n' %
                        (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb,
                    generator)
                sql = '''
                    SELECT c.colno AS colno, c.name AS name,
                            gc.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_generator AS g,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_generator_column AS gc
                                USING (colno))
                        WHERE g.id = ? AND g.id = gc.generator_id
                            AND g.tabname = c.tabname
                        ORDER BY colno ASC;
                '''
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'model' or \
                casefold(tokens[0]) == 'models':
            if len(tokens) < 2:
                self.stdout.write('Describe models of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb,
                        generator):
                    self.stdout.write('No such generator: %s\n' %
                        (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb,
                    generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = '1'
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write('Invalid model number: %s\n' %
                                (repr(token),))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(
                                    self._bdb, generator_id, modelno):
                                self.stdout.write('No such model: %d\n' %
                                    (modelno,))
                                return
                            modelnos.append(modelno)
                    qualifier = 'modelno IN (%s)' % \
                        (','.join(map(str, modelnos),))
                sql = '''
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                ''' % (qualifier,)
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe columns <gen>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
Example #14
0
    def dot_describe(self, line):
        """describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        """
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write("Usage: .describe table(s) [<table>...]\n")
            self.stdout.write("       .describe generator(s) [<gen>...]\n")
            self.stdout.write("       .describe columns <gen>\n")
            self.stdout.write("       .describe model(s) <gen> [<model>...]\n")
            return
        if casefold(tokens[0]) == "table" or casefold(tokens[0]) == "tables":
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = "1"
            else:
                params = tokens[1:]
                qualifier = "(" + " OR ".join(["tabname = ?" for _p in params]) + ")"
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write("No such table: %s\n" % (repr(table),))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = """
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            """ % (
                qualifier,
            )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) == "generator" or casefold(tokens[0]) == "generators":
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = "1"
            else:
                params = tokens[1:]
                names = ",".join("?%d" % (i + 1,) for i in range(len(params)))
                qualifier = """
                    (name IN ({names}) OR (defaultp AND tabname IN ({names})))
                """.format(
                    names=names
                )
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator_default(self._bdb, generator):
                        self.stdout.write("No such generator: %s\n" % (repr(generator),))
                        ok = False
                if not ok:
                    return
            sql = """
                SELECT id, name, tabname, metamodel
                    FROM bayesdb_generator
                    WHERE %s
            """ % (
                qualifier,
            )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == "columns":
            if len(tokens) != 2:
                self.stdout.write("Describe columns of what generator?\n")
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb, generator):
                    self.stdout.write("No such generator: %s\n" % (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb, generator)
                sql = """
                    SELECT c.colno AS colno, c.name AS name,
                            gc.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_generator AS g,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_generator_column AS gc
                                USING (colno))
                        WHERE g.id = ? AND g.id = gc.generator_id
                            AND g.tabname = c.tabname
                        ORDER BY colno ASC;
                """
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == "model" or casefold(tokens[0]) == "models":
            if len(tokens) < 2:
                self.stdout.write("Describe models of what generator?\n")
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb, generator):
                    self.stdout.write("No such generator: %s\n" % (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb, generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = "1"
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write("Invalid model number: %s\n" % (repr(token),))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(self._bdb, generator_id, modelno):
                                self.stdout.write("No such model: %d\n" % (modelno,))
                                return
                            modelnos.append(modelno)
                    qualifier = "modelno IN (%s)" % (",".join(map(str, modelnos)))
                sql = """
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                """ % (
                    qualifier,
                )
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write("Usage: .describe table(s) [<table>...]\n")
            self.stdout.write("       .describe generator(s) [<gen>...]\n")
            self.stdout.write("       .describe columns <gen>\n")
            self.stdout.write("       .describe model(s) <gen> [<model>...]\n")
Example #15
0
def instantiate_generator(bdb, gen_name, table, metamodel, columns,
                          default=None):
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(bdb, 'Name already defined as table: %s' %
            (repr(gen_name),))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    generator_already_existed = False
    if core.bayesdb_has_generator(bdb, gen_name):
        generator_already_existed = True
    else:
        # Create the generator record.
        generator_sql = '''INSERT INTO bayesdb_generator
                           (name, tabname, metamodel, defaultp)
                           VALUES (:name, :table, :metamodel, :defaultp)'''
        cursor = bdb.sql_execute(generator_sql, {
            'name': gen_name,
            'table': table,
            'metamodel': metamodel.name(),
            'defaultp': default,
        })
    generator_id = core.bayesdb_get_generator(bdb, gen_name)

    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {
                'stattype': stattype,
            })
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, 'No such columns in table %s: %s' %
            (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(bdb, 'Duplicate column names: %s' %
            (repr(list(duplicates)),))
    if invalid:
        raise BQLError(bdb, 'Invalid statistical types: %s' %
            (repr(list(invalid)),))

    if not generator_already_existed:
        # Insert column records.
        column_sql = '''
            INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
        '''
        for name, stattype in columns:
            colno = column_map[casefold(name)]
            stattype = casefold(stattype)
            bdb.sql_execute(column_sql, {
                'generator_id': generator_id,
                'colno': colno,
                'stattype': stattype,
            })

    column_list = sorted((column_map[casefold(name)], name, stattype)
        for name, stattype in columns)
    return generator_id, column_list