def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    # XXX Whattakludge!
    idxcol = '_rowid_'
    if idxcol in df.columns:
        raise ValueError('Column `_rowid_\' is not allowed.')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(name for name in df.columns
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown),))
            column_names = ['_rowid_'] + df.columns
        elif create:
            column_names = [idxcol] + list(df.columns)
            qcns = map(sqlite3_quote_name, column_names)
            schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table),))
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in df.to_records():
            bdb.sql_execute(sql, row)
Exemple #2
0
def subsample_table_columns(bdb, table, new_table, limit, keep, drop, seed):
    """Return a subsample of the columns in the table."""
    if not bayesdb_has_table(bdb, table):
        raise ValueError('No such table: %s' % (table, ))
    if bayesdb_has_table(bdb, new_table):
        raise ValueError('Table already exists: %s' % (new_table, ))
    keep = map(casefold, keep)
    drop = map(casefold, drop)
    skip = keep + drop
    unknown = [
        column for column in skip
        if not bayesdb_table_has_column(bdb, table, column)
    ]
    if unknown:
        raise ValueError('No such columns: %s' % (unknown, ))
    overlap = [column for column in drop if column in keep]
    if overlap:
        raise ValueError('Cannot both drop and keep columns: %s' % (overlap, ))
    num_sample = limit - len(keep)
    if num_sample < 0:
        raise ValueError('Must sample at least as many columns to keep.')
    subselect_columns = [
        column for column in bayesdb_table_column_names(bdb, table)
        if casefold(column) not in skip
    ]
    rng = np.random.RandomState(seed)
    subsample_columns = rng.choice(subselect_columns,
                                   replace=False,
                                   size=min(len(subselect_columns),
                                            num_sample))
    qt = bql_quote_name(table)
    qnt = bql_quote_name(new_table)
    qc = ','.join(map(bql_quote_name, itertools.chain(keep,
                                                      subsample_columns)))
    cursor = bdb.execute('''
        CREATE TABLE %s AS SELECT %s FROM %s
    ''' % (qnt, qc, qt))
    return cursor_to_df(cursor)
Exemple #3
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as generator: %s' %
                    (repr(phrase.name), ))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as table: %s' %
                    (repr(phrase.name), ))
            if not core.bayesdb_has_generator_default(
                    bdb, phrase.simulation.generator):
                raise BQLError(
                    bdb,
                    'No such generator: %s' % (phrase.simulation.generator, ))
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            out.write(', ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.modelno,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = \
                [(core.bayesdb_generator_column_number(bdb, generator_id, name),
                        value)
                    for (name, _expression), value in
                        zip(phrase.simulation.constraints, cursor[0][2:])]
            colnos = \
                [core.bayesdb_generator_column_number(bdb, generator_id, name)
                    for name in column_names]
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join(
                     '%s %s' % (qcn, column_sqltypes[casefold(column_name)])
                     for qcn, column_name in zip(qcns, column_names))))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(bdb,
                                              generator_id,
                                              constraints,
                                              colnos,
                                              modelno=modelno,
                                              numpredictions=nsamples):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(
                    bdb, 'Table still in use by generators: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp) or \
                              core.bayesdb_has_generator(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(
                            bdb,
                            'No such generator: %s' % (repr(cmd.generator), ))
                    generator_id = core.bayesdb_get_generator(
                        bdb, cmd.generator)
                    bayesdb_schema_required(bdb, 6, "generator defaults")
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                    set_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(set_default_sql, (generator_id, ))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(
                bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), ))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:

                def instantiate(columns):
                    return instantiate_generator(bdb,
                                                 phrase.name,
                                                 phrase.table,
                                                 metamodel,
                                                 columns,
                                                 default=phrase.default)

                metamodel.create_generator(bdb, phrase.table, phrase.schema,
                                           instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos,
                                        model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX
Exemple #4
0
def bayesdb_read_csv(bdb,
                     table,
                     f,
                     header=False,
                     create=False,
                     ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table), ))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                              (repr(list(duplicates)), ))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(
                    name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown), ))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                              (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                              (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Exemple #5
0
def bayesdb_read_csv(bdb, table, f, header=False,
        create=False, ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table),))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            if any(len(c)==0 for c in column_names):
                raise IOError(
                    'Missing column names in header: %s' %repr(column_names))
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                    (repr(list(duplicates)),))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown),))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                    (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                    (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Exemple #6
0
def bayesdb_read_pandas_df(bdb,
                           table,
                           df,
                           create=False,
                           ifnotexists=False,
                           index=None):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    :param str index: name of column for index

    If `index` is `None`, then the dataframe's index dtype must be
    convertible to int64, and it is mapped to the table's rowids.  If
    the dataframe's index dtype is not convertible to int64, you must
    specify `index` to give a primary key for the table.
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    column_names = [str(column) for column in df.columns]
    if index is None:
        create_column_names = column_names
        insert_column_names = ['_rowid_'] + column_names
        try:
            key_index = df.index.astype('int64')
        except ValueError:
            raise ValueError('Must specify index name for non-integral index!')
    else:
        if index in df.columns:
            raise ValueError('Index name collides with column name: %r' %
                             (index, ))
        create_column_names = [index] + column_names
        insert_column_names = create_column_names
        key_index = df.index
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(
                name for name in create_column_names
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown), ))
        elif create:
            qccns = map(sqlite3_quote_name, create_column_names)

            def column_schema(column_name, qcn):
                if column_name == index:
                    return '%s NUMERIC PRIMARY KEY' % (qcn, )
                else:
                    return '%s NUMERIC' % (qcn, )

            schema = ','.join(
                column_schema(ccn, qccn)
                for ccn, qccn in zip(create_column_names, qccns))
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table), ))
        qt = sqlite3_quote_name(table)
        qicns = map(sqlite3_quote_name, insert_column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qicns), ','.join('?' for _qicn in qicns))
        for key, i in zip(key_index, df.index):
            bdb.sql_execute(sql, (key, ) + tuple(df.ix[i]))
Exemple #7
0
         if core.bayesdb_has_table(bdb, cmd.name):
             raise BQLError(
                 bdb, 'Name already defined as table'
                 ': %s' % (repr(cmd.name), ))
         rename_table(bdb, table, cmd.name)
     # Remember the new name for subsequent commands.
     table = cmd.name
 elif isinstance(cmd, ast.AlterTabRenameCol):
     # XXX Need to deal with this in the compiler.
     raise NotImplementedError('Renaming columns'
                               ' not yet implemented.')
     # Make sure the old name exist and the new name does not.
     old_folded = casefold(cmd.old)
     new_folded = casefold(cmd.new)
     if old_folded != new_folded:
         if not core.bayesdb_table_has_column(
                 bdb, table, cmd.old):
             raise BQLError(
                 bdb, 'No such column in table %s'
                 ': %s' % (repr(table), repr(cmd.old)))
         if core.bayesdb_table_has_column(bdb, table, cmd.new):
             raise BQLError(
                 bdb, 'Column already exists'
                 ' in table %s: %s' %
                 (repr(table), repr(cmd.new)))
     # Update bayesdb_column.  Everything else refers
     # to columns by (tabname, colno) pairs rather than
     # by names.
     update_column_sql = '''
         UPDATE bayesdb_column SET name = :new
             WHERE tabname = :table AND name = :old
     '''
Exemple #8
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSimModels):
        assert isinstance(phrase.simulation, ast.SimulateModels)
        with bdb.savepoint():
            # Check if table exists.
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                raise BQLError(
                    bdb,
                    'Name already defined as table: %s' % (phrase.name),
                )
            # Set up schema and create the new table.
            qn = sqlite3_quote_name(phrase.name)
            qcns = map(sqlite3_quote_name, [
                simcol.name if simcol.name is not None else str(simcol.col)
                for simcol in phrase.simulation.columns
            ])
            temp = '' if phrase.temp is None else 'TEMP'
            bdb.sql_execute('''
                CREATE %s TABLE %s (%s)
            ''' % (temp, qn, str.join(',', qcns)))
            # Retrieve the rows.
            rows = simulate_models_rows(bdb, phrase.simulation)
            # Insert the rows into the table.
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in rows:
                bdb.sql_execute(insert_sql, row)
            return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        schema = guess.guess_to_schema(guess.bayesdb_guess_stattypes, bdb,
                                       phrase.table)
        # Print schema to console, so user can edit it and/or copy/paste it into
        # the schema definition when creating a population.
        print schema
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            if core.bayesdb_population_generators(bdb, population_id):
                raise BQLError(
                    bdb,
                    'Population still has generators: %r' % (phrase.name, ))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb, 'Cannot update statistical types '
                            'for population %s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population'
                            ': %s' % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type'
                            ': %r' % (repr(cmd.stattype), ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX
Exemple #9
0
def _create_population(bdb, phrase):
    # Retrieve the (possibility implicit) population name.
    population_name = phrase.name or phrase.table
    implicit = 1 if phrase.name is None else 0

    # Handle IF NOT EXISTS.
    if core.bayesdb_has_population(bdb, population_name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(bdb, 'Name already defined as population: %r' %
                (population_name,))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)

    # Create the population record and get the assigned id.
    bdb.sql_execute('''
        INSERT INTO bayesdb_population (name, tabname, implicit)
            VALUES (?, ?, ?)
    ''', (population_name, phrase.table, implicit))
    population_id = core.bayesdb_get_population(bdb, population_name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(itertools.chain.from_iterable(
        [[(name, s.stattype) for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(itertools.chain.from_iterable(
        [[(name, 'ignore') for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(itertools.chain.from_iterable(
        [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r'
                % (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes])
        migrate = [(col, st) for col, st in pop_guess_vars if st=='key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Ensure no string-valued variables are being modeled as numerical.
    numerical_string_vars = [
        var for var, stattype in pop_model_vars
        if stattype == 'numerical'
            and _column_contains_string(bdb, phrase.table, var)
    ]
    if numerical_string_vars:
        raise BQLError(bdb,
            'Column(s) with string values modeled as numerical: %r'
            % (numerical_string_vars, ))

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r'
            % (not_found, ))

    # Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    seen_variables = set()
    duplicates = set()
    missing = set()
    invalid = set()
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in seen_variables:
            duplicates.add(name)
            continue
        if not core.bayesdb_table_has_column(bdb, phrase.table, nm):
            missing.add(name)
            continue
        cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
        if cursor_value(cursor) == 0 and stattype != 'ignore':
            invalid.add(stattype)
            continue
        seen_variables.add(nm)
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, 'No such columns in table %r: %r' %
            (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),))
    if invalid:
        raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        core.bayesdb_add_variable(bdb, population_id, name, stattype)
Exemple #10
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
            out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb,
                        'Name already defined as table: %s' %
                        (repr(phrase.name),))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb, 'Table already exists: %s' %
                        (repr(phrase.name),))
            bayesdb_read_csv_file(
                bdb, phrase.name, phrase.csv, header=True, create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                raise BQLError(bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name),))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                (phrase.name,))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as table: %s'
                                % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                                bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s'
                            % (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                        ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table,
                                cmd.old):
                            raise BQLError(bdb, 'No such column in table %s'
                                ': %s' %
                                (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table,))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id,) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(bdb,
                                generator_id)
                            backend.rename_column(bdb, generator_id,
                                old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder('''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt,), ())
            for cn, st, ct in zip(column_names, stattypes, distinct_value_counts):
                out.winder('''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt,))
            out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name,))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids]
                raise BQLError(bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute('''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id,))
            bdb.sql_execute('''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb, 'No such population: %s' %
                    (repr(population),))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead'
                            % (population,))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as population' ': %s'
                                % (repr(cmd.name),))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s'
                            % (qg, qp,))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(
                            bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'No such variable in base table: %s'
                            % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(
                            bdb, population_id, None, cmd.name):
                        raise BQLError(bdb,
                            'Variable already in population: %s'
                            % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb,
                            'Cannot add variable without any values: %s'
                            % (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute(
                            'SELECT %s FROM %s' % (qc, qt))
                        rows = cursor.fetchall()
                        [stattype, reason] = bayesdb_guess_stattypes(
                            [cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(bdb,
                                'Values in column %s appear to be keys.'
                                % (cmd.name,))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(bdb,
                                'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name,))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'Numerical column contains string values: %r '
                            % (qc,))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(
                            bdb, population_id, cmd.name, stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s'
                            % (repr(population), repr(generators),))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not
                        core.bayesdb_has_variable(bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(bdb,
                            'No such variables in population: %s'
                            % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid statistical type: %r'
                            % (repr(cmd.stattype),))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(bdb,
                                'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars,))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(
                            bdb, population_id, None, c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno,) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos,)
                    bdb.sql_execute(
                        update_stattype_sql,
                        (casefold(cmd.stattype), population_id,))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' %
                (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' %
                (repr(backend_name),))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name),))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute('''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(bdb, 'Name already defined'
                                ' as generator: %s' %
                                (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos if not
                    core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(bdb,
                        'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
            else:
                existing = set(modelno for modelno in modelnos
                    if core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
                if 0 < len(existing):
                    raise BQLError(bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb, generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
            program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(
                bdb, None, phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' % (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                    phrase.generator):
                raise BQLError(bdb,
                    'No such generator: %r' % (phrase.generator,))
            generator_id = core.bayesdb_get_generator(
                bdb, population_id, phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(
                bdb, population_id, None, phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target,))
        colno_target = core.bayesdb_variable_number(
            bdb, population_id, None, phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
            generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(bdb,
                'Target variable is not numerical: %r' % (phrase.target,))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(
                    bdb, population_id, None, given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(
            colno for colno in colno_givens if colno!= colno_target
        )
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(
            bdb, population_id, generator_id, modelnos, constraints,
            colnos, numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(
            target_values, given_values, given_names, stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder('''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt,), ())
        for variable, coef in coefficients:
            out.winder('''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (variable, coef,))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt,))
        out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    assert False                # XXX
Exemple #11
0
def bayesdb_load_codebook_csv_file(bdb, table, pathname):
    """Load a codebook for `table` from the CSV file at `pathname`."""
    codebook = None
    with open(pathname, 'rU') as f:
        reader = csv.reader(f)
        try:
            header = reader.next()
        except StopIteration:
            raise IOError('Empty codebook file')
        header = [unicode(h, 'utf8').strip() for h in header]
        if header != ['name','shortname','description','value_map']:
            raise IOError('Wrong CSV header for codebook')
        codebook = []
        line = 1
        for row in reader:
            if len(row) != 4:
                raise IOError('Wrong number of columns at line %d: %d' %
                    (line, len(row)))
            column_name, _shortname, _description, _value_map_json = row
            codebook.append(row)
            line += 1
    with bdb.savepoint():
        for column_name, shortname, description, value_map_json in codebook:
            if not core.bayesdb_table_has_column(bdb, table, column_name):
                raise IOError('Column does not exist in table %s: %s' %
                    (repr(table), repr(column_name)))
            colno = core.bayesdb_table_column_number(bdb, table, column_name)
            try:
                value_map = dict(json.loads(value_map_json))
            except (ValueError, TypeError):
                if value_map_json == '' or value_map_json.lower() == 'nan':
                    value_map = {}
                else:
                    raise IOError('Invalid value map for column %r: %r' %
                                  (column_name, value_map_json))
            sql = '''
                DELETE FROM bayesdb_column_map
                    WHERE tabname = ? AND colno = ?
            '''
            bdb.sql_execute(sql, (table, colno))
            sql = '''
                INSERT INTO bayesdb_column_map
                    (tabname, colno, key, value)
                    VALUES (?, ?, ?, ?)
            '''
            for key in sorted(value_map.keys()):
                value = value_map[key]
                bdb.sql_execute(sql, (table, colno, key, value))
            sql = '''
                UPDATE bayesdb_column
                    SET shortname = :shortname, description = :description
                    WHERE tabname = :table AND colno = :colno
            '''
            total_changes = bdb._sqlite3.totalchanges()
            bdb.sql_execute(sql, {
                'shortname': shortname,
                'description': description,
                'table': table,
                'colno': colno,
            })
            assert bdb._sqlite3.totalchanges() - total_changes == 1
Exemple #12
0
def _create_population(bdb, phrase):
    if core.bayesdb_has_population(bdb, phrase.name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(
                bdb,
                'Name already defined as population: %r' % (phrase.name, ))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)
    seen_columns = []

    # Create the population record and get the assigned id.
    bdb.sql_execute(
        '''
        INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?)
    ''', (phrase.name, phrase.table))
    population_id = core.bayesdb_get_population(bdb, phrase.name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(
        itertools.chain.from_iterable([[(name, s.stattype) for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(
        itertools.chain.from_iterable([[(name, 'ignore') for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(
        itertools.chain.from_iterable([
            s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)
        ]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r' %
                (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes])
        migrate = [(col, st) for col, st in pop_guess_vars if st == 'key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Ensure no string-valued variables are being modeled as numerical.
    numerical_string_vars = [
        var for var, stattype in pop_model_vars if stattype == 'numerical'
        and _column_contains_string(bdb, phrase.table, var)
    ]
    if numerical_string_vars:
        raise BQLError(
            bdb, 'Column(s) with string values modeled as numerical: %r' %
            (numerical_string_vars, ))

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r' %
            (not_found, ))

    # Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    seen_variables = set()
    duplicates = set()
    missing = set()
    invalid = set()
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in seen_variables:
            duplicates.add(name)
            continue
        if not core.bayesdb_table_has_column(bdb, phrase.table, nm):
            missing.add(name)
            continue
        cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
        if cursor_value(cursor) == 0 and stattype != 'ignore':
            invalid.add(stattype)
            continue
        seen_variables.add(nm)
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb,
            'No such columns in table %r: %r' % (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb,
                       'Duplicate column names: %r' % (list(duplicates), ))
    if invalid:
        raise BQLError(bdb,
                       'Invalid statistical types: %r' % (list(invalid), ))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        core.bayesdb_add_variable(bdb, population_id, name, stattype)
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname,
        create=False, ifnotexists=False, gzipped=None):
    """Load legacy BayesDB models from a file.

    Legacy models are from the previous incarnation of BayesDB, before
    bayeslite.  If you did not use the previous incarnation of
    BayesDB, you need not worry about this.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str generator: name of generator
    :param str table: name of table
    :param str metamodel: name of metamodel, must be ``crosscat``
    :param str pathname: pathname of legacy models file
    :param bool create: if true and `generator` does not exist, create it
    :param bool ifnotexists: if true and `generator` exists, do it anyway
    :param bool gzipped: if true, or if ``None`` and `pathname`
        ends in ``.pkl.gz``, decompress with gzip first
    """

    if metamodel != 'crosscat':
        raise ValueError('Only crosscat legacy models are supported.')

    if not create:
        if ifnotexists:
            raise ValueError('Not creating generator whether or not exists!')

    # Load the pickled file -- gzipped, if gzipped is true or if
    # gzipped is not specified and the file ends in .pkl.gz.
    pickled = None
    with open(pathname, 'rb') as f:
        if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')):
            with gzip.GzipFile(fileobj=f) as gzf:
                pickled = pickle.load(gzf)
        else:
            pickled = pickle.load(f)

    # Pick apart the schema and model data.
    #
    # XXX Support even older models formats, from before the schema
    # was included.  Not sure exactly how they were structured.
    if 'schema' not in pickled:
        raise IOError('Invalid legacy model: missing schema')
    if 'models' not in pickled:
        raise IOError('Invalid legacy model: missing models')
    schema = pickled['schema']
    models = pickled['models']

    # Make sure the schema looks sensible.  Map legacy stattypes
    # (`cctypes') to modern stattypes.
    if not isinstance(schema, dict):
        raise IOError('Invalid legacy model: schema is not a dict')
    for column_name in schema:
        column_schema = schema[column_name]
        if not isinstance(column_schema, dict):
            raise IOError('Invalid legacy model: column schema is not a dict')
        if not 'cctype' in column_schema:
            raise IOError('Invalid legacy model: column schema missing cctype')
        if column_schema['cctype'] in renamed_column_stattypes:
            column_schema['cctype'] = \
                renamed_column_stattypes[column_schema['cctype']]
        if column_schema['cctype'] not in allowed_column_stattypes:
            raise IOError('Invalid legacy model: unknown column type')

    # XXX Check whether the schema resembles a sane generator schema.
    # XXX Check whether models is a dict mapping integers to thetas.
    # XXX Check whether the thetas look sensible.
    # XXX Check whether the metamodel makes sense of it!

    column_stattypes = dict((casefold(column_name),
                             casefold(schema[column_name]['cctype']))
        for column_name in schema)

    # Ready to update the database.  Do it in a savepoint in case
    # anything goes wrong.
    with bdb.savepoint():

        # Ensure the table exists.  Can't do anything if we have no
        # data.
        if not core.bayesdb_has_table(bdb, table):
            raise ValueError('No such table: %s' % (repr(table),))

        # Ensure the generator exists.
        if core.bayesdb_has_generator(bdb, generator):
            if create and not ifnotexists:
                raise ValueError('Generator already exists: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            generator_table = core.bayesdb_generator_table(bdb, generator_id)
            if casefold(table) != generator_table:
                raise ValueError(
                    'Generator %r is for table %r, not for table: %r' %
                    (generator, generator_table, table))
            # Generator exists.  If the schema differs and there are
            # existing models, fail.  If the schema differs and there
            # are no existing models, change the schema.
            #
            # XXX Not clear changing the schema is really appropriate.
            generator_id = core.bayesdb_get_generator(bdb, generator)
            old_types = bayesdb_generator_column_stattypes(bdb, generator_id)
            if column_stattypes != old_types:
                sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = ?
                '''
                cursor = bdb.sql_execute(bdb, (generator_id,))
                if 0 < cursor_value(cursor):
                    raise ValueError('Legacy models mismatch schema: %s' %
                        (repr(generator),))
                qg = sqlite3_quote_name(generator)
                bdb.execute('DROP GENERATOR %s' % (qg,))
                bayesdb_create_legacy_generator(bdb, generator, table,
                    column_stattypes)
        elif create:
            bayesdb_create_legacy_generator(bdb, generator, table,
                column_stattypes)
        else:
            raise ValueError('No such generator: %s' % (repr(generator),))

        # Map the case of the column names in the models.
        #
        # XXX Check more than just the column names.
        for modelno in models:      # dictionary
            theta = models[modelno]
            if 'X_L' not in theta:
                raise IOError('Invalid legacy model: no X_L in theta[%u]' %
                    (modelno,))
            X_L = theta['X_L']
            if 'view_state' not in X_L:
                raise IOError('Invalid legacy model'
                    ': no view_state in X_L[%u]' %
                    (modelno,))
            for viewno, view_state in enumerate(X_L['view_state']):
                if 'column_names' not in view_state:
                    raise IOError('Invalid legacy model: no column names'
                        ' in view state %u of X_L[%u]' % (viewno, modelno))
                view_column_names = view_state['column_names']
                if not isinstance(view_column_names, list):
                    raise IOError('Invalid legacy model'
                        ': non-list for view %u columns in X_L[%u]'
                        % (viewno, modelno))
                for i in range(len(view_column_names)):
                    name = view_column_names[i]
                    if not core.bayesdb_table_has_column(bdb, table, name):
                        raise IOError('No such column in table %s: %s' %
                            (repr(table), repr(name)))
                    # Canonicalize the case.
                    colno = core.bayesdb_table_column_number(bdb, table, name)
                    name = core.bayesdb_table_column_name(bdb, table, colno)
                    view_column_names[i] = name

        # Determine where to start numbering the new models.
        generator_id = core.bayesdb_get_generator(bdb, generator)
        modelno_max_sql = '''
            SELECT MAX(modelno) FROM bayesdb_generator_model
                WHERE generator_id = ?
        '''
        cursor = bdb.sql_execute(modelno_max_sql, (generator_id,))
        modelno_max = cursor_value(cursor)
        modelno_start = 0 if modelno_max is None else modelno_max + 1

        # Consistently number the models consecutively in order of the
        # external numbering starting at the smallest nonnegative
        # model number not currently used.  Do not vary based on the
        # ordering of Python dict iteration.
        insert_model_sql = '''
            INSERT INTO bayesdb_generator_model
                (generator_id, modelno, iterations)
                VALUES (:generator_id, :modelno, :iterations)
        '''
        insert_theta_json_sql = '''
            INSERT INTO bayesdb_crosscat_theta
                (generator_id, modelno, theta_json)
                VALUES (:generator_id, :modelno, :theta_json)
        '''
        for i, modelno_ext in enumerate(sorted(models.keys())):
            modelno = modelno_start + i
            theta = models[modelno_ext]
            iterations = 0
            if 'iterations' in theta and isinstance(theta['iterations'], int):
                iterations = theta['iterations']
            bdb.sql_execute(insert_model_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'iterations': iterations,
            })
            bdb.sql_execute(insert_theta_json_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'theta_json': json.dumps(theta),
            })
Exemple #14
0
def bayesdb_load_legacy_models(bdb,
                               generator,
                               table,
                               metamodel,
                               pathname,
                               create=False,
                               ifnotexists=False,
                               gzipped=None):
    """Load legacy BayesDB models from a file.

    Legacy models are from the previous incarnation of BayesDB, before
    bayeslite.  If you did not use the previous incarnation of
    BayesDB, you need not worry about this.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str generator: name of generator
    :param str table: name of table
    :param str metamodel: name of metamodel, must be ``crosscat``
    :param str pathname: pathname of legacy models file
    :param bool create: if true and `generator` does not exist, create it
    :param bool ifnotexists: if true and `generator` exists, do it anyway
    :param bool gzipped: if true, or if ``None`` and `pathname`
        ends in ``.pkl.gz``, decompress with gzip first
    """

    if metamodel != 'crosscat':
        raise ValueError('Only crosscat legacy models are supported.')

    if not create:
        if ifnotexists:
            raise ValueError('Not creating generator whether or not exists!')

    # Load the pickled file -- gzipped, if gzipped is true or if
    # gzipped is not specified and the file ends in .pkl.gz.
    pickled = None
    with open(pathname, 'rb') as f:
        if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')):
            with gzip.GzipFile(fileobj=f) as gzf:
                pickled = pickle.load(gzf)
        else:
            pickled = pickle.load(f)

    # Pick apart the schema and model data.
    #
    # XXX Support even older models formats, from before the schema
    # was included.  Not sure exactly how they were structured.
    if 'schema' not in pickled:
        raise IOError('Invalid legacy model: missing schema')
    if 'models' not in pickled:
        raise IOError('Invalid legacy model: missing models')
    schema = pickled['schema']
    models = pickled['models']

    # Make sure the schema looks sensible.  Map legacy stattypes
    # (`cctypes') to modern stattypes.
    if not isinstance(schema, dict):
        raise IOError('Invalid legacy model: schema is not a dict')
    for column_name in schema:
        column_schema = schema[column_name]
        if not isinstance(column_schema, dict):
            raise IOError('Invalid legacy model: column schema is not a dict')
        if not 'cctype' in column_schema:
            raise IOError('Invalid legacy model: column schema missing cctype')
        if column_schema['cctype'] in renamed_column_stattypes:
            column_schema['cctype'] = \
                renamed_column_stattypes[column_schema['cctype']]
        if column_schema['cctype'] not in allowed_column_stattypes:
            raise IOError('Invalid legacy model: unknown column type')

    # XXX Check whether the schema resembles a sane generator schema.
    # XXX Check whether models is a dict mapping integers to thetas.
    # XXX Check whether the thetas look sensible.
    # XXX Check whether the metamodel makes sense of it!

    column_stattypes = dict(
        (casefold(column_name), casefold(schema[column_name]['cctype']))
        for column_name in schema)

    # Ready to update the database.  Do it in a savepoint in case
    # anything goes wrong.
    with bdb.savepoint():

        # Ensure the table exists.  Can't do anything if we have no
        # data.
        if not core.bayesdb_has_table(bdb, table):
            raise ValueError('No such table: %s' % (repr(table), ))

        # Ensure the generator exists.
        if core.bayesdb_has_generator(bdb, generator):
            if create and not ifnotexists:
                raise ValueError('Generator already exists: %s' %
                                 (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            generator_table = core.bayesdb_generator_table(bdb, generator_id)
            if casefold(table) != generator_table:
                raise ValueError(
                    'Generator %r is for table %r, not for table: %r' %
                    (generator, generator_table, table))
            # Generator exists.  If the schema differs and there are
            # existing models, fail.  If the schema differs and there
            # are no existing models, change the schema.
            #
            # XXX Not clear changing the schema is really appropriate.
            generator_id = core.bayesdb_get_generator(bdb, generator)
            old_types = bayesdb_generator_column_stattypes(bdb, generator_id)
            if column_stattypes != old_types:
                sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = ?
                '''
                cursor = bdb.sql_execute(bdb, (generator_id, ))
                if 0 < cursor_value(cursor):
                    raise ValueError('Legacy models mismatch schema: %s' %
                                     (repr(generator), ))
                qg = sqlite3_quote_name(generator)
                bdb.execute('DROP GENERATOR %s' % (qg, ))
                bayesdb_create_legacy_generator(bdb, generator, table,
                                                column_stattypes)
        elif create:
            bayesdb_create_legacy_generator(bdb, generator, table,
                                            column_stattypes)
        else:
            raise ValueError('No such generator: %s' % (repr(generator), ))

        # Map the case of the column names in the models.
        #
        # XXX Check more than just the column names.
        for modelno in models:  # dictionary
            theta = models[modelno]
            if 'X_L' not in theta:
                raise IOError('Invalid legacy model: no X_L in theta[%u]' %
                              (modelno, ))
            X_L = theta['X_L']
            if 'view_state' not in X_L:
                raise IOError('Invalid legacy model'
                              ': no view_state in X_L[%u]' % (modelno, ))
            for viewno, view_state in enumerate(X_L['view_state']):
                if 'column_names' not in view_state:
                    raise IOError('Invalid legacy model: no column names'
                                  ' in view state %u of X_L[%u]' %
                                  (viewno, modelno))
                view_column_names = view_state['column_names']
                if not isinstance(view_column_names, list):
                    raise IOError('Invalid legacy model'
                                  ': non-list for view %u columns in X_L[%u]' %
                                  (viewno, modelno))
                for i in range(len(view_column_names)):
                    name = view_column_names[i]
                    if not core.bayesdb_table_has_column(bdb, table, name):
                        raise IOError('No such column in table %s: %s' %
                                      (repr(table), repr(name)))
                    # Canonicalize the case.
                    colno = core.bayesdb_table_column_number(bdb, table, name)
                    name = core.bayesdb_table_column_name(bdb, table, colno)
                    view_column_names[i] = name

        # Determine where to start numbering the new models.
        generator_id = core.bayesdb_get_generator(bdb, generator)
        modelno_max_sql = '''
            SELECT MAX(modelno) FROM bayesdb_generator_model
                WHERE generator_id = ?
        '''
        cursor = bdb.sql_execute(modelno_max_sql, (generator_id, ))
        modelno_max = cursor_value(cursor)
        modelno_start = 0 if modelno_max is None else modelno_max + 1

        # Consistently number the models consecutively in order of the
        # external numbering starting at the smallest nonnegative
        # model number not currently used.  Do not vary based on the
        # ordering of Python dict iteration.
        insert_model_sql = '''
            INSERT INTO bayesdb_generator_model
                (generator_id, modelno, iterations)
                VALUES (:generator_id, :modelno, :iterations)
        '''
        insert_theta_json_sql = '''
            INSERT INTO bayesdb_crosscat_theta
                (generator_id, modelno, theta_json)
                VALUES (:generator_id, :modelno, :theta_json)
        '''
        for i, modelno_ext in enumerate(sorted(models.keys())):
            modelno = modelno_start + i
            theta = models[modelno_ext]
            iterations = 0
            if 'iterations' in theta and isinstance(theta['iterations'], int):
                iterations = theta['iterations']
            bdb.sql_execute(
                insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                    'iterations': iterations,
                })
            bdb.sql_execute(
                insert_theta_json_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                    'theta_json': json.dumps(theta),
                })
Exemple #15
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = "TEMP " if phrase.temp else ""
            ifnotexists = "IF NOT EXISTS " if phrase.ifnotexists else ""
            out.write("CREATE %sTABLE %s%s AS " % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as generator: %s" % (repr(phrase.name),))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as table: %s" % (repr(phrase.name),))
            if not core.bayesdb_has_generator_default(bdb, phrase.simulation.generator):
                raise BQLError(bdb, "No such generator: %s" % (phrase.simulation.generator,))
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute("PRAGMA table_info(%s)" % (qt,))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write("SELECT ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out)
            out.write(", ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(", ")
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = [
                (core.bayesdb_generator_column_number(bdb, generator_id, name), value)
                for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])
            ]
            colnos = [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names]
            bdb.sql_execute(
                "CREATE %sTABLE %s%s (%s)"
                % (
                    "TEMP " if phrase.temp else "",
                    "IF NOT EXISTS " if phrase.ifnotexists else "",
                    qn,
                    ",".join(
                        "%s %s" % (qcn, column_sqltypes[casefold(column_name)])
                        for qcn, column_name in zip(qcns, column_names)
                    ),
                )
            )
            insert_sql = """
                INSERT INTO %s (%s) VALUES (%s)
            """ % (
                qn,
                ",".join(qcns),
                ",".join("?" for qcn in qcns),
            )
            for row in bqlfn.bayesdb_simulate(
                bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples
            ):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = "SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?"
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(bdb, "Table still in use by generators: %s" % (repr(phrase.name),))
            bdb.sql_execute("DELETE FROM bayesdb_column WHERE tabname = ?", (phrase.name,))
            ifexists = "IF EXISTS " if phrase.ifexists else ""
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute("DROP TABLE %s%s" % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, "No such table: %s" % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + "_temp"
                        while core.bayesdb_has_table(bdb, temp) or core.bayesdb_has_generator(bdb, temp):
                            temp += "_temp"
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError("Renaming columns" " not yet implemented.")
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table, cmd.old):
                            raise BQLError(bdb, "No such column in table %s" ": %s" % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, "Column already exists" " in table %s: %s" % (repr(table), repr(cmd.new))
                            )
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = """
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_column_sql, {"table": table, "old": cmd.old, "new": cmd.new})
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = """
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        """
                        cursor = bdb.sql_execute(generators_sql, (table,))
                        for (generator_id,) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id, old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(bdb, "No such generator: %s" % (repr(cmd.generator),))
                    generator_id = core.bayesdb_get_generator(bdb, cmd.generator)
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                    set_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(set_default_sql, (generator_id,))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                else:
                    assert False, "Invalid alter table command: %s" % (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(bdb, "No such metamodel: %s" % (repr(phrase.metamodel),))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():

            def instantiate(columns):
                return instantiate_generator(
                    bdb,
                    phrase.name,
                    phrase.table,
                    metamodel,
                    columns,
                    ifnotexists=phrase.ifnotexists,
                    default=phrase.default,
                )

            metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, "No such generator: %s" % (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = """
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            """
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = """
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            """
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = """
                DELETE FROM bayesdb_generator WHERE id = ?
            """
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb, "No such generator: %s" % (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = """
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_generator_sql, (cmd.name, generator_id))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, "Invalid ALTER GENERATOR command: %s" % (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(
                    modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
            else:
                existing = set(
                    modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
                if 0 < len(existing):
                    raise BQLError(
                        bdb, "Generator %s already has models: %s" % (repr(phrase.generator), sorted(existing))
                    )

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = """
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            """
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {"generator_id": generator_id, "modelno": modelno, "iterations": 0})

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos, model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError("No background analysis -- use WAIT.")
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(
            bdb,
            generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
        )
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = """
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {"generator_id": generator_id, "modelno": modelno})
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, "No such model" " in generator %s: %s" % (repr(phrase.generator), repr(modelno))
                        )
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = """
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                """
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = """
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {"generator_id": generator_id, "modelno": modelno})
        return empty_cursor(bdb)

    assert False  # XXX
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False,
        index=None):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    :param str index: name of column for index

    If `index` is `None`, then the dataframe's index dtype must be
    convertible to int64, and it is mapped to the table's rowids.  If
    the dataframe's index dtype is not convertible to int64, you must
    specify `index` to give a primary key for the table.
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    column_names = [str(column) for column in df.columns]
    if index is None:
        create_column_names = column_names
        insert_column_names = ['_rowid_'] + column_names
        try:
            key_index = df.index.astype('int64')
        except ValueError:
            raise ValueError('Must specify index name for non-integral index!')
    else:
        if index in df.columns:
            raise ValueError('Index name collides with column name: %r'
                % (index,))
        create_column_names = [index] + column_names
        insert_column_names = create_column_names
        key_index = df.index
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(name for name in create_column_names
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown),))
        elif create:
            qccns = map(sqlite3_quote_name, create_column_names)
            def column_schema(column_name, qcn):
                if column_name == index:
                    return '%s NUMERIC PRIMARY KEY' % (qcn,)
                else:
                    return '%s NUMERIC' % (qcn,)
            schema = ','.join(column_schema(ccn, qccn)
                for ccn, qccn in zip(create_column_names, qccns))
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table),))
        qt = sqlite3_quote_name(table)
        qicns = map(sqlite3_quote_name, insert_column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qicns), ','.join('?' for _qicn in qicns))
        for key, i in zip(key_index, df.index):
            bdb.sql_execute(sql, (key,) + tuple(df.ix[i]))
Exemple #17
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder(
                '''
                CREATE TEMP TABLE %s (column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT)
            ''' % (qtt), ())
            for cn, st, ct in zip(column_names, stattypes,
                                  distinct_value_counts):
                out.winder(
                    '''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt, ))
            out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [
                    core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids
                ]
                raise BQLError(
                    bdb, 'Population %r still has metamodels: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'No such variable in base table: %s' % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(bdb, population_id, None,
                                                 cmd.name):
                        raise BQLError(
                            bdb,
                            'Variable already in population: %s' % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'Cannot add variable without any values: %s' %
                            (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute('SELECT %s FROM %s' %
                                                 (qc, qt))
                        rows = cursor.fetchall()
                        [stattype,
                         reason] = bayesdb_guess_stattypes([cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(
                                bdb, 'Values in column %s appear to be keys.' %
                                (cmd.name, ))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(
                                bdb, 'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name, ))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                                       'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'Numerical column contains string values: %r ' %
                            (qc, ))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(bdb, population_id, cmd.name,
                                                  stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) metamodel in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            # XXX Omit needless bayesdb_generator_column table
                            # Github issue #441.
                            bdb.sql_execute(
                                '''
                                INSERT INTO bayesdb_generator_column
                                    VALUES (:generator_id, :colno, :stattype)
                            ''', {
                                    'generator_id': generator_id,
                                    'colno': colno,
                                    'stattype': stattype,
                                })
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb,
                            'Cannot update statistical types for population '
                            '%s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population: %s' %
                            (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type: %r' %
                            (repr(cmd.stattype), ))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(
                                bdb, 'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars, ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(
                        bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(
                        bdb, 'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the metamodel.
                metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                metamodel.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the metamodel.
        generator_id = None
        if phrase.metamodel:
            if not core.bayesdb_has_generator(bdb, population_id,
                                              phrase.metamodel):
                raise BQLError(bdb,
                               'No such metamodel: %r' % (phrase.population, ))
            generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                      phrase.metamodel)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(bdb, population_id, None,
                                         phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target, ))
        colno_target = core.bayesdb_variable_number(bdb, population_id, None,
                                                    phrase.target)
        if core.bayesdb_variable_stattype(bdb, population_id, colno_target) != \
                'numerical':
            raise BQLError(
                bdb,
                'Target variable is not numerical: %r' % (phrase.target, ))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(bdb, population_id, None,
                                             given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(colno for colno in colno_givens
                                  if colno != colno_target)
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(bdb,
                                      population_id,
                                      generator_id,
                                      modelnos,
                                      constraints,
                                      colnos,
                                      numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(target_values, given_values, given_names,
                                   stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder(
            '''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt, ), ())
        for variable, coef in coefficients:
            out.winder(
                '''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (
                    variable,
                    coef,
                ))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt, ))
        out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    assert False  # XXX