Beispiel #1
0
def bql_row_similarity(bdb, population_id, generator_id, modelnos, rowid,
                       target_rowid, colno):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    modelnos = _retrieve_modelnos(modelnos)

    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Change [colno] to colno by updating IBayesDBMetamodel.
        return metamodel.row_similarity(bdb, generator_id, modelnos, rowid,
                                        target_rowid, [colno])

    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    similarities = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean(similarities)
Beispiel #2
0
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid,
                       *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    if len(colnos) == 0:
        colnos = core.bayesdb_variable_numbers(bdb, population_id,
                                               generator_id)

    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        return metamodel.row_similarity(bdb, generator_id, None, rowid,
                                        target_rowid, colnos)
    generator_ids = [generator_id] if generator_id is not None else \
        core.bayesdb_population_generators(bdb, population_id)
    similarities = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean(similarities)
Beispiel #3
0
def _retrieve_rowid_constraints(bdb, population_id, constraints):
    rowid = core.bayesdb_population_fresh_row_id(bdb, population_id)
    if constraints:
        user_rowid = [
            v for c, v in constraints
            if c in core.bayesdb_rowid_tokens(bdb)
        ]
        if len(user_rowid) == 1:
            rowid = user_rowid[0]
        elif len(user_rowid) > 1:
            raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,))
        constraints = [
            (c, v) for c, v in constraints
            if c not in core.bayesdb_rowid_tokens(bdb)
        ]
    return rowid, constraints
Beispiel #4
0
def bayesdb_generator_cell_value(bdb, generator_id, rowid, colno):
    table_name = bayesdb_generator_table(bdb, generator_id)
    colname = bayesdb_generator_column_name(bdb, generator_id, colno)
    qt = sqlite3_quote_name(table_name)
    qcn = sqlite3_quote_name(colname)
    value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt)
    value_cursor = bdb.sql_execute(value_sql, (rowid, ))
    value = None
    try:
        row = value_cursor.next()
    except StopIteration:
        generator = bayesdb_generator_name(bdb, generator_id)
        raise BQLError(bdb, 'No such row in %s: %d' % (repr(generator), rowid))
    else:
        assert len(row) == 1
        value = row[0]
    return value
 def _check_loom_initialized(self, bdb, generator_id):
     # Not invoked on a per-query basis due to high overhead.
     cursor = bdb.sql_execute(
         '''
         SELECT COUNT(*)
         FROM bayesdb_loom_row_kind_partition
         WHERE generator_id = ?
     ''', (generator_id, ))
     count_row = cursor.fetchall()
     cursor = bdb.sql_execute(
         '''
         SELECT COUNT(*)
         FROM bayesdb_loom_row_kind_partition
         WHERE generator_id = ?
     ''', (generator_id, ))
     count_col = cursor.fetchall()
     if count_row[0][0] == 0 or count_col[0][0] == 0:
         raise BQLError(bdb, 'Analyze must be run before any BQL'\
             ' queries when using loom.')
 def predictive_relevance(self, bdb, generator_id, modelnos, rowid_target,
                          rowid_queries, hypotheticals, colno):
     if len(hypotheticals) > 0:
         raise BQLError(bdb, 'Loom cannot handle hypothetical rows' \
             ' because it is unable to insert rows into CrossCat')
     if modelnos is None:
         modelnos = range(self._get_num_models(bdb, generator_id))
     relevances = [0] * len(rowid_queries)
     for modelno in modelnos:
         kind_id_context = self._get_kind_id(bdb, generator_id, modelno,
                                             colno)
         partition_id_target = self._get_partition_id(
             bdb, generator_id, modelno, kind_id_context, rowid_target)
         for idx, rowid in enumerate(rowid_queries):
             partition_id_query = self._get_partition_id(
                 bdb, generator_id, modelno, kind_id_context, rowid)
             if partition_id_target == partition_id_query:
                 relevances[idx] += 1
     # XXX This procedure appears to be computing the wrong thing.
     return [xsum / float(len(modelnos)) for xsum in relevances]
Beispiel #7
0
 def register(self, bdb):
     with bdb.savepoint():
         schema_sql = 'SELECT version FROM bayesdb_metamodel WHERE name = ?'
         cursor = bdb.sql_execute(schema_sql, (self.name(),))
         version = None
         try:
             row = cursor.next()
         except StopIteration:
             version = 0
         else:
             version = row[0]
         assert version is not None
         if version == 0:
             # XXX WHATTAKLUDGE!
             for stmt in std_normal_schema_1.split(';'):
                 bdb.sql_execute(stmt)
             version = 1
         if version != 1:
             raise BQLError(bdb, 'IID-Gaussian already installed'
                 ' with unknown schema version: %d' % (version,))
Beispiel #8
0
def bayesdb_population_cell_value(bdb, population_id, rowid, colno):
    if colno < 0:
        # Latent variables do not appear in the table.
        return None
    table_name = bayesdb_population_table(bdb, population_id)
    var = bayesdb_variable_name(bdb, population_id, colno)
    qt = sqlite3_quote_name(table_name)
    qv = sqlite3_quote_name(var)
    value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qv, qt)
    value_cursor = bdb.sql_execute(value_sql, (rowid, ))
    value = None
    try:
        row = value_cursor.next()
    except StopIteration:
        population = bayesdb_population_name(bdb, population_id)
        raise BQLError(
            bdb, 'No such invidual in population %r: %d' % (population, rowid))
    else:
        assert len(row) == 1
        value = row[0]
    return value
Beispiel #9
0
    def _initialize_engine(self, bdb, generator_id, n, variables):
        population_id = core.bayesdb_generator_population(bdb, generator_id)

        def map_var(var):
            return core.bayesdb_variable_number(bdb, population_id,
                                                generator_id, var)

        # If no variables in the population modeled by the gpmcc, then create 1
        # dummy variable with one measurement. The design space for how to
        # refactor cgpm.crosscat.State to initialize without any variables is
        # not simple, so we will live with this workaround for now.
        if not variables:
            (outputs, cctypes, distargs, gpmcc_data) = \
                [7**10], ['bernoulli'], [None], [[0]]
        else:
            outputs = [map_var(var) for var, _st, _cct, _da in variables]
            cctypes = [cctype for _n, _st, cctype, _da in variables]
            distargs = [distargs for _n, _st, _cct, distargs in variables]
            gpmcc_vars = [var for var, _stattype, _dist, _params in variables]
            gpmcc_data = self._data(bdb, generator_id, gpmcc_vars)
            # If gpmcc_data has any column which is all null, then crash early
            # and notify the user of all offending column names.
            n_rows = len(gpmcc_data[0])
            nulls = [
                v for i, v in enumerate(gpmcc_vars) if all(
                    math.isnan(gpmcc_data[r][i]) for r in xrange(n_rows))
            ]
            if nulls:
                raise BQLError(
                    bdb, 'Failed to initialize, '
                    'columns have all null values: %s' % repr(nulls))

        return Engine(gpmcc_data,
                      num_states=n,
                      rng=bdb.np_prng,
                      multiprocess=self._ncpu,
                      outputs=outputs,
                      cctypes=cctypes,
                      distargs=distargs)
Beispiel #10
0
 def _from_numeric(self, bdb, generator_id, colno, value):
     """Convert value in cgpm to equivalent bayeslite format."""
     # XXX Latent variables are not associated with an entry in
     # bayesdb_cgpm_category, so just pass through whatever value cgpm
     # returns.
     if colno < 0:
         return value
     if math.isnan(value):
         return None
     stattype = core.bayesdb_generator_column_stattype(
         bdb, generator_id, colno)
     if _is_categorical(stattype):
         cursor = bdb.sql_execute(
             '''
             SELECT value FROM bayesdb_cgpm_category
                 WHERE generator_id = ? AND colno = ? AND code = ?
         ''', (generator_id, colno, value))
         text = cursor_value(cursor, nullok=True)
         if text is None:
             raise BQLError('Invalid category: %r' % (value, ))
         return text
     else:
         return value
Beispiel #11
0
 def create_generator(self, bdb, table, schema, instantiate):
     # The schema is the column list. May want to change this later
     # to make room for specifying the hyperparameters, etc.
     insert_column_sql = '''
         INSERT INTO bayesdb_nig_normal_column
             (generator_id, colno, count, sum, sumsq)
             VALUES (:generator_id, :colno, :count, :sum, :sumsq)
     '''
     with bdb.savepoint():
         generator_id, column_list = instantiate(schema)
         for (colno, column_name, stattype) in column_list:
             if not stattype == 'numerical':
                 raise BQLError(bdb, 'NIG-Normal only supports'
                     ' numerical columns, but %s is %s'
                     % (repr(column_name), repr(stattype)))
             (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
             bdb.sql_execute(insert_column_sql, {
                 'generator_id': generator_id,
                 'colno': colno,
                 'count': count,
                 'sum': xsum,
                 'sumsq': sumsq,
             })
Beispiel #12
0
    def _initialize_cgpm(self, bdb, generator_id, cgpm_ext):
        population_id = core.bayesdb_generator_population(bdb, generator_id)

        def map_var(var):
            return core.bayesdb_variable_number(bdb, population_id,
                                                generator_id, var)

        name = cgpm_ext['name']
        outputs = map(map_var, cgpm_ext['outputs'])
        inputs = map(map_var, cgpm_ext['inputs'])
        args = cgpm_ext.get('args', ())
        kwds = cgpm_ext.get('kwds', {})
        if name not in self._cgpm_registry:
            raise BQLError(bdb, 'Unknown CGPM: %s' % (repr(name), ))
        cls = self._cgpm_registry[name]
        cgpm_vars = cgpm_ext['outputs'] + cgpm_ext['inputs']
        cgpm_data = self._data(bdb, generator_id, cgpm_vars)
        cgpm = cls(outputs, inputs, rng=bdb.np_prng, *args, **kwds)
        for cgpm_rowid, row in enumerate(cgpm_data):
            # CGPMs do not uniformly handle null values or missing
            # values sensibly yet, so until we have that sorted
            # out we both (a) omit nulls and (b) ignore errors in
            # incorporate.
            query = {
                colno: row[i]
                for i, colno in enumerate(outputs) if not math.isnan(row[i])
            }
            n = len(outputs)
            evidence = {
                colno: row[n + i]
                for i, colno in enumerate(inputs) if not math.isnan(row[n + i])
            }
            try:
                cgpm.incorporate(cgpm_rowid, query, evidence)
            except Exception:
                pass
        return cgpm
Beispiel #13
0
def instantiate_generator(bdb,
                          gen_name,
                          table,
                          metamodel,
                          columns,
                          default=None):
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(
            bdb, 'Name already defined as table: %s' % (repr(gen_name), ))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    generator_already_existed = False
    if core.bayesdb_has_generator(bdb, gen_name):
        generator_already_existed = True
    else:
        # Create the generator record.
        generator_sql = '''INSERT INTO bayesdb_generator
                           (name, tabname, metamodel, defaultp)
                           VALUES (:name, :table, :metamodel, :defaultp)'''
        cursor = bdb.sql_execute(
            generator_sql, {
                'name': gen_name,
                'table': table,
                'metamodel': metamodel.name(),
                'defaultp': default,
            })
    generator_id = core.bayesdb_get_generator(bdb, gen_name)

    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {
                'stattype': stattype,
            })
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb, 'No such columns in table %s: %s' %
            (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(
            bdb, 'Duplicate column names: %s' % (repr(list(duplicates)), ))
    if invalid:
        raise BQLError(
            bdb, 'Invalid statistical types: %s' % (repr(list(invalid)), ))

    if not generator_already_existed:
        # Insert column records.
        column_sql = '''
            INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
        '''
        for name, stattype in columns:
            colno = column_map[casefold(name)]
            stattype = casefold(stattype)
            bdb.sql_execute(
                column_sql, {
                    'generator_id': generator_id,
                    'colno': colno,
                    'stattype': stattype,
                })

    column_list = sorted((column_map[casefold(name)], name, stattype)
                         for name, stattype in columns)
    return generator_id, column_list
Beispiel #14
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as generator: %s' %
                    (repr(phrase.name), ))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as table: %s' %
                    (repr(phrase.name), ))
            if not core.bayesdb_has_generator_default(
                    bdb, phrase.simulation.generator):
                raise BQLError(
                    bdb,
                    'No such generator: %s' % (phrase.simulation.generator, ))
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            out.write(', ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.modelno,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = \
                [(core.bayesdb_generator_column_number(bdb, generator_id, name),
                        value)
                    for (name, _expression), value in
                        zip(phrase.simulation.constraints, cursor[0][2:])]
            colnos = \
                [core.bayesdb_generator_column_number(bdb, generator_id, name)
                    for name in column_names]
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join(
                     '%s %s' % (qcn, column_sqltypes[casefold(column_name)])
                     for qcn, column_name in zip(qcns, column_names))))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(bdb,
                                              generator_id,
                                              constraints,
                                              colnos,
                                              modelno=modelno,
                                              numpredictions=nsamples):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(
                    bdb, 'Table still in use by generators: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp) or \
                              core.bayesdb_has_generator(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(
                            bdb,
                            'No such generator: %s' % (repr(cmd.generator), ))
                    generator_id = core.bayesdb_get_generator(
                        bdb, cmd.generator)
                    bayesdb_schema_required(bdb, 6, "generator defaults")
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                    set_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(set_default_sql, (generator_id, ))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(
                bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), ))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:

                def instantiate(columns):
                    return instantiate_generator(bdb,
                                                 phrase.name,
                                                 phrase.table,
                                                 metamodel,
                                                 columns,
                                                 default=phrase.default)

                metamodel.create_generator(bdb, phrase.table, phrase.schema,
                                           instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos,
                                        model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX
Beispiel #15
0
def simulate_models_rows(bdb, simulation):
    assert all(isinstance(c, ast.SimCol) for c in simulation.columns)
    population_id = core.bayesdb_get_population(bdb, simulation.population)
    generator_id = None
    if simulation.generator is not None:
        if not core.bayesdb_has_generator(bdb, population_id,
                                          simulation.generator):
            raise BQLError(bdb,
                           'No such generator: %r' % (simulation.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                  simulation.generator)

    def retrieve_literal(expression):
        assert isinstance(expression, ast.ExpLit)
        lit = expression.value
        if isinstance(lit, ast.LitNull):
            return None
        elif isinstance(lit, ast.LitInt):
            return lit.value
        elif isinstance(lit, ast.LitFloat):
            return lit.value
        elif isinstance(lit, ast.LitString):
            return lit.value
        else:
            assert False

    def retrieve_variable(var):
        if not core.bayesdb_has_variable(bdb, population_id, generator_id,
                                         var):
            raise BQLError(bdb, 'No such population variable: %s' % (var, ))
        return core.bayesdb_variable_number(bdb, population_id, generator_id,
                                            var)

    def simulate_column(phrase):
        if isinstance(phrase, ast.ExpBQLDepProb):
            raise BQLError(
                bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.')
        elif isinstance(phrase, ast.ExpBQLProb):
            raise BQLError(bdb, 'PROBABILITY OF simulation still unsupported.')
        elif isinstance(phrase, ast.ExpBQLMutInf):
            colno0 = retrieve_variable(phrase.column0)
            colno1 = retrieve_variable(phrase.column1)
            constraint_args = ()
            if phrase.constraints is not None:
                constraint_args = tuple(
                    itertools.chain.from_iterable(
                        [[retrieve_variable(colname),
                          retrieve_literal(expr)]
                         for colname, expr in phrase.constraints]))
            nsamples = phrase.nsamples and retrieve_literal(phrase.nsamples)
            # One mi_list per generator of the population.
            mi_lists = bqlfn._bql_column_mutual_information(
                bdb, population_id, generator_id, colno0, colno1, nsamples,
                *constraint_args)
            return list(itertools.chain.from_iterable(mi_lists))
        else:
            raise BQLError(
                bdb, 'Only constants can be simulated: %s.' % (simulation, ))

    columns = [simulate_column(c.col) for c in simulation.columns]
    # All queries must return the same number of rows, equal to the number of
    # models of all generators implied by the query.
    assert all(len(column) == len(columns[0]) for column in columns)
    # Convert the columns into rows.
    return zip(*columns)
Beispiel #16
0
def bayesdb_simulate(bdb,
                     population_id,
                     constraints,
                     colnos,
                     generator_id=None,
                     numpredictions=1,
                     accuracy=None):
    """Simulate rows from a generative model, subject to constraints.

    Returns a list of `numpredictions` tuples, with a value for each
    column specified in the list `colnos`, conditioned on the
    constraints in the list `constraints` of tuples ``(colno,
    value)``.

    The results are simulated from the predictive distribution on
    fresh rows.
    """
    rowid, constraints = _retrieve_rowid_constraints(bdb, population_id,
                                                     constraints)

    def loglikelihood(generator_id, metamodel):
        if not constraints:
            return 0
        return metamodel.logpdf_joint(bdb, generator_id, rowid, constraints,
                                      [], None)

    def simulate(generator_id, metamodel, n):
        return metamodel.simulate_joint(bdb,
                                        generator_id,
                                        rowid,
                                        colnos,
                                        constraints,
                                        None,
                                        num_samples=n,
                                        accuracy=accuracy)

    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    metamodels = [
        core.bayesdb_generator_metamodel(bdb, generator_id)
        for generator_id in generator_ids
    ]
    if len(generator_ids) > 1:
        loglikelihoods = map(loglikelihood, generator_ids, metamodels)
        likelihoods = map(math.exp, loglikelihoods)
        total_likelihood = sum(likelihoods)
        if total_likelihood == 0:
            # XXX Show the constraints with symbolic names.
            raise BQLError(bdb, 'Impossible constraints: %r' % (constraints, ))
        probabilities = [
            likelihood / total_likelihood for likelihood in likelihoods
        ]
        countses = bdb.np_prng.multinomial(numpredictions,
                                           probabilities,
                                           size=1)
        counts = countses[0]
    else:
        counts = [numpredictions]
    rowses = map(simulate, generator_ids, metamodels, counts)
    all_rows = [row for rows in rowses for row in rows]
    assert all(isinstance(row, (tuple, list)) for row in all_rows)
    return all_rows
Beispiel #17
0
    def simulate_joint(self,
                       bdb,
                       generator_id,
                       modelnos,
                       rowid,
                       targets,
                       constraints,
                       num_samples=1,
                       accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)

        # If rowid exists, retrieve conditioning data from the table.
        if rowid != bayesdb_population_fresh_row_id(bdb, generator_id):
            row_values_raw = bayesdb_population_row_values(
                bdb, population_id, rowid)
            row_values = [
                str(a) if isinstance(a, unicode) else a for a in row_values_raw
            ]
            row = [
                entry for entry in enumerate(row_values)
                if entry[1] is not None
            ]
            constraints_colnos = [c[0] for c in constraints]
            row_colnos = [r[0] for r in row]
            if any([colno in constraints_colnos for colno in row_colnos]):
                raise BQLError(bdb, 'Overlap between constraints and' \
                    'target row in simulate.')
            constraints.extend(row)

        # Prepare the query row to provide to Loom.
        row = {}
        target_num_to_name = {}
        for colno in targets:
            name = bayesdb_variable_name(bdb, generator_id, None, colno)
            target_num_to_name[colno] = name
            row[name] = ''
        for (colno, value) in constraints:
            name = bayesdb_variable_name(bdb, generator_id, None, colno)
            row[name] = value

        # Fetch the server.
        server = self._get_cache_entry(bdb, generator_id, 'preql_server')

        # Prepare the csv header.
        csv_headers, csv_values = zip(*row.iteritems())
        lower_to_upper = {str(a).lower(): str(a) for a in csv_headers}
        csv_headers = lower_to_upper.keys()
        csv_values = [str(a) for a in csv_values]

        # Retrieve the samples from the server..
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers] + [csv_values])
        server._predict(reader, num_samples, writer, False)
        output = writer.result()

        # Parse output.
        returned_headers = [
            lower_to_upper[a]
            for a in output.strip().split('\r\n')[0].split(CSV_DELIMITER)
        ]
        loom_output = [
            zip(returned_headers, a.split(CSV_DELIMITER))
            for a in output.strip().split('\r\n')[1:]
        ]
        return_list = []
        for row in loom_output:
            # Prepare the row.
            row_values = []
            row_dict = dict(row)
            for colno in targets:
                colname = target_num_to_name[colno]
                value = row_dict[colname]
                stattype = bayesdb_variable_stattype(bdb, population_id, None,
                                                     colno)
                if not _is_nominal(stattype):
                    value = float(value)
                row_values.append(value)
            # Add this row to the return list.
            return_list.append(row_values)

        return return_list
Beispiel #18
0
    def grouped_schema():
        schema = ''
        nominal = []
        numerical = []
        ignore = []

        for var in guesses.keys():
            if len(var) > 0:
                guessed_type_reason = guesses[var]
                guessed_type = guessed_type_reason[0].lower()
                guessed_reason = guessed_type_reason[1]

                if guessed_type == 'nominal':
                    nominal.append([var, guessed_reason])
                elif guessed_type == 'numerical':
                    numerical.append([var, guessed_reason])
                elif guessed_type == 'ignore':
                    ignore.append([var, guessed_reason])
                elif guessed_type == 'key':
                    if len(guessed_reason) > 0:
                        ignore.append([var, guessed_reason])
                    else:
                        ignore.append([var, 'This variable is a key.'])
            else:
                raise BQLError(bdb, 'Empty column name(s) in table %s' %
                    (tablename,))

        stattype_var_list_pairs = [
            ['NOMINAL', nominal],
            ['NUMERICAL', numerical],
            ['IGNORE', ignore]
        ]

        for stattype, var_list in stattype_var_list_pairs:
            # Remove any empty-string variable names.
            var_list = filter(None, var_list)

            if len(var_list) > 0:
                if stattype == 'IGNORE':
                    schema += 'IGNORE '
                else:
                    schema += 'MODEL %s ' % (os.linesep,)

                for i in xrange(len(var_list)):
                    # List of variable and reason it was classified as such.
                    var_reason = var_list[i]
                    var = var_reason[0]
                    reason = var_reason[1]

                    schema += '\t %s' % (var,)

                    # Don't append a comma for last item in list.
                    if i != len(var_list) - 1:
                        schema += ','
                    # Add a space between the last variable and 'AS' for proper
                    # parsing.
                    else:
                        schema += ' '

                    if len(reason) > 0:
                        # Add reason as a comment.
                        schema += " '''# %s" % (reason,)

                    # Each variable (and reason) on a separate line.
                    schema += os.linesep

                    # If reason was commented on previous line, need triple
                    # quote to re-enter schema string.
                    if len(reason) > 0:
                        schema += "'''"

                if stattype != 'IGNORE':
                    schema += 'AS %s \t %s' % (os.linesep, stattype,)

                schema += ';%s' % (os.linesep,)

        # Strip last semicolon and newline - not needed at end of schema.
        schema = schema[:-2]
        return schema
Beispiel #19
0
def _create_population(bdb, phrase):
    if core.bayesdb_has_population(bdb, phrase.name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(
                bdb,
                'Name already defined as population: %r' % (phrase.name, ))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)
    seen_columns = []

    # Create the population record and get the assigned id.
    bdb.sql_execute(
        '''
        INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?)
    ''', (phrase.name, phrase.table))
    population_id = core.bayesdb_get_population(bdb, phrase.name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(
        itertools.chain.from_iterable([[(name, s.stattype) for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(
        itertools.chain.from_iterable([[(name, 'ignore') for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(
        itertools.chain.from_iterable([
            s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)
        ]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r' %
                (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, pop_guess_stattypes)
        migrate = [(col, st) for col, st in pop_guess_vars if st == 'key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r' %
            (not_found, ))

    # Get a map from variable name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    variable_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in variable_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': phrase.table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
            if cursor_value(cursor) == 0 and stattype != 'ignore':
                invalid.add(stattype)
                continue
            variable_map[name] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb,
            'No such columns in table %r: %r' % (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb,
                       'Duplicate column names: %r' % (list(duplicates), ))
    if invalid:
        raise BQLError(bdb,
                       'Invalid statistical types: %r' % (list(invalid), ))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        colno = variable_map[name]
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_variable
                (population_id, name, colno, stattype)
                VALUES (?, ?, ?, ?)
        ''', (population_id, name, colno, stattype))
Beispiel #20
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            if not core.bayesdb_has_population(bdb,
                                               phrase.simulation.population):
                raise BQLError(
                    bdb, 'No such population: %s' %
                    (phrase.simulation.population, ))
            population_id = core.bayesdb_get_population(
                bdb, phrase.simulation.population)
            generator_id = None
            if phrase.simulation.generator is not None:
                if not core.bayesdb_has_generator(bdb, population_id,
                                                  phrase.simulation.generator):
                    raise BQLError(
                        bdb, 'No such generator: %r' %
                        (phrase.simulation.generator, ))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.simulation.generator)
            table = core.bayesdb_population_table(bdb, population_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such variable'
                        ' in population %r: %s' %
                        (phrase.simulation.population, column_name))
            for column_name, _expression in phrase.simulation.constraints:
                cn = casefold(column_name)
                if (cn not in column_sqltypes
                        and cn not in core.bayesdb_rowid_tokens(bdb)):
                    raise BQLError(
                        bdb, 'No such variable in population %s: %s' %
                        (phrase.simulation.population, column_name))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)

            def map_var(var):
                if casefold(var) not in core.bayesdb_rowid_tokens(bdb):
                    return core.bayesdb_variable_number(
                        bdb, population_id, generator_id, var)
                else:
                    return casefold(var)

            def map_constraint(((var, _expression), value)):
                return (map_var(var), value)

            constraints = map(
                map_constraint,
                zip(phrase.simulation.constraints, cursor[0][1:]))
            colnos = map(map_var, column_names)
            schema = ','.join('%s %s' %
                              (qcn, column_sqltypes[casefold(column_name)])
                              for qcn, column_name in zip(qcns, column_names))
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, schema))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(
                    bdb,
                    population_id,
                    constraints,
                    colnos,
                    generator_id=generator_id,
                    numpredictions=nsamples,
                    accuracy=phrase.simulation.accuracy):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)
Beispiel #21
0
                    population_id,
                    constraints,
                    colnos,
                    generator_id=generator_id,
                    numpredictions=nsamples,
                    accuracy=phrase.simulation.accuracy):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
Beispiel #22
0
def simulate_models_rows(bdb, simulation):
    assert all(isinstance(c, ast.SelColExp) for c in simulation.columns)
    population_id = core.bayesdb_get_population(bdb, simulation.population)
    generator_id = None
    if simulation.generator is not None:
        if not core.bayesdb_has_generator(bdb, population_id,
                                          simulation.generator):
            raise BQLError(bdb,
                           'No such generator: %r' % (simulation.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                  simulation.generator)

    def retrieve_literal(expression):
        assert isinstance(expression, ast.ExpLit)
        lit = expression.value
        if isinstance(lit, ast.LitNull):
            return None
        elif isinstance(lit, ast.LitInt):
            return lit.value
        elif isinstance(lit, ast.LitFloat):
            return lit.value
        elif isinstance(lit, ast.LitString):
            return lit.value
        else:
            assert False

    def retrieve_variable(var):
        if not core.bayesdb_has_variable(bdb, population_id, generator_id,
                                         var):
            raise BQLError(bdb, 'No such population variable: %s' % (var, ))
        return core.bayesdb_variable_number(bdb, population_id, generator_id,
                                            var)

    def simulate_column(exp):
        if isinstance(exp, ast.ExpCol):
            # XXX This is wrong -- it returns independent samples from
            # the marginals of each variable, not one sample from the
            # joint on all variables.
            if False:
                raise BQLError(
                    bdb, 'SIMULATE FROM MODELS OF can\'t sample conditional')
                # XXX Gotta weight each model by probability of
                # constraints.
                constraints = [(retrieve_variable(v), retrieve_literal(e))
                               for v, e in simulation.constraints]
            else:
                constraints = []
            colnos = [retrieve_variable(exp.column)]
            accuracy = 1  # XXX Allow nontrivial accuracy?
            samples = bqlfn.bayesdb_simulate(bdb,
                                             population_id,
                                             constraints,
                                             colnos,
                                             generator_id=generator_id,
                                             numpredictions=1,
                                             accuracy=accuracy)
            return [sample[0] for sample in samples]
        elif isinstance(exp, ast.ExpBQLDepProb):
            raise BQLError(
                bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.')
        elif isinstance(exp, ast.ExpBQLProbDensity):
            raise BQLError(
                bdb, 'PROBABILITY DENSITY OF simulation still unsupported.')
        elif isinstance(exp, ast.ExpBQLMutInf):
            colnos0 = [retrieve_variable(c) for c in exp.columns0]
            colnos1 = [retrieve_variable(c) for c in exp.columns1]
            constraint_args = ()
            if exp.constraints is not None:
                constraint_args = tuple(
                    itertools.chain.from_iterable(
                        [[retrieve_variable(colname),
                          retrieve_literal(expr)]
                         for colname, expr in exp.constraints]))
            nsamples = exp.nsamples and retrieve_literal(exp.nsamples)
            # One mi_list per generator of the population.
            #
            # XXX fsaad@20170625: Setting modelnos = None arbitrarily, figure
            # out how to set the modelnos argument.
            mi_lists = bqlfn._bql_column_mutual_information(
                bdb, population_id, generator_id, None, colnos0, colnos1,
                nsamples, *constraint_args)
            return list(itertools.chain.from_iterable(mi_lists))
        else:
            raise BQLError(
                bdb, 'Only constants can be simulated: %s.' % (simulation, ))

    columns = [simulate_column(c.expression) for c in simulation.columns]
    # All queries must return the same number of rows, equal to the number of
    # models of all generators implied by the query.
    assert all(len(column) == len(columns[0]) for column in columns)
    # Convert the columns into rows.
    return zip(*columns)
Beispiel #23
0
 def retrieve_variable(var):
     if not core.bayesdb_has_variable(bdb, population_id, generator_id,
                                      var):
         raise BQLError(bdb, 'No such population variable: %s' % (var, ))
     return core.bayesdb_variable_number(bdb, population_id, generator_id,
                                         var)
Beispiel #24
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder(
                '''
                CREATE TEMP TABLE %s (column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT)
            ''' % (qtt), ())
            for cn, st, ct in zip(column_names, stattypes,
                                  distinct_value_counts):
                out.winder(
                    '''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt, ))
            out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [
                    core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids
                ]
                raise BQLError(
                    bdb, 'Population %r still has metamodels: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'No such variable in base table: %s' % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(bdb, population_id, None,
                                                 cmd.name):
                        raise BQLError(
                            bdb,
                            'Variable already in population: %s' % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'Cannot add variable without any values: %s' %
                            (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute('SELECT %s FROM %s' %
                                                 (qc, qt))
                        rows = cursor.fetchall()
                        [stattype,
                         reason] = bayesdb_guess_stattypes([cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(
                                bdb, 'Values in column %s appear to be keys.' %
                                (cmd.name, ))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(
                                bdb, 'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name, ))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                                       'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'Numerical column contains string values: %r ' %
                            (qc, ))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(bdb, population_id, cmd.name,
                                                  stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) metamodel in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            # XXX Omit needless bayesdb_generator_column table
                            # Github issue #441.
                            bdb.sql_execute(
                                '''
                                INSERT INTO bayesdb_generator_column
                                    VALUES (:generator_id, :colno, :stattype)
                            ''', {
                                    'generator_id': generator_id,
                                    'colno': colno,
                                    'stattype': stattype,
                                })
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb,
                            'Cannot update statistical types for population '
                            '%s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population: %s' %
                            (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type: %r' %
                            (repr(cmd.stattype), ))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(
                                bdb, 'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars, ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(
                        bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(
                        bdb, 'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the metamodel.
                metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                metamodel.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the metamodel.
        generator_id = None
        if phrase.metamodel:
            if not core.bayesdb_has_generator(bdb, population_id,
                                              phrase.metamodel):
                raise BQLError(bdb,
                               'No such metamodel: %r' % (phrase.population, ))
            generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                      phrase.metamodel)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(bdb, population_id, None,
                                         phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target, ))
        colno_target = core.bayesdb_variable_number(bdb, population_id, None,
                                                    phrase.target)
        if core.bayesdb_variable_stattype(bdb, population_id, colno_target) != \
                'numerical':
            raise BQLError(
                bdb,
                'Target variable is not numerical: %r' % (phrase.target, ))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(bdb, population_id, None,
                                             given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(colno for colno in colno_givens
                                  if colno != colno_target)
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(bdb,
                                      population_id,
                                      generator_id,
                                      modelnos,
                                      constraints,
                                      colnos,
                                      numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(target_values, given_values, given_names,
                                   stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder(
            '''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt, ), ())
        for variable, coef in coefficients:
            out.winder(
                '''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (
                    variable,
                    coef,
                ))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt, ))
        out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    assert False  # XXX
Beispiel #25
0
        def retrieve_analyze_variables(ast):
            # Transition all variables by default.
            variables = None

            # Exactly 1 VARIABLES or SKIP clause supported for simplicity.
            seen_variables, seen_skip, seen_optimized = False, False, False
            for clause in ast:
                # Transition user specified variables only.
                if isinstance(clause, cgpm_analyze.parse.Variables):
                    if seen_variables or seen_skip:
                        raise BQLError(
                            bdb,
                            'Only 1 VARIABLES or SKIP clause allowed in ANALYZE'
                        )
                    seen_variables = True
                    included = set()
                    unknown = set()
                    for var in clause.vars:
                        if not core.bayesdb_has_variable(
                                bdb, population_id, generator_id, var):
                            unknown.add(var)
                        included.add(var)
                    if unknown:
                        raise BQLError(
                            bdb, 'Unknown variables in ANALYZE: %r' %
                            (sorted(unknown), ))
                    variables = sorted(included)
                # Transition all variables except user specified skip.
                elif isinstance(clause, cgpm_analyze.parse.Skip):
                    if seen_variables or seen_skip:
                        raise BQLError(
                            bdb,
                            'Only 1 VARIABLES or SKIP clause allowed in ANALYZE'
                        )
                    seen_skip = True
                    excluded = set()
                    unknown = set()
                    for var in clause.vars:
                        if not core.bayesdb_has_variable(
                                bdb, population_id, generator_id, var):
                            unknown.add(var)
                        excluded.add(var)
                    if unknown:
                        raise BQLError(
                            bdb, 'Unknown variables in ANALYZE: %r' %
                            (sorted(unknown), ))
                    all_vars = core.bayesdb_variable_names(
                        bdb, population_id, generator_id)
                    variables = sorted(set(all_vars) - excluded)
                elif isinstance(clause, cgpm_analyze.parse.Optimized):
                    seen_optimized = True
                # Unknown/impossible clause.
                else:
                    raise ValueError('Unknown clause in ANALYZE: %s.' % ast)

            if variables is None:
                variables = core.bayesdb_variable_names(
                    bdb, population_id, generator_id)

            varnos = [
                core.bayesdb_variable_number(bdb, population_id, generator_id,
                                             v) for v in variables
            ]

            # TODO Perform error checking if the OPTIMIZED clause is used.
            # In particular, the variables in OPTIMIZED must correspond
            # EXACTLY to the variables that are modeled by the CrossCat
            # baseline. Avoided this check for now since the nature of a
            # variable is not stored in the bdb. For now, just check the
            # user did not include a VARIABLES clause.
            if seen_optimized:
                if seen_variables:
                    raise BQLError(bdb,
                                   'OPTIMIZED incompatible with VARIABLES')
                # TODO Check if varnos are exactly the CrossCat variables.
                # raise BQLError(bdb,
                #     'The OPTIMIZED phrase in ANALYZE must target all the '
                #     'variables modeled by the baseline, only. '
                #     'Use SKIP to explicitly ignore analysis of overriden '
                #     'variables')

            return varnos, seen_optimized
Beispiel #26
0
    def simulate_joint(self,
                       bdb,
                       generator_id,
                       modelnos,
                       rowid,
                       targets,
                       constraints,
                       num_samples=1,
                       accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Prepare list of full constraints, potentially adding data from table.
        constraints_full = constraints

        # If rowid exist in base table, retrieve conditioning data.
        # Conditioning values are fetched for any rowid that exists in the base
        # table irrespective of whether the rowid is incorporated in the Loom
        # model or whether it was added after creation.
        if bayesdb_table_has_rowid(bdb, table, rowid):
            # Fetch population column numbers and row values.
            colnos = bayesdb_variable_numbers(bdb, population_id, generator_id)
            rowvals = bayesdb_population_row_values(bdb, population_id, rowid)
            observations = [(colno, rowval)
                            for colno, rowval in zip(colnos, rowvals)
                            if rowval is not None and colno not in targets]
            # Raise error if a constraint overrides an observed cell.
            colnos_constrained = [constraint[0] for constraint in constraints]
            colnos_observed = [observation[0] for observation in observations]
            if set.intersection(set(colnos_constrained), set(colnos_observed)):
                raise BQLError(
                    bdb, 'Overlap between constraints and'
                    ' target row in simulate.')
            # Update the constraints.
            constraints_full = constraints + observations

        # Store mapping from target column name to column number and stattype.
        target_colno_to_name = {
            colno: bayesdb_variable_name(bdb, generator_id, None, colno)
            for colno in targets
        }
        target_colno_to_stattype = {
            colno: bayesdb_variable_stattype(bdb, population_id, None, colno)
            for colno in targets
        }

        # Construct the CSV row for targets.
        row_targets = {target_colno_to_name[colno]: '' for colno in targets}
        row_constraints = {
            bayesdb_variable_name(bdb, generator_id, None, colno): value
            for colno, value in constraints_full
        }
        row = dict(
            itertools.chain(row_targets.iteritems(),
                            row_constraints.iteritems()))

        # Fetch the server.
        server = self._get_preql_server(bdb, generator_id)

        # Prepare the csv header and values.
        csv_headers = map(str, row.iterkeys())
        csv_values = map(str, row.itervalues())

        # Prepare streams for the server.
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers] + [csv_values])

        # Obtain the prediction.
        server._predict(reader, num_samples, writer, False)

        # Parse the CSV output.
        output_csv = writer.result()
        output_rows = output_csv.strip().split('\r\n')

        # Extract the header of the CSV file.
        header = output_rows[0].split(CSV_DELIMITER)

        # Extract list of simulated rows. Each simulated row is represented
        # as a dictionary mapping column name to its simulated value.
        simulated_rows = [
            dict(zip(header, row.split(CSV_DELIMITER)))
            for row in output_rows[1:]
        ]

        # Prepare the return list of simulated_rows.
        def _extract_simulated_value(row, colno):
            colname = target_colno_to_name[colno]
            stattype = target_colno_to_stattype[colno]
            value = row[colname]
            return value if _is_nominal(stattype) else float(value)

        # Return the list of samples.
        return [[_extract_simulated_value(row, colno) for colno in targets]
                for row in simulated_rows]
Beispiel #27
0
def _retrieve_analyze_variables(bdb, generator_id, ast):

    population_id = core.bayesdb_generator_population(bdb, generator_id)

    # Transitions all variables by default.
    variables = None

    # Exactly 1 VARIABLES or SKIP clause supported for simplicity.
    seen_variables, seen_skip, seen_optimized = False, False, False

    for clause in ast:

        # Transition user specified variables only.
        if isinstance(clause, cgpm_analyze.parse.Variables):
            if seen_variables or seen_skip:
                raise BQLError(
                    bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE')
            seen_variables = True
            included = set()
            unknown = set()
            for var in clause.vars:
                if not core.bayesdb_has_variable(bdb, population_id,
                                                 generator_id, var):
                    unknown.add(var)
                included.add(var)
            if unknown:
                raise BQLError(
                    bdb,
                    'Unknown variables in ANALYZE: %r' % (sorted(unknown), ))
            variables = sorted(included)

        # Transition all variables except user specified skip.
        elif isinstance(clause, cgpm_analyze.parse.Skip):
            if seen_variables or seen_skip:
                raise BQLError(
                    bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE')
            seen_skip = True
            excluded = set()
            unknown = set()
            for var in clause.vars:
                if not core.bayesdb_has_variable(bdb, population_id,
                                                 generator_id, var):
                    unknown.add(var)
                excluded.add(var)
            if unknown:
                raise BQLError(
                    bdb,
                    'Unknown variables in ANALYZE: %r' % (sorted(unknown), ))
            all_vars = core.bayesdb_variable_names(bdb, population_id,
                                                   generator_id)
            variables = sorted(set(all_vars) - excluded)

        # OPTIMIZED is incompatible with any other clause.
        elif isinstance(clause, cgpm_analyze.parse.Optimized):
            seen_optimized = True

        # Unknown/impossible clause.
        else:
            raise BQLError(bdb, 'Unknown clause in ANALYZE: %s.' % (ast, ))

    # OPTIMIZED is incompatible with any other clause.
    if seen_optimized:
        if seen_variables or seen_skip:
            raise BQLError(bdb, 'OPTIMIZED incompatible with other clauses.')

    variable_numbers = [
        core.bayesdb_variable_number(bdb, population_id, generator_id, v)
        for v in variables
    ] if variables else None

    return (variable_numbers, seen_optimized)
Beispiel #28
0
    def analyze_models(self,
                       bdb,
                       generator_id,
                       modelnos=None,
                       iterations=None,
                       max_seconds=None,
                       ckpt_iterations=None,
                       ckpt_seconds=None,
                       program=None):
        # Not sure why model-based analysis is useful.
        if modelnos:
            raise NotImplementedError('CGpm analysis by models not supported.')
        # XXX https://github.com/probcomp/cgpm/issues/167
        if ckpt_iterations is not None or ckpt_seconds is not None:
            raise NotImplementedError(
                'CGpm analysis checkpoint not supported.')
        if program is None:
            program = []

        # Retrieve the engine.
        engine = self._engine(bdb, generator_id)

        # Retrieve user-specified target variables to transition.
        analyze_ast = cgpm_analyze.parse.parse(program)
        vars_user, optimized = _retrieve_analyze_variables(
            bdb, generator_id, analyze_ast)

        # Transitions all baseline variables only using lovecat.
        if optimized:
            engine.transition_lovecat(N=iterations,
                                      S=max_seconds,
                                      multiprocess=self._multiprocess)

        # More complex possibilities if using cgpm.
        else:
            # XXX Retrieve all, baseline, and foreign variable indices.
            state = engine.states[0]
            vars_baseline = state.outputs
            vars_foreign = list(
                itertools.chain.from_iterable([
                    cgpm.outputs for cgpm in state.hooked_cgpms.itervalues()
                ]))

            # By default transition all baseline variables only.
            vars_target_baseline = vars_baseline
            vars_target_foreign = None

            # Partition user-specified variables into baseline and foreign.
            if vars_user:
                intersection = lambda a, b: [x for x in a if x in b]
                vars_target_baseline = intersection(vars_user, vars_baseline)
                vars_target_foreign = intersection(vars_user, vars_foreign)

            assert vars_target_baseline or vars_target_foreign

            # Timed analysis is incompatible with mixed baseline and foreign.
            if max_seconds and (vars_target_baseline and vars_target_foreign):
                raise BQLError(
                    bdb,
                    'Timed analysis accepts foreign xor baseline variables.')

            # Run transitions on baseline variables.
            if vars_target_baseline:
                engine.transition(N=iterations,
                                  S=max_seconds,
                                  cols=vars_target_baseline,
                                  multiprocess=self._multiprocess)

            # Run transitions on foreign variables.
            if vars_target_foreign:
                engine.transition_foreign(N=iterations,
                                          S=max_seconds,
                                          cols=vars_target_foreign,
                                          multiprocess=self._multiprocess)

        # Serialize the engine.
        engine_json = json_dumps(engine.to_metadata())

        # Update the engine.
        bdb.sql_execute(
            '''
            UPDATE bayesdb_cgpm_generator
                SET engine_json = :engine_json
                WHERE generator_id = :generator_id
        ''', {
                'generator_id': generator_id,
                'engine_json': engine_json
            })
Beispiel #29
0
def _create_schema(bdb, generator_id, schema_ast, **kwargs):
    # Get some parameters.
    population_id = core.bayesdb_generator_population(bdb, generator_id)
    table = core.bayesdb_population_table(bdb, population_id)

    # State.
    variables = []
    variable_dist = {}
    latents = {}
    cgpm_composition = []
    modelled = set()
    default_modelled = set()
    subsample = None
    deferred_input = defaultdict(lambda: [])
    deferred_output = dict()

    # Error-reporting state.
    duplicate = set()
    unknown = set()
    needed = set()
    existing_latent = set()
    must_exist = []
    unknown_stattype = {}

    # XXX Convert all Foreign.exposed lists to Latent clauses.
    # Retrieve Foreign clauses with exposed variables.
    foreign_clauses = [
        c for c in schema_ast
        if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0
    ]
    # Add the exposed variables to Foreign.outputs
    # Note that this assumes if there are K exposed variables, then they are
    # necessarily the last K outputs of the fc.outputs.
    for fc in foreign_clauses:
        fc.outputs.extend([e[0] for e in fc.exposed])

    # Convert exposed entries into Latent clauses.
    latent_vars = list(
        itertools.chain.from_iterable(c.exposed for c in foreign_clauses))
    latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars]
    # Append the Latent clauses to the ast.
    schema_ast.extend(latent_clauses)

    # XXX Convert the baseline to a Foreign clause.
    # Currently the baselines do not accept a schema, and will fail if
    # `schema_ast` has any entries.
    baseline = kwargs.get('baseline', None)
    if baseline is not None and casefold(baseline.name) != 'crosscat':
        if schema_ast:
            raise BQLError(
                bdb, 'Cannot accept schema with baseline: %s.' % schema_ast)
        # Retrieve all variable names in the population
        outputs = core.bayesdb_variable_names(bdb, population_id, None)
        # Convert the LITERAL namedtuples to their raw values.
        ps, vs = zip(*baseline.params)
        vs_new = [v.value for v in vs]
        params = zip(ps, vs_new)
        # Create the clause.
        clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name,
                                           params)
        # And add append it to the schema_ast.
        schema_ast.append(clause)

    # Process each clause one by one.
    for clause in schema_ast:

        if isinstance(clause, cgpm_schema.parse.Basic):
            # Basic Crosscat component model: one variable to be put
            # into Crosscat views.
            var = clause.var
            dist = clause.dist
            params = dict(clause.params)  # XXX error checking

            # Reject if the variable does not exist.
            if not core.bayesdb_has_variable(bdb, population_id, None, var):
                unknown.add(var)
                continue

            # Reject if the variable has already been modelled.
            if var in modelled:
                duplicate.add(var)
                continue

            # Reject if the variable is latent.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Get the column number.
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            assert 0 <= colno

            # Add it to the list and mark it modelled by default.
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      colno)
            variables.append([var, stattype, dist, params])
            assert var not in variable_dist
            variable_dist[var] = (stattype, dist, params)
            modelled.add(var)
            default_modelled.add(var)

        elif isinstance(clause, cgpm_schema.parse.Latent):
            var = clause.name
            stattype = clause.stattype

            # Reject if the variable has already been modelled by the
            # default model.
            if var in default_modelled:
                duplicate.add(var)
                continue

            # Reject if the variable even *exists* in the population
            # at all yet.
            if core.bayesdb_has_variable(bdb, population_id, None, var):
                duplicate.add(var)
                continue

            # Reject if the variable is already latent, from another
            # generator.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Reject if we've already processed it.
            if var in latents:
                duplicate.add(var)
                continue

            # Add it to the set of latent variables.
            latents[var] = stattype

        elif isinstance(clause, cgpm_schema.parse.Foreign):
            # Foreign model: some set of output variables is to be
            # modelled by foreign logic, possibly conditional on some
            # set of input variables.
            #
            # Gather up the state for a cgpm_composition record, which
            # we may have to do incrementally because it must refer to
            # the distribution types of variables we may not have
            # seen.
            name = clause.name
            outputs = clause.outputs
            inputs = clause.inputs

            output_stattypes = []
            output_statargs = []
            input_stattypes = []
            input_statargs = []
            distargs = {
                'inputs': {
                    'stattypes': input_stattypes,
                    'statargs': input_statargs
                },
                'outputs': {
                    'stattypes': output_stattypes,
                    'statargs': output_statargs,
                }
            }
            kwds = {'distargs': distargs}
            kwds.update(clause.params)

            # First make sure all the output variables exist and have
            # not yet been modelled.
            for var in outputs:
                must_exist.append(var)
                if var in modelled:
                    duplicate.add(var)
                    continue
                modelled.add(var)
                # Add the output statistical type and its parameters.
                i = len(output_stattypes)
                assert i == len(output_statargs)
                output_stattypes.append(None)
                output_statargs.append(None)
                deferred_output[var] = (output_stattypes, output_statargs, i)

            # Next make sure all the input variables exist, mark them
            # needed, and record where to put their distribution type
            # and parameters.
            for var in inputs:
                must_exist.append(var)
                needed.add(var)
                i = len(input_stattypes)
                assert i == len(input_statargs)
                input_stattypes.append(None)
                input_statargs.append(None)
                deferred_input[var].append(
                    (input_stattypes, input_statargs, i))

            # Finally, add a cgpm_composition record.
            cgpm_composition.append({
                'name': name,
                'inputs': inputs,
                'outputs': outputs,
                'kwds': kwds,
            })

        elif isinstance(clause, cgpm_schema.parse.Subsample):
            if subsample is not None:
                raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, ))
            subsample = clause.n

        else:
            raise BQLError(bdb, 'Unknown clause: %r' % (clause, ))

    # Make sure all the outputs and inputs exist, either in the
    # population or as latents in this generator.
    for var in must_exist:
        if core.bayesdb_has_variable(bdb, population_id, None, var):
            continue
        if var in latents:
            continue
        unknown.add(var)

    # Raise an exception if there were duplicates or unknown
    # variables.
    if duplicate:
        raise BQLError(bdb,
                       'Duplicate model variables: %r' % (sorted(duplicate), ))
    if existing_latent:
        raise BQLError(
            bdb, 'Latent variables already defined: %r' %
            (sorted(existing_latent), ))
    if unknown:
        raise BQLError(bdb,
                       'Unknown model variables: %r' % (sorted(unknown), ))

    def default_dist(var, stattype):
        stattype = casefold(stattype)
        if stattype not in _DEFAULT_DIST:
            if var in unknown_stattype:
                assert unknown_stattype[var] == stattype
            else:
                unknown_stattype[var] = stattype
            return None
        dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var)
        return dist, params

    # Use the default distribution for any variables that remain to be
    # modelled, excluding any that are latent or that have statistical
    # types we don't know about.
    for var in core.bayesdb_variable_names(bdb, population_id, None):
        if var in modelled:
            continue
        colno = core.bayesdb_variable_number(bdb, population_id, None, var)
        assert 0 <= colno
        stattype = core.bayesdb_variable_stattype(bdb, population_id, colno)
        distparams = default_dist(var, stattype)
        if distparams is None:
            continue
        dist, params = distparams
        variables.append([var, stattype, dist, params])
        assert var not in variable_dist
        variable_dist[var] = (stattype, dist, params)
        modelled.add(var)

    # Fill in the deferred_input statistical type assignments.
    for var in sorted(deferred_input.iterkeys()):
        # Check whether the variable is modelled.  If not, skip -- we
        # will fail later because this variable is guaranteed to also
        # be in needed.
        if var not in modelled:
            assert var in needed
            continue

        # Determine (possibly fictitious) distribution and parameters.
        if var in default_modelled:
            # Manifest variable modelled by default Crosscat model.
            assert var in variable_dist
            stattype, dist, params = variable_dist[var]
        else:
            # Modelled by a foreign model.  Assign a fictitious
            # default distribution because the 27B/6 of CGPM requires
            # this.
            if var in latents:
                # Latent variable modelled by a foreign model.  Use
                # the statistical type specified for it.
                stattype = latents[var]
            else:
                # Manifest variable modelled by a foreign model.  Use
                # the statistical type in the population.
                assert core.bayesdb_has_variable(bdb, population_id, None, var)
                colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     var)
                stattype = core.bayesdb_variable_stattype(
                    bdb, population_id, colno)
            distparams = default_dist(var, stattype)
            if distparams is None:
                continue
            dist, params = distparams

        # Assign the distribution and parameters.
        for cctypes, ccargs, i in deferred_input[var]:
            assert cctypes[i] is None
            assert ccargs[i] is None
            cctypes[i] = dist
            ccargs[i] = params

    # Fill in the deferred_output statistical type assignments. The need to be
    # in the form NUMERICAL or CATEGORICAL.
    for var in deferred_output:
        if var in latents:
            # Latent variable modelled by a foreign model.  Use
            # the statistical type specified for it.
            var_stattype = casefold(latents[var])
            if var_stattype not in _DEFAULT_DIST:
                if var in unknown_stattype:
                    assert unknown_stattype[var] == var_stattype
                else:
                    unknown_stattype[var] = var_stattype
            # XXX Cannot specify statargs for a latent variable. Trying to using
            # default_dist might lookup the counts for unique values of the
            # categorical in the base table causing a failure.
            var_statargs = {}
        else:
            # Manifest variable modelled by a foreign model.  Use
            # the statistical type and arguments from the population.
            assert core.bayesdb_has_variable(bdb, population_id, None, var)
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            var_stattype = core.bayesdb_variable_stattype(
                bdb, population_id, colno)
            distparams = default_dist(var, var_stattype)
            if distparams is None:
                continue
            _, var_statargs = distparams

        stattypes, statargs, i = deferred_output[var]
        assert stattypes[i] is None
        assert statargs[i] is None
        stattypes[i] = var_stattype
        statargs[i] = var_statargs

    if unknown_stattype:
        raise BQLError(
            bdb, 'Unknown statistical types for variables: %r' %
            (sorted(unknown_stattype.iteritems(), )))

    # If there remain any variables that we needed to model, because
    # others are conditional on them, fail.
    needed -= modelled
    if needed:
        raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, ))

    # Finally, create a CGPM schema.
    return {
        'variables': variables,
        'cgpm_composition': cgpm_composition,
        'subsample': subsample,
        'latents': latents,
    }
Beispiel #30
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSimModels):
        assert isinstance(phrase.simulation, ast.SimulateModels)
        with bdb.savepoint():
            # Check if table exists.
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                raise BQLError(
                    bdb,
                    'Name already defined as table: %s' % (phrase.name),
                )
            # Set up schema and create the new table.
            qn = sqlite3_quote_name(phrase.name)
            qcns = map(sqlite3_quote_name, [
                simcol.name if simcol.name is not None else str(simcol.col)
                for simcol in phrase.simulation.columns
            ])
            temp = '' if phrase.temp is None else 'TEMP'
            bdb.sql_execute('''
                CREATE %s TABLE %s (%s)
            ''' % (temp, qn, str.join(',', qcns)))
            # Retrieve the rows.
            rows = simulate_models_rows(bdb, phrase.simulation)
            # Insert the rows into the table.
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in rows:
                bdb.sql_execute(insert_sql, row)
            return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        schema = guess.guess_to_schema(guess.bayesdb_guess_stattypes, bdb,
                                       phrase.table)
        # Print schema to console, so user can edit it and/or copy/paste it into
        # the schema definition when creating a population.
        print schema
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            if core.bayesdb_population_generators(bdb, population_id):
                raise BQLError(
                    bdb,
                    'Population still has generators: %r' % (phrase.name, ))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb, 'Cannot update statistical types '
                            'for population %s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population'
                            ': %s' % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type'
                            ': %r' % (repr(cmd.stattype), ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX