Exemple #1
0
def bayesdb_generator_column_stattypes(bdb, generator_id):
    column_stattypes = {}
    for name in core.bayesdb_generator_column_names(bdb, generator_id):
        stattype = core.bayesdb_generator_column_stattype(
            bdb, generator_id, name)
        column_stattypes[casefold(name)] = casefold(stattype)
    return column_stattypes
def bayesdb_generator_column_stattypes(bdb, generator_id):
    column_stattypes = {}
    for name in core.bayesdb_generator_column_names(bdb, generator_id):
        stattype = core.bayesdb_generator_column_stattype(bdb, generator_id,
            name)
        column_stattypes[casefold(name)] = casefold(stattype)
    return column_stattypes
Exemple #3
0
def parse(schema, subsample_default):
    '''Parses a generator schema as passed to CrosscatMetamodel.

    schema is a tokenized expression of the form [['GUESS', ['*']], ['x',
    'NUMERICAL'], ...] that is passed to CrosscatMetamodel.create_generator and
    represents the argument to "crosscat" in CREATE GENERATOR ... FOR ... USING
    crosscat(...).

    Returns a GeneratorSchema.

    See test_crosscat_generator_schema.py for examples.
    '''

    guess = False
    subsample = subsample_default
    columns = []
    dep_constraints = []
    for directive in schema:

        if directive == []:
            # Skip extra commas so you can write
            #
            #    CREATE GENERATOR t_cc FOR t USING crosscat(
            #        x,
            #        y,
            #        z,
            #    )
            continue

        if (not isinstance(directive, list) or len(directive) != 2
                or not isinstance(directive[0], basestring)):
            raise BQLError(
                None,
                'Invalid crosscat column model directive: %r' % (directive, ))

        op = casefold(directive[0])
        if op == 'guess' and directive[1] == ['*']:
            guess = True
        elif (op == 'subsample' and isinstance(directive[1], list)
              and len(directive[1]) == 1):
            subsample = _parse_subsample_clause(directive[1][0])
        elif op == 'dependent':
            constraint = (_parse_dependent_clause(directive[1]), True)
            dep_constraints.append(constraint)
        elif op == 'independent':
            constraint = (_parse_dependent_clause(directive[1]), False)
            dep_constraints.append(constraint)
        elif op != 'guess' and casefold(directive[1]) != 'guess':
            columns.append((directive[0], directive[1]))
        else:
            raise BQLError(
                None,
                'Invalid crosscat column model: %r' % (directive),
            )
    return GeneratorSchema(guess=guess,
                           subsample=subsample,
                           columns=columns,
                           dep_constraints=dep_constraints)
def parse(schema, subsample_default):
    '''Parses a generator schema as passed to CrosscatMetamodel.

    schema is a tokenized expression of the form [['GUESS', ['*']], ['x',
    'NUMERICAL'], ...] that is passed to CrosscatMetamodel.create_generator and
    represents the argument to "crosscat" in CREATE GENERATOR ... FOR ... USING
    crosscat(...).

    Returns a GeneratorSchema.

    See test_crosscat_generator_schema.py for examples.
    '''

    guess = False
    subsample = subsample_default
    columns = []
    dep_constraints = []
    for directive in schema:

        if directive == []:
            # Skip extra commas so you can write
            #
            #    CREATE GENERATOR t_cc FOR t USING crosscat(
            #        x,
            #        y,
            #        z,
            #    )
            continue

        if (not isinstance(directive, list) or len(directive) != 2 or
                not isinstance(directive[0], basestring)):
            raise BQLError(
                None,
                'Invalid crosscat column model directive: %r' % (directive,))

        op = casefold(directive[0])
        if op == 'guess' and directive[1] == ['*']:
            guess = True
        elif (op == 'subsample' and isinstance(directive[1], list) and
                len(directive[1]) == 1):
            subsample = _parse_subsample_clause(directive[1][0])
        elif op == 'dependent':
            constraint = (_parse_dependent_clause(directive[1]), True)
            dep_constraints.append(constraint)
        elif op == 'independent':
            constraint = (_parse_dependent_clause(directive[1]), False)
            dep_constraints.append(constraint)
        elif op != 'guess' and casefold(directive[1]) != 'guess':
            columns.append((directive[0], directive[1]))
        else:
            raise BQLError(
                None, 'Invalid crosscat column model: %r' % (directive),)
    return GeneratorSchema(
        guess=guess, subsample=subsample, columns=columns,
        dep_constraints=dep_constraints)
Exemple #5
0
def bayesdb_add_variable(bdb, population_id, name, stattype):
    """Adds a variable to the population, with colno from the base table."""
    table_name = bayesdb_population_table(bdb, population_id)
    colno = bayesdb_table_column_number(bdb, table_name, name)
    # Use the original case of the variable in the table.
    name_unfolded = bayesdb_table_column_name(bdb, table_name, colno)
    assert casefold(name) == casefold(name_unfolded)
    bdb.sql_execute(
        '''
        INSERT INTO bayesdb_variable
            (population_id, name, colno, stattype)
            VALUES (?, ?, ?, ?)
    ''', (population_id, name_unfolded, colno, stattype))
Exemple #6
0
def tokenize(tokenses):
    for token in intersperse(',', [flatten(tokens) for tokens in tokenses]):
        if isinstance(token, str):
            if casefold(token) in KEYWORDS:
                yield KEYWORDS[casefold(token)], token
            elif token in PUNCTUATION:
                yield PUNCTUATION[token], token
            else:               # XXX check for alphanumeric/_
                yield grammar.L_NAME, token
        elif isinstance(token, (int, float)):
            yield grammar.L_NUMBER, token
        else:
            raise IOError('Invalid token: %r' % (token,))
    yield 0, ''                 # EOF
Exemple #7
0
def _parse_subsample_clause(clause):
    if isinstance(clause, basestring) and casefold(clause) == 'off':
        return False
    elif isinstance(clause, int):
        return clause
    else:
        raise BQLError(None, 'Invalid subsampling: %r' % (clause, ))
def _parse_subsample_clause(clause):
    if isinstance(clause, basestring) and casefold(clause) == 'off':
        return False
    elif isinstance(clause, int):
        return clause
    else:
        raise BQLError(None, 'Invalid subsampling: %r' % (clause,))
def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes):
    column_names = core.bayesdb_table_column_names(bdb, table)
    qcns = map(sqlite3_quote_name, column_names)
    assert all(column_stattypes[name] in allowed_column_stattypes
        for name in column_stattypes)
    column_name_set = set(casefold(name) for name in column_names)
    for name in column_stattypes:
        if name not in column_name_set:
            raise IOError('No such column in table %s: %s' %
                (repr(table), repr(name)))
    schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)])
        for name, qcn in zip(column_names, qcns))
    qg = sqlite3_quote_name(generator)
    qt = sqlite3_quote_name(table)
    qmm = 'crosscat'
    bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
        (qg, qt, qmm, schema))
Exemple #10
0
def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes):
    column_names = core.bayesdb_table_column_names(bdb, table)
    qcns = map(sqlite3_quote_name, column_names)
    assert all(column_stattypes[name] in allowed_column_stattypes
               for name in column_stattypes)
    column_name_set = set(casefold(name) for name in column_names)
    for name in column_stattypes:
        if name not in column_name_set:
            raise IOError('No such column in table %s: %s' %
                          (repr(table), repr(name)))
    schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)])
                      for name, qcn in zip(column_names, qcns))
    qg = sqlite3_quote_name(generator)
    qt = sqlite3_quote_name(table)
    qmm = 'crosscat'
    bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
                (qg, qt, qmm, schema))
Exemple #11
0
 def _cmd_interactive_pairplot(self, query, sql=None, **kwargs):
     population = kwargs.get('population', None)
     if population is None:
         raise ValueError('Specify --population=<name> argument.')
     c = self._bdb.sql_execute(query) if sql else self._bdb.execute(query)
     df = utils_bql.cursor_to_df(c)
     schema = utils_mml.get_schema_as_list(self._bdb, population)
     for colname in df.columns:
         drop = True
         for entry in schema:
             if casefold(entry['name']) == casefold(colname):
                 drop = False
                 entry['name'] = colname
         if drop:
             print "Ignoring non-modelled column %s" % (colname, )
             del df[colname]
     return jsviz.interactive_pairplot(df, schema)
Exemple #12
0
 def default_dist(var, stattype):
     stattype = casefold(stattype)
     if stattype not in _DEFAULT_DIST:
         if var in unknown_stattype:
             assert unknown_stattype[var] == stattype
         else:
             unknown_stattype[var] = stattype
         return None
     dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var)
     return dist, params
Exemple #13
0
def scan_nampar(scanner, text):
    text = casefold(text)
    n = None
    if text in scanner.nampar_map:
        n = scanner.nampar_map[text]
    else:
        # Numbered parameters are 1-indexed.
        scanner.n_numpar += 1
        n = scanner.n_numpar
        scanner.nampar_map[text] = n
    scanner.produce(grammar.L_NAMPAR, (n, text))
Exemple #14
0
def scan_nampar(scanner, text):
    text = casefold(text)
    n = None
    if text in scanner.nampar_map:
        n = scanner.nampar_map[text]
    else:
        # Numbered parameters are 1-indexed.
        scanner.n_numpar += 1
        n = scanner.n_numpar
        scanner.nampar_map[text] = n
    scanner.produce(grammar.L_NAMPAR, (n, text))
Exemple #15
0
    def register_foreign_predictor(self, builder):
        """Register an object which builds a foreign predictor.

        Explicitly initializing a foreign predictor instance is not
        necessary. The `composer` will create, train, and serialize
        all foreign predictors declared in the `schema` when the BQL query
        INITIALIZE is called.

        Foreign predictors must be registered each time the database is
        launched.

        Parameters
        ----------
        builder : :class:`.IBayesDBForeignPredictorFactory`
            The pattern used by the extant predictors is to include these
            four methods in the class implementing
            :class:`~.IBayesDBForeignPredictor` as `@classmethods`. For
            example, registering
            :class:`bdbcontrib.predictors.random_forest.RandomForest` is
            achieved by registering the **class** instance::

                >> from bdbcontrib.predictors.random_forest import RandomForest
                >> composer.register_foreign_predictor(RandomForest)
        """
        # Validate the builder.
        # Not isinstance(builder, predictor.IBayesDBForeignPreidctorFactory)
        # because the pattern using classes and class methods does not make the
        # classes be instances of that.
        assert hasattr(builder, 'create')
        assert hasattr(builder, 'serialize')
        assert hasattr(builder, 'deserialize')
        assert hasattr(builder, 'name')
        # Check for duplicates.
        if casefold(builder.name()) in self.predictor_builder:
            raise BLE(ValueError(
                'A foreign predictor with name "{}" has already '
                'been registered. Currently registered: {}'.format(
                    builder.name(), self.predictor_builder)))
        self.predictor_builder[casefold(builder.name())] = builder
Exemple #16
0
def subsample_table_columns(bdb, table, new_table, limit, keep, drop, seed):
    """Return a subsample of the columns in the table."""
    if not bayesdb_has_table(bdb, table):
        raise ValueError('No such table: %s' % (table, ))
    if bayesdb_has_table(bdb, new_table):
        raise ValueError('Table already exists: %s' % (new_table, ))
    keep = map(casefold, keep)
    drop = map(casefold, drop)
    skip = keep + drop
    unknown = [
        column for column in skip
        if not bayesdb_table_has_column(bdb, table, column)
    ]
    if unknown:
        raise ValueError('No such columns: %s' % (unknown, ))
    overlap = [column for column in drop if column in keep]
    if overlap:
        raise ValueError('Cannot both drop and keep columns: %s' % (overlap, ))
    num_sample = limit - len(keep)
    if num_sample < 0:
        raise ValueError('Must sample at least as many columns to keep.')
    subselect_columns = [
        column for column in bayesdb_table_column_names(bdb, table)
        if casefold(column) not in skip
    ]
    rng = np.random.RandomState(seed)
    subsample_columns = rng.choice(subselect_columns,
                                   replace=False,
                                   size=min(len(subselect_columns),
                                            num_sample))
    qt = bql_quote_name(table)
    qnt = bql_quote_name(new_table)
    qc = ','.join(map(bql_quote_name, itertools.chain(keep,
                                                      subsample_columns)))
    cursor = bdb.execute('''
        CREATE TABLE %s AS SELECT %s FROM %s
    ''' % (qnt, qc, qt))
    return cursor_to_df(cursor)
Exemple #17
0
def bayesdb_has_stattype(bdb, stattype):
    sql = 'SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype'
    cursor = bdb.sql_execute(sql, {'stattype': casefold(stattype)})
    return cursor_value(cursor) > 0
Exemple #18
0
def bayesdb_guess_stattypes(column_names, rows,
        count_cutoff=None, ratio_cutoff=None, overrides=None):
    """Heuristically guess statistical types for the data in `rows`.

    Return a list of statistical types corresponding to the columns
    named in the list `column_names`.

    :param int count_cutoff: number of distinct values below which
        columns whose values can all be parsed as numbers will be
        considered categorical anyway
    :param real ratio_cutoff: ratio of distinct values to total values
        below which columns whose values can all be parsed as numbers
        will be considered categorical anyway
    :param list overrides: list of ``(name, stattype)``, overriding
        any guessed statistical type for columns by those names

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``.
    """

    # Fill in default arguments.
    if count_cutoff is None:
        count_cutoff = 20
    if ratio_cutoff is None:
        ratio_cutoff = 0.02
    if overrides is None:
        overrides = []

    # Build a set of the column names.
    column_name_set = set()
    duplicates = set()
    for name in column_names:
        if casefold(name) in column_name_set:
            duplicates.add(name)
        column_name_set.add(casefold(name))
    if 0 < len(duplicates):
        raise ValueError('Duplicate column names: %s' %
            (repr(list(duplicates),)))

    # Build a map for the overrides.
    #
    # XXX Support more than just stattype: allow arbitrary column
    # descriptions.
    override_map = {}
    unknown = set()
    duplicates = set()
    for name, stattype in overrides:
        if casefold(name) not in column_name_set:
            unknown.add(name)
            continue
        if casefold(name) in override_map:
            duplicates.add(name)
            continue
        override_map[casefold(name)] = casefold(stattype)
    if 0 < len(unknown):
        raise ValueError('Unknown columns overridden: %s' %
            (repr(list(unknown)),))
    if 0 < len(duplicates):
        raise ValueError('Duplicate columns overridden: %s' %
            (repr(list(duplicates)),))

    # Sanity-check the inputs.
    ncols = len(column_names)
    assert ncols == len(unique(map(casefold, column_names)))
    for ri, row in enumerate(rows):
        if len(row) < ncols:
            raise ValueError('Row %d: Too few columns: %d < %d' %
                (ri, len(row), ncols))
        if len(row) > ncols:
            raise ValueError('Row %d: Too many columns: %d > %d' %
                (ri, len(row), ncols))

    # Find a key first, if it has been specified as an override.
    key = None
    duplicate_keys = set()
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            if override_map[casefold(column_name)] == 'key':
                if key is not None:
                    duplicate_keys.add(column_name)
                    continue
                column = integerify(rows, ci)
                if not column:
                    column = [row[ci] for row in rows]
                if not keyable_p(column):
                    raise ValueError('Column non-unique but specified as key'
                        ': %s' % (repr(column_name),))
                key = column_name
    if 0 < len(duplicate_keys):
        raise ValueError('Multiple columns overridden as keys: %s' %
            (repr(list(duplicate_keys)),))

    # Now go through and guess the other column stattypes or use the
    # override.
    stattypes = []
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            stattype = override_map[casefold(column_name)]
        else:
            numericable = True
            column = integerify(rows, ci)
            if not column:
                column = floatify(rows, ci)
                if not column:
                    column = [row[ci] for row in rows]
                    numericable = False
            if key is None and keyable_p(column):
                stattype = 'key'
                key = column_name
            elif numericable and \
                 numerical_p(column, count_cutoff, ratio_cutoff):
                stattype = 'numerical'
            else:
                stattype = 'categorical'
        stattypes.append(stattype)
    return stattypes
Exemple #19
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = "TEMP " if phrase.temp else ""
            ifnotexists = "IF NOT EXISTS " if phrase.ifnotexists else ""
            out.write("CREATE %sTABLE %s%s AS " % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as generator: %s" % (repr(phrase.name),))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as table: %s" % (repr(phrase.name),))
            if not core.bayesdb_has_generator_default(bdb, phrase.simulation.generator):
                raise BQLError(bdb, "No such generator: %s" % (phrase.simulation.generator,))
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute("PRAGMA table_info(%s)" % (qt,))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write("SELECT ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out)
            out.write(", ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(", ")
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = [
                (core.bayesdb_generator_column_number(bdb, generator_id, name), value)
                for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])
            ]
            colnos = [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names]
            bdb.sql_execute(
                "CREATE %sTABLE %s%s (%s)"
                % (
                    "TEMP " if phrase.temp else "",
                    "IF NOT EXISTS " if phrase.ifnotexists else "",
                    qn,
                    ",".join(
                        "%s %s" % (qcn, column_sqltypes[casefold(column_name)])
                        for qcn, column_name in zip(qcns, column_names)
                    ),
                )
            )
            insert_sql = """
                INSERT INTO %s (%s) VALUES (%s)
            """ % (
                qn,
                ",".join(qcns),
                ",".join("?" for qcn in qcns),
            )
            for row in bqlfn.bayesdb_simulate(
                bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples
            ):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = "SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?"
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(bdb, "Table still in use by generators: %s" % (repr(phrase.name),))
            bdb.sql_execute("DELETE FROM bayesdb_column WHERE tabname = ?", (phrase.name,))
            ifexists = "IF EXISTS " if phrase.ifexists else ""
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute("DROP TABLE %s%s" % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, "No such table: %s" % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + "_temp"
                        while core.bayesdb_has_table(bdb, temp) or core.bayesdb_has_generator(bdb, temp):
                            temp += "_temp"
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError("Renaming columns" " not yet implemented.")
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table, cmd.old):
                            raise BQLError(bdb, "No such column in table %s" ": %s" % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, "Column already exists" " in table %s: %s" % (repr(table), repr(cmd.new))
                            )
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = """
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_column_sql, {"table": table, "old": cmd.old, "new": cmd.new})
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = """
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        """
                        cursor = bdb.sql_execute(generators_sql, (table,))
                        for (generator_id,) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id, old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(bdb, "No such generator: %s" % (repr(cmd.generator),))
                    generator_id = core.bayesdb_get_generator(bdb, cmd.generator)
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                    set_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(set_default_sql, (generator_id,))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                else:
                    assert False, "Invalid alter table command: %s" % (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(bdb, "No such metamodel: %s" % (repr(phrase.metamodel),))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():

            def instantiate(columns):
                return instantiate_generator(
                    bdb,
                    phrase.name,
                    phrase.table,
                    metamodel,
                    columns,
                    ifnotexists=phrase.ifnotexists,
                    default=phrase.default,
                )

            metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, "No such generator: %s" % (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = """
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            """
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = """
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            """
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = """
                DELETE FROM bayesdb_generator WHERE id = ?
            """
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb, "No such generator: %s" % (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = """
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_generator_sql, (cmd.name, generator_id))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, "Invalid ALTER GENERATOR command: %s" % (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(
                    modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
            else:
                existing = set(
                    modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
                if 0 < len(existing):
                    raise BQLError(
                        bdb, "Generator %s already has models: %s" % (repr(phrase.generator), sorted(existing))
                    )

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = """
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            """
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {"generator_id": generator_id, "modelno": modelno, "iterations": 0})

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos, model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError("No background analysis -- use WAIT.")
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(
            bdb,
            generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
        )
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = """
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {"generator_id": generator_id, "modelno": modelno})
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, "No such model" " in generator %s: %s" % (repr(phrase.generator), repr(modelno))
                        )
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = """
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                """
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = """
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {"generator_id": generator_id, "modelno": modelno})
        return empty_cursor(bdb)

    assert False  # XXX
Exemple #20
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as generator: %s' %
                    (repr(phrase.name), ))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as table: %s' %
                    (repr(phrase.name), ))
            if not core.bayesdb_has_generator_default(
                    bdb, phrase.simulation.generator):
                raise BQLError(
                    bdb,
                    'No such generator: %s' % (phrase.simulation.generator, ))
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            out.write(', ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.modelno,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = \
                [(core.bayesdb_generator_column_number(bdb, generator_id, name),
                        value)
                    for (name, _expression), value in
                        zip(phrase.simulation.constraints, cursor[0][2:])]
            colnos = \
                [core.bayesdb_generator_column_number(bdb, generator_id, name)
                    for name in column_names]
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join(
                     '%s %s' % (qcn, column_sqltypes[casefold(column_name)])
                     for qcn, column_name in zip(qcns, column_names))))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(bdb,
                                              generator_id,
                                              constraints,
                                              colnos,
                                              modelno=modelno,
                                              numpredictions=nsamples):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(
                    bdb, 'Table still in use by generators: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp) or \
                              core.bayesdb_has_generator(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(
                            bdb,
                            'No such generator: %s' % (repr(cmd.generator), ))
                    generator_id = core.bayesdb_get_generator(
                        bdb, cmd.generator)
                    bayesdb_schema_required(bdb, 6, "generator defaults")
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                    set_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(set_default_sql, (generator_id, ))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(
                bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), ))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:

                def instantiate(columns):
                    return instantiate_generator(bdb,
                                                 phrase.name,
                                                 phrase.table,
                                                 metamodel,
                                                 columns,
                                                 default=phrase.default)

                metamodel.create_generator(bdb, phrase.table, phrase.schema,
                                           instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos,
                                        model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX
Exemple #21
0
    def p_foreign_name(self, foreign):          return casefold(foreign)

    def p_given_opt_none(self):                 return []
def _is_continuous(stattype):
    return casefold(stattype) in ['cyclic', 'numerical', 'boolean']
Exemple #23
0
 def create_generator(self, bdb, table, schema, instantiate):
     # Parse the schema.
     (columns, lcols, _fcols, fcol_to_pcols, fcol_to_fpred,
         dependencies) = self.parse(schema)
     # Instantiate **this** generator.
     genid, bdbcolumns = instantiate(columns.items())
     # Create internal crosscat generator. The name will be the same as
     # this generator name, with a _cc suffix.
     SUFFIX = '_cc'
     cc_name = bayeslite.core.bayesdb_generator_name(bdb, genid) + SUFFIX
     # Create strings for crosscat schema.
     cc_cols = ','.join('{} {}'.format(quote(c), quote(columns[c]))
                        for c in lcols)
     cc_dep = []
     for dep, colnames in dependencies:
         qcns = ','.join(map(quote, colnames))
         if dep:
             cc_dep.append('DEPENDENT({})'.format(qcns))
         else:
             cc_dep.append('INDEPENDENT({})'.format(qcns))
     bql = """
         CREATE GENERATOR {} FOR {} USING crosscat(
             {}, {}
         );
     """.format(quote(cc_name), quote(table), cc_cols, ','.join(cc_dep))
     bdb.execute(bql)
     # Convert strings to column numbers.
     fcolno_to_pcolnos = {}
     for f in fcol_to_pcols:
         fcolno = core.bayesdb_generator_column_number(bdb, genid, f)
         fcolno_to_pcolnos[fcolno] = [core.bayesdb_generator_column_number(
             bdb, genid, col) for col in fcol_to_pcols[f]]
     with bdb.savepoint():
         # Save internal cc generator id.
         bdb.sql_execute('''
             INSERT INTO bayesdb_composer_cc_id
                 (generator_id, crosscat_generator_id) VALUES (?,?)
         ''', (genid, core.bayesdb_get_generator(bdb, cc_name),))
         # Save lcols/fcolnos.
         for colno, _, _ in bdbcolumns:
             local = colno not in fcolno_to_pcolnos
             bdb.sql_execute('''
                 INSERT INTO bayesdb_composer_column_owner
                     (generator_id, colno, local) VALUES (?,?,?)
             ''', (genid, colno, int(local),))
         # Save parents of foreign columns.
         for fcolno in fcolno_to_pcolnos:
             for pcolno in fcolno_to_pcolnos[fcolno]:
                 bdb.sql_execute('''
                     INSERT INTO bayesdb_composer_column_parents
                         (generator_id, fcolno, pcolno) VALUES (?,?,?)
                 ''', (genid, fcolno, pcolno,))
         # Save topological order.
         topo = self.topological_sort(fcolno_to_pcolnos)
         for position, (colno, _) in enumerate(topo):
             bdb.sql_execute('''
                 INSERT INTO bayesdb_composer_column_toposort
                     (generator_id, colno, position) VALUES (?,?,?)
                 ''', (genid, colno, position,))
         # Save predictor names of foreign columns.
         for fcolno in fcolno_to_pcolnos:
             fp_name = fcol_to_fpred[casefold(
                 core.bayesdb_generator_column_name(bdb,genid, fcolno))]
             bdb.sql_execute('''
                 INSERT INTO bayesdb_composer_column_foreign_predictor
                     (generator_id, colno, predictor_name) VALUES (?,?,?)
             ''', (genid, fcolno, casefold(fp_name)))
Exemple #24
0
def _create_population(bdb, phrase):
    # Retrieve the (possibility implicit) population name.
    population_name = phrase.name or phrase.table
    implicit = 1 if phrase.name is None else 0

    # Handle IF NOT EXISTS.
    if core.bayesdb_has_population(bdb, population_name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(bdb, 'Name already defined as population: %r' %
                (population_name,))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)

    # Create the population record and get the assigned id.
    bdb.sql_execute('''
        INSERT INTO bayesdb_population (name, tabname, implicit)
            VALUES (?, ?, ?)
    ''', (population_name, phrase.table, implicit))
    population_id = core.bayesdb_get_population(bdb, population_name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(itertools.chain.from_iterable(
        [[(name, s.stattype) for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(itertools.chain.from_iterable(
        [[(name, 'ignore') for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(itertools.chain.from_iterable(
        [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r'
                % (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes])
        migrate = [(col, st) for col, st in pop_guess_vars if st=='key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Ensure no string-valued variables are being modeled as numerical.
    numerical_string_vars = [
        var for var, stattype in pop_model_vars
        if stattype == 'numerical'
            and _column_contains_string(bdb, phrase.table, var)
    ]
    if numerical_string_vars:
        raise BQLError(bdb,
            'Column(s) with string values modeled as numerical: %r'
            % (numerical_string_vars, ))

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r'
            % (not_found, ))

    # Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    seen_variables = set()
    duplicates = set()
    missing = set()
    invalid = set()
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in seen_variables:
            duplicates.add(name)
            continue
        if not core.bayesdb_table_has_column(bdb, phrase.table, nm):
            missing.add(name)
            continue
        cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
        if cursor_value(cursor) == 0 and stattype != 'ignore':
            invalid.add(stattype)
            continue
        seen_variables.add(nm)
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, 'No such columns in table %r: %r' %
            (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),))
    if invalid:
        raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        core.bayesdb_add_variable(bdb, population_id, name, stattype)
Exemple #25
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
            out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb,
                        'Name already defined as table: %s' %
                        (repr(phrase.name),))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb, 'Table already exists: %s' %
                        (repr(phrase.name),))
            bayesdb_read_csv_file(
                bdb, phrase.name, phrase.csv, header=True, create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                raise BQLError(bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name),))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                (phrase.name,))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as table: %s'
                                % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                                bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s'
                            % (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                        ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table,
                                cmd.old):
                            raise BQLError(bdb, 'No such column in table %s'
                                ': %s' %
                                (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table,))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id,) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(bdb,
                                generator_id)
                            backend.rename_column(bdb, generator_id,
                                old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder('''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt,), ())
            for cn, st, ct in zip(column_names, stattypes, distinct_value_counts):
                out.winder('''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt,))
            out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name,))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids]
                raise BQLError(bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute('''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id,))
            bdb.sql_execute('''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb, 'No such population: %s' %
                    (repr(population),))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead'
                            % (population,))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as population' ': %s'
                                % (repr(cmd.name),))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s'
                            % (qg, qp,))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(
                            bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'No such variable in base table: %s'
                            % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(
                            bdb, population_id, None, cmd.name):
                        raise BQLError(bdb,
                            'Variable already in population: %s'
                            % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb,
                            'Cannot add variable without any values: %s'
                            % (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute(
                            'SELECT %s FROM %s' % (qc, qt))
                        rows = cursor.fetchall()
                        [stattype, reason] = bayesdb_guess_stattypes(
                            [cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(bdb,
                                'Values in column %s appear to be keys.'
                                % (cmd.name,))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(bdb,
                                'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name,))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'Numerical column contains string values: %r '
                            % (qc,))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(
                            bdb, population_id, cmd.name, stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s'
                            % (repr(population), repr(generators),))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not
                        core.bayesdb_has_variable(bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(bdb,
                            'No such variables in population: %s'
                            % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid statistical type: %r'
                            % (repr(cmd.stattype),))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(bdb,
                                'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars,))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(
                            bdb, population_id, None, c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno,) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos,)
                    bdb.sql_execute(
                        update_stattype_sql,
                        (casefold(cmd.stattype), population_id,))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' %
                (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' %
                (repr(backend_name),))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name),))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute('''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(bdb, 'Name already defined'
                                ' as generator: %s' %
                                (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos if not
                    core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(bdb,
                        'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
            else:
                existing = set(modelno for modelno in modelnos
                    if core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
                if 0 < len(existing):
                    raise BQLError(bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb, generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
            program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(
                bdb, None, phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' % (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                    phrase.generator):
                raise BQLError(bdb,
                    'No such generator: %r' % (phrase.generator,))
            generator_id = core.bayesdb_get_generator(
                bdb, population_id, phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(
                bdb, population_id, None, phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target,))
        colno_target = core.bayesdb_variable_number(
            bdb, population_id, None, phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
            generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(bdb,
                'Target variable is not numerical: %r' % (phrase.target,))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(
                    bdb, population_id, None, given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(
            colno for colno in colno_givens if colno!= colno_target
        )
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(
            bdb, population_id, generator_id, modelnos, constraints,
            colnos, numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(
            target_values, given_values, given_names, stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder('''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt,), ())
        for variable, coef in coefficients:
            out.winder('''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (variable, coef,))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt,))
        out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    assert False                # XXX
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname,
        create=False, ifnotexists=False, gzipped=None):
    """Load legacy BayesDB models from a file.

    Legacy models are from the previous incarnation of BayesDB, before
    bayeslite.  If you did not use the previous incarnation of
    BayesDB, you need not worry about this.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str generator: name of generator
    :param str table: name of table
    :param str metamodel: name of metamodel, must be ``crosscat``
    :param str pathname: pathname of legacy models file
    :param bool create: if true and `generator` does not exist, create it
    :param bool ifnotexists: if true and `generator` exists, do it anyway
    :param bool gzipped: if true, or if ``None`` and `pathname`
        ends in ``.pkl.gz``, decompress with gzip first
    """

    if metamodel != 'crosscat':
        raise ValueError('Only crosscat legacy models are supported.')

    if not create:
        if ifnotexists:
            raise ValueError('Not creating generator whether or not exists!')

    # Load the pickled file -- gzipped, if gzipped is true or if
    # gzipped is not specified and the file ends in .pkl.gz.
    pickled = None
    with open(pathname, 'rb') as f:
        if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')):
            with gzip.GzipFile(fileobj=f) as gzf:
                pickled = pickle.load(gzf)
        else:
            pickled = pickle.load(f)

    # Pick apart the schema and model data.
    #
    # XXX Support even older models formats, from before the schema
    # was included.  Not sure exactly how they were structured.
    if 'schema' not in pickled:
        raise IOError('Invalid legacy model: missing schema')
    if 'models' not in pickled:
        raise IOError('Invalid legacy model: missing models')
    schema = pickled['schema']
    models = pickled['models']

    # Make sure the schema looks sensible.  Map legacy stattypes
    # (`cctypes') to modern stattypes.
    if not isinstance(schema, dict):
        raise IOError('Invalid legacy model: schema is not a dict')
    for column_name in schema:
        column_schema = schema[column_name]
        if not isinstance(column_schema, dict):
            raise IOError('Invalid legacy model: column schema is not a dict')
        if not 'cctype' in column_schema:
            raise IOError('Invalid legacy model: column schema missing cctype')
        if column_schema['cctype'] in renamed_column_stattypes:
            column_schema['cctype'] = \
                renamed_column_stattypes[column_schema['cctype']]
        if column_schema['cctype'] not in allowed_column_stattypes:
            raise IOError('Invalid legacy model: unknown column type')

    # XXX Check whether the schema resembles a sane generator schema.
    # XXX Check whether models is a dict mapping integers to thetas.
    # XXX Check whether the thetas look sensible.
    # XXX Check whether the metamodel makes sense of it!

    column_stattypes = dict((casefold(column_name),
                             casefold(schema[column_name]['cctype']))
        for column_name in schema)

    # Ready to update the database.  Do it in a savepoint in case
    # anything goes wrong.
    with bdb.savepoint():

        # Ensure the table exists.  Can't do anything if we have no
        # data.
        if not core.bayesdb_has_table(bdb, table):
            raise ValueError('No such table: %s' % (repr(table),))

        # Ensure the generator exists.
        if core.bayesdb_has_generator(bdb, generator):
            if create and not ifnotexists:
                raise ValueError('Generator already exists: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            generator_table = core.bayesdb_generator_table(bdb, generator_id)
            if casefold(table) != generator_table:
                raise ValueError(
                    'Generator %r is for table %r, not for table: %r' %
                    (generator, generator_table, table))
            # Generator exists.  If the schema differs and there are
            # existing models, fail.  If the schema differs and there
            # are no existing models, change the schema.
            #
            # XXX Not clear changing the schema is really appropriate.
            generator_id = core.bayesdb_get_generator(bdb, generator)
            old_types = bayesdb_generator_column_stattypes(bdb, generator_id)
            if column_stattypes != old_types:
                sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = ?
                '''
                cursor = bdb.sql_execute(bdb, (generator_id,))
                if 0 < cursor_value(cursor):
                    raise ValueError('Legacy models mismatch schema: %s' %
                        (repr(generator),))
                qg = sqlite3_quote_name(generator)
                bdb.execute('DROP GENERATOR %s' % (qg,))
                bayesdb_create_legacy_generator(bdb, generator, table,
                    column_stattypes)
        elif create:
            bayesdb_create_legacy_generator(bdb, generator, table,
                column_stattypes)
        else:
            raise ValueError('No such generator: %s' % (repr(generator),))

        # Map the case of the column names in the models.
        #
        # XXX Check more than just the column names.
        for modelno in models:      # dictionary
            theta = models[modelno]
            if 'X_L' not in theta:
                raise IOError('Invalid legacy model: no X_L in theta[%u]' %
                    (modelno,))
            X_L = theta['X_L']
            if 'view_state' not in X_L:
                raise IOError('Invalid legacy model'
                    ': no view_state in X_L[%u]' %
                    (modelno,))
            for viewno, view_state in enumerate(X_L['view_state']):
                if 'column_names' not in view_state:
                    raise IOError('Invalid legacy model: no column names'
                        ' in view state %u of X_L[%u]' % (viewno, modelno))
                view_column_names = view_state['column_names']
                if not isinstance(view_column_names, list):
                    raise IOError('Invalid legacy model'
                        ': non-list for view %u columns in X_L[%u]'
                        % (viewno, modelno))
                for i in range(len(view_column_names)):
                    name = view_column_names[i]
                    if not core.bayesdb_table_has_column(bdb, table, name):
                        raise IOError('No such column in table %s: %s' %
                            (repr(table), repr(name)))
                    # Canonicalize the case.
                    colno = core.bayesdb_table_column_number(bdb, table, name)
                    name = core.bayesdb_table_column_name(bdb, table, colno)
                    view_column_names[i] = name

        # Determine where to start numbering the new models.
        generator_id = core.bayesdb_get_generator(bdb, generator)
        modelno_max_sql = '''
            SELECT MAX(modelno) FROM bayesdb_generator_model
                WHERE generator_id = ?
        '''
        cursor = bdb.sql_execute(modelno_max_sql, (generator_id,))
        modelno_max = cursor_value(cursor)
        modelno_start = 0 if modelno_max is None else modelno_max + 1

        # Consistently number the models consecutively in order of the
        # external numbering starting at the smallest nonnegative
        # model number not currently used.  Do not vary based on the
        # ordering of Python dict iteration.
        insert_model_sql = '''
            INSERT INTO bayesdb_generator_model
                (generator_id, modelno, iterations)
                VALUES (:generator_id, :modelno, :iterations)
        '''
        insert_theta_json_sql = '''
            INSERT INTO bayesdb_crosscat_theta
                (generator_id, modelno, theta_json)
                VALUES (:generator_id, :modelno, :theta_json)
        '''
        for i, modelno_ext in enumerate(sorted(models.keys())):
            modelno = modelno_start + i
            theta = models[modelno_ext]
            iterations = 0
            if 'iterations' in theta and isinstance(theta['iterations'], int):
                iterations = theta['iterations']
            bdb.sql_execute(insert_model_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'iterations': iterations,
            })
            bdb.sql_execute(insert_theta_json_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'theta_json': json.dumps(theta),
            })
Exemple #27
0
    def parse(self, schema):
        """Parse the given `schema` for a `composer` metamodel.

        An example of a schema is::

            CREATE GENERATOR foo FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner
                    CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km
                    NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL,
                    Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch
                    NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor
                    CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site
                    CATEGORICAL, Launch_Vehicle CATEGORICAL,
                    Source_Used_for_Orbital_Data CATEGORICAL,
                    longitude_radians_of_geo NUMERICAL, Inclination_radians
                    NUMERICAL
                ),
                random_forest (
                    Type_of_Orbit CATEGORICAL
                        GIVEN Apogee_km, Perigee_km,
                            Eccentricity, Period_minutes, Launch_Mass_kg,
                            Power_watts, Anticipated_Lifetime, Class_of_orbit
                ),
                keplers_law (
                    Period_minutes NUMERICAL
                        GIVEN Perigee_km, Apogee_km
                ),
                multiple_regression (
                    Anticipated_Lifetime NUMERICAL
                        GIVEN Dry_Mass_kg, Launch_Mass_kg, Purpose
                ),
                dependent(Launch_Mass_kg, Dry_Mass_kg, Power_watts),
                dependent(Perigee_km, Apogee_km),
                independent(Operator_Owner, Inclination_radians)
            );

        The schema must adhere to the following rules:

        - Default metamodel is identified `default` or
          `crosscat`. Every `colname` must have its `stattype`
          declared. IGNORE and GUESS(*) are forbidden.

        - Foriegn predictors are identified by the `name()` method of
          the object used when `Composer.register_foreign_predictor`
          was invoked.

        For example::

            >> from bdbcontrib.foreign.random_forest import RandomForest
            >> composer.register_foreign_predictor(random_forest.RandomForest)
            >> RandomForest.name()
            random_forest

        The grammar inside foreign predictor directives is::

            <target> <stattype> GIVEN <condition> [...[condition]]

        All columns specified in `dependent` and `independent`
        directives must be modeled by the `default` metamodel.

        Parameters
        ----------
        schema : list<list>
            The `schema` as parsed by bayesdb.

        Returns
        -------
        columns : dict(str:str)
            A dict(colname:stattype) mapping every `colname` declared in
            `schema` to its `stattype`.

        lcols : list<str>
            A list of columns modeled by `default` model.

        fcols : list<str>
            A list of columns modeled by foreign predictor.

        fcol_to_pcols : dict(str:list<str>)
            A dict(fcol:conditions) mapping `fcol` to a list of its parent
            columns.

        fcol_to_fpred : dict(str:str)
            A dict(fcol:fpred) mapping `fcol` to the name of its foreign
            predictor. The values in the dictionary are keys in
            `self.predictor_builder`.

        dependencies : list(<tuple(<bool>,<list<str>)>)
            A list of dependency constraints. Each entry in the list
            is a tuple.  For example, (True, ['foo', 'bar', 'baz'])
            means the three variables are mutually and pairwise
            *dependent*.

        """
        # Allowed keywords.
        DIRECTIVES = ['crosscat', 'default', 'dependent', 'independent'] + \
            self.predictor_builder.keys()
        STATTYPES = ['numerical', 'categorical']
        # Data structures to return.
        columns = {}
        lcols = []
        fcols = []
        fcol_to_pcols = dict()
        fcol_to_fpred = dict()
        dependencies = []
        # Parse!
        for block in schema:
            if len(block) == 0:
                continue
            directive = casefold(block[0])
            commands = block[1]
            if directive not in DIRECTIVES:
                raise ValueError('Unknown directive "{}".\n'
                    'Available directives: {}.'.format(directive, DIRECTIVES))
            if not isinstance(commands, list):
                raise ValueError('Unknown commands in "{}" directive: {}.'\
                        .format(directive, commands))
            if directive == 'default' or directive == 'crosscat':
                while commands:
                    c = casefold(commands.pop(0))
                    if c == ',':
                        continue
                    s = casefold(commands.pop(0))
                    if s not in STATTYPES:
                        raise ValueError('Invalid stattype "{}".'.format(s))
                    columns[c] = s
                    lcols.append(c)
            elif directive == 'independent':
                ind = []
                while commands:
                    c = casefold(commands.pop(0))
                    if c == ',':
                        continue
                    ind.append(c)
                dependencies.append((False, ind))
            elif directive == 'dependent':
                dep = []
                while commands:
                    c = casefold(commands.pop(0))
                    if c == ',':
                        continue
                    dep.append(c)
                dependencies.append((True, dep))
            elif directive in self.predictor_builder:
                c = casefold(commands.pop(0))
                s = casefold(commands.pop(0))
                if s not in STATTYPES:
                    raise ValueError('Invalid stattype "{}".'.format(s))
                columns[c] = s
                given = casefold(commands.pop(0))
                if given != 'given':
                    raise ValueError('Execpted GIVEN keyword, received: {}.'\
                            .format(given))
                conditions = []
                while commands:
                    r = casefold(commands.pop(0))
                    if r == ',':
                        continue
                    conditions.append(r)
                fcols.append(c)
                fcol_to_pcols[c] = conditions
                fcol_to_fpred[c] = directive
        # Unique lcols.
        if len(lcols) != len(set(lcols)):
            raise ValueError('Duplicate default columns enountered: {}.'\
                .format(lcols))
        # Unique fcols.
        if len(fcols) != len(set(fcols)):
            raise ValueError('Duplicate foreign columns enountered: {}.'\
                .format(fcols))
        # All stattypes declared.
        for _, c in fcol_to_pcols.iteritems():
            for r in c:
                if r not in columns:
                    raise ValueError('No stattype declaration for "{}".'\
                        .format(r))
        # No col both lcol and fcol.
        for l in lcols:
            if l in fcol_to_pcols:
                raise ValueError('Column "{}" can only be modeled once.'\
                    .format(l))
        # No non-default dependencies.
        for dep in dependencies:
            for col in dep[1]:
                if col not in lcols:
                    raise ValueError('Column "{}" with dependency constraint '
                        'must have default model.'.format(col))
        # Return the hodgepodge.
        return (columns, lcols, fcols, fcol_to_pcols, fcol_to_fpred,
            dependencies)
Exemple #28
0
def bayesdb_has_stattype(bdb, stattype):
    """True if `stattype` is registered in `bdb` instance."""
    sql = 'SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype'
    cursor = bdb.sql_execute(sql, {'stattype': casefold(stattype)})
    return cursor_value(cursor) > 0
Exemple #29
0
def bayesdb_stattype_affinity(_bdb, stattype):
    assert bayesdb_has_stattype(_bdb, stattype)
    return _STATTYPE_TO_AFFINITY[casefold(stattype)]
Exemple #30
0
 def p_dist_name(self, dist):                return casefold(dist)
 def p_foreign_name(self, foreign):          return casefold(foreign)
Exemple #31
0
def bayesdb_read_csv(bdb, table, f, header=False,
        create=False, ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table),))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            if any(len(c)==0 for c in column_names):
                raise IOError(
                    'Missing column names in header: %s' %repr(column_names))
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                    (repr(list(duplicates)),))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown),))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                    (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                    (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Exemple #32
0
    def p_var_name(self, var):                  return casefold(var)

    def p_stattype_s(self, st):                 return st
Exemple #33
0
def bayesdb_guess_stattypes(column_names, rows, null_values=None,
        numcat_count=None, numcat_ratio=None, distinct_ratio=None,
        nullify_ratio=None, overrides=None):
    """Heuristically guess statistical types for the data in `rows`.

    Return a list of statistical types corresponding to the columns
    named in the list `column_names`.

    :param set null_values: values to nullify.
    :param int numcat_count: number of distinct values below which
        columns whose values can all be parsed as numbers will be
        considered categorical anyway
    :param real numcat_ratio: ratio of distinct values to total values
        below which columns whose values can all be parsed as numbers
        will be considered categorical anyway
    :param real distinct_ratio: ratio of distinct values to total values
        above which a column will be ignored as a pseudo-key
        (only if count > numcat_count).
    :param real nullify_ratio: ratio of count of the most numerous value to
        total number of values above which the most numerous value should be
        nullified (set to 1 to turn off).
    :param list overrides: list of ``(name, stattype)``, overriding
        any guessed statistical type for columns by those names

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``.
    """

    # Fill in default arguments.
    if null_values is None:
        null_values = set(("", "N/A", "none", "None"))
    if numcat_count is None:
        numcat_count = 20
    if numcat_ratio is None:
        numcat_ratio = 0.02
    if distinct_ratio is None:
        distinct_ratio = 0.9
    if nullify_ratio is None:
        nullify_ratio = 0.9
    if overrides is None:
        overrides = []

    # Build a set of the column names.
    column_name_set = set()
    duplicates = set()
    for name in column_names:
        if casefold(name) in column_name_set:
            duplicates.add(name)
        column_name_set.add(casefold(name))
    if 0 < len(duplicates):
        raise ValueError('Duplicate column names: %s' %
            (repr(list(duplicates),)))

    # Build a map for the overrides.
    #
    # XXX Support more than just stattype: allow arbitrary column
    # descriptions.
    override_map = {}
    unknown = set()
    duplicates = set()
    for name, stattype in overrides:
        if casefold(name) not in column_name_set:
            unknown.add(name)
            continue
        if casefold(name) in override_map:
            duplicates.add(name)
            continue
        override_map[casefold(name)] = casefold(stattype)
    if 0 < len(unknown):
        raise ValueError('Unknown columns overridden: %s' %
            (repr(list(unknown)),))
    if 0 < len(duplicates):
        raise ValueError('Duplicate columns overridden: %s' %
            (repr(list(duplicates)),))

    # Sanity-check the inputs.
    ncols = len(column_names)
    assert ncols == len(unique(map(casefold, column_names)))
    for ri, row in enumerate(rows):
        if len(row) < ncols:
            raise ValueError('Row %d: Too few columns: %d < %d' %
                (ri, len(row), ncols))
        if len(row) > ncols:
            raise ValueError('Row %d: Too many columns: %d > %d' %
                (ri, len(row), ncols))

    # Find a key first, if it has been specified as an override.
    key = None
    duplicate_keys = set()
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            if override_map[casefold(column_name)] == 'key':
                if key is not None:
                    duplicate_keys.add(column_name)
                    continue
                column = [row[ci] for row in rows]
                ints = integerify(column)
                if ints:
                   column = ints 
                if not keyable_p(column):
                    raise ValueError('Column non-unique but specified as key'
                        ': %s' % (repr(column_name),))
                key = column_name
    if 0 < len(duplicate_keys):
        raise ValueError('Multiple columns overridden as keys: %s' %
            (repr(list(duplicate_keys)),))

    # Now go through and guess the other column stattypes or use the
    # override.
    stattypes = []
    for ci, column_name in enumerate(column_names):
        if casefold(column_name) in override_map:
            stattype = override_map[casefold(column_name)]
        else:
            column = nullify(null_values, rows, ci)
            stattype = guess_column_stattype(column,
                                             distinct_ratio=distinct_ratio,
                                             nullify_ratio=nullify_ratio,
                                             numcat_count=numcat_count,
                                             numcat_ratio=numcat_ratio,
                                             have_key=(key is not None))
            if stattype == 'key':
                key = column_name
        stattypes.append(stattype)
    return stattypes
def _is_nominal(stattype):
    return casefold(stattype) in ['nominal', 'unbounded_nominal']
Exemple #35
0
    def dot_describe(self, line):
        """describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        """
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write("Usage: .describe table(s) [<table>...]\n")
            self.stdout.write("       .describe generator(s) [<gen>...]\n")
            self.stdout.write("       .describe columns <gen>\n")
            self.stdout.write("       .describe model(s) <gen> [<model>...]\n")
            return
        if casefold(tokens[0]) == "table" or casefold(tokens[0]) == "tables":
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = "1"
            else:
                params = tokens[1:]
                qualifier = "(" + " OR ".join(["tabname = ?" for _p in params]) + ")"
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write("No such table: %s\n" % (repr(table),))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = """
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            """ % (
                qualifier,
            )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) == "generator" or casefold(tokens[0]) == "generators":
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = "1"
            else:
                params = tokens[1:]
                names = ",".join("?%d" % (i + 1,) for i in range(len(params)))
                qualifier = """
                    (name IN ({names}) OR (defaultp AND tabname IN ({names})))
                """.format(
                    names=names
                )
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator_default(self._bdb, generator):
                        self.stdout.write("No such generator: %s\n" % (repr(generator),))
                        ok = False
                if not ok:
                    return
            sql = """
                SELECT id, name, tabname, metamodel
                    FROM bayesdb_generator
                    WHERE %s
            """ % (
                qualifier,
            )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == "columns":
            if len(tokens) != 2:
                self.stdout.write("Describe columns of what generator?\n")
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb, generator):
                    self.stdout.write("No such generator: %s\n" % (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb, generator)
                sql = """
                    SELECT c.colno AS colno, c.name AS name,
                            gc.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_generator AS g,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_generator_column AS gc
                                USING (colno))
                        WHERE g.id = ? AND g.id = gc.generator_id
                            AND g.tabname = c.tabname
                        ORDER BY colno ASC;
                """
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == "model" or casefold(tokens[0]) == "models":
            if len(tokens) < 2:
                self.stdout.write("Describe models of what generator?\n")
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb, generator):
                    self.stdout.write("No such generator: %s\n" % (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb, generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = "1"
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write("Invalid model number: %s\n" % (repr(token),))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(self._bdb, generator_id, modelno):
                                self.stdout.write("No such model: %d\n" % (modelno,))
                                return
                            modelnos.append(modelno)
                    qualifier = "modelno IN (%s)" % (",".join(map(str, modelnos)))
                sql = """
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                """ % (
                    qualifier,
                )
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write("Usage: .describe table(s) [<table>...]\n")
            self.stdout.write("       .describe generator(s) [<gen>...]\n")
            self.stdout.write("       .describe columns <gen>\n")
            self.stdout.write("       .describe model(s) <gen> [<model>...]\n")
def _is_countable(stattype):
    return casefold(stattype) in ['counts', 'boolean']
Exemple #37
0
def instantiate_generator(bdb, gen_name, table, metamodel, columns,
                          default=None):
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(bdb, 'Name already defined as table: %s' %
            (repr(gen_name),))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    generator_already_existed = False
    if core.bayesdb_has_generator(bdb, gen_name):
        generator_already_existed = True
    else:
        # Create the generator record.
        generator_sql = '''INSERT INTO bayesdb_generator
                           (name, tabname, metamodel, defaultp)
                           VALUES (:name, :table, :metamodel, :defaultp)'''
        cursor = bdb.sql_execute(generator_sql, {
            'name': gen_name,
            'table': table,
            'metamodel': metamodel.name(),
            'defaultp': default,
        })
    generator_id = core.bayesdb_get_generator(bdb, gen_name)

    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {
                'stattype': stattype,
            })
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, 'No such columns in table %s: %s' %
            (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(bdb, 'Duplicate column names: %s' %
            (repr(list(duplicates)),))
    if invalid:
        raise BQLError(bdb, 'Invalid statistical types: %s' %
            (repr(list(invalid)),))

    if not generator_already_existed:
        # Insert column records.
        column_sql = '''
            INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
        '''
        for name, stattype in columns:
            colno = column_map[casefold(name)]
            stattype = casefold(stattype)
            bdb.sql_execute(column_sql, {
                'generator_id': generator_id,
                'colno': colno,
                'stattype': stattype,
            })

    column_list = sorted((column_map[casefold(name)], name, stattype)
        for name, stattype in columns)
    return generator_id, column_list
Exemple #38
0
def scan_name(_scanner, text):
    return keywords.get(text) or keywords.get(casefold(text)) or \
        grammar.L_NAME;
Exemple #39
0
def define_correlation_p(stattype0, stattype1, method):
    assert casefold(stattype0) == stattype0
    assert casefold(stattype1) == stattype1
    assert (stattype0, stattype1) not in correlation_p_methods
    correlation_p_methods[stattype0, stattype1] = method
Exemple #40
0
def bayesdb_read_csv(bdb,
                     table,
                     f,
                     header=False,
                     create=False,
                     ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table), ))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                              (repr(list(duplicates)), ))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(
                    name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown), ))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                              (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                              (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Exemple #41
0
    try:
        cursor.next()
    except StopIteration:
        pass
    else:
        generator = bayesdb_generator_table(bdb, generator_id)
        raise BQLError(bdb, 'More than one such row'
            ' in table %s for generator %s: %d' %
            (repr(table_name), repr(generator), repr(rowid)))
    return row

def bayesdb_generator_fresh_row_id(bdb, generator_id):
    table_name = bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt,))
    max_rowid = cursor_value(cursor)
    if max_rowid is None:
        max_rowid = 0
    return max_rowid + 1   # Synthesize a non-existent SQLite row id

# XXX This should be stored in the database by adding a column to the
# bayesdb_stattype table -- when we are later willing to contemplate
# adding statistical types, e.g. COUNT, SCALE, or NONNEGATIVE REAL.
_STATTYPE_TO_AFFINITY = dict((casefold(st), casefold(af)) for st, af in (
    ('categorical', 'text'),
    ('cyclic', 'real'),
    ('numerical', 'real'),
))
def bayesdb_stattype_affinity(_bdb, stattype):
    return _STATTYPE_TO_AFFINITY[casefold(stattype)]
Exemple #42
0
def bayesdb_has_stattype(bdb, stattype):
    """True if `stattype` is registered in `bdb` instance."""
    sql = 'SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype'
    cursor = bdb.sql_execute(sql, {'stattype': casefold(stattype)})
    return cursor_value(cursor) > 0
Exemple #43
0
def _create_schema(bdb, generator_id, schema_ast, **kwargs):
    # Get some parameters.
    population_id = core.bayesdb_generator_population(bdb, generator_id)
    table = core.bayesdb_population_table(bdb, population_id)

    # State.
    variables = []
    variable_dist = {}
    latents = {}
    cgpm_composition = []
    modelled = set()
    default_modelled = set()
    subsample = None
    deferred_input = defaultdict(lambda: [])
    deferred_output = dict()

    # Error-reporting state.
    duplicate = set()
    unknown = set()
    needed = set()
    existing_latent = set()
    must_exist = []
    unknown_stattype = {}

    # XXX Convert all Foreign.exposed lists to Latent clauses.
    # Retrieve Foreign clauses with exposed variables.
    foreign_clauses = [
        c for c in schema_ast
        if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0
    ]
    # Add the exposed variables to Foreign.outputs
    # Note that this assumes if there are K exposed variables, then they are
    # necessarily the last K outputs of the fc.outputs.
    for fc in foreign_clauses:
        fc.outputs.extend([e[0] for e in fc.exposed])

    # Convert exposed entries into Latent clauses.
    latent_vars = list(
        itertools.chain.from_iterable(c.exposed for c in foreign_clauses))
    latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars]
    # Append the Latent clauses to the ast.
    schema_ast.extend(latent_clauses)

    # XXX Convert the baseline to a Foreign clause.
    # Currently the baselines do not accept a schema, and will fail if
    # `schema_ast` has any entries.
    baseline = kwargs.get('baseline', None)
    if baseline is not None and casefold(baseline.name) != 'crosscat':
        if schema_ast:
            raise BQLError(
                bdb, 'Cannot accept schema with baseline: %s.' % schema_ast)
        # Retrieve all variable names in the population
        outputs = core.bayesdb_variable_names(bdb, population_id, None)
        # Convert the LITERAL namedtuples to their raw values.
        ps, vs = zip(*baseline.params)
        vs_new = [v.value for v in vs]
        params = zip(ps, vs_new)
        # Create the clause.
        clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name,
                                           params)
        # And add append it to the schema_ast.
        schema_ast.append(clause)

    # Process each clause one by one.
    for clause in schema_ast:

        if isinstance(clause, cgpm_schema.parse.Basic):
            # Basic Crosscat component model: one variable to be put
            # into Crosscat views.
            var = clause.var
            dist = clause.dist
            params = dict(clause.params)  # XXX error checking

            # Reject if the variable does not exist.
            if not core.bayesdb_has_variable(bdb, population_id, None, var):
                unknown.add(var)
                continue

            # Reject if the variable has already been modelled.
            if var in modelled:
                duplicate.add(var)
                continue

            # Reject if the variable is latent.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Get the column number.
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            assert 0 <= colno

            # Add it to the list and mark it modelled by default.
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      colno)
            variables.append([var, stattype, dist, params])
            assert var not in variable_dist
            variable_dist[var] = (stattype, dist, params)
            modelled.add(var)
            default_modelled.add(var)

        elif isinstance(clause, cgpm_schema.parse.Latent):
            var = clause.name
            stattype = clause.stattype

            # Reject if the variable has already been modelled by the
            # default model.
            if var in default_modelled:
                duplicate.add(var)
                continue

            # Reject if the variable even *exists* in the population
            # at all yet.
            if core.bayesdb_has_variable(bdb, population_id, None, var):
                duplicate.add(var)
                continue

            # Reject if the variable is already latent, from another
            # generator.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Reject if we've already processed it.
            if var in latents:
                duplicate.add(var)
                continue

            # Add it to the set of latent variables.
            latents[var] = stattype

        elif isinstance(clause, cgpm_schema.parse.Foreign):
            # Foreign model: some set of output variables is to be
            # modelled by foreign logic, possibly conditional on some
            # set of input variables.
            #
            # Gather up the state for a cgpm_composition record, which
            # we may have to do incrementally because it must refer to
            # the distribution types of variables we may not have
            # seen.
            name = clause.name
            outputs = clause.outputs
            inputs = clause.inputs

            output_stattypes = []
            output_statargs = []
            input_stattypes = []
            input_statargs = []
            distargs = {
                'inputs': {
                    'stattypes': input_stattypes,
                    'statargs': input_statargs
                },
                'outputs': {
                    'stattypes': output_stattypes,
                    'statargs': output_statargs,
                }
            }
            kwds = {'distargs': distargs}
            kwds.update(clause.params)

            # First make sure all the output variables exist and have
            # not yet been modelled.
            for var in outputs:
                must_exist.append(var)
                if var in modelled:
                    duplicate.add(var)
                    continue
                modelled.add(var)
                # Add the output statistical type and its parameters.
                i = len(output_stattypes)
                assert i == len(output_statargs)
                output_stattypes.append(None)
                output_statargs.append(None)
                deferred_output[var] = (output_stattypes, output_statargs, i)

            # Next make sure all the input variables exist, mark them
            # needed, and record where to put their distribution type
            # and parameters.
            for var in inputs:
                must_exist.append(var)
                needed.add(var)
                i = len(input_stattypes)
                assert i == len(input_statargs)
                input_stattypes.append(None)
                input_statargs.append(None)
                deferred_input[var].append(
                    (input_stattypes, input_statargs, i))

            # Finally, add a cgpm_composition record.
            cgpm_composition.append({
                'name': name,
                'inputs': inputs,
                'outputs': outputs,
                'kwds': kwds,
            })

        elif isinstance(clause, cgpm_schema.parse.Subsample):
            if subsample is not None:
                raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, ))
            subsample = clause.n

        else:
            raise BQLError(bdb, 'Unknown clause: %r' % (clause, ))

    # Make sure all the outputs and inputs exist, either in the
    # population or as latents in this generator.
    for var in must_exist:
        if core.bayesdb_has_variable(bdb, population_id, None, var):
            continue
        if var in latents:
            continue
        unknown.add(var)

    # Raise an exception if there were duplicates or unknown
    # variables.
    if duplicate:
        raise BQLError(bdb,
                       'Duplicate model variables: %r' % (sorted(duplicate), ))
    if existing_latent:
        raise BQLError(
            bdb, 'Latent variables already defined: %r' %
            (sorted(existing_latent), ))
    if unknown:
        raise BQLError(bdb,
                       'Unknown model variables: %r' % (sorted(unknown), ))

    def default_dist(var, stattype):
        stattype = casefold(stattype)
        if stattype not in _DEFAULT_DIST:
            if var in unknown_stattype:
                assert unknown_stattype[var] == stattype
            else:
                unknown_stattype[var] = stattype
            return None
        dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var)
        return dist, params

    # Use the default distribution for any variables that remain to be
    # modelled, excluding any that are latent or that have statistical
    # types we don't know about.
    for var in core.bayesdb_variable_names(bdb, population_id, None):
        if var in modelled:
            continue
        colno = core.bayesdb_variable_number(bdb, population_id, None, var)
        assert 0 <= colno
        stattype = core.bayesdb_variable_stattype(bdb, population_id, colno)
        distparams = default_dist(var, stattype)
        if distparams is None:
            continue
        dist, params = distparams
        variables.append([var, stattype, dist, params])
        assert var not in variable_dist
        variable_dist[var] = (stattype, dist, params)
        modelled.add(var)

    # Fill in the deferred_input statistical type assignments.
    for var in sorted(deferred_input.iterkeys()):
        # Check whether the variable is modelled.  If not, skip -- we
        # will fail later because this variable is guaranteed to also
        # be in needed.
        if var not in modelled:
            assert var in needed
            continue

        # Determine (possibly fictitious) distribution and parameters.
        if var in default_modelled:
            # Manifest variable modelled by default Crosscat model.
            assert var in variable_dist
            stattype, dist, params = variable_dist[var]
        else:
            # Modelled by a foreign model.  Assign a fictitious
            # default distribution because the 27B/6 of CGPM requires
            # this.
            if var in latents:
                # Latent variable modelled by a foreign model.  Use
                # the statistical type specified for it.
                stattype = latents[var]
            else:
                # Manifest variable modelled by a foreign model.  Use
                # the statistical type in the population.
                assert core.bayesdb_has_variable(bdb, population_id, None, var)
                colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     var)
                stattype = core.bayesdb_variable_stattype(
                    bdb, population_id, colno)
            distparams = default_dist(var, stattype)
            if distparams is None:
                continue
            dist, params = distparams

        # Assign the distribution and parameters.
        for cctypes, ccargs, i in deferred_input[var]:
            assert cctypes[i] is None
            assert ccargs[i] is None
            cctypes[i] = dist
            ccargs[i] = params

    # Fill in the deferred_output statistical type assignments. The need to be
    # in the form NUMERICAL or CATEGORICAL.
    for var in deferred_output:
        if var in latents:
            # Latent variable modelled by a foreign model.  Use
            # the statistical type specified for it.
            var_stattype = casefold(latents[var])
            if var_stattype not in _DEFAULT_DIST:
                if var in unknown_stattype:
                    assert unknown_stattype[var] == var_stattype
                else:
                    unknown_stattype[var] = var_stattype
            # XXX Cannot specify statargs for a latent variable. Trying to using
            # default_dist might lookup the counts for unique values of the
            # categorical in the base table causing a failure.
            var_statargs = {}
        else:
            # Manifest variable modelled by a foreign model.  Use
            # the statistical type and arguments from the population.
            assert core.bayesdb_has_variable(bdb, population_id, None, var)
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            var_stattype = core.bayesdb_variable_stattype(
                bdb, population_id, colno)
            distparams = default_dist(var, var_stattype)
            if distparams is None:
                continue
            _, var_statargs = distparams

        stattypes, statargs, i = deferred_output[var]
        assert stattypes[i] is None
        assert statargs[i] is None
        stattypes[i] = var_stattype
        statargs[i] = var_statargs

    if unknown_stattype:
        raise BQLError(
            bdb, 'Unknown statistical types for variables: %r' %
            (sorted(unknown_stattype.iteritems(), )))

    # If there remain any variables that we needed to model, because
    # others are conditional on them, fail.
    needed -= modelled
    if needed:
        raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, ))

    # Finally, create a CGPM schema.
    return {
        'variables': variables,
        'cgpm_composition': cgpm_composition,
        'subsample': subsample,
        'latents': latents,
    }
Exemple #44
0
def instantiate_generator(bdb,
                          gen_name,
                          table,
                          metamodel,
                          columns,
                          default=None):
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(
            bdb, 'Name already defined as table: %s' % (repr(gen_name), ))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    generator_already_existed = False
    if core.bayesdb_has_generator(bdb, gen_name):
        generator_already_existed = True
    else:
        # Create the generator record.
        generator_sql = '''INSERT INTO bayesdb_generator
                           (name, tabname, metamodel, defaultp)
                           VALUES (:name, :table, :metamodel, :defaultp)'''
        cursor = bdb.sql_execute(
            generator_sql, {
                'name': gen_name,
                'table': table,
                'metamodel': metamodel.name(),
                'defaultp': default,
            })
    generator_id = core.bayesdb_get_generator(bdb, gen_name)

    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {
                'stattype': stattype,
            })
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb, 'No such columns in table %s: %s' %
            (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(
            bdb, 'Duplicate column names: %s' % (repr(list(duplicates)), ))
    if invalid:
        raise BQLError(
            bdb, 'Invalid statistical types: %s' % (repr(list(invalid)), ))

    if not generator_already_existed:
        # Insert column records.
        column_sql = '''
            INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
        '''
        for name, stattype in columns:
            colno = column_map[casefold(name)]
            stattype = casefold(stattype)
            bdb.sql_execute(
                column_sql, {
                    'generator_id': generator_id,
                    'colno': colno,
                    'stattype': stattype,
                })

    column_list = sorted((column_map[casefold(name)], name, stattype)
                         for name, stattype in columns)
    return generator_id, column_list
Exemple #45
0
def _is_countable(stattype):
    return casefold(stattype) in ['counts', 'boolean']
Exemple #46
0
def instantiate_generator(bdb, gen_name, table, metamodel, columns, ifnotexists=None, default=None):
    if ifnotexists is None:
        ifnotexists = False
    if default is None:
        default = False

    # Make sure there is no table by this name.
    if core.bayesdb_has_table(bdb, gen_name):
        raise BQLError(bdb, "Name already defined as table: %s" % (repr(gen_name),))

    # Make sure there's no generator by this name unless we were asked
    # to redefine it in that case.
    if not ifnotexists and core.bayesdb_has_generator(bdb, gen_name):
        raise BQLError(bdb, "Name already defined as generator: %s" % (repr(gen_name),))

    # Make sure the bayesdb_column table knows all the columns.
    core.bayesdb_table_guarantee_columns(bdb, table)

    # Create the generator record.
    generator_sql = """
        INSERT%s INTO bayesdb_generator
            (name, tabname, metamodel, defaultp)
            VALUES (:name, :table, :metamodel, :defaultp)
    """ % (
        " OR IGNORE" if ifnotexists else "",
    )
    cursor = bdb.sql_execute(
        generator_sql, {"name": gen_name, "table": table, "metamodel": metamodel.name(), "defaultp": default}
    )
    generator_id = cursor.lastrowid
    assert generator_id
    assert 0 < generator_id

    # Get a map from column name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    column_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = """
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    """
    stattype_sql = """
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    """
    for name, stattype in columns:
        name_folded = casefold(name)
        if name_folded in column_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {"table": table, "column_name": name})
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {"stattype": stattype})
            if cursor_value(cursor) == 0:
                invalid.add(stattype)
                continue
            column_map[casefold(name)] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, "No such columns in table %s: %s" % (repr(table), repr(list(missing))))
    if duplicates:
        raise BQLError(bdb, "Duplicate column names: %s" % (repr(list(duplicates)),))
    if invalid:
        raise BQLError(bdb, "Invalid statistical types: %s" % (repr(list(invalid)),))

    # Insert column records.
    column_sql = """
        INSERT INTO bayesdb_generator_column
            (generator_id, colno, stattype)
            VALUES (:generator_id, :colno, :stattype)
    """
    for name, stattype in columns:
        colno = column_map[casefold(name)]
        stattype = casefold(stattype)
        bdb.sql_execute(column_sql, {"generator_id": generator_id, "colno": colno, "stattype": stattype})

    column_list = sorted((column_map[casefold(name)], name, stattype) for name, stattype in columns)
    return generator_id, column_list
Exemple #47
0
    def dot_describe(self, line):
        '''describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        '''
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe columns <gen>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
            return
        if casefold(tokens[0]) == 'table' or \
           casefold(tokens[0]) == 'tables':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                qualifier = \
                    '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')'
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write('No such table: %s\n' %
                                          (repr(table),))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = '''
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            ''' % (qualifier,)
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) == 'generator' or \
                casefold(tokens[0]) == 'generators':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1,) for i in range(len(params)))
                qualifier = '''
                    (name IN ({names}) OR (defaultp AND tabname IN ({names})))
                '''.format(names=names)
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator_default(self._bdb,
                            generator):
                        self.stdout.write('No such generator: %s\n' %
                            (repr(generator),))
                        ok = False
                if not ok:
                    return
            sql = '''
                SELECT id, name, tabname, metamodel
                    FROM bayesdb_generator
                    WHERE %s
            ''' % (qualifier,)
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout,
                    self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == 'columns':
            if len(tokens) != 2:
                self.stdout.write('Describe columns of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb,
                        generator):
                    self.stdout.write('No such generator: %s\n' %
                        (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb,
                    generator)
                sql = '''
                    SELECT c.colno AS colno, c.name AS name,
                            gc.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_generator AS g,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_generator_column AS gc
                                USING (colno))
                        WHERE g.id = ? AND g.id = gc.generator_id
                            AND g.tabname = c.tabname
                        ORDER BY colno ASC;
                '''
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'model' or \
                casefold(tokens[0]) == 'models':
            if len(tokens) < 2:
                self.stdout.write('Describe models of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator_default(self._bdb,
                        generator):
                    self.stdout.write('No such generator: %s\n' %
                        (repr(generator),))
                    return
                generator_id = core.bayesdb_get_generator_default(self._bdb,
                    generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = '1'
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write('Invalid model number: %s\n' %
                                (repr(token),))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(
                                    self._bdb, generator_id, modelno):
                                self.stdout.write('No such model: %d\n' %
                                    (modelno,))
                                return
                            modelnos.append(modelno)
                    qualifier = 'modelno IN (%s)' % \
                        (','.join(map(str, modelnos),))
                sql = '''
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                ''' % (qualifier,)
                cursor = self._bdb.sql_execute(sql, (generator_id,))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe columns <gen>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
Exemple #48
0
    def dot_describe(self, line):
        '''describe BayesDB entities
        [table(s)|generator(s)|columns|model(s)] [<name>...]

        Print a human-readable description of the specified BayesDB
        entities.
        '''
        # XXX Lousy, lousy tokenizer.
        tokens = line.split()
        if len(tokens) == 0:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe population(s) [<pop>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
            return
        if casefold(tokens[0]) == 'table' or \
           casefold(tokens[0]) == 'tables':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                qualifier = \
                    '(' + ' OR '.join(['tabname = ?' for _p in params]) + ')'
                ok = True
                for table in params:
                    if not core.bayesdb_has_table(self._bdb, table):
                        self.stdout.write('No such table: %s\n' %
                                          (repr(table), ))
                        ok = False
                if not ok:
                    return
                for table in params:
                    core.bayesdb_table_guarantee_columns(self._bdb, table)
            sql = '''
                SELECT tabname, colno, name, shortname
                    FROM bayesdb_column
                    WHERE %s
                    ORDER BY tabname ASC, colno ASC
            ''' % (qualifier, )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout, self._bdb.execute(sql, params))
        elif casefold(tokens[0]) in ('population', 'populations'):
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1, )
                                 for i in xrange(len(params)))
                qualifier = '(name IN (%s))' % (names, )
                ok = True
                for population in params:
                    if not core.bayesdb_has_population(self._bdb, population):
                        self.stdout.write('No such population: %s\n' %
                                          (repr(population), ))
                        ok = False
                if not ok:
                    return
            with self._bdb.savepoint():
                cursor = self._bdb.sql_execute(
                    '''
                    SELECT id, name, tabname
                        FROM bayesdb_population
                        WHERE %s
                ''' % (qualifier, ), params)
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'generator' or \
                casefold(tokens[0]) == 'generators':
            params = None
            qualifier = None
            if len(tokens) == 1:
                params = ()
                qualifier = '1'
            else:
                params = tokens[1:]
                names = ','.join('?%d' % (i + 1, ) for i in range(len(params)))
                qualifier = '''
                    (name IN ({names}))
                '''.format(names=names)
                ok = True
                for generator in params:
                    if not core.bayesdb_has_generator(self._bdb, None,
                                                      generator):
                        self.stdout.write('No such generator: %s\n' %
                                          (repr(generator), ))
                        ok = False
                if not ok:
                    return
            sql = '''
                SELECT id, name, tabname, backend
                    FROM bayesdb_generator
                    WHERE %s
            ''' % (qualifier, )
            with self._bdb.savepoint():
                pretty.pp_cursor(self.stdout,
                                 self._bdb.sql_execute(sql, params))
        elif casefold(tokens[0]) == 'variables':
            if len(tokens) != 2:
                self.stdout.write('Usage: .describe variables <population>\n')
                return
            population = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_population(self._bdb, population):
                    self.stdout.write('No such population: %r\n' %
                                      (population, ))
                    return
                population_id = core.bayesdb_get_population(
                    self._bdb, population)
                sql = '''
                    SELECT c.colno AS colno, c.name AS name,
                            v.stattype AS stattype, c.shortname AS shortname
                        FROM bayesdb_population AS p,
                            (bayesdb_column AS c LEFT OUTER JOIN
                                bayesdb_variable AS v
                                USING (colno))
                        WHERE p.id = ? AND p.id = v.population_id
                            AND p.tabname = c.tabname
                        ORDER BY colno ASC;
                '''
                cursor = self._bdb.sql_execute(sql, (population_id, ))
                pretty.pp_cursor(self.stdout, cursor)
        elif casefold(tokens[0]) == 'model' or \
                casefold(tokens[0]) == 'models':
            if len(tokens) < 2:
                self.stdout.write('Describe models of what generator?\n')
                return
            generator = tokens[1]
            with self._bdb.savepoint():
                if not core.bayesdb_has_generator(self._bdb, None, generator):
                    self.stdout.write('No such generator: %s\n' %
                                      (repr(generator), ))
                    return
                generator_id = core.bayesdb_get_generator(
                    self._bdb, None, generator)
                qualifier = None
                if len(tokens) == 2:
                    qualifier = '1'
                else:
                    modelnos = []
                    for token in tokens[2:]:
                        try:
                            modelno = int(token)
                        except ValueError:
                            self.stdout.write('Invalid model number: %s\n' %
                                              (repr(token), ))
                            return
                        else:
                            if not core.bayesdb_generator_has_model(
                                    self._bdb, generator_id, modelno):
                                self.stdout.write('No such model: %d\n' %
                                                  (modelno, ))
                                return
                            modelnos.append(modelno)
                    qualifier = 'modelno IN (%s)' % \
                        (','.join(map(str, modelnos),))
                sql = '''
                    SELECT modelno, iterations FROM bayesdb_generator_model
                        WHERE generator_id = ? AND %s
                ''' % (qualifier, )
                cursor = self._bdb.sql_execute(sql, (generator_id, ))
                pretty.pp_cursor(self.stdout, cursor)
        else:
            self.stdout.write('Usage: .describe table(s) [<table>...]\n')
            self.stdout.write('       .describe generator(s) [<gen>...]\n')
            self.stdout.write('       .describe variables <pop>\n')
            self.stdout.write('       .describe model(s) <gen> [<model>...]\n')
Exemple #49
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            if not core.bayesdb_has_population(bdb,
                                               phrase.simulation.population):
                raise BQLError(
                    bdb, 'No such population: %s' %
                    (phrase.simulation.population, ))
            population_id = core.bayesdb_get_population(
                bdb, phrase.simulation.population)
            generator_id = None
            if phrase.simulation.generator is not None:
                if not core.bayesdb_has_generator(bdb, population_id,
                                                  phrase.simulation.generator):
                    raise BQLError(
                        bdb, 'No such generator: %r' %
                        (phrase.simulation.generator, ))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.simulation.generator)
            table = core.bayesdb_population_table(bdb, population_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such variable'
                        ' in population %r: %s' %
                        (phrase.simulation.population, column_name))
            for column_name, _expression in phrase.simulation.constraints:
                cn = casefold(column_name)
                if (cn not in column_sqltypes
                        and cn not in core.bayesdb_rowid_tokens(bdb)):
                    raise BQLError(
                        bdb, 'No such variable in population %s: %s' %
                        (phrase.simulation.population, column_name))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)

            def map_var(var):
                if casefold(var) not in core.bayesdb_rowid_tokens(bdb):
                    return core.bayesdb_variable_number(
                        bdb, population_id, generator_id, var)
                else:
                    return casefold(var)

            def map_constraint(((var, _expression), value)):
                return (map_var(var), value)

            constraints = map(
                map_constraint,
                zip(phrase.simulation.constraints, cursor[0][1:]))
            colnos = map(map_var, column_names)
            schema = ','.join('%s %s' %
                              (qcn, column_sqltypes[casefold(column_name)])
                              for qcn, column_name in zip(qcns, column_names))
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, schema))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(
                    bdb,
                    population_id,
                    constraints,
                    colnos,
                    generator_id=generator_id,
                    numpredictions=nsamples,
                    accuracy=phrase.simulation.accuracy):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)
Exemple #50
0
def bayesdb_stattype_affinity(_bdb, stattype):
    return _STATTYPE_TO_AFFINITY[casefold(stattype)]
Exemple #51
0
def _create_population(bdb, phrase):
    if core.bayesdb_has_population(bdb, phrase.name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(
                bdb,
                'Name already defined as population: %r' % (phrase.name, ))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)
    seen_columns = []

    # Create the population record and get the assigned id.
    bdb.sql_execute(
        '''
        INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?)
    ''', (phrase.name, phrase.table))
    population_id = core.bayesdb_get_population(bdb, phrase.name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(
        itertools.chain.from_iterable([[(name, s.stattype) for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(
        itertools.chain.from_iterable([[(name, 'ignore') for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(
        itertools.chain.from_iterable([
            s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)
        ]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r' %
                (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, pop_guess_stattypes)
        migrate = [(col, st) for col, st in pop_guess_vars if st == 'key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r' %
            (not_found, ))

    # Get a map from variable name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    variable_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in variable_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': phrase.table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
            if cursor_value(cursor) == 0 and stattype != 'ignore':
                invalid.add(stattype)
                continue
            variable_map[name] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb,
            'No such columns in table %r: %r' % (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb,
                       'Duplicate column names: %r' % (list(duplicates), ))
    if invalid:
        raise BQLError(bdb,
                       'Invalid statistical types: %r' % (list(invalid), ))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        colno = variable_map[name]
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_variable
                (population_id, name, colno, stattype)
                VALUES (?, ?, ?, ?)
        ''', (population_id, name, colno, stattype))
Exemple #52
0
def _is_categorical(stattype):
    return casefold(stattype) in ['categorical', 'nominal']
Exemple #53
0
def bayesdb_rowid_tokens(bdb):
    tokens = bdb.sql_execute('''
        SELECT token FROM bayesdb_rowid_tokens
    ''').fetchall()
    return [t[0] for t in tokens]


def bayesdb_has_stattype(bdb, stattype):
    sql = 'SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype'
    cursor = bdb.sql_execute(sql, {'stattype': casefold(stattype)})
    return cursor_value(cursor) > 0


# XXX This should be stored in the database by adding a column to the
# bayesdb_stattype table -- when we are later willing to contemplate
# adding statistical types, e.g. COUNT, SCALE, or NONNEGATIVE REAL.
_STATTYPE_TO_AFFINITY = dict((casefold(st), casefold(af)) for st, af in (
    ('categorical', 'text'),
    ('cyclic', 'real'),
    ('numerical', 'real'),
    ('counts', 'real'),
    ('magnitude', 'real'),
    ('nominal', 'text'),
    ('numericalranged', 'real'),
))


def bayesdb_stattype_affinity(_bdb, stattype):
    assert bayesdb_has_stattype(_bdb, stattype)
    return _STATTYPE_TO_AFFINITY[casefold(stattype)]
Exemple #54
0
def define_correlation_p(stattype0, stattype1, method):
    assert casefold(stattype0) == stattype0
    assert casefold(stattype1) == stattype1
    assert (stattype0, stattype1) not in correlation_p_methods
    correlation_p_methods[stattype0, stattype1] = method
Exemple #55
0
def _is_continuous(stattype):
    return casefold(stattype) in ['cyclic', 'numerical', 'boolean']