Exemple #1
0
def get_schema_as_list(bdb, population_name):
    population_id = bayesdb_get_population(bdb, population_name)
    table_name = bayesdb_population_table(bdb, population_id)
    qt = bql_quote_name(table_name)
    variable_names = bayesdb_variable_names(bdb, population_id, None)
    schema = []
    for variable_name in variable_names:
        colno = bayesdb_variable_number(bdb, population_id, None,
                                        variable_name)
        stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
        stattype_lookup = {
            'numerical': 'realAdditive',
            'nominal': 'categorical',
            'categorical': 'categorical',
        }
        schema_entry = {
            'name': variable_name,
            'stat_type': stattype_lookup[stattype]
        }
        if stattype == 'nominal':
            qv = bql_quote_name(variable_name)
            values = utils_bql.query(
                bdb, '''
                SELECT DISTINCT(%s) FROM %s
                WHERE %s IS NOT NULL
            ''' % (
                    qv,
                    qt,
                    qv,
                ))
            schema_entry['unique_values'] = \
                values[values.columns[0]].unique().tolist()
        schema.append(schema_entry)
    return schema
Exemple #2
0
def test_case():
    with bayeslite.bayesdb_open(':memory:') as bdb:
        bdb.sql_execute('create table t(x,Y)')
        bdb.sql_execute('insert into t values(1,2)')
        bdb.sql_execute('insert into t values(3,4)')
        bdb.sql_execute('insert into t values(1,4)')
        bdb.sql_execute('insert into t values(2,2)')
        bdb.execute('create population p for t(guess(*))')
        population_id = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_names(bdb, population_id, None) == \
            ['x', 'Y']
Exemple #3
0
def _create_schema(bdb, generator_id, schema_ast, **kwargs):
    # Get some parameters.
    population_id = core.bayesdb_generator_population(bdb, generator_id)
    table = core.bayesdb_population_table(bdb, population_id)

    # State.
    variables = []
    variable_dist = {}
    latents = {}
    cgpm_composition = []
    modelled = set()
    default_modelled = set()
    subsample = None
    deferred_input = defaultdict(lambda: [])
    deferred_output = dict()

    # Error-reporting state.
    duplicate = set()
    unknown = set()
    needed = set()
    existing_latent = set()
    must_exist = []
    unknown_stattype = {}

    # XXX Convert all Foreign.exposed lists to Latent clauses.
    # Retrieve Foreign clauses with exposed variables.
    foreign_clauses = [
        c for c in schema_ast
        if isinstance(c, cgpm_schema.parse.Foreign) and len(c.exposed) > 0
    ]
    # Add the exposed variables to Foreign.outputs
    # Note that this assumes if there are K exposed variables, then they are
    # necessarily the last K outputs of the fc.outputs.
    for fc in foreign_clauses:
        fc.outputs.extend([e[0] for e in fc.exposed])

    # Convert exposed entries into Latent clauses.
    latent_vars = list(
        itertools.chain.from_iterable(c.exposed for c in foreign_clauses))
    latent_clauses = [cgpm_schema.parse.Latent(v, s) for (v, s) in latent_vars]
    # Append the Latent clauses to the ast.
    schema_ast.extend(latent_clauses)

    # XXX Convert the baseline to a Foreign clause.
    # Currently the baselines do not accept a schema, and will fail if
    # `schema_ast` has any entries.
    baseline = kwargs.get('baseline', None)
    if baseline is not None and casefold(baseline.name) != 'crosscat':
        if schema_ast:
            raise BQLError(
                bdb, 'Cannot accept schema with baseline: %s.' % schema_ast)
        # Retrieve all variable names in the population
        outputs = core.bayesdb_variable_names(bdb, population_id, None)
        # Convert the LITERAL namedtuples to their raw values.
        ps, vs = zip(*baseline.params)
        vs_new = [v.value for v in vs]
        params = zip(ps, vs_new)
        # Create the clause.
        clause = cgpm_schema.parse.Foreign(outputs, [], [], baseline.name,
                                           params)
        # And add append it to the schema_ast.
        schema_ast.append(clause)

    # Process each clause one by one.
    for clause in schema_ast:

        if isinstance(clause, cgpm_schema.parse.Basic):
            # Basic Crosscat component model: one variable to be put
            # into Crosscat views.
            var = clause.var
            dist = clause.dist
            params = dict(clause.params)  # XXX error checking

            # Reject if the variable does not exist.
            if not core.bayesdb_has_variable(bdb, population_id, None, var):
                unknown.add(var)
                continue

            # Reject if the variable has already been modelled.
            if var in modelled:
                duplicate.add(var)
                continue

            # Reject if the variable is latent.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Get the column number.
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            assert 0 <= colno

            # Add it to the list and mark it modelled by default.
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      colno)
            variables.append([var, stattype, dist, params])
            assert var not in variable_dist
            variable_dist[var] = (stattype, dist, params)
            modelled.add(var)
            default_modelled.add(var)

        elif isinstance(clause, cgpm_schema.parse.Latent):
            var = clause.name
            stattype = clause.stattype

            # Reject if the variable has already been modelled by the
            # default model.
            if var in default_modelled:
                duplicate.add(var)
                continue

            # Reject if the variable even *exists* in the population
            # at all yet.
            if core.bayesdb_has_variable(bdb, population_id, None, var):
                duplicate.add(var)
                continue

            # Reject if the variable is already latent, from another
            # generator.
            if core.bayesdb_has_latent(bdb, population_id, var):
                existing_latent.add(var)
                continue

            # Reject if we've already processed it.
            if var in latents:
                duplicate.add(var)
                continue

            # Add it to the set of latent variables.
            latents[var] = stattype

        elif isinstance(clause, cgpm_schema.parse.Foreign):
            # Foreign model: some set of output variables is to be
            # modelled by foreign logic, possibly conditional on some
            # set of input variables.
            #
            # Gather up the state for a cgpm_composition record, which
            # we may have to do incrementally because it must refer to
            # the distribution types of variables we may not have
            # seen.
            name = clause.name
            outputs = clause.outputs
            inputs = clause.inputs

            output_stattypes = []
            output_statargs = []
            input_stattypes = []
            input_statargs = []
            distargs = {
                'inputs': {
                    'stattypes': input_stattypes,
                    'statargs': input_statargs
                },
                'outputs': {
                    'stattypes': output_stattypes,
                    'statargs': output_statargs,
                }
            }
            kwds = {'distargs': distargs}
            kwds.update(clause.params)

            # First make sure all the output variables exist and have
            # not yet been modelled.
            for var in outputs:
                must_exist.append(var)
                if var in modelled:
                    duplicate.add(var)
                    continue
                modelled.add(var)
                # Add the output statistical type and its parameters.
                i = len(output_stattypes)
                assert i == len(output_statargs)
                output_stattypes.append(None)
                output_statargs.append(None)
                deferred_output[var] = (output_stattypes, output_statargs, i)

            # Next make sure all the input variables exist, mark them
            # needed, and record where to put their distribution type
            # and parameters.
            for var in inputs:
                must_exist.append(var)
                needed.add(var)
                i = len(input_stattypes)
                assert i == len(input_statargs)
                input_stattypes.append(None)
                input_statargs.append(None)
                deferred_input[var].append(
                    (input_stattypes, input_statargs, i))

            # Finally, add a cgpm_composition record.
            cgpm_composition.append({
                'name': name,
                'inputs': inputs,
                'outputs': outputs,
                'kwds': kwds,
            })

        elif isinstance(clause, cgpm_schema.parse.Subsample):
            if subsample is not None:
                raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n, ))
            subsample = clause.n

        else:
            raise BQLError(bdb, 'Unknown clause: %r' % (clause, ))

    # Make sure all the outputs and inputs exist, either in the
    # population or as latents in this generator.
    for var in must_exist:
        if core.bayesdb_has_variable(bdb, population_id, None, var):
            continue
        if var in latents:
            continue
        unknown.add(var)

    # Raise an exception if there were duplicates or unknown
    # variables.
    if duplicate:
        raise BQLError(bdb,
                       'Duplicate model variables: %r' % (sorted(duplicate), ))
    if existing_latent:
        raise BQLError(
            bdb, 'Latent variables already defined: %r' %
            (sorted(existing_latent), ))
    if unknown:
        raise BQLError(bdb,
                       'Unknown model variables: %r' % (sorted(unknown), ))

    def default_dist(var, stattype):
        stattype = casefold(stattype)
        if stattype not in _DEFAULT_DIST:
            if var in unknown_stattype:
                assert unknown_stattype[var] == stattype
            else:
                unknown_stattype[var] = stattype
            return None
        dist, params = _DEFAULT_DIST[stattype](bdb, generator_id, var)
        return dist, params

    # Use the default distribution for any variables that remain to be
    # modelled, excluding any that are latent or that have statistical
    # types we don't know about.
    for var in core.bayesdb_variable_names(bdb, population_id, None):
        if var in modelled:
            continue
        colno = core.bayesdb_variable_number(bdb, population_id, None, var)
        assert 0 <= colno
        stattype = core.bayesdb_variable_stattype(bdb, population_id, colno)
        distparams = default_dist(var, stattype)
        if distparams is None:
            continue
        dist, params = distparams
        variables.append([var, stattype, dist, params])
        assert var not in variable_dist
        variable_dist[var] = (stattype, dist, params)
        modelled.add(var)

    # Fill in the deferred_input statistical type assignments.
    for var in sorted(deferred_input.iterkeys()):
        # Check whether the variable is modelled.  If not, skip -- we
        # will fail later because this variable is guaranteed to also
        # be in needed.
        if var not in modelled:
            assert var in needed
            continue

        # Determine (possibly fictitious) distribution and parameters.
        if var in default_modelled:
            # Manifest variable modelled by default Crosscat model.
            assert var in variable_dist
            stattype, dist, params = variable_dist[var]
        else:
            # Modelled by a foreign model.  Assign a fictitious
            # default distribution because the 27B/6 of CGPM requires
            # this.
            if var in latents:
                # Latent variable modelled by a foreign model.  Use
                # the statistical type specified for it.
                stattype = latents[var]
            else:
                # Manifest variable modelled by a foreign model.  Use
                # the statistical type in the population.
                assert core.bayesdb_has_variable(bdb, population_id, None, var)
                colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     var)
                stattype = core.bayesdb_variable_stattype(
                    bdb, population_id, colno)
            distparams = default_dist(var, stattype)
            if distparams is None:
                continue
            dist, params = distparams

        # Assign the distribution and parameters.
        for cctypes, ccargs, i in deferred_input[var]:
            assert cctypes[i] is None
            assert ccargs[i] is None
            cctypes[i] = dist
            ccargs[i] = params

    # Fill in the deferred_output statistical type assignments. The need to be
    # in the form NUMERICAL or CATEGORICAL.
    for var in deferred_output:
        if var in latents:
            # Latent variable modelled by a foreign model.  Use
            # the statistical type specified for it.
            var_stattype = casefold(latents[var])
            if var_stattype not in _DEFAULT_DIST:
                if var in unknown_stattype:
                    assert unknown_stattype[var] == var_stattype
                else:
                    unknown_stattype[var] = var_stattype
            # XXX Cannot specify statargs for a latent variable. Trying to using
            # default_dist might lookup the counts for unique values of the
            # categorical in the base table causing a failure.
            var_statargs = {}
        else:
            # Manifest variable modelled by a foreign model.  Use
            # the statistical type and arguments from the population.
            assert core.bayesdb_has_variable(bdb, population_id, None, var)
            colno = core.bayesdb_variable_number(bdb, population_id, None, var)
            var_stattype = core.bayesdb_variable_stattype(
                bdb, population_id, colno)
            distparams = default_dist(var, var_stattype)
            if distparams is None:
                continue
            _, var_statargs = distparams

        stattypes, statargs, i = deferred_output[var]
        assert stattypes[i] is None
        assert statargs[i] is None
        stattypes[i] = var_stattype
        statargs[i] = var_statargs

    if unknown_stattype:
        raise BQLError(
            bdb, 'Unknown statistical types for variables: %r' %
            (sorted(unknown_stattype.iteritems(), )))

    # If there remain any variables that we needed to model, because
    # others are conditional on them, fail.
    needed -= modelled
    if needed:
        raise BQLError(bdb, 'Unmodellable variables: %r' % (needed, ))

    # Finally, create a CGPM schema.
    return {
        'variables': variables,
        'cgpm_composition': cgpm_composition,
        'subsample': subsample,
        'latents': latents,
    }
Exemple #4
0
    def predict_confidence(self,
                           bdb,
                           generator_id,
                           modelno,
                           colno,
                           rowid,
                           numsamples=None):
        if not numsamples:
            numsamples = 2
        assert numsamples > 0

        def _impute_categorical(sample):
            counts = Counter(s[0] for s in sample)
            mode_count = max(counts[v] for v in counts)
            pred = iter(v for v in counts if counts[v] == mode_count).next()
            conf = float(mode_count) / numsamples
            return pred, conf

        def _impute_numerical(sample):
            pred = sum(s[0] for s in sample) / float(len(sample))
            conf = 0  # XXX Punt confidence for now
            return pred, conf

        constraints = []
        # If rowid is a hypothetical cell for cgpm (did not exist at the time
        # of INITIALIZE), but exists in the base table (by INSERT INTO), then
        # retrieve all values for rowid as the constraints.
        exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id)
        max_cgpm_rowid = bdb.sql_execute(
            '''
            SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, )).fetchall()[0][0]
        hypothetical = rowid > max_cgpm_rowid
        if exists and hypothetical:
            population_id = core.bayesdb_generator_population(
                bdb, generator_id)
            # Retrieve all other variables except colno, and ignore latents in
            # generator_id, and place them in the constraints.
            pop_names = core.bayesdb_variable_names(bdb, population_id, None)
            avoid_name = core.bayesdb_variable_name(bdb, population_id, colno)
            constraints_names = [n for n in pop_names if n != avoid_name]
            # Obtain the row.
            qt_names = str.join(',', map(sqlite3_quote_name,
                                         constraints_names))
            qt_table = sqlite3_quote_name(
                core.bayesdb_population_table(bdb, population_id))
            data = bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid = ?
            ''' % (
                    qt_names,
                    qt_table,
                ), (rowid, )).fetchall()[0]
            # Build the constraints.
            pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None)
            constraints_nos = [n for n in pop_nos if n != colno]
            # import ipdb; ipdb.set_trace()
            assert len(data) == len(constraints_nos)
            constraints = [(rowid, c, v)
                           for c, v in zip(constraints_nos, data)
                           if (v is not None) and v]

        # Retrieve the samples.
        sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)],
                                     constraints, modelno, numsamples)

        # Determine the imputation strategy (mode or mean).
        stattype = core.bayesdb_variable_stattype(
            bdb, core.bayesdb_generator_population(bdb, generator_id), colno)
        if _is_categorical(stattype):
            return _impute_categorical(sample)
        else:
            return _impute_numerical(sample)
Exemple #5
0
        def retrieve_analyze_variables(ast):
            # Transition all variables by default.
            variables = None

            # Exactly 1 VARIABLES or SKIP clause supported for simplicity.
            seen_variables, seen_skip, seen_optimized = False, False, False
            for clause in ast:
                # Transition user specified variables only.
                if isinstance(clause, cgpm_analyze.parse.Variables):
                    if seen_variables or seen_skip:
                        raise BQLError(
                            bdb,
                            'Only 1 VARIABLES or SKIP clause allowed in ANALYZE'
                        )
                    seen_variables = True
                    included = set()
                    unknown = set()
                    for var in clause.vars:
                        if not core.bayesdb_has_variable(
                                bdb, population_id, generator_id, var):
                            unknown.add(var)
                        included.add(var)
                    if unknown:
                        raise BQLError(
                            bdb, 'Unknown variables in ANALYZE: %r' %
                            (sorted(unknown), ))
                    variables = sorted(included)
                # Transition all variables except user specified skip.
                elif isinstance(clause, cgpm_analyze.parse.Skip):
                    if seen_variables or seen_skip:
                        raise BQLError(
                            bdb,
                            'Only 1 VARIABLES or SKIP clause allowed in ANALYZE'
                        )
                    seen_skip = True
                    excluded = set()
                    unknown = set()
                    for var in clause.vars:
                        if not core.bayesdb_has_variable(
                                bdb, population_id, generator_id, var):
                            unknown.add(var)
                        excluded.add(var)
                    if unknown:
                        raise BQLError(
                            bdb, 'Unknown variables in ANALYZE: %r' %
                            (sorted(unknown), ))
                    all_vars = core.bayesdb_variable_names(
                        bdb, population_id, generator_id)
                    variables = sorted(set(all_vars) - excluded)
                elif isinstance(clause, cgpm_analyze.parse.Optimized):
                    seen_optimized = True
                # Unknown/impossible clause.
                else:
                    raise ValueError('Unknown clause in ANALYZE: %s.' % ast)

            if variables is None:
                variables = core.bayesdb_variable_names(
                    bdb, population_id, generator_id)

            varnos = [
                core.bayesdb_variable_number(bdb, population_id, generator_id,
                                             v) for v in variables
            ]

            # TODO Perform error checking if the OPTIMIZED clause is used.
            # In particular, the variables in OPTIMIZED must correspond
            # EXACTLY to the variables that are modeled by the CrossCat
            # baseline. Avoided this check for now since the nature of a
            # variable is not stored in the bdb. For now, just check the
            # user did not include a VARIABLES clause.
            if seen_optimized:
                if seen_variables:
                    raise BQLError(bdb,
                                   'OPTIMIZED incompatible with VARIABLES')
                # TODO Check if varnos are exactly the CrossCat variables.
                # raise BQLError(bdb,
                #     'The OPTIMIZED phrase in ANALYZE must target all the '
                #     'variables modeled by the baseline, only. '
                #     'Use SKIP to explicitly ignore analysis of overriden '
                #     'variables')

            return varnos, seen_optimized
Exemple #6
0
def _retrieve_analyze_variables(bdb, generator_id, ast):

    population_id = core.bayesdb_generator_population(bdb, generator_id)

    # Transitions all variables by default.
    variables = None

    # Exactly 1 VARIABLES or SKIP clause supported for simplicity.
    seen_variables, seen_skip, seen_optimized = False, False, False

    for clause in ast:

        # Transition user specified variables only.
        if isinstance(clause, cgpm_analyze.parse.Variables):
            if seen_variables or seen_skip:
                raise BQLError(
                    bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE')
            seen_variables = True
            included = set()
            unknown = set()
            for var in clause.vars:
                if not core.bayesdb_has_variable(bdb, population_id,
                                                 generator_id, var):
                    unknown.add(var)
                included.add(var)
            if unknown:
                raise BQLError(
                    bdb,
                    'Unknown variables in ANALYZE: %r' % (sorted(unknown), ))
            variables = sorted(included)

        # Transition all variables except user specified skip.
        elif isinstance(clause, cgpm_analyze.parse.Skip):
            if seen_variables or seen_skip:
                raise BQLError(
                    bdb, 'Only 1 VARIABLES or SKIP clause allowed in ANALYZE')
            seen_skip = True
            excluded = set()
            unknown = set()
            for var in clause.vars:
                if not core.bayesdb_has_variable(bdb, population_id,
                                                 generator_id, var):
                    unknown.add(var)
                excluded.add(var)
            if unknown:
                raise BQLError(
                    bdb,
                    'Unknown variables in ANALYZE: %r' % (sorted(unknown), ))
            all_vars = core.bayesdb_variable_names(bdb, population_id,
                                                   generator_id)
            variables = sorted(set(all_vars) - excluded)

        # OPTIMIZED is incompatible with any other clause.
        elif isinstance(clause, cgpm_analyze.parse.Optimized):
            seen_optimized = True

        # Unknown/impossible clause.
        else:
            raise BQLError(bdb, 'Unknown clause in ANALYZE: %s.' % (ast, ))

    # OPTIMIZED is incompatible with any other clause.
    if seen_optimized:
        if seen_variables or seen_skip:
            raise BQLError(bdb, 'OPTIMIZED incompatible with other clauses.')

    variable_numbers = [
        core.bayesdb_variable_number(bdb, population_id, generator_id, v)
        for v in variables
    ] if variables else None

    return (variable_numbers, seen_optimized)