Ejemplo n.º 1
0
    def analyze_models(self,
                       bdb,
                       generator_id,
                       modelnos=None,
                       iterations=1,
                       max_seconds=None,
                       ckpt_iterations=None,
                       ckpt_seconds=None,
                       program=None):
        if max_seconds is not None:
            raise BQLError(bdb,
                           'Loom analyze does not support number of seconds.')
        if ckpt_iterations is not None or ckpt_seconds is not None:
            raise BQLError(bdb, 'Loom analyze does not support checkpoint.')
        if program is not None:
            raise BQLError(bdb, 'Loom analyze does not support programs.')
        if modelnos is not None:
            raise BQLError(bdb, 'Loom cannot analyze specific model numbers.')

        # Prepare arguments for loom.tasks.infer invocation.
        num_models = (self._get_num_models(bdb, generator_id))
        iterations = max(int(iterations), 1)
        config = {'schedule': {'extra_passes': iterations}}
        project_path = self._get_loom_project_path(bdb, generator_id)

        # Run inference.
        loom.tasks.infer(project_path, sample_count=num_models, config=config)

        # Save the column and row partitions.
        self._store_kind_partition(bdb, generator_id, modelnos)

        # Close cached query servers.
        self._close_query_server(bdb, generator_id)
        self._close_preql_server(bdb, generator_id)
Ejemplo n.º 2
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(bdb, population_id,
                                                     generator_id, colno)
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(
                    bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s' %
                    (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(
                insert_column_sql, {
                    'population_id': population_id,
                    'generator_id': generator_id,
                    'colno': colno,
                    'count': count,
                    'sum': xsum,
                    'sumsq': sumsq,
                })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb,
                               'Invalid nig_normal clause: %r' % (clause, ))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                                             obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var, ))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                                                generator_id, dev_var,
                                                'numerical')
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))
Ejemplo n.º 3
0
def bayesdb_generator_row_values(bdb, generator_id, rowid):
    table_name = bayesdb_generator_table(bdb, generator_id)
    column_names = bayesdb_generator_column_names(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    qcns = ','.join(map(sqlite3_quote_name, column_names))
    select_sql = ('SELECT %s FROM %s WHERE _rowid_ = ?' % (qcns, qt))
    cursor = bdb.sql_execute(select_sql, (rowid, ))
    row = None
    try:
        row = cursor.next()
    except StopIteration:
        generator = bayesdb_generator_table(bdb, generator_id)
        raise BQLError(
            bdb, 'No such row in table %s'
            ' for generator %d: %d' %
            (repr(table_name), repr(generator), repr(rowid)))
    try:
        cursor.next()
    except StopIteration:
        pass
    else:
        generator = bayesdb_generator_table(bdb, generator_id)
        raise BQLError(
            bdb, 'More than one such row'
            ' in table %s for generator %s: %d' %
            (repr(table_name), repr(generator), repr(rowid)))
    return row
Ejemplo n.º 4
0
 def simulate_column(phrase):
     if isinstance(phrase, ast.ExpBQLDepProb):
         raise BQLError(
             bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.')
     elif isinstance(phrase, ast.ExpBQLProb):
         raise BQLError(bdb, 'PROBABILITY OF simulation still unsupported.')
     elif isinstance(phrase, ast.ExpBQLMutInf):
         colno0 = retrieve_variable(phrase.column0)
         colno1 = retrieve_variable(phrase.column1)
         constraint_args = ()
         if phrase.constraints is not None:
             constraint_args = tuple(
                 itertools.chain.from_iterable(
                     [[retrieve_variable(colname),
                       retrieve_literal(expr)]
                      for colname, expr in phrase.constraints]))
         nsamples = phrase.nsamples and retrieve_literal(phrase.nsamples)
         # One mi_list per generator of the population.
         mi_lists = bqlfn._bql_column_mutual_information(
             bdb, population_id, generator_id, colno0, colno1, nsamples,
             *constraint_args)
         return list(itertools.chain.from_iterable(mi_lists))
     else:
         raise BQLError(
             bdb, 'Only constants can be simulated: %s.' % (simulation, ))
Ejemplo n.º 5
0
    def analyze_models(self,
                       bdb,
                       generator_id,
                       modelnos=None,
                       iterations=1,
                       max_seconds=None,
                       ckpt_iterations=None,
                       ckpt_seconds=None,
                       program=None):
        if max_seconds is not None:
            raise BQLError(bdb,
                           'Loom analyze does not support number of seconds.')
        if ckpt_iterations is not None or ckpt_seconds is not None:
            raise BQLError(bdb, 'Loom analyze does not support checkpoint.')
        if program is not None:
            raise BQLError(bdb, 'Loom analyze does not support programs.')
        if modelnos is not None:
            raise BQLError(bdb, 'Loom cannot analyze specific model numbers.')

        num_models = (self._get_num_models(bdb, generator_id))
        iterations = max(int(iterations), 1)
        config = {'schedule': {'extra_passes': iterations}}
        project_path = self._get_loom_project_path(bdb, generator_id)

        loom.tasks.infer(project_path, sample_count=num_models, config=config)

        self._store_kind_partition(bdb, generator_id, modelnos)
        self._set_cache_entry(
            bdb, generator_id, 'q_server',
            loom.query.get_server(
                self._get_loom_project_path(bdb, generator_id)))
        preqlServer = loom.tasks.query(
            self._get_loom_project_path(bdb, generator_id))
        self._set_cache_entry(bdb, generator_id, 'preql_server', preqlServer)
Ejemplo n.º 6
0
def bayesdb_population_row_values(bdb, population_id, rowid):
    """Return values stored in `rowid` of given `population_id`."""
    table_name = bayesdb_population_table(bdb, population_id)
    column_names = bayesdb_variable_names(bdb, population_id, None)
    qt = sqlite3_quote_name(table_name)
    qcns = ','.join(map(sqlite3_quote_name, column_names))
    select_sql = ('SELECT %s FROM %s WHERE oid = ?' % (qcns, qt))
    cursor = bdb.sql_execute(select_sql, (rowid,))
    row = None
    try:
        row = cursor.next()
    except StopIteration:
        population = bayesdb_population_table(bdb, population_id)
        raise BQLError(bdb, 'No such row in table %s for population %s: %d'
            % (repr(table_name), repr(population), rowid))
    try:
        cursor.next()
    except StopIteration:
        pass
    else:
        population = bayesdb_population_table(bdb, population_id)
        raise BQLError(bdb,
            'More than one such row in table %s for population %s: %d'
            % (repr(table_name), repr(population), rowid))
    return row
Ejemplo n.º 7
0
def bql_row_predictive_relevance(bdb, population_id, generator_id, modelnos,
                                 rowid_target, rowid_query, colno,
                                 *constraint_args):
    if rowid_target is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    rowid_query = json.loads(rowid_query)
    modelnos = _retrieve_modelnos(modelnos)
    # Build the list of hypothetical values.
    # Each sequence of values is separated by None to demarcate between rows.
    splits = [-1] + [i for i, x in enumerate(constraint_args) if x is None]
    assert splits[-1] == len(constraint_args) - 1
    rows_list = [
        constraint_args[splits[i] + 1:splits[i + 1]]
        for i in range(len(splits) - 1)
    ]
    assert all(len(row) % 2 == 0 for row in rows_list)
    hypotheticals = [zip(row[::2], row[1::2]) for row in rows_list]
    if len(rowid_query) == 0 and len(hypotheticals) == 0:
        raise BQLError(bdb, 'No matching rows for PREDICTIVE RELEVANCE.')

    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        return metamodel.predictive_relevance(bdb, generator_id, modelnos,
                                              rowid_target, rowid_query,
                                              hypotheticals, colno)

    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    sims = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean([stats.arithmetic_mean(s) for s in sims])
Ejemplo n.º 8
0
def parse(schema, subsample_default):
    '''Parses a generator schema as passed to CrosscatMetamodel.

    schema is a tokenized expression of the form [['GUESS', ['*']], ['x',
    'NUMERICAL'], ...] that is passed to CrosscatMetamodel.create_generator and
    represents the argument to "crosscat" in CREATE GENERATOR ... FOR ... USING
    crosscat(...).

    Returns a GeneratorSchema.

    See test_crosscat_generator_schema.py for examples.
    '''

    guess = False
    subsample = subsample_default
    columns = []
    dep_constraints = []
    for directive in schema:

        if directive == []:
            # Skip extra commas so you can write
            #
            #    CREATE GENERATOR t_cc FOR t USING crosscat(
            #        x,
            #        y,
            #        z,
            #    )
            continue

        if (not isinstance(directive, list) or len(directive) != 2
                or not isinstance(directive[0], basestring)):
            raise BQLError(
                None,
                'Invalid crosscat column model directive: %r' % (directive, ))

        op = casefold(directive[0])
        if op == 'guess' and directive[1] == ['*']:
            guess = True
        elif (op == 'subsample' and isinstance(directive[1], list)
              and len(directive[1]) == 1):
            subsample = _parse_subsample_clause(directive[1][0])
        elif op == 'dependent':
            constraint = (_parse_dependent_clause(directive[1]), True)
            dep_constraints.append(constraint)
        elif op == 'independent':
            constraint = (_parse_dependent_clause(directive[1]), False)
            dep_constraints.append(constraint)
        elif op != 'guess' and casefold(directive[1]) != 'guess':
            columns.append((directive[0], directive[1]))
        else:
            raise BQLError(
                None,
                'Invalid crosscat column model: %r' % (directive),
            )
    return GeneratorSchema(guess=guess,
                           subsample=subsample,
                           columns=columns,
                           dep_constraints=dep_constraints)
Ejemplo n.º 9
0
def bayesdb_simulate(bdb, population_id, constraints, colnos,
        generator_id=None, numpredictions=1, accuracy=None):
    """Simulate rows from a generative model, subject to constraints.

    Returns a list of `numpredictions` tuples, with a value for each
    column specified in the list `colnos`, conditioned on the
    constraints in the list `constraints` of tuples ``(colno,
    value)``.

    The results are simulated from the predictive distribution on
    fresh rows.
    """
    rowid = core.bayesdb_population_fresh_row_id(bdb, population_id)
    if constraints is not None:
        user_rowid = [
            v for c, v in constraints
            if c in core.bayesdb_rowid_tokens(bdb)
        ]
        if len(user_rowid) == 1:
            rowid = user_rowid[0]
        elif len(user_rowid) > 1:
            raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints,))
        constraints = [
            (rowid, c, v) for c, v in constraints
            if c not in core.bayesdb_rowid_tokens(bdb)
        ]
    targets = [(rowid, colno) for colno in colnos]
    def loglikelihood(generator_id, metamodel):
        if not constraints:
            return 0
        return metamodel.logpdf_joint(
            bdb, generator_id, constraints, [], None)
    def simulate(generator_id, metamodel, n):
        return metamodel.simulate_joint(
            bdb, generator_id, targets,
            constraints, None, num_predictions=n, accuracy=accuracy)
    generator_ids = [generator_id] if generator_id is not None else \
        core.bayesdb_population_generators(bdb, population_id)
    metamodels = [core.bayesdb_generator_metamodel(bdb, generator_id)
        for generator_id in generator_ids]
    if len(generator_ids) > 1:
        loglikelihoods = map(loglikelihood, generator_ids, metamodels)
        likelihoods = map(math.exp, loglikelihoods)
        total_likelihood = sum(likelihoods)
        if total_likelihood == 0:
            # XXX Show the constraints with symbolic names.
            raise BQLError(bdb, 'Impossible constraints: %r' % (constraints,))
        probabilities = [likelihood/total_likelihood
            for likelihood in likelihoods]
        countses = bdb.np_prng.multinomial(
            numpredictions, probabilities, size=1)
        counts = countses[0]
    else:
        counts = [numpredictions]
    rowses = map(simulate, generator_ids, metamodels, counts)
    all_rows = [row for rows in rowses for row in rows]
    assert all(isinstance(row, (tuple, list)) for row in all_rows)
    return all_rows
Ejemplo n.º 10
0
 def simulate_column(exp):
     if isinstance(exp, ast.ExpCol):
         # XXX This is wrong -- it returns independent samples from
         # the marginals of each variable, not one sample from the
         # joint on all variables.
         if False:
             raise BQLError(
                 bdb, 'SIMULATE FROM MODELS OF can\'t sample conditional')
             # XXX Gotta weight each model by probability of
             # constraints.
             constraints = [(retrieve_variable(v), retrieve_literal(e))
                            for v, e in simulation.constraints]
         else:
             constraints = []
         colnos = [retrieve_variable(exp.column)]
         accuracy = 1  # XXX Allow nontrivial accuracy?
         samples = bqlfn.bayesdb_simulate(bdb,
                                          population_id,
                                          constraints,
                                          colnos,
                                          generator_id=generator_id,
                                          numpredictions=1,
                                          accuracy=accuracy)
         return [sample[0] for sample in samples]
     elif isinstance(exp, ast.ExpBQLDepProb):
         raise BQLError(
             bdb, 'DEPENDENCE PROBABILITY simulation still unsupported.')
     elif isinstance(exp, ast.ExpBQLProbDensity):
         raise BQLError(
             bdb, 'PROBABILITY DENSITY OF simulation still unsupported.')
     elif isinstance(exp, ast.ExpBQLMutInf):
         colnos0 = [retrieve_variable(c) for c in exp.columns0]
         colnos1 = [retrieve_variable(c) for c in exp.columns1]
         constraint_args = ()
         if exp.constraints is not None:
             constraint_args = tuple(
                 itertools.chain.from_iterable(
                     [[retrieve_variable(colname),
                       retrieve_literal(expr)]
                      for colname, expr in exp.constraints]))
         nsamples = exp.nsamples and retrieve_literal(exp.nsamples)
         # One mi_list per generator of the population.
         #
         # XXX fsaad@20170625: Setting modelnos = None arbitrarily, figure
         # out how to set the modelnos argument.
         mi_lists = bqlfn._bql_column_mutual_information(
             bdb, population_id, generator_id, None, colnos0, colnos1,
             nsamples, *constraint_args)
         return list(itertools.chain.from_iterable(mi_lists))
     else:
         raise BQLError(
             bdb, 'Only constants can be simulated: %s.' % (simulation, ))
Ejemplo n.º 11
0
def bql_column_correlation(bdb, population_id, _generator_id, colno0, colno1):
    if colno0 < 0:
        raise BQLError(bdb, 'No correlation for latent variable: %r' %
            (core.bayesdb_variable_name(bdb, population_id, colno0),))
    if colno1 < 0:
        raise BQLError(bdb, 'No correlation for latent variable: %r' %
            (core.bayesdb_variable_name(bdb, population_id, colno1),))
    (st0, st1, data0, data1) = bql_variable_stattypes_and_data(bdb,
        population_id, colno0, colno1)
    if (st0, st1) not in correlation_methods:
        raise NotImplementedError('No correlation method for %s/%s.' %
            (st0, st1))
    return correlation_methods[st0, st1](data0, data1)
Ejemplo n.º 12
0
    def _schema(self, bdb, generator_id):
        # Probe the cache.
        cache = self._cache(bdb)
        if cache is not None:
            if generator_id in cache.schema:
                return cache.schema[generator_id]

        # Not cached.  Load the schema from the database.
        cursor = bdb.sql_execute(
            '''
            SELECT schema_json FROM bayesdb_cgpm_generator
                WHERE generator_id = ?
        ''', (generator_id, ))
        schema_json = cursor_value(cursor, nullok=True)
        if schema_json is None:
            generator = core.bayesdb_generator_name(bdb, generator_id)
            raise BQLError(bdb, 'No such CGPM generator: %r' % (generator, ))

        # Deserialize the schema.
        schema = json.loads(schema_json)

        # Cache it, if we can.
        if cache is not None:
            cache.schema[generator_id] = schema
        return schema
Ejemplo n.º 13
0
 def create_generator(self, bdb, table, schema, instantiate):
     # The schema is the column list. May want to change this later
     # to make room for specifying the hyperparameters, etc.
     insert_column_sql = '''
         INSERT INTO bayesdb_nig_normal_column
             (generator_id, colno, count, sum, sumsq)
             VALUES (:generator_id, :colno, :count, :sum, :sumsq)
     '''
     with bdb.savepoint():
         generator_id, column_list = instantiate(schema)
         for (colno, column_name, stattype) in column_list:
             if not stattype == 'numerical':
                 raise BQLError(
                     bdb, 'NIG-Normal only supports'
                     ' numerical columns, but %s is %s' %
                     (repr(column_name), repr(stattype)))
             (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
             bdb.sql_execute(
                 insert_column_sql, {
                     'generator_id': generator_id,
                     'colno': colno,
                     'count': count,
                     'sum': xsum,
                     'sumsq': sumsq,
                 })
Ejemplo n.º 14
0
def _parse_subsample_clause(clause):
    if isinstance(clause, basestring) and casefold(clause) == 'off':
        return False
    elif isinstance(clause, int):
        return clause
    else:
        raise BQLError(None, 'Invalid subsampling: %r' % (clause, ))
Ejemplo n.º 15
0
    def _engine(self, bdb, generator_id):
        # Probe the cache.
        cache = self._cache(bdb)
        if cache is not None and generator_id in cache.engine:
            return cache.engine[generator_id]

        # Not cached.  Load the engine from the database.
        cursor = bdb.sql_execute(
            '''
            SELECT engine_json FROM bayesdb_cgpm_generator
                WHERE generator_id = ?
        ''', (generator_id, ))
        engine_json = cursor_value(cursor)
        if engine_json is None:
            generator = core.bayesdb_generator_name(bdb, generator_id)
            raise BQLError(
                bdb, 'No models initialized for generator: %r' % (generator, ))

        # Deserialize the engine.
        engine = Engine.from_metadata(json.loads(engine_json),
                                      rng=bdb.np_prng,
                                      multiprocess=self._ncpu)

        # Cache it, if we can.
        if cache is not None:
            cache.engine[generator_id] = engine
        return engine
Ejemplo n.º 16
0
 def drop_models(self, bdb, generator_id, modelnos=None):
     with bdb.savepoint():
         if modelnos is not None:
             raise BQLError(bdb, 'Loom cannot drop specific model numbers.')
         bdb.sql_execute(
             '''
             DELETE FROM bayesdb_loom_column_kind_partition
             WHERE generator_id = ?
         ''', (generator_id, ))
         bdb.sql_execute(
             '''
             DELETE FROM bayesdb_loom_row_kind_partition
             WHERE generator_id = ?
         ''', (generator_id, ))
         # Close the servers.
         self._close_query_server(bdb, generator_id)
         self._close_preql_server(bdb, generator_id)
         bdb.sql_execute(
             '''
             UPDATE bayesdb_loom_generator_model_info
             SET num_models = 0
             WHERE generator_id = ?
         ''', (generator_id, ))
         # Remove directories stored on disk.
         project_path = self._get_loom_project_path(bdb, generator_id)
         paths = loom.store.get_paths(project_path)
         if 'root' in paths:
             folder_with_models = os.path.join(paths['root'], 'samples')
             # XXX Change to subprocess.check_call
             os.system('rm -rf {}'.format(folder_with_models))
Ejemplo n.º 17
0
    def ungrouped_schema():
        schema = ''
        for i, var in enumerate(guesses.keys()):
            if len(var) > 0:
                guessed_type_reason = guesses[var]
                guessed_type = guessed_type_reason[0].lower()
                guessed_reason = guessed_type_reason[1]

                # Ignore the type key as well as ignore.
                if guessed_type in ['key', 'ignore']:
                    schema += 'IGNORE %s' % (var,)
                else:
                    schema += 'MODEL %s AS %s' % (var, guessed_type.upper(),)

                # Append a semicolon if not last var in schema.
                if i != len(guesses.keys()) - 1:
                    schema += ';'

                if len(guessed_reason) > 0:
                    schema += "'''# %s" % (guessed_reason,)
                else:
                    if guessed_type == 'key':
                        schema += "'''# This variable is a key."

                schema += os.linesep
            else:
                raise BQLError(bdb, 'Empty column name(s) in table %s' % \
                    (tablename,))

            # If reason was commented on previous line, need triple quote to
            # re-enter schema string.
            if len(guessed_reason) > 0 or guessed_type == 'key':
                schema += "''' %s" % (os.linesep,)

        return schema
Ejemplo n.º 18
0
def bql_row_similarity(bdb, generator_id, modelno, rowid, target_rowid,
                       *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
    if len(colnos) == 0:
        colnos = core.bayesdb_generator_column_numbers(bdb, generator_id)
    return metamodel.row_similarity(bdb, generator_id, modelno, rowid,
                                    target_rowid, colnos)
Ejemplo n.º 19
0
def _parse_dependent_clause(args):
    i = 0
    dep_columns = []
    while i < len(args):
        dep_columns.append(args[i])
        if i + 1 < len(args) and args[i + 1] != ',':
            raise BQLError(None, 'Invalid dependent columns: %r' % (args, ))
        i += 2
    return dep_columns
Ejemplo n.º 20
0
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid,
                       *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    if len(colnos) == 0:
        colnos = core.bayesdb_variable_numbers(bdb, population_id,
                                               generator_id)
    if len(colnos) != 1:
        raise BQLError(bdb,
                       'Multiple with respect to columns: %s.' % (colnos, ))

    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        return metamodel.row_similarity(bdb, generator_id, None, rowid,
                                        target_rowid, colnos)

    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    similarities = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean(similarities)
Ejemplo n.º 21
0
 def register(self, bdb):
     with bdb.savepoint():
         version = bayesdb_metamodel_version(bdb, self.name())
         if version is None:
             bdb.sql_execute(nig_normal_schema_1)
             version = 1
         if version == 1:
             bdb.sql_execute(nig_normal_schema_2)
             version = 2
         if version != 2:
             raise BQLError(bdb, 'NIG-Normal already installed'
                 ' with unknown schema version: %d' % (version,))
Ejemplo n.º 22
0
def bayesdb_simulate(
        bdb, population_id, generator_id, modelnos, constraints, colnos,
        numpredictions=1, accuracy=None):
    """Simulate rows from a generative model, subject to constraints.

    Returns a list of `numpredictions` tuples, with a value for each
    column specified in the list `colnos`, conditioned on the
    constraints in the list `constraints` of tuples ``(colno,
    value)``.

    The results are simulated from the predictive distribution on
    fresh rows.
    """
    modelnos = _retrieve_modelnos(modelnos)
    rowid, constraints = _retrieve_rowid_constraints(
        bdb, population_id, constraints)
    def loglikelihood(generator_id, metamodel):
        if not constraints:
            return 0
        return metamodel.logpdf_joint(
            bdb, generator_id, modelnos, rowid, constraints, [])
    def simulate(generator_id, metamodel, n):
        return metamodel.simulate_joint(
            bdb, generator_id, modelnos, rowid, colnos, constraints,
            num_samples=n, accuracy=accuracy)
    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    metamodels = [
        core.bayesdb_generator_metamodel(bdb, generator_id)
        for generator_id in generator_ids
    ]
    if len(generator_ids) > 1:
        loglikelihoods = map(loglikelihood, generator_ids, metamodels)
        likelihoods = map(math.exp, loglikelihoods)
        total_likelihood = sum(likelihoods)
        if total_likelihood == 0:
            # XXX Show the constraints with symbolic names.
            raise BQLError(bdb, 'Impossible constraints: %r' % (constraints,))
        probabilities = [
            likelihood / total_likelihood
            for likelihood in likelihoods
        ]
        countses = bdb.np_prng.multinomial(
            numpredictions, probabilities, size=1)
        counts = countses[0]
    elif len(generator_ids) == 1:
        counts = [numpredictions]
    else:
        counts = []
    rowses = map(simulate, generator_ids, metamodels, counts)
    all_rows = [row for rows in rowses for row in rows]
    assert all(isinstance(row, (tuple, list)) for row in all_rows)
    return all_rows
Ejemplo n.º 23
0
def bql_row_similarity(
        bdb, population_id, generator_id, modelnos, rowid, target_rowid, colno):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    modelnos = _retrieve_modelnos(modelnos)
    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Change [colno] to colno by updating IBayesDBMetamodel.
        return metamodel.row_similarity(
            bdb, generator_id, modelnos, rowid, target_rowid, [colno])
    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    similarities = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean(similarities)
Ejemplo n.º 24
0
def _retrieve_rowid_constraints(bdb, population_id, constraints):
    rowid = core.bayesdb_population_fresh_row_id(bdb, population_id)
    if constraints:
        user_rowid = [
            v for c, v in constraints if c in core.bayesdb_rowid_tokens(bdb)
        ]
        if len(user_rowid) == 1:
            rowid = user_rowid[0]
        elif len(user_rowid) > 1:
            raise BQLError(bdb, 'Multiple rowids given: %s.' % (constraints, ))
        constraints = [(c, v) for c, v in constraints
                       if c not in core.bayesdb_rowid_tokens(bdb)]
    return rowid, constraints
Ejemplo n.º 25
0
 def register(self, bdb):
     with bdb.savepoint():
         # Get the current version, if there is one.
         version = bayesdb_metamodel_version(bdb, self.name())
         # Check the version.
         if version is None:
             # No version -- CGPM schema not instantaited.
             # Instantiate it.
             bdb.sql_execute(CGPM_SCHEMA_1)
             version = 1
         if version != 1:
             # Unrecognized version.
             raise BQLError(
                 bdb, 'CGPM already installed'
                 ' with unknown schema version: %d' % (version, ))
Ejemplo n.º 26
0
def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid,
        *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    if len(colnos) == 0:
        colnos = core.bayesdb_variable_numbers(bdb, population_id,
            generator_id)
    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        return metamodel.row_similarity(bdb, generator_id, None, rowid,
            target_rowid, colnos)
    generator_ids = [generator_id] if generator_id is not None else \
        core.bayesdb_population_generators(bdb, population_id)
    similarities = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean(similarities)
Ejemplo n.º 27
0
def bayesdb_generator_cell_value(bdb, generator_id, rowid, colno):
    table_name = bayesdb_generator_table(bdb, generator_id)
    colname = bayesdb_generator_column_name(bdb, generator_id, colno)
    qt = sqlite3_quote_name(table_name)
    qcn = sqlite3_quote_name(colname)
    value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt)
    value_cursor = bdb.sql_execute(value_sql, (rowid, ))
    value = None
    try:
        row = value_cursor.next()
    except StopIteration:
        generator = bayesdb_generator_name(bdb, generator_id)
        raise BQLError(bdb, 'No such row in %s: %d' % (repr(generator), rowid))
    else:
        assert len(row) == 1
        value = row[0]
    return value
Ejemplo n.º 28
0
 def _check_loom_initialized(self, bdb, generator_id):
     # Not invoked on a per-query basis due to high overhead.
     cursor = bdb.sql_execute(
         '''
         SELECT COUNT(*)
         FROM bayesdb_loom_row_kind_partition
         WHERE generator_id = ?
     ''', (generator_id, ))
     count_row = cursor.fetchall()
     cursor = bdb.sql_execute(
         '''
         SELECT COUNT(*)
         FROM bayesdb_loom_row_kind_partition
         WHERE generator_id = ?
     ''', (generator_id, ))
     count_col = cursor.fetchall()
     if count_row[0][0] == 0 or count_col[0][0] == 0:
         raise BQLError(bdb, 'Analyze must be run before any BQL'\
             ' queries when using loom.')
Ejemplo n.º 29
0
 def predictive_relevance(self, bdb, generator_id, modelnos, rowid_target,
                          rowid_queries, hypotheticals, colno):
     if len(hypotheticals) > 0:
         raise BQLError(bdb, 'Loom cannot handle hypothetical rows' \
             ' because it is unable to insert rows into CrossCat')
     if modelnos is None:
         modelnos = range(self._get_num_models(bdb, generator_id))
     relevances = [0] * len(rowid_queries)
     for modelno in modelnos:
         kind_id_context = self._get_kind_id(bdb, generator_id, modelno,
                                             colno)
         partition_id_target = self._get_partition_id(
             bdb, generator_id, modelno, kind_id_context, rowid_target)
         for idx, rowid in enumerate(rowid_queries):
             partition_id_query = self._get_partition_id(
                 bdb, generator_id, modelno, kind_id_context, rowid)
             if partition_id_target == partition_id_query:
                 relevances[idx] += 1
     # XXX This procedure appears to be computing the wrong thing.
     return [xsum / float(len(modelnos)) for xsum in relevances]
Ejemplo n.º 30
0
 def register(self, bdb):
     with bdb.savepoint():
         schema_sql = 'SELECT version FROM bayesdb_metamodel WHERE name = ?'
         cursor = bdb.sql_execute(schema_sql, (self.name(),))
         version = None
         try:
             row = cursor.next()
         except StopIteration:
             version = 0
         else:
             version = row[0]
         assert version is not None
         if version == 0:
             # XXX WHATTAKLUDGE!
             for stmt in std_normal_schema_1.split(';'):
                 bdb.sql_execute(stmt)
             version = 1
         if version != 1:
             raise BQLError(bdb, 'IID-Gaussian already installed'
                 ' with unknown schema version: %d' % (version,))