Beispiel #1
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(bdb, population_id,
                                                     generator_id, colno)
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(
                    bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s' %
                    (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(
                insert_column_sql, {
                    'population_id': population_id,
                    'generator_id': generator_id,
                    'colno': colno,
                    'count': count,
                    'sum': xsum,
                    'sumsq': sumsq,
                })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb,
                               'Invalid nig_normal clause: %r' % (clause, ))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                                             obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var, ))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                                                generator_id, dev_var,
                                                'numerical')
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))
Beispiel #2
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(
                bdb, population_id, generator_id, colno)
            stattype = core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s'
                    % (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(insert_column_sql, {
                'population_id': population_id,
                'generator_id': generator_id,
                'colno': colno,
                'count': count,
                'sum': xsum,
                'sumsq': sumsq,
            })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb, 'Invalid nig_normal clause: %r' %
                    (clause,))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                    obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var,))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                generator_id, dev_var, 'numerical')
            bdb.sql_execute('''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))
Beispiel #3
0
    def create_generator(self, bdb, generator_id, schema_tokens, **kwargs):
        schema_ast = cgpm_schema.parse.parse(schema_tokens)
        schema = _create_schema(bdb, generator_id, schema_ast, **kwargs)

        # Store the schema.
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_cgpm_generator (generator_id, schema_json)
                VALUES (?, ?)
        ''', (generator_id, json_dumps(schema)))

        # Get the underlying population and table.
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        qt = sqlite3_quote_name(table)

        # Assign latent variable numbers.
        for var, stattype in sorted(schema['latents'].iteritems()):
            core.bayesdb_add_latent(bdb, population_id, generator_id, var,
                                    stattype)

        # Assign codes to categories and consecutive column numbers to
        # the modelled variables.
        vars_cursor = bdb.sql_execute(
            '''
            SELECT colno, name, stattype FROM bayesdb_variable
                WHERE population_id = ? AND 0 <= colno
        ''', (population_id, ))
        for colno, name, stattype in vars_cursor:
            if _is_categorical(stattype):
                qn = sqlite3_quote_name(name)
                cursor = bdb.sql_execute('''
                    SELECT DISTINCT %s FROM %s WHERE %s IS NOT NULL
                ''' % (qn, qt, qn))
                for code, (value, ) in enumerate(cursor):
                    bdb.sql_execute(
                        '''
                        INSERT INTO bayesdb_cgpm_category
                            (generator_id, colno, value, code)
                            VALUES (?, ?, ?, ?)
                    ''', (generator_id, colno, value, code))

        # Assign contiguous 0-indexed ids to the individuals in the
        # table.
        if schema['subsample']:
            k = schema['subsample']
            n = cursor_value(
                bdb.sql_execute('SELECT COUNT(*) FROM %s' % (qt, )))
            cursor = bdb.sql_execute(
                'SELECT _rowid_ FROM %s ORDER BY _rowid_ ASC' % (qt, ))
            uniform = bdb._prng.weakrandom_uniform
            # https://en.wikipedia.org/wiki/Reservoir_sampling
            samples = []
            for i, row in enumerate(cursor):
                if i < k:
                    samples.append(row)
                else:
                    r = uniform(i + 1)
                    if r < k:
                        samples[r] = row
            cursor = samples
        else:
            cursor = bdb.sql_execute('SELECT _rowid_ FROM %s' % (qt, ))
        for cgpm_rowid, (table_rowid, ) in enumerate(cursor):
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_cgpm_individual
                    (generator_id, table_rowid, cgpm_rowid)
                    VALUES (?, ?, ?)
            ''', (generator_id, table_rowid, cgpm_rowid))