예제 #1
0
 def column_mutual_information(self, bdb, generator_id, modelnos, colnos0,
                               colnos1, constraints, numsamples):
     population_id = bayesdb_generator_population(bdb, generator_id)
     colnames0 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos0
     ]
     colnames1 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos1
     ]
     server = self._get_preql_server(bdb, generator_id)
     target_set = server._cols_to_mask(server.encode_set(colnames0))
     query_set = server._cols_to_mask(server.encode_set(colnames1))
     if self._marginize_cmi(constraints):
         inner_numsamples = numsamples
         conditioning_rows_loom_format = self._get_constraint_rows(
             constraints, bdb, generator_id, population_id, modelnos,
             server, inner_numsamples)
     else:
         conditioning_rows_loom_format = [
             self._get_constraint_row(constraints, bdb, generator_id,
                                      population_id, server)
         ]
     mi_estimates = [
         server._query_server.mutual_information(
             target_set,
             query_set,
             entropys=None,
             sample_count=loom.preql.SAMPLE_COUNT,
             conditioning_row=conditioning_row_loom_format).mean
         for conditioning_row_loom_format in conditioning_rows_loom_format
     ]
     # Output requires an iterable.
     return [arithmetic_mean(mi_estimates)]
예제 #2
0
    def logpdf_joint(self, bdb, generator_id, modelnos, rowid, targets,
            constraints):
        population_id = bayesdb_generator_population(bdb, generator_id)
        ordered_column_names = self._get_ordered_column_names(bdb, generator_id)

        # Pr[targets|constraints] = Pr[targets, constraints] / Pr[constraints]
        # The numerator is and_case; denominator is conditional_case.
        and_case = OrderedDict(
            [(a, None) for a in ordered_column_names])
        conditional_case = OrderedDict(
            [(a, None) for a in ordered_column_names])

        for (colno, value) in targets:
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            and_case[column_name] = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)
            conditional_case[column_name] = None
        for (colno, value) in constraints:
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            processed_value = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)

            and_case[column_name] = processed_value
            conditional_case[column_name] = processed_value

        and_case = and_case.values()
        conditional_case = conditional_case.values()

        server = self._get_query_server(bdb, generator_id)
        and_score = server.score(and_case)
        conditional_score = server.score(conditional_case)
        return and_score - conditional_score
예제 #3
0
    def logpdf_joint(self, bdb, generator_id, modelnos, rowid, targets,
                     constraints):
        population_id = bayesdb_generator_population(bdb, generator_id)
        ordered_column_names = self._get_ordered_column_names(
            bdb, generator_id)

        # Pr[targets|constraints] = Pr[targets, constraints] / Pr[constraints]
        # The numerator is and_case; denominator is conditional_case.
        and_case = OrderedDict([(a, None) for a in ordered_column_names])
        conditional_case = OrderedDict([(a, None)
                                        for a in ordered_column_names])

        for (colno, value) in targets:
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            and_case[column_name] = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)
            conditional_case[column_name] = None
        for (colno, value) in constraints:
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            processed_value = self._convert_to_proper_stattype(
                bdb, generator_id, colno, value)

            and_case[column_name] = processed_value
            conditional_case[column_name] = processed_value

        and_case = and_case.values()
        conditional_case = conditional_case.values()

        server = self._get_query_server(bdb, generator_id)
        and_score = server.score(and_case)
        conditional_score = server.score(conditional_case)
        return and_score - conditional_score
예제 #4
0
파일: bqlfn.py 프로젝트: PeterZs/bayeslite
def bql_column_correlation(bdb, population_id, _generator_id, colno0, colno1):
    if colno0 < 0:
        raise BQLError(bdb, 'No correlation for latent variable: %r' %
            (core.bayesdb_variable_name(bdb, population_id, colno0),))
    if colno1 < 0:
        raise BQLError(bdb, 'No correlation for latent variable: %r' %
            (core.bayesdb_variable_name(bdb, population_id, colno1),))
    (st0, st1, data0, data1) = bql_variable_stattypes_and_data(bdb,
        population_id, colno0, colno1)
    if (st0, st1) not in correlation_methods:
        raise NotImplementedError('No correlation method for %s/%s.' %
            (st0, st1))
    return correlation_methods[st0, st1](data0, data1)
예제 #5
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(bdb, population_id,
                                                     generator_id, colno)
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(
                    bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s' %
                    (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(
                insert_column_sql, {
                    'population_id': population_id,
                    'generator_id': generator_id,
                    'colno': colno,
                    'count': count,
                    'sum': xsum,
                    'sumsq': sumsq,
                })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb,
                               'Invalid nig_normal clause: %r' % (clause, ))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                                             obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var, ))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                                                generator_id, dev_var,
                                                'numerical')
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))
예제 #6
0
 def _get_ordered_column_names(self, bdb, generator_id):
     """Return list of column names ordered by their loom rank."""
     population_id = bayesdb_generator_population(bdb, generator_id)
     return [
         bayesdb_variable_name(bdb, population_id, None, colno)
         for colno in self._get_ordered_column_numbers(bdb, generator_id)
     ]
예제 #7
0
 def _get_ordered_column_names(self, bdb, generator_id):
     """Return list of column names ordered by their loom rank."""
     population_id = bayesdb_generator_population(bdb, generator_id)
     return [
         bayesdb_variable_name(bdb, population_id, None, colno)
         for colno in self._get_ordered_column_numbers(bdb, generator_id)
     ]
예제 #8
0
def bql_variable_stattypes_and_data(bdb, population_id, colno0, colno1):
    st0 = core.bayesdb_variable_stattype(bdb, population_id, colno0)
    st1 = core.bayesdb_variable_stattype(bdb, population_id, colno1)
    table_name = core.bayesdb_population_table(bdb, population_id)
    qt = sqlite3_quote_name(table_name)
    varname0 = core.bayesdb_variable_name(bdb, population_id, colno0)
    varname1 = core.bayesdb_variable_name(bdb, population_id, colno1)
    qvn0 = sqlite3_quote_name(varname0)
    qvn1 = sqlite3_quote_name(varname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qvn0, qvn1, qt, qvn0, qvn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
예제 #9
0
파일: bqlfn.py 프로젝트: probcomp/bayeslite
def bql_variable_stattypes_and_data(bdb, population_id, colno0, colno1):
    st0 = core.bayesdb_variable_stattype(bdb, population_id, None, colno0)
    st1 = core.bayesdb_variable_stattype(bdb, population_id, None, colno1)
    table_name = core.bayesdb_population_table(bdb, population_id)
    qt = sqlite3_quote_name(table_name)
    varname0 = core.bayesdb_variable_name(bdb, population_id, None, colno0)
    varname1 = core.bayesdb_variable_name(bdb, population_id, None, colno1)
    qvn0 = sqlite3_quote_name(varname0)
    qvn1 = sqlite3_quote_name(varname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qvn0, qvn1, qt, qvn0, qvn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
예제 #10
0
파일: bqlfn.py 프로젝트: probcomp/bayeslite
def bql_column_correlation_pvalue(bdb, population_id, generator_id, _modelnos,
        colno0, colno1):
    if colno0 < 0:
        varname = core.bayesdb_variable_name(bdb, population_id,
            generator_id, colno0)
        raise BQLError(bdb, 'No correlation p-value for latent variable: %r'
            % (varname,))
    if colno1 < 0:
        varname = core.bayesdb_variable_name(bdb, population_id,
            generator_id, colno1)
        raise BQLError(bdb, 'No correlation p-value for latent variable: %r'
            % (varname,))
    (st0, st1, data0, data1) = bql_variable_stattypes_and_data(
        bdb, population_id, colno0, colno1)
    if (st0, st1) not in correlation_p_methods:
        raise NotImplementedError(
            'No correlation pvalue method for %s/%s.' % (st0, st1))
    return correlation_p_methods[st0, st1](data0, data1)
예제 #11
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(
                bdb, population_id, generator_id, colno)
            stattype = core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s'
                    % (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(insert_column_sql, {
                'population_id': population_id,
                'generator_id': generator_id,
                'colno': colno,
                'count': count,
                'sum': xsum,
                'sumsq': sumsq,
            })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb, 'Invalid nig_normal clause: %r' %
                    (clause,))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                    obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var,))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                generator_id, dev_var, 'numerical')
            bdb.sql_execute('''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))
예제 #12
0
 def _data_to_schema(self, bdb, population_id, data_by_column):
     json_dict = {}
     for colno in bayesdb_variable_numbers(bdb, population_id, None):
         column_name = bayesdb_variable_name(bdb, population_id, None, colno)
         stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
         if stattype == 'nominal' \
                 and len(set(data_by_column[column_name])) > 256:
             stattype = 'unbounded_nominal'
         json_dict[column_name] = STATTYPE_TO_LOOMTYPE[stattype]
     with tempfile.NamedTemporaryFile(delete=False) as schema_file:
         schema_file.write(json.dumps(json_dict))
     return schema_file
예제 #13
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item, ) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path,
                          rows_csv=csv_file.name,
                          schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt, )).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid, ) in enumerate(rowids))
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions, ))
예제 #14
0
 def _get_constraint_row(self, constraints, bdb, generator_id, population_id,
         server):
     """For a tuple of constraints, return a conditioning row loom style."""
     if not constraints:
         return None
     else:
         row_constraints = {
             bayesdb_variable_name(
                 bdb, population_id, generator_id, colno) : value
             for colno, value in constraints
         }
         csv_headers_str = map(str, row_constraints.iterkeys())
         csv_values_str  = map(str, row_constraints.itervalues())
         return server.encode_row(csv_values_str, csv_headers_str)
예제 #15
0
 def _data_to_schema(self, bdb, population_id, data_by_column):
     json_dict = {}
     for colno in bayesdb_variable_numbers(bdb, population_id, None):
         column_name = bayesdb_variable_name(bdb, population_id, None,
                                             colno)
         stattype = bayesdb_variable_stattype(bdb, population_id, None,
                                              colno)
         if stattype == 'nominal' \
                 and len(set(data_by_column[column_name])) > 256:
             stattype = 'unbounded_nominal'
         json_dict[column_name] = STATTYPE_TO_LOOMTYPE[stattype]
     with tempfile.NamedTemporaryFile(delete=False) as schema_file:
         schema_file.write(json.dumps(json_dict))
     return schema_file
예제 #16
0
 def column_mutual_information(self, bdb, generator_id, modelnos, colnos0,
                               colnos1, constraints, numsamples):
     # XXX Why are the constraints being ignored? If Loom does not support
     # conditioning, then implement constraints using the simple Monte Carlo
     # estimator.
     population_id = bayesdb_generator_population(bdb, generator_id)
     colnames0 = [
         str(bayesdb_variable_name(bdb, population_id, None, colno))
         for colno in colnos0
     ]
     colnames1 = [
         str(bayesdb_variable_name(bdb, population_id, None, colno))
         for colno in colnos1
     ]
     server = self._get_cache_entry(bdb, generator_id, 'preql_server')
     target_set = server._cols_to_mask(server.encode_set(colnames0))
     query_set = server._cols_to_mask(server.encode_set(colnames1))
     mi = server._query_server.mutual_information(
         target_set,
         query_set,
         entropys=None,
         sample_count=loom.preql.SAMPLE_COUNT)
     return mi
예제 #17
0
 def _get_constraint_row(self, constraints, bdb, generator_id,
                         population_id, server):
     """For a tuple of constraints, return a conditioning row loom style."""
     if not constraints:
         return None
     else:
         row_constraints = {
             bayesdb_variable_name(bdb, population_id, generator_id, colno):
             value
             for colno, value in constraints
         }
         csv_headers_str = map(str, row_constraints.iterkeys())
         csv_values_str = map(str, row_constraints.itervalues())
         return server.encode_row(csv_values_str, csv_headers_str)
예제 #18
0
 def column_mutual_information(self, bdb, generator_id, modelnos, colnos0,
         colnos1, constraints, numsamples):
     population_id = bayesdb_generator_population(bdb, generator_id)
     colnames0 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos0
     ]
     colnames1 = [
         str(bayesdb_variable_name(bdb, population_id, generator_id, colno))
         for colno in colnos1
     ]
     server = self._get_preql_server(bdb, generator_id)
     target_set = server._cols_to_mask(server.encode_set(colnames0))
     query_set = server._cols_to_mask(server.encode_set(colnames1))
     if self._marginize_cmi(constraints):
         inner_numsamples = numsamples
         conditioning_rows_loom_format = self._get_constraint_rows(
             constraints, bdb, generator_id, population_id, modelnos, server,
             inner_numsamples)
     else:
         conditioning_rows_loom_format = [
             self._get_constraint_row(constraints, bdb, generator_id,
             population_id, server)
         ]
     mi_estimates = [
         server._query_server.mutual_information(
             target_set,
             query_set,
             entropys=None,
             sample_count=loom.preql.SAMPLE_COUNT,
             conditioning_row=conditioning_row_loom_format
         ).mean
         for conditioning_row_loom_format in conditioning_rows_loom_format
     ]
     # Output requires an iterable.
     return [arithmetic_mean(mi_estimates)]
예제 #19
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item,) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path, rows_csv=csv_file.name,
            schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt,)).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid,) in enumerate(rowids)
        )
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions,))
예제 #20
0
def test_t1_column_value_probability(colno, rowid):
    with analyzed_bayesdb_population(t1(), 1, 1) \
            as (bdb, population_id, generator_id):
        if rowid == 0:
            rowid = bayesdb_maxrowid(bdb, population_id)
        value = core.bayesdb_population_cell_value(
            bdb, population_id, rowid, colno)
        bqlfn.bql_column_value_probability(
            bdb, population_id, None, None, colno, value)
        table_name = core.bayesdb_population_table(bdb, population_id)
        var = core.bayesdb_variable_name(bdb, population_id, None, colno)
        qt = bql_quote_name(table_name)
        qv = bql_quote_name(var)
        sql = '''
            select bql_column_value_probability(?, NULL, NULL, ?,
                (select %s from %s where rowid = ?))
        ''' % (qv, qt)
        bdb.sql_execute(sql, (population_id, colno, rowid)).fetchall()
예제 #21
0
def test_t1_column_value_probability(colno, rowid):
    with analyzed_bayesdb_population(t1(), 1, 1) \
            as (bdb, population_id, generator_id):
        if rowid == 0:
            rowid = bayesdb_maxrowid(bdb, population_id)
        value = core.bayesdb_population_cell_value(bdb, population_id, rowid,
                                                   colno)
        bqlfn.bql_column_value_probability(bdb, population_id, None, None,
                                           colno, value)
        table_name = core.bayesdb_population_table(bdb, population_id)
        var = core.bayesdb_variable_name(bdb, population_id, None, colno)
        qt = bql_quote_name(table_name)
        qv = bql_quote_name(var)
        sql = '''
            select bql_column_value_probability(?, NULL, NULL, ?,
                (select %s from %s where rowid = ?))
        ''' % (qv, qt)
        bdb.sql_execute(sql, (population_id, colno, rowid)).fetchall()
예제 #22
0
    def _reorder_row(self, bdb, generator_id, row, dense=True):
        """Reorder a row of columns according to loom's column order.

        Row should be a list of (colno, value) tuples

        Returns a list of (colno, value) tuples in the proper order.
        """
        ordered_column_labels = self._get_ordered_column_labels(
            bdb, generator_id)
        ordererd_column_dict = OrderedDict([(a, None)
                                            for a in ordered_column_labels])

        population_id = bayesdb_generator_population(bdb, generator_id)
        for colno, value in zip(range(1, len(row) + 1), row):
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            ordererd_column_dict[column_name] = str(value)
        if dense is False:
            return [(colno, value)
                    for (colno, value) in ordererd_column_dict.iteritems()
                    if value is not None]
        return ordererd_column_dict.iteritems()
예제 #23
0
    def predict_confidence(self,
                           bdb,
                           generator_id,
                           modelno,
                           colno,
                           rowid,
                           numsamples=None):
        if not numsamples:
            numsamples = 2
        assert numsamples > 0

        def _impute_categorical(sample):
            counts = Counter(s[0] for s in sample)
            mode_count = max(counts[v] for v in counts)
            pred = iter(v for v in counts if counts[v] == mode_count).next()
            conf = float(mode_count) / numsamples
            return pred, conf

        def _impute_numerical(sample):
            pred = sum(s[0] for s in sample) / float(len(sample))
            conf = 0  # XXX Punt confidence for now
            return pred, conf

        constraints = []
        # If rowid is a hypothetical cell for cgpm (did not exist at the time
        # of INITIALIZE), but exists in the base table (by INSERT INTO), then
        # retrieve all values for rowid as the constraints.
        exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id)
        max_cgpm_rowid = bdb.sql_execute(
            '''
            SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, )).fetchall()[0][0]
        hypothetical = rowid > max_cgpm_rowid
        if exists and hypothetical:
            population_id = core.bayesdb_generator_population(
                bdb, generator_id)
            # Retrieve all other variables except colno, and ignore latents in
            # generator_id, and place them in the constraints.
            pop_names = core.bayesdb_variable_names(bdb, population_id, None)
            avoid_name = core.bayesdb_variable_name(bdb, population_id, colno)
            constraints_names = [n for n in pop_names if n != avoid_name]
            # Obtain the row.
            qt_names = str.join(',', map(sqlite3_quote_name,
                                         constraints_names))
            qt_table = sqlite3_quote_name(
                core.bayesdb_population_table(bdb, population_id))
            data = bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid = ?
            ''' % (
                    qt_names,
                    qt_table,
                ), (rowid, )).fetchall()[0]
            # Build the constraints.
            pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None)
            constraints_nos = [n for n in pop_nos if n != colno]
            # import ipdb; ipdb.set_trace()
            assert len(data) == len(constraints_nos)
            constraints = [(rowid, c, v)
                           for c, v in zip(constraints_nos, data)
                           if (v is not None) and v]

        # Retrieve the samples.
        sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)],
                                     constraints, modelno, numsamples)

        # Determine the imputation strategy (mode or mean).
        stattype = core.bayesdb_variable_stattype(
            bdb, core.bayesdb_generator_population(bdb, generator_id), colno)
        if _is_categorical(stattype):
            return _impute_categorical(sample)
        else:
            return _impute_numerical(sample)
예제 #24
0
    def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets,
            constraints, num_samples=1, accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Prepare list of full constraints, potentially adding data from table.
        constraints_full = constraints

        # If rowid exist in base table, retrieve conditioning data.
        # Conditioning values are fetched for any rowid that exists in the base
        # table irrespective of whether the rowid is incorporated in the Loom
        # model or whether it was added after creation.
        if bayesdb_table_has_rowid(bdb, table, rowid):
            # Fetch population column numbers and row values.
            colnos = bayesdb_variable_numbers(bdb, population_id, generator_id)
            rowvals = bayesdb_population_row_values(bdb, population_id, rowid)
            observations = [
                (colno, rowval)
                for colno, rowval in zip(colnos, rowvals)
                if rowval is not None and colno not in targets
            ]
            # Raise error if a constraint overrides an observed cell.
            colnos_constrained = [constraint[0] for constraint in constraints]
            colnos_observed = [observation[0] for observation in observations]
            if set.intersection(set(colnos_constrained), set(colnos_observed)):
                raise BQLError(bdb, 'Overlap between constraints and'
                    ' target row in simulate.')
            # Update the constraints.
            constraints_full = constraints + observations

        # Store mapping from target column name to column number and stattype.
        target_colno_to_name = {
            colno: bayesdb_variable_name(bdb, generator_id, None, colno)
            for colno in targets
        }
        target_colno_to_stattype = {
            colno: bayesdb_variable_stattype(bdb, population_id, None, colno)
            for colno in targets
        }

        # Construct the CSV row for targets.
        row_targets = {target_colno_to_name[colno] : '' for colno in targets}
        row_constraints = {
            bayesdb_variable_name(bdb, generator_id, None, colno) : value
            for colno, value in constraints_full
        }
        row = dict(itertools.chain(
            row_targets.iteritems(), row_constraints.iteritems()))

        # Fetch the server.
        server = self._get_preql_server(bdb, generator_id)

        # Prepare the csv header and values.
        csv_headers = map(str, row.iterkeys())
        csv_values = map(str, row.itervalues())

        # Prepare streams for the server.
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers]+[csv_values])

        # Obtain the prediction.
        server._predict(reader, num_samples, writer, False)

        # Parse the CSV output.
        output_csv = writer.result()
        output_rows = output_csv.strip().split('\r\n')

        # Extract the header of the CSV file.
        header = output_rows[0].split(CSV_DELIMITER)

        # Extract list of simulated rows. Each simulated row is represented
        # as a dictionary mapping column name to its simulated value.
        simulated_rows = [
            dict(zip(header, row.split(CSV_DELIMITER)))
            for row in output_rows[1:]
        ]

        # Prepare the return list of simulated_rows.
        def _extract_simulated_value(row, colno):
            colname = target_colno_to_name[colno]
            stattype = target_colno_to_stattype[colno]
            value = row[colname]
            return value if _is_nominal(stattype) else float(value)

        # Return the list of samples.
        return [
            [_extract_simulated_value(row, colno) for colno in targets]
            for row in simulated_rows
        ]
예제 #25
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder(
                '''
                CREATE TEMP TABLE %s (column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT)
            ''' % (qtt), ())
            for cn, st, ct in zip(column_names, stattypes,
                                  distinct_value_counts):
                out.winder(
                    '''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt, ))
            out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [
                    core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids
                ]
                raise BQLError(
                    bdb, 'Population %r still has metamodels: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'No such variable in base table: %s' % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(bdb, population_id, None,
                                                 cmd.name):
                        raise BQLError(
                            bdb,
                            'Variable already in population: %s' % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'Cannot add variable without any values: %s' %
                            (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute('SELECT %s FROM %s' %
                                                 (qc, qt))
                        rows = cursor.fetchall()
                        [stattype,
                         reason] = bayesdb_guess_stattypes([cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(
                                bdb, 'Values in column %s appear to be keys.' %
                                (cmd.name, ))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(
                                bdb, 'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name, ))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                                       'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'Numerical column contains string values: %r ' %
                            (qc, ))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(bdb, population_id, cmd.name,
                                                  stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) metamodel in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            # XXX Omit needless bayesdb_generator_column table
                            # Github issue #441.
                            bdb.sql_execute(
                                '''
                                INSERT INTO bayesdb_generator_column
                                    VALUES (:generator_id, :colno, :stattype)
                            ''', {
                                    'generator_id': generator_id,
                                    'colno': colno,
                                    'stattype': stattype,
                                })
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb,
                            'Cannot update statistical types for population '
                            '%s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population: %s' %
                            (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type: %r' %
                            (repr(cmd.stattype), ))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(
                                bdb, 'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars, ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(
                        bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(
                        bdb, 'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the metamodel.
                metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                metamodel.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the metamodel.
        generator_id = None
        if phrase.metamodel:
            if not core.bayesdb_has_generator(bdb, population_id,
                                              phrase.metamodel):
                raise BQLError(bdb,
                               'No such metamodel: %r' % (phrase.population, ))
            generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                      phrase.metamodel)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(bdb, population_id, None,
                                         phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target, ))
        colno_target = core.bayesdb_variable_number(bdb, population_id, None,
                                                    phrase.target)
        if core.bayesdb_variable_stattype(bdb, population_id, colno_target) != \
                'numerical':
            raise BQLError(
                bdb,
                'Target variable is not numerical: %r' % (phrase.target, ))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(bdb, population_id, None,
                                             given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(colno for colno in colno_givens
                                  if colno != colno_target)
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(bdb,
                                      population_id,
                                      generator_id,
                                      modelnos,
                                      constraints,
                                      colnos,
                                      numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(target_values, given_values, given_names,
                                   stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder(
            '''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt, ), ())
        for variable, coef in coefficients:
            out.winder(
                '''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (
                    variable,
                    coef,
                ))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt, ))
        out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    assert False  # XXX
예제 #26
0
파일: bql.py 프로젝트: probcomp/bayeslite
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
            out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb,
                        'Name already defined as table: %s' %
                        (repr(phrase.name),))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb, 'Table already exists: %s' %
                        (repr(phrase.name),))
            bayesdb_read_csv_file(
                bdb, phrase.name, phrase.csv, header=True, create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                raise BQLError(bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name),))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                (phrase.name,))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as table: %s'
                                % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                                bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s'
                            % (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                        ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table,
                                cmd.old):
                            raise BQLError(bdb, 'No such column in table %s'
                                ': %s' %
                                (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table,))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id,) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(bdb,
                                generator_id)
                            backend.rename_column(bdb, generator_id,
                                old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder('''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt,), ())
            for cn, st, ct in zip(column_names, stattypes, distinct_value_counts):
                out.winder('''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt,))
            out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name,))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids]
                raise BQLError(bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute('''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id,))
            bdb.sql_execute('''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb, 'No such population: %s' %
                    (repr(population),))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead'
                            % (population,))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as population' ': %s'
                                % (repr(cmd.name),))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s'
                            % (qg, qp,))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(
                            bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'No such variable in base table: %s'
                            % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(
                            bdb, population_id, None, cmd.name):
                        raise BQLError(bdb,
                            'Variable already in population: %s'
                            % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb,
                            'Cannot add variable without any values: %s'
                            % (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute(
                            'SELECT %s FROM %s' % (qc, qt))
                        rows = cursor.fetchall()
                        [stattype, reason] = bayesdb_guess_stattypes(
                            [cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(bdb,
                                'Values in column %s appear to be keys.'
                                % (cmd.name,))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(bdb,
                                'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name,))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'Numerical column contains string values: %r '
                            % (qc,))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(
                            bdb, population_id, cmd.name, stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s'
                            % (repr(population), repr(generators),))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not
                        core.bayesdb_has_variable(bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(bdb,
                            'No such variables in population: %s'
                            % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid statistical type: %r'
                            % (repr(cmd.stattype),))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(bdb,
                                'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars,))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(
                            bdb, population_id, None, c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno,) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos,)
                    bdb.sql_execute(
                        update_stattype_sql,
                        (casefold(cmd.stattype), population_id,))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' %
                (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' %
                (repr(backend_name),))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name),))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute('''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(bdb, 'Name already defined'
                                ' as generator: %s' %
                                (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos if not
                    core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(bdb,
                        'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
            else:
                existing = set(modelno for modelno in modelnos
                    if core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
                if 0 < len(existing):
                    raise BQLError(bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb, generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
            program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(
                bdb, None, phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' % (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                    phrase.generator):
                raise BQLError(bdb,
                    'No such generator: %r' % (phrase.generator,))
            generator_id = core.bayesdb_get_generator(
                bdb, population_id, phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(
                bdb, population_id, None, phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target,))
        colno_target = core.bayesdb_variable_number(
            bdb, population_id, None, phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
            generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(bdb,
                'Target variable is not numerical: %r' % (phrase.target,))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(
                    bdb, population_id, None, given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(
            colno for colno in colno_givens if colno!= colno_target
        )
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(
            bdb, population_id, generator_id, modelnos, constraints,
            colnos, numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(
            target_values, given_values, given_names, stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder('''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt,), ())
        for variable, coef in coefficients:
            out.winder('''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (variable, coef,))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt,))
        out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    assert False                # XXX
예제 #27
0
    def simulate_joint(self,
                       bdb,
                       generator_id,
                       modelnos,
                       rowid,
                       targets,
                       constraints,
                       num_samples=1,
                       accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Prepare list of full constraints, potentially adding data from table.
        constraints_full = constraints

        # If rowid exist in base table, retrieve conditioning data.
        # Conditioning values are fetched for any rowid that exists in the base
        # table irrespective of whether the rowid is incorporated in the Loom
        # model or whether it was added after creation.
        if bayesdb_table_has_rowid(bdb, table, rowid):
            # Fetch population column numbers and row values.
            colnos = bayesdb_variable_numbers(bdb, population_id, generator_id)
            rowvals = bayesdb_population_row_values(bdb, population_id, rowid)
            observations = [(colno, rowval)
                            for colno, rowval in zip(colnos, rowvals)
                            if rowval is not None and colno not in targets]
            # Raise error if a constraint overrides an observed cell.
            colnos_constrained = [constraint[0] for constraint in constraints]
            colnos_observed = [observation[0] for observation in observations]
            if set.intersection(set(colnos_constrained), set(colnos_observed)):
                raise BQLError(
                    bdb, 'Overlap between constraints and'
                    ' target row in simulate.')
            # Update the constraints.
            constraints_full = constraints + observations

        # Store mapping from target column name to column number and stattype.
        target_colno_to_name = {
            colno: bayesdb_variable_name(bdb, generator_id, None, colno)
            for colno in targets
        }
        target_colno_to_stattype = {
            colno: bayesdb_variable_stattype(bdb, population_id, None, colno)
            for colno in targets
        }

        # Construct the CSV row for targets.
        row_targets = {target_colno_to_name[colno]: '' for colno in targets}
        row_constraints = {
            bayesdb_variable_name(bdb, generator_id, None, colno): value
            for colno, value in constraints_full
        }
        row = dict(
            itertools.chain(row_targets.iteritems(),
                            row_constraints.iteritems()))

        # Fetch the server.
        server = self._get_preql_server(bdb, generator_id)

        # Prepare the csv header and values.
        csv_headers = map(str, row.iterkeys())
        csv_values = map(str, row.itervalues())

        # Prepare streams for the server.
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers] + [csv_values])

        # Obtain the prediction.
        server._predict(reader, num_samples, writer, False)

        # Parse the CSV output.
        output_csv = writer.result()
        output_rows = output_csv.strip().split('\r\n')

        # Extract the header of the CSV file.
        header = output_rows[0].split(CSV_DELIMITER)

        # Extract list of simulated rows. Each simulated row is represented
        # as a dictionary mapping column name to its simulated value.
        simulated_rows = [
            dict(zip(header, row.split(CSV_DELIMITER)))
            for row in output_rows[1:]
        ]

        # Prepare the return list of simulated_rows.
        def _extract_simulated_value(row, colno):
            colname = target_colno_to_name[colno]
            stattype = target_colno_to_stattype[colno]
            value = row[colname]
            return value if _is_nominal(stattype) else float(value)

        # Return the list of samples.
        return [[_extract_simulated_value(row, colno) for colno in targets]
                for row in simulated_rows]
예제 #28
0
    def simulate_joint(self,
                       bdb,
                       generator_id,
                       modelnos,
                       rowid,
                       targets,
                       constraints,
                       num_samples=1,
                       accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)

        # If rowid exists, retrieve conditioning data from the table.
        if rowid != bayesdb_population_fresh_row_id(bdb, generator_id):
            row_values_raw = bayesdb_population_row_values(
                bdb, population_id, rowid)
            row_values = [
                str(a) if isinstance(a, unicode) else a for a in row_values_raw
            ]
            row = [
                entry for entry in enumerate(row_values)
                if entry[1] is not None
            ]
            constraints_colnos = [c[0] for c in constraints]
            row_colnos = [r[0] for r in row]
            if any([colno in constraints_colnos for colno in row_colnos]):
                raise BQLError(bdb, 'Overlap between constraints and' \
                    'target row in simulate.')
            constraints.extend(row)

        # Prepare the query row to provide to Loom.
        row = {}
        target_num_to_name = {}
        for colno in targets:
            name = bayesdb_variable_name(bdb, generator_id, None, colno)
            target_num_to_name[colno] = name
            row[name] = ''
        for (colno, value) in constraints:
            name = bayesdb_variable_name(bdb, generator_id, None, colno)
            row[name] = value

        # Fetch the server.
        server = self._get_cache_entry(bdb, generator_id, 'preql_server')

        # Prepare the csv header.
        csv_headers, csv_values = zip(*row.iteritems())
        lower_to_upper = {str(a).lower(): str(a) for a in csv_headers}
        csv_headers = lower_to_upper.keys()
        csv_values = [str(a) for a in csv_values]

        # Retrieve the samples from the server..
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers] + [csv_values])
        server._predict(reader, num_samples, writer, False)
        output = writer.result()

        # Parse output.
        returned_headers = [
            lower_to_upper[a]
            for a in output.strip().split('\r\n')[0].split(CSV_DELIMITER)
        ]
        loom_output = [
            zip(returned_headers, a.split(CSV_DELIMITER))
            for a in output.strip().split('\r\n')[1:]
        ]
        return_list = []
        for row in loom_output:
            # Prepare the row.
            row_values = []
            row_dict = dict(row)
            for colno in targets:
                colname = target_num_to_name[colno]
                value = row_dict[colname]
                stattype = bayesdb_variable_stattype(bdb, population_id, None,
                                                     colno)
                if not _is_nominal(stattype):
                    value = float(value)
                row_values.append(value)
            # Add this row to the return list.
            return_list.append(row_values)

        return return_list
예제 #29
0
 def _get_ordered_column_labels(self, bdb, generator_id):
     population_id = bayesdb_generator_population(bdb, generator_id)
     return [
         bayesdb_variable_name(bdb, population_id, None, colno)
         for colno in self._get_order(bdb, generator_id)
     ]
예제 #30
0
파일: magics.py 프로젝트: probcomp/iventure
    def _cmd_render_crosscat(self, query, sql=None, **kwargs):
        '''Returns a rendering of the specified crosscat state

        Usage: .render_crosscat [options] <generator> <modelno>.

        Options:
            --subsample=<n>
            --width=<w>
            --height=<c>
            --rowlabels=<colname>
            --progress=[True|False]
            --yticklabeslize=<fontsize>
            --xticklabeslize=<fontsize>

        The allowable fontsize strings are:
            xx-small, x-small, # small, medium, large, x-large, xx-large
        '''
        tokens = query.split()
        if len(tokens) != 2:
            self.write_stderr('Usage: .render_crosscat <generator> <modelno>')
            return
        generator = tokens[0]
        modelno = int(tokens[1])
        if not bayesdb_has_generator(self._bdb, None, generator):
            self.write_stderr('No such generator: %s.' % (generator, ))
            return
        generator_id = bayesdb_get_generator(self._bdb, None, generator)
        population_id = bayesdb_generator_population(self._bdb, generator_id)
        backend = bayesdb_generator_backend(self._bdb, generator_id)
        if backend.name() != 'cgpm':
            self.write_stderr('.render_crosscat requires generator from the '
                              'cgpm backend')
            return
        engine = backend._engine(self._bdb, generator_id)
        cursor = self._bdb.sql_execute(
            '''
            SELECT cgpm_modelno FROM bayesdb_cgpm_modelno
            WHERE generator_id = ? AND modelno = ?
        ''', (
                generator_id,
                modelno,
            ))
        cgpm_modelno = cursor_value(cursor, nullok=True)
        if cgpm_modelno is None:
            self.write_stderr('No such model number: %d.' % (modelno, ))
            return
        state = engine.get_state(cgpm_modelno)
        row_names = None
        row_index_column = kwargs.get('rowlabels', None)
        if row_index_column is not None:
            table_name = bayesdb_generator_table(self._bdb, generator_id)
            qt = bql_quote_name(table_name)
            qc = bql_quote_name(row_index_column)
            cursor = self._bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid IN (
                    SELECT table_rowid FROM bayesdb_cgpm_individual
                    WHERE generator_id = ?
                )
            ''' % (qc, qt), (generator_id, ))
            row_names = [c[0] for c in cursor]
        if 'progress' in kwargs:
            sys.stdout.write('Creating figure...\n')
        import cgpm.utils.render
        if 'variable' not in kwargs:
            # Plot the entire state.
            col_names = [
                bayesdb_variable_name(self._bdb, population_id, None, colno)
                for colno in state.outputs
            ]
            fig, _ax = cgpm.utils.render.viz_state(state,
                                                   col_names=col_names,
                                                   row_names=row_names,
                                                   **kwargs)
        else:
            # Plot the view of the requested variable.
            varno = bayesdb_variable_number(self._bdb, population_id,
                                            generator_id, kwargs['variable'])
            view = state.view_for(varno)
            col_names = [
                bayesdb_variable_name(self._bdb, population_id, None, colno)
                for colno in view.outputs[1:]
            ]
            fig, _ax = cgpm.utils.render.viz_view(view,
                                                  col_names=col_names,
                                                  row_names=row_names,
                                                  **kwargs)
        (width, height) = fig.get_size_inches()
        if 'width' in kwargs:
            width = float(kwargs['width'])
            fig.set_size_inches(width, height)
        if 'height' in kwargs:
            height = float(kwargs['height'])
            fig.set_size_inches(width, height)
        if 'progress' in kwargs:
            sys.stdout.write('Rendering figure...\n')