Python bayesdb_variable_numbers 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: bayeslite.core

메소드/함수: bayesdb_variable_numbers

hotexamples.com에서의 예제들: 21

Python bayesdb_variable_numbers - 21개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 bayeslite.core.bayesdb_variable_numbers에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: test_nig_normal.py 프로젝트: probcomp/bayeslite

def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_backend(bdb, NIGNormalBackend())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)', (x, x*x - 100))
        bdb.execute('''
            create population p for t(
                id ignore;
                set stattypes of x,y to numerical;
            )
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]

예제 #2

파일 보기

def test_nig_normal_latent_numbering():
    with bayesdb_open(':memory:') as bdb:
        bayesdb_register_metamodel(bdb, NIGNormalMetamodel())
        bdb.sql_execute('create table t(id integer primary key, x, y)')
        for x in xrange(100):
            bdb.sql_execute('insert into t(x, y) values(?, ?)',
                            (x, x * x - 100))
        bdb.execute('''
            create population p for t(id ignore; model x,y as numerical)
        ''')
        assert core.bayesdb_has_population(bdb, 'p')
        pid = core.bayesdb_get_population(bdb, 'p')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]

        bdb.execute('create generator g0 for p using nig_normal')
        bdb.execute('''
            create generator g1 for p using nig_normal(xe deviation(x))
        ''')

        assert core.bayesdb_has_generator(bdb, pid, 'g0')
        g0 = core.bayesdb_get_generator(bdb, pid, 'g0')
        assert core.bayesdb_has_generator(bdb, pid, 'g1')
        g1 = core.bayesdb_get_generator(bdb, pid, 'g1')
        assert core.bayesdb_variable_numbers(bdb, pid, None) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g0) == [1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g0) == [1, 2]
        assert core.bayesdb_variable_numbers(bdb, pid, g1) == [-1, 1, 2]
        assert core.bayesdb_generator_column_numbers(bdb, g1) == [-1, 1, 2]

예제 #3

파일 보기

def bql_row_column_predictive_probability(bdb, population_id, generator_id,
                                          rowid, colno):
    value = core.bayesdb_population_cell_value(bdb, population_id, rowid,
                                               colno)
    if value is None:
        return None
    # Retrieve all other values in the row.
    row_values = core.bayesdb_population_row_values(bdb, population_id, rowid)
    variable_numbers = core.bayesdb_variable_numbers(bdb, population_id, None)
    # Build the constraints and query from rowid, using a fresh rowid.
    fresh_rowid = core.bayesdb_population_fresh_row_id(bdb, population_id)
    query = [(colno, value)]
    constraints = [(col, value)
                   for (col, value) in zip(variable_numbers, row_values)
                   if (value is not None) and (col != colno)]

    def generator_predprob(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        return metamodel.logpdf_joint(bdb, generator_id, fresh_rowid, query,
                                      constraints, None)

    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    predprobs = map(generator_predprob, generator_ids)
    r = logmeanexp(predprobs)
    return ieee_exp(r)

예제 #4

파일 보기

파일: nig_normal.py 프로젝트: vishalbelsare/bayeslite

    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(bdb, population_id,
                                                     generator_id, colno)
            stattype = core.bayesdb_variable_stattype(bdb, population_id,
                                                      generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(
                    bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s' %
                    (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(
                insert_column_sql, {
                    'population_id': population_id,
                    'generator_id': generator_id,
                    'colno': colno,
                    'count': count,
                    'sum': xsum,
                    'sumsq': sumsq,
                })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb,
                               'Invalid nig_normal clause: %r' % (clause, ))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                                             obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var, ))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                                                     obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                                                generator_id, dev_var,
                                                'numerical')
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))

예제 #5

파일 보기

파일: test_core.py 프로젝트: LiuFang816/SALSTM_py_data

def test_row_similarity(exname, source, target, colnos):
    if exname == 't0' and any(colno > 0 for colno in colnos):
        pytest.skip('Not enough columns in t0.')
    if exname.startswith('t1_sub') and any(colno > 1 for colno in colnos):
        pytest.skip('Not enough columns in %s.' % (exname, ))
    with analyzed_bayesdb_population(examples[exname](), 1, 1) \
            as (bdb, population_id, generator_id):
        popcols = core.bayesdb_variable_numbers(bdb, population_id,
                                                generator_id)

        # Forbid multiple columns specified in WITH RESPECT TO.
        def test_row_similarity_one(f):
            try:
                f()
                if len(colnos) != 1 and len(popcols) != 1:
                    pytest.fail('No exception on similarity with respect to.')
            except bayeslite.BQLError:
                if len(colnos) == 1:
                    pytest.fail('Bad exception on similarity with respect to.')

        def f_api():
            bqlfn.bql_row_similarity(bdb, population_id, None, source, target,
                                     *colnos)

        def f_sql():
            sql = 'select bql_row_similarity(?, NULL, ?, ?%s%s)' % \
                ('' if 0 == len(colnos) else ', ', ', '.join(map(str, colnos)))
            bdb.sql_execute(sql, (population_id, source, target)).fetchall()

        test_row_similarity_one(f_sql)
        test_row_similarity_one(f_api)

예제 #6

파일 보기

파일: nig_normal.py 프로젝트: probcomp/bayeslite

    def create_generator(self, bdb, generator_id, schema, **kwargs):
        # XXX Do something with the schema.
        insert_column_sql = '''
            INSERT INTO bayesdb_nig_normal_column
                (population_id, generator_id, colno, count, sum, sumsq)
                VALUES (:population_id, :generator_id, :colno,
                    :count, :sum, :sumsq)
        '''
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        for colno in core.bayesdb_variable_numbers(bdb, population_id, None):
            column_name = core.bayesdb_variable_name(
                bdb, population_id, generator_id, colno)
            stattype = core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno)
            if not stattype == 'numerical':
                raise BQLError(bdb, 'NIG-Normal only supports'
                    ' numerical columns, but %s is %s'
                    % (repr(column_name), repr(stattype)))
            (count, xsum, sumsq) = data_suff_stats(bdb, table, column_name)
            bdb.sql_execute(insert_column_sql, {
                'population_id': population_id,
                'generator_id': generator_id,
                'colno': colno,
                'count': count,
                'sum': xsum,
                'sumsq': sumsq,
            })

        # XXX Make the schema a little more flexible.
        if schema == [[]]:
            return
        for clause in schema:
            if not (len(clause) == 3 and \
                    isinstance(clause[0], str) and \
                    clause[1] == 'deviation' and \
                    isinstance(clause[2], list) and \
                    len(clause[2]) == 1 and \
                    isinstance(clause[2][0], str)):
                raise BQLError(bdb, 'Invalid nig_normal clause: %r' %
                    (clause,))
            dev_var = clause[0]
            obs_var = clause[2][0]
            if not core.bayesdb_has_variable(bdb, population_id, None,
                    obs_var):
                raise BQLError(bdb, 'No such variable: %r' % (obs_var,))
            obs_colno = core.bayesdb_variable_number(bdb, population_id, None,
                obs_var)
            dev_colno = core.bayesdb_add_latent(bdb, population_id,
                generator_id, dev_var, 'numerical')
            bdb.sql_execute('''
                INSERT INTO bayesdb_nig_normal_deviation
                    (population_id, generator_id, deviation_colno,
                        observed_colno)
                    VALUES (?, ?, ?, ?)
            ''', (population_id, generator_id, dev_colno, obs_colno))

예제 #7

파일 보기

파일: loom_backend.py 프로젝트: probcomp/bayeslite

 def _data_to_schema(self, bdb, population_id, data_by_column):
     json_dict = {}
     for colno in bayesdb_variable_numbers(bdb, population_id, None):
         column_name = bayesdb_variable_name(bdb, population_id, None, colno)
         stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
         if stattype == 'nominal' \
                 and len(set(data_by_column[column_name])) > 256:
             stattype = 'unbounded_nominal'
         json_dict[column_name] = STATTYPE_TO_LOOMTYPE[stattype]
     with tempfile.NamedTemporaryFile(delete=False) as schema_file:
         schema_file.write(json.dumps(json_dict))
     return schema_file

예제 #8

파일 보기

파일: loom_backend.py 프로젝트: vishalbelsare/bayeslite

 def _data_to_schema(self, bdb, population_id, data_by_column):
     json_dict = {}
     for colno in bayesdb_variable_numbers(bdb, population_id, None):
         column_name = bayesdb_variable_name(bdb, population_id, None,
                                             colno)
         stattype = bayesdb_variable_stattype(bdb, population_id, None,
                                              colno)
         if stattype == 'nominal' \
                 and len(set(data_by_column[column_name])) > 256:
             stattype = 'unbounded_nominal'
         json_dict[column_name] = STATTYPE_TO_LOOMTYPE[stattype]
     with tempfile.NamedTemporaryFile(delete=False) as schema_file:
         schema_file.write(json.dumps(json_dict))
     return schema_file

예제 #9

파일 보기

파일: loom_backend.py 프로젝트: vishalbelsare/bayeslite

    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item, ) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path,
                          rows_csv=csv_file.name,
                          schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt, )).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid, ) in enumerate(rowids))
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions, ))

예제 #10

파일 보기

파일: bqlfn.py 프로젝트: PeterZs/bayeslite

def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid,
        *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    if len(colnos) == 0:
        colnos = core.bayesdb_variable_numbers(bdb, population_id,
            generator_id)
    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        return metamodel.row_similarity(bdb, generator_id, None, rowid,
            target_rowid, colnos)
    generator_ids = [generator_id] if generator_id is not None else \
        core.bayesdb_population_generators(bdb, population_id)
    similarities = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean(similarities)

예제 #11

파일 보기

파일: loom_backend.py 프로젝트: probcomp/bayeslite

    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item,) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path, rows_csv=csv_file.name,
            schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt,)).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid,) in enumerate(rowids)
        )
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions,))

예제 #12

파일 보기

def bql_row_similarity(bdb, population_id, generator_id, rowid, target_rowid,
                       *colnos):
    if target_rowid is None:
        raise BQLError(bdb, 'No such target row for SIMILARITY')
    if len(colnos) == 0:
        colnos = core.bayesdb_variable_numbers(bdb, population_id,
                                               generator_id)
    if len(colnos) != 1:
        raise BQLError(bdb,
                       'Multiple with respect to columns: %s.' % (colnos, ))

    def generator_similarity(generator_id):
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        return metamodel.row_similarity(bdb, generator_id, None, rowid,
                                        target_rowid, colnos)

    generator_ids = _retrieve_generator_ids(bdb, population_id, generator_id)
    similarities = map(generator_similarity, generator_ids)
    return stats.arithmetic_mean(similarities)

예제 #13

파일 보기

파일: loom_backend.py 프로젝트: probcomp/bayeslite

 def _store_kind_partition(self, bdb, generator_id, modelnos):
     population_id = bayesdb_generator_population(bdb, generator_id)
     if modelnos is None:
         modelnos = range(self._get_num_models(bdb, generator_id))
     with bdb.savepoint():
         for modelno in modelnos:
             column_partition = self._retrieve_column_partition(
                 bdb, generator_id, modelno)
             # Bulk insertion of mapping from colno to kind_id.
             colnos = bayesdb_variable_numbers(bdb, population_id, None)
             ranks = [self._get_loom_rank(bdb, generator_id, colno)
                 for colno in colnos]
             insertions = ','.join(
                 str((generator_id, modelno, colno, column_partition[rank]))
                 for colno, rank in zip(colnos, ranks)
             )
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO bayesdb_loom_column_kind_partition
                 (generator_id, modelno, colno, kind_id)
                 VALUES %s
             ''' % (insertions,))
             # Bulk insertion of mapping from (kind_id, rowid) to cluster_id.
             row_partition = self._retrieve_row_partition(
                 bdb, generator_id, modelno)
             rowids = bdb.sql_execute('''
                 SELECT table_rowid, loom_rowid
                     FROM bayesdb_loom_rowid_mapping
             ''').fetchall()
             insertions = ','.join(
                 str((generator_id, modelno, rowid[0], rowid[1],
                         kind_id, partition_id))
                 for kind_id in row_partition
                 for rowid, partition_id
                     in zip(rowids, row_partition[kind_id]))
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO
                     bayesdb_loom_row_kind_partition
                 (generator_id, modelno, table_rowid, loom_rowid,
                     kind_id, partition_id)
                 VALUES %s
             ''' % (insertions,))

예제 #14

파일 보기

파일: loom_backend.py 프로젝트: vishalbelsare/bayeslite

 def _store_kind_partition(self, bdb, generator_id, modelnos):
     population_id = bayesdb_generator_population(bdb, generator_id)
     if modelnos is None:
         modelnos = range(self._get_num_models(bdb, generator_id))
     with bdb.savepoint():
         for modelno in modelnos:
             column_partition = self._retrieve_column_partition(
                 bdb, generator_id, modelno)
             # Bulk insertion of mapping from colno to kind_id.
             colnos = bayesdb_variable_numbers(bdb, population_id, None)
             ranks = [
                 self._get_loom_rank(bdb, generator_id, colno)
                 for colno in colnos
             ]
             insertions = ','.join(
                 str((generator_id, modelno, colno, column_partition[rank]))
                 for colno, rank in zip(colnos, ranks))
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO bayesdb_loom_column_kind_partition
                 (generator_id, modelno, colno, kind_id)
                 VALUES %s
             ''' % (insertions, ))
             # Bulk insertion of mapping from (kind_id, rowid) to cluster_id.
             row_partition = self._retrieve_row_partition(
                 bdb, generator_id, modelno)
             rowids = bdb.sql_execute('''
                 SELECT table_rowid, loom_rowid
                     FROM bayesdb_loom_rowid_mapping
             ''').fetchall()
             insertions = ','.join(
                 str((generator_id, modelno, rowid[0], rowid[1], kind_id,
                      partition_id)) for kind_id in row_partition for rowid,
                 partition_id in zip(rowids, row_partition[kind_id]))
             bdb.sql_execute('''
                 INSERT OR REPLACE INTO
                     bayesdb_loom_row_kind_partition
                 (generator_id, modelno, table_rowid, loom_rowid,
                     kind_id, partition_id)
                 VALUES %s
             ''' % (insertions, ))

예제 #15

파일 보기

파일: test_vscgpm.py 프로젝트: utanapishtim/bayeslite

def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(
                        0, len(countries))]
                    name = 'sat-%s-%d' % (country,
                                          bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute(
                        '''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit NOMINAL;
                country_of_operator NOMINAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Backend(cgpm_registry)
        bayesdb_register_backend(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_variable_numbers(bdb, population_id, None) \
            == [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \
            == [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')

예제 #16

파일 보기

파일: loom_backend.py 프로젝트: probcomp/bayeslite

    def simulate_joint(self, bdb, generator_id, modelnos, rowid, targets,
            constraints, num_samples=1, accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Prepare list of full constraints, potentially adding data from table.
        constraints_full = constraints

        # If rowid exist in base table, retrieve conditioning data.
        # Conditioning values are fetched for any rowid that exists in the base
        # table irrespective of whether the rowid is incorporated in the Loom
        # model or whether it was added after creation.
        if bayesdb_table_has_rowid(bdb, table, rowid):
            # Fetch population column numbers and row values.
            colnos = bayesdb_variable_numbers(bdb, population_id, generator_id)
            rowvals = bayesdb_population_row_values(bdb, population_id, rowid)
            observations = [
                (colno, rowval)
                for colno, rowval in zip(colnos, rowvals)
                if rowval is not None and colno not in targets
            ]
            # Raise error if a constraint overrides an observed cell.
            colnos_constrained = [constraint[0] for constraint in constraints]
            colnos_observed = [observation[0] for observation in observations]
            if set.intersection(set(colnos_constrained), set(colnos_observed)):
                raise BQLError(bdb, 'Overlap between constraints and'
                    ' target row in simulate.')
            # Update the constraints.
            constraints_full = constraints + observations

        # Store mapping from target column name to column number and stattype.
        target_colno_to_name = {
            colno: bayesdb_variable_name(bdb, generator_id, None, colno)
            for colno in targets
        }
        target_colno_to_stattype = {
            colno: bayesdb_variable_stattype(bdb, population_id, None, colno)
            for colno in targets
        }

        # Construct the CSV row for targets.
        row_targets = {target_colno_to_name[colno] : '' for colno in targets}
        row_constraints = {
            bayesdb_variable_name(bdb, generator_id, None, colno) : value
            for colno, value in constraints_full
        }
        row = dict(itertools.chain(
            row_targets.iteritems(), row_constraints.iteritems()))

        # Fetch the server.
        server = self._get_preql_server(bdb, generator_id)

        # Prepare the csv header and values.
        csv_headers = map(str, row.iterkeys())
        csv_values = map(str, row.itervalues())

        # Prepare streams for the server.
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers]+[csv_values])

        # Obtain the prediction.
        server._predict(reader, num_samples, writer, False)

        # Parse the CSV output.
        output_csv = writer.result()
        output_rows = output_csv.strip().split('\r\n')

        # Extract the header of the CSV file.
        header = output_rows[0].split(CSV_DELIMITER)

        # Extract list of simulated rows. Each simulated row is represented
        # as a dictionary mapping column name to its simulated value.
        simulated_rows = [
            dict(zip(header, row.split(CSV_DELIMITER)))
            for row in output_rows[1:]
        ]

        # Prepare the return list of simulated_rows.
        def _extract_simulated_value(row, colno):
            colname = target_colno_to_name[colno]
            stattype = target_colno_to_stattype[colno]
            value = row[colname]
            return value if _is_nominal(stattype) else float(value)

        # Return the list of samples.
        return [
            [_extract_simulated_value(row, colno) for colno in targets]
            for row in simulated_rows
        ]

예제 #17

파일 보기

def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb, 'Name already defined as table: %s' %
                        (repr(phrase.name), ))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(
                        bdb,
                        'Table already exists: %s' % (repr(phrase.name), ))
            bayesdb_read_csv_file(bdb,
                                  phrase.name,
                                  phrase.csv,
                                  header=True,
                                  create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                raise BQLError(
                    bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder(
                '''
                CREATE TEMP TABLE %s (column TEXT, stattype TEXT, num_distinct INTEGER, reason TEXT)
            ''' % (qtt), ())
            for cn, st, ct in zip(column_names, stattypes,
                                  distinct_value_counts):
                out.winder(
                    '''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt, ))
            out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name, ))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [
                    core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids
                ]
                raise BQLError(
                    bdb, 'Population %r still has metamodels: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id, ))
            bdb.sql_execute(
                '''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb,
                               'No such population: %s' % (repr(population), ))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'No such variable in base table: %s' % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(bdb, population_id, None,
                                                 cmd.name):
                        raise BQLError(
                            bdb,
                            'Variable already in population: %s' % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'Cannot add variable without any values: %s' %
                            (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute('SELECT %s FROM %s' %
                                                 (qc, qt))
                        rows = cursor.fetchall()
                        [stattype,
                         reason] = bayesdb_guess_stattypes([cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(
                                bdb, 'Values in column %s appear to be keys.' %
                                (cmd.name, ))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(
                                bdb, 'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name, ))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                                       'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(
                            bdb,
                            'Numerical column contains string values: %r ' %
                            (qc, ))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(bdb, population_id, cmd.name,
                                                  stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) metamodel in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            # XXX Omit needless bayesdb_generator_column table
                            # Github issue #441.
                            bdb.sql_execute(
                                '''
                                INSERT INTO bayesdb_generator_column
                                    VALUES (:generator_id, :colno, :stattype)
                            ''', {
                                    'generator_id': generator_id,
                                    'colno': colno,
                                    'stattype': stattype,
                                })
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no metamodels are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(
                            bdb,
                            'Cannot update statistical types for population '
                            '%s, it has metamodels: %s' % (
                                repr(population),
                                repr(generators),
                            ))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not core.bayesdb_has_variable(
                            bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(
                            bdb, 'No such variables in population: %s' %
                            (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(
                            bdb, 'Invalid statistical type: %r' %
                            (repr(cmd.stattype), ))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(
                                bdb, 'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars, ))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(bdb, population_id, None,
                                                     c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno, ) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos, )
                    bdb.sql_execute(update_stattype_sql, (
                        casefold(cmd.stattype),
                        population_id,
                    ))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        table = core.bayesdb_population_table(bdb, population_id)

        # Find the metamodel, or use the default.
        metamodel_name = phrase.metamodel
        if phrase.metamodel is None:
            metamodel_name = 'cgpm'
        if metamodel_name not in bdb.metamodels:
            raise BQLError(bdb,
                           'No such metamodel: %s' % (repr(metamodel_name), ))
        metamodel = bdb.metamodels[metamodel_name]

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator
                        (name, tabname, population_id, metamodel)
                        VALUES (?, ?, ?, ?)
                ''', (phrase.name, table, population_id, metamodel.name()))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, phrase.name)

                # Populate bayesdb_generator_column.
                #
                # XXX Omit needless bayesdb_generator_column table --
                # Github issue #441.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id IS NULL
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

                # Do any metamodel-specific initialization.
                metamodel.create_generator(bdb,
                                           generator_id,
                                           phrase.schema,
                                           baseline=phrase.baseline)

                # Populate bayesdb_generator_column with any latent
                # variables that metamodel.create_generator has added
                # with bayesdb_add_latent.
                bdb.sql_execute(
                    '''
                    INSERT INTO bayesdb_generator_column
                        (generator_id, colno, stattype)
                        SELECT :generator_id, colno, stattype
                            FROM bayesdb_variable
                            WHERE population_id = :population_id
                                AND generator_id = :generator_id
                ''', {
                        'generator_id': generator_id,
                        'population_id': population_id,
                    })

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(
                        bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(
                        bdb, 'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the metamodel.
                metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                metamodel.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds,
                                 program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(bdb, None,
                                                      phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb,
                           'No such population: %r' % (phrase.population, ))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the metamodel.
        generator_id = None
        if phrase.metamodel:
            if not core.bayesdb_has_generator(bdb, population_id,
                                              phrase.metamodel):
                raise BQLError(bdb,
                               'No such metamodel: %r' % (phrase.population, ))
            generator_id = core.bayesdb_get_generator(bdb, population_id,
                                                      phrase.metamodel)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(bdb, population_id, None,
                                         phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target, ))
        colno_target = core.bayesdb_variable_number(bdb, population_id, None,
                                                    phrase.target)
        if core.bayesdb_variable_stattype(bdb, population_id, colno_target) != \
                'numerical':
            raise BQLError(
                bdb,
                'Target variable is not numerical: %r' % (phrase.target, ))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(bdb, population_id, None,
                                             given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(colno for colno in colno_givens
                                  if colno != colno_target)
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(bdb,
                                      population_id,
                                      generator_id,
                                      modelnos,
                                      constraints,
                                      colnos,
                                      numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(target_values, given_values, given_names,
                                   stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder(
            '''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt, ), ())
        for variable, coef in coefficients:
            out.winder(
                '''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (
                    variable,
                    coef,
                ))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt, ))
        out.unwinder('DROP TABLE %s' % (qtt, ), ())
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    assert False  # XXX

예제 #18

파일 보기

파일: loom_backend.py 프로젝트: vishalbelsare/bayeslite

    def simulate_joint(self,
                       bdb,
                       generator_id,
                       modelnos,
                       rowid,
                       targets,
                       constraints,
                       num_samples=1,
                       accuracy=None):
        # Retrieve the population id.
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Prepare list of full constraints, potentially adding data from table.
        constraints_full = constraints

        # If rowid exist in base table, retrieve conditioning data.
        # Conditioning values are fetched for any rowid that exists in the base
        # table irrespective of whether the rowid is incorporated in the Loom
        # model or whether it was added after creation.
        if bayesdb_table_has_rowid(bdb, table, rowid):
            # Fetch population column numbers and row values.
            colnos = bayesdb_variable_numbers(bdb, population_id, generator_id)
            rowvals = bayesdb_population_row_values(bdb, population_id, rowid)
            observations = [(colno, rowval)
                            for colno, rowval in zip(colnos, rowvals)
                            if rowval is not None and colno not in targets]
            # Raise error if a constraint overrides an observed cell.
            colnos_constrained = [constraint[0] for constraint in constraints]
            colnos_observed = [observation[0] for observation in observations]
            if set.intersection(set(colnos_constrained), set(colnos_observed)):
                raise BQLError(
                    bdb, 'Overlap between constraints and'
                    ' target row in simulate.')
            # Update the constraints.
            constraints_full = constraints + observations

        # Store mapping from target column name to column number and stattype.
        target_colno_to_name = {
            colno: bayesdb_variable_name(bdb, generator_id, None, colno)
            for colno in targets
        }
        target_colno_to_stattype = {
            colno: bayesdb_variable_stattype(bdb, population_id, None, colno)
            for colno in targets
        }

        # Construct the CSV row for targets.
        row_targets = {target_colno_to_name[colno]: '' for colno in targets}
        row_constraints = {
            bayesdb_variable_name(bdb, generator_id, None, colno): value
            for colno, value in constraints_full
        }
        row = dict(
            itertools.chain(row_targets.iteritems(),
                            row_constraints.iteritems()))

        # Fetch the server.
        server = self._get_preql_server(bdb, generator_id)

        # Prepare the csv header and values.
        csv_headers = map(str, row.iterkeys())
        csv_values = map(str, row.itervalues())

        # Prepare streams for the server.
        outfile = StringIO()
        writer = loom.preql.CsvWriter(outfile, returns=outfile.getvalue)
        reader = iter([csv_headers] + [csv_values])

        # Obtain the prediction.
        server._predict(reader, num_samples, writer, False)

        # Parse the CSV output.
        output_csv = writer.result()
        output_rows = output_csv.strip().split('\r\n')

        # Extract the header of the CSV file.
        header = output_rows[0].split(CSV_DELIMITER)

        # Extract list of simulated rows. Each simulated row is represented
        # as a dictionary mapping column name to its simulated value.
        simulated_rows = [
            dict(zip(header, row.split(CSV_DELIMITER)))
            for row in output_rows[1:]
        ]

        # Prepare the return list of simulated_rows.
        def _extract_simulated_value(row, colno):
            colname = target_colno_to_name[colno]
            stattype = target_colno_to_stattype[colno]
            value = row[colname]
            return value if _is_nominal(stattype) else float(value)

        # Return the list of samples.
        return [[_extract_simulated_value(row, colno) for colno in targets]
                for row in simulated_rows]

예제 #19

파일 보기

파일: bql.py 프로젝트: probcomp/bayeslite

def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
            out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb,
                        'Name already defined as table: %s' %
                        (repr(phrase.name),))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb, 'Table already exists: %s' %
                        (repr(phrase.name),))
            bayesdb_read_csv_file(
                bdb, phrase.name, phrase.csv, header=True, create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                raise BQLError(bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name),))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                (phrase.name,))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as table: %s'
                                % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                                bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s'
                            % (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                        ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table,
                                cmd.old):
                            raise BQLError(bdb, 'No such column in table %s'
                                ': %s' %
                                (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table,))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id,) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(bdb,
                                generator_id)
                            backend.rename_column(bdb, generator_id,
                                old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder('''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt,), ())
            for cn, st, ct in zip(column_names, stattypes, distinct_value_counts):
                out.winder('''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt,))
            out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name,))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids]
                raise BQLError(bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute('''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id,))
            bdb.sql_execute('''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb, 'No such population: %s' %
                    (repr(population),))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead'
                            % (population,))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as population' ': %s'
                                % (repr(cmd.name),))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s'
                            % (qg, qp,))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(
                            bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'No such variable in base table: %s'
                            % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(
                            bdb, population_id, None, cmd.name):
                        raise BQLError(bdb,
                            'Variable already in population: %s'
                            % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb,
                            'Cannot add variable without any values: %s'
                            % (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute(
                            'SELECT %s FROM %s' % (qc, qt))
                        rows = cursor.fetchall()
                        [stattype, reason] = bayesdb_guess_stattypes(
                            [cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(bdb,
                                'Values in column %s appear to be keys.'
                                % (cmd.name,))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(bdb,
                                'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name,))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'Numerical column contains string values: %r '
                            % (qc,))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(
                            bdb, population_id, cmd.name, stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s'
                            % (repr(population), repr(generators),))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not
                        core.bayesdb_has_variable(bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(bdb,
                            'No such variables in population: %s'
                            % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid statistical type: %r'
                            % (repr(cmd.stattype),))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(bdb,
                                'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars,))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(
                            bdb, population_id, None, c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno,) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos,)
                    bdb.sql_execute(
                        update_stattype_sql,
                        (casefold(cmd.stattype), population_id,))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' %
                (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' %
                (repr(backend_name),))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name),))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute('''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(bdb, 'Name already defined'
                                ' as generator: %s' %
                                (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos if not
                    core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(bdb,
                        'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
            else:
                existing = set(modelno for modelno in modelnos
                    if core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
                if 0 < len(existing):
                    raise BQLError(bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb, generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
            program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(
                bdb, None, phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' % (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                    phrase.generator):
                raise BQLError(bdb,
                    'No such generator: %r' % (phrase.generator,))
            generator_id = core.bayesdb_get_generator(
                bdb, population_id, phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(
                bdb, population_id, None, phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target,))
        colno_target = core.bayesdb_variable_number(
            bdb, population_id, None, phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
            generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(bdb,
                'Target variable is not numerical: %r' % (phrase.target,))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(
                    bdb, population_id, None, given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(
            colno for colno in colno_givens if colno!= colno_target
        )
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(
            bdb, population_id, generator_id, modelnos, constraints,
            colnos, numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(
            target_values, given_values, given_names, stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder('''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt,), ())
        for variable, coef in coefficients:
            out.winder('''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (variable, coef,))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt,))
        out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    assert False                # XXX

예제 #20

파일 보기

    def predict_confidence(self,
                           bdb,
                           generator_id,
                           modelno,
                           colno,
                           rowid,
                           numsamples=None):
        if not numsamples:
            numsamples = 2
        assert numsamples > 0

        def _impute_categorical(sample):
            counts = Counter(s[0] for s in sample)
            mode_count = max(counts[v] for v in counts)
            pred = iter(v for v in counts if counts[v] == mode_count).next()
            conf = float(mode_count) / numsamples
            return pred, conf

        def _impute_numerical(sample):
            pred = sum(s[0] for s in sample) / float(len(sample))
            conf = 0  # XXX Punt confidence for now
            return pred, conf

        constraints = []
        # If rowid is a hypothetical cell for cgpm (did not exist at the time
        # of INITIALIZE), but exists in the base table (by INSERT INTO), then
        # retrieve all values for rowid as the constraints.
        exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id)
        max_cgpm_rowid = bdb.sql_execute(
            '''
            SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, )).fetchall()[0][0]
        hypothetical = rowid > max_cgpm_rowid
        if exists and hypothetical:
            population_id = core.bayesdb_generator_population(
                bdb, generator_id)
            # Retrieve all other variables except colno, and ignore latents in
            # generator_id, and place them in the constraints.
            pop_names = core.bayesdb_variable_names(bdb, population_id, None)
            avoid_name = core.bayesdb_variable_name(bdb, population_id, colno)
            constraints_names = [n for n in pop_names if n != avoid_name]
            # Obtain the row.
            qt_names = str.join(',', map(sqlite3_quote_name,
                                         constraints_names))
            qt_table = sqlite3_quote_name(
                core.bayesdb_population_table(bdb, population_id))
            data = bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid = ?
            ''' % (
                    qt_names,
                    qt_table,
                ), (rowid, )).fetchall()[0]
            # Build the constraints.
            pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None)
            constraints_nos = [n for n in pop_nos if n != colno]
            # import ipdb; ipdb.set_trace()
            assert len(data) == len(constraints_nos)
            constraints = [(rowid, c, v)
                           for c, v in zip(constraints_nos, data)
                           if (v is not None) and v]

        # Retrieve the samples.
        sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)],
                                     constraints, modelno, numsamples)

        # Determine the imputation strategy (mode or mean).
        stattype = core.bayesdb_variable_stattype(
            bdb, core.bayesdb_generator_population(bdb, generator_id), colno)
        if _is_categorical(stattype):
            return _impute_categorical(sample)
        else:
            return _impute_numerical(sample)

예제 #21

파일 보기

파일: test_vscgpm.py 프로젝트: probcomp/bayeslite

def test_cgpm_extravaganza__ci_slow():
    try:
        from cgpm.regressions.forest import RandomForest
        from cgpm.regressions.linreg import LinearRegression
        from cgpm.venturescript.vscgpm import VsCGpm
    except ImportError:
        pytest.skip('no sklearn or venturescript')
        return
    with bayesdb_open(':memory:', builtin_backends=False) as bdb:
        # XXX Use the real satellites data instead of this bogosity?
        bdb.sql_execute('''
            CREATE TABLE satellites_ucs (
                name,
                apogee,
                class_of_orbit,
                country_of_operator,
                launch_mass,
                perigee,
                period
            )
        ''')
        for l, f in [
            ('geo', lambda x, y: x + y**2),
            ('leo', lambda x, y: math.sin(x + y)),
        ]:
            for x in xrange(1000):
                for y in xrange(10):
                    countries = ['US', 'Russia', 'China', 'Bulgaria']
                    country = countries[bdb._np_prng.randint(0, len(countries))]
                    name = 'sat-%s-%d' % (
                        country, bdb._np_prng.randint(0, 10**8))
                    mass = bdb._np_prng.normal(1000, 50)
                    bdb.sql_execute('''
                        INSERT INTO satellites_ucs
                            (name, country_of_operator, launch_mass,
                                class_of_orbit, apogee, perigee, period)
                            VALUES (?,?,?,?,?,?,?)
                    ''', (name, country, mass, l, x, y, f(x, y)))

        bdb.execute('''
            CREATE POPULATION satellites FOR satellites_ucs (
                name IGNORE;
                apogee NUMERICAL;
                class_of_orbit NOMINAL;
                country_of_operator NOMINAL;
                launch_mass NUMERICAL;
                perigee NUMERICAL;
                period NUMERICAL
            )
        ''')

        bdb.execute('''
            ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites
            ''').fetchall()

        cgpm_registry = {
            'venturescript': VsCGpm,
            'linreg': LinearRegression,
            'forest': RandomForest,
        }
        cgpmt = CGPM_Backend(cgpm_registry)
        bayesdb_register_backend(bdb, cgpmt)

        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    SET CATEGORY MODEL FOR apoge TO NORMAL
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg
                )
            ''')
        with pytest.raises(BQLError):
            bdb.execute('''
                CREATE GENERATOR g0 FOR satellites USING cgpm (
                    LATENT apogee NUMERICAL
                )
            ''')

        bdb.execute('''
            CREATE GENERATOR g0 FOR satellites USING cgpm (
                SET CATEGORY MODEL FOR apogee TO NORMAL;

                LATENT kepler_cluster_id NUMERICAL;
                LATENT kepler_noise NUMERICAL;

                OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period
                GIVEN apogee, perigee
                USING venturescript (source = "{}");

                OVERRIDE MODEL FOR
                    perigee
                GIVEN apogee USING linreg;

                OVERRIDE MODEL FOR class_of_orbit
                GIVEN apogee, period, perigee, kepler_noise
                USING forest (k = 4);

                SUBSAMPLE 100,
            )
        '''.format(kepler_source))

        population_id = core.bayesdb_get_population(bdb, 'satellites')
        generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0')
        assert core.bayesdb_variable_numbers(bdb, population_id, None) \
            == [1, 2, 3, 4, 5, 6]
        assert core.bayesdb_variable_numbers(bdb, population_id, generator_id) \
            == [-2, -1, 1, 2, 3, 4, 5, 6]

        # -- MODEL country_of_operator GIVEN class_of_orbit USING forest;
        bdb.execute('INITIALIZE 1 MODELS FOR g0')
        bdb.execute('ANALYZE g0 FOR 1 iteration (;)')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (VARIABLES kepler_cluster_id)
        ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        # OPTIMIZED uses the lovecat backend.
        bdb.execute('ANALYZE g0 FOR 20 iteration (OPTIMIZED)')
        with pytest.raises(Exception):
            # Disallow both SKIP and VARIABLES clauses.
            #
            # XXX Catch a more specific exception.
            bdb.execute('''
                ANALYZE g0 FOR 1 ITERATION (
                    SKIP kepler_cluster_id;
                    VARIABLES apogee, perigee;
                )
            ''')
        bdb.execute('''
            ANALYZE g0 FOR 1 iteration (
                SKIP kepler_cluster_id, kepler_noise, period;
            )
        ''')
        bdb.execute('ANALYZE g0 FOR 1 ITERATION')

        bdb.execute('''
            ESTIMATE DEPENDENCE PROBABILITY
                OF kepler_cluster_id WITH period WITHIN satellites
                MODELED BY g0
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise
                FROM satellites MODELED BY g0 LIMIT 1
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PREDICTIVE PROBABILITY OF period
                FROM satellites LIMIT 1
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT
                    PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf
                FROM satellites MODELED BY g0 LIMIT 2;
        ''').fetchall()
        bdb.execute('''
            INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf
                FROM satellites MODELED BY g0 LIMIT 1;
        ''').fetchall()
        bdb.execute('''
            ESTIMATE PROBABILITY DENSITY OF period = 42
                    GIVEN (apogee = 8 AND perigee = 7)
                BY satellites
        ''').fetchall()

        bdb.execute('''
            SIMULATE kepler_cluster_id, apogee, perigee, period
                FROM satellites MODELED BY g0 LIMIT 4
        ''').fetchall()

        bdb.execute('DROP MODELS FROM g0')
        bdb.execute('DROP GENERATOR g0')
        bdb.execute('DROP POPULATION satellites')
        bdb.execute('DROP TABLE satellites_ucs')