Esempio n. 1
0
    def _store_encoding_info(self, bdb, generator_id):
        encoding_path = os.path.join(
            self._get_loom_project_path(bdb, generator_id), 'ingest',
            'encoding.json.gz')
        with gzip.open(encoding_path) as encoding_file:
            encoding = json.loads(encoding_file.read().decode('ascii'))

        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store string encoding.
        insert_string_encoding = '''
            INSERT INTO bayesdb_loom_string_encoding
            (generator_id, colno, string_form, integer_form)
            VALUES (:generator_id, :colno, :string_form, :integer_form)
        '''
        for col in encoding:
            if 'symbols' in col:
                colno = bayesdb_table_column_number(bdb, table,
                                                    str(col['name']))
                for string_form, integer_form in col['symbols'].iteritems():
                    bdb.sql_execute(
                        insert_string_encoding, {
                            'generator_id': generator_id,
                            'colno': colno,
                            'string_form': string_form,
                            'integer_form': integer_form
                        })

        # Store ordering of columns.
        insert_order_sql = '''
            INSERT INTO bayesdb_loom_column_ordering
            (generator_id, colno, rank)
            VALUES (:generator_id, :colno, :rank)
        '''
        for col_index in xrange(len(encoding)):
            colno = bayesdb_table_column_number(
                bdb, table, str(encoding[col_index]['name']))
            bdb.sql_execute(insert_order_sql, {
                'generator_id': generator_id,
                'colno': colno,
                'rank': col_index
            })
Esempio n. 2
0
    def _store_encoding_info(self, bdb, generator_id):
        encoding_path = os.path.join(
            self._get_loom_project_path(bdb, generator_id),
            'ingest', 'encoding.json.gz'
        )
        with gzip.open(encoding_path) as encoding_file:
            encoding = json.loads(encoding_file.read().decode('ascii'))

        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store string encoding.
        insert_string_encoding = '''
            INSERT INTO bayesdb_loom_string_encoding
            (generator_id, colno, string_form, integer_form)
            VALUES (:generator_id, :colno, :string_form, :integer_form)
        '''
        for col in encoding:
            if 'symbols' in col:
                colno = bayesdb_table_column_number(bdb, table, str(col['name']))
                for string_form, integer_form in col['symbols'].iteritems():
                    bdb.sql_execute(insert_string_encoding, {
                        'generator_id': generator_id,
                        'colno': colno,
                        'string_form': string_form,
                        'integer_form': integer_form
                    })

        # Store ordering of columns.
        insert_order_sql = '''
            INSERT INTO bayesdb_loom_column_ordering
            (generator_id, colno, rank)
            VALUES (:generator_id, :colno, :rank)
        '''
        for col_index in xrange(len(encoding)):
            colno = bayesdb_table_column_number(
                bdb, table, str(encoding[col_index]['name']))
            bdb.sql_execute(insert_order_sql, {
                'generator_id': generator_id,
                'colno': colno,
                'rank': col_index
            })
Esempio n. 3
0
def test_bayesdb_population_add_variable():
    with bayesdb() as bdb:
        bdb.sql_execute('create table t (a real, b ignore, c real)')
        bdb.execute('''
            create population p for t with schema(
                set stattypes of a, c to numerical;
                b ignore;
            );
        ''')
        population_id = core.bayesdb_get_population(bdb, 'p')
        # Checks column a.
        assert core.bayesdb_has_variable(bdb, population_id, None, 'a')
        assert core.bayesdb_table_column_number(bdb, 't', 'a') == 0
        assert core.bayesdb_variable_number(bdb, population_id, None, 'a') == 0
        # Checks column b, which is not in the population yet.
        assert not core.bayesdb_has_variable(bdb, population_id, None, 'b')
        assert core.bayesdb_table_column_number(bdb, 't', 'b') == 1
        # Checks column c.
        assert core.bayesdb_has_variable(bdb, population_id, None, 'c')
        assert core.bayesdb_table_column_number(bdb, 't', 'c') == 2
        assert core.bayesdb_variable_number(bdb, population_id, None, 'c') == 2
        # Cannot add variable 'c', already exists.
        with pytest.raises(apsw.ConstraintError):
            core.bayesdb_add_variable(bdb, population_id, 'c', 'nominal')
        # Cannot add variable 'b' with a bad stattype.
        with pytest.raises(apsw.ConstraintError):
            core.bayesdb_add_variable(bdb, population_id, 'b', 'quzz')
        # Now add column b to the population.
        core.bayesdb_add_variable(bdb, population_id, 'b', 'nominal')
        assert core.bayesdb_variable_number(bdb, population_id, None, 'b') == 1
        # Add a new column q to table t, then add it to population p.
        bdb.sql_execute('alter table t add column q real;')
        assert core.bayesdb_table_column_number(bdb, 't', 'q') == 3
        assert not core.bayesdb_has_variable(bdb, population_id, None, 'q')
        core.bayesdb_add_variable(bdb, population_id, 'q', 'numerical')
        assert core.bayesdb_has_variable(bdb, population_id, None, 'q')
        assert core.bayesdb_variable_number(bdb, population_id, None, 'q') == 3
Esempio n. 4
0
def test_bayesdb_population_add_variable():
    with bayesdb() as bdb:
        bdb.sql_execute('create table t (a real, b ignore, c real)')
        bdb.execute('''
            create population p for t with schema(
                set stattypes of a, c to numerical;
                b ignore;
            );
        ''')
        population_id = core.bayesdb_get_population(bdb, 'p')
        # Checks column a.
        assert core.bayesdb_has_variable(bdb, population_id, None, 'a')
        assert core.bayesdb_table_column_number(bdb, 't', 'a') == 0
        assert core.bayesdb_variable_number(bdb, population_id, None, 'a') == 0
        # Checks column b, which is not in the population yet.
        assert not core.bayesdb_has_variable(bdb, population_id, None, 'b')
        assert core.bayesdb_table_column_number(bdb, 't', 'b') == 1
        # Checks column c.
        assert core.bayesdb_has_variable(bdb, population_id, None, 'c')
        assert core.bayesdb_table_column_number(bdb, 't', 'c') == 2
        assert core.bayesdb_variable_number(bdb, population_id, None, 'c') == 2
        # Cannot add variable 'c', already exists.
        with pytest.raises(apsw.ConstraintError):
            core.bayesdb_add_variable(bdb, population_id, 'c', 'nominal')
        # Cannot add variable 'b' with a bad stattype.
        with pytest.raises(apsw.ConstraintError):
            core.bayesdb_add_variable(bdb, population_id, 'b', 'quzz')
        # Now add column b to the population.
        core.bayesdb_add_variable(bdb, population_id, 'b', 'nominal')
        assert core.bayesdb_variable_number(bdb, population_id, None, 'b') == 1
        # Add a new column q to table t, then add it to population p.
        bdb.sql_execute('alter table t add column q real;')
        assert core.bayesdb_table_column_number(bdb, 't', 'q') == 3
        assert not core.bayesdb_has_variable(bdb, population_id, None, 'q')
        core.bayesdb_add_variable(bdb, population_id, 'q', 'numerical')
        assert core.bayesdb_has_variable(bdb, population_id, None, 'q')
        assert core.bayesdb_variable_number(bdb, population_id, None, 'q') == 3
Esempio n. 5
0
def bayesdb_load_codebook_csv_file(bdb, table, pathname):
    """Load a codebook for `table` from the CSV file at `pathname`."""
    codebook = None
    with open(pathname, 'rU') as f:
        reader = csv.reader(f)
        try:
            header = reader.next()
        except StopIteration:
            raise IOError('Empty codebook file')
        header = [unicode(h, 'utf8').strip() for h in header]
        if header != ['name','shortname','description','value_map']:
            raise IOError('Wrong CSV header for codebook')
        codebook = []
        line = 1
        for row in reader:
            if len(row) != 4:
                raise IOError('Wrong number of columns at line %d: %d' %
                    (line, len(row)))
            column_name, _shortname, _description, _value_map_json = row
            codebook.append(row)
            line += 1
    with bdb.savepoint():
        for column_name, shortname, description, value_map_json in codebook:
            if not core.bayesdb_table_has_column(bdb, table, column_name):
                raise IOError('Column does not exist in table %s: %s' %
                    (repr(table), repr(column_name)))
            colno = core.bayesdb_table_column_number(bdb, table, column_name)
            try:
                value_map = dict(json.loads(value_map_json))
            except (ValueError, TypeError):
                if value_map_json == '' or value_map_json.lower() == 'nan':
                    value_map = {}
                else:
                    raise IOError('Invalid value map for column %r: %r' %
                                  (column_name, value_map_json))
            sql = '''
                DELETE FROM bayesdb_column_map
                    WHERE tabname = ? AND colno = ?
            '''
            bdb.sql_execute(sql, (table, colno))
            sql = '''
                INSERT INTO bayesdb_column_map
                    (tabname, colno, key, value)
                    VALUES (?, ?, ?, ?)
            '''
            for key in sorted(value_map.keys()):
                value = value_map[key]
                bdb.sql_execute(sql, (table, colno, key, value))
            sql = '''
                UPDATE bayesdb_column
                    SET shortname = :shortname, description = :description
                    WHERE tabname = :table AND colno = :colno
            '''
            total_changes = bdb._sqlite3.totalchanges()
            bdb.sql_execute(sql, {
                'shortname': shortname,
                'description': description,
                'table': table,
                'colno': colno,
            })
            assert bdb._sqlite3.totalchanges() - total_changes == 1
Esempio n. 6
0
def bayesdb_load_legacy_models(bdb, generator, table, metamodel, pathname,
        create=False, ifnotexists=False, gzipped=None):
    """Load legacy BayesDB models from a file.

    Legacy models are from the previous incarnation of BayesDB, before
    bayeslite.  If you did not use the previous incarnation of
    BayesDB, you need not worry about this.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str generator: name of generator
    :param str table: name of table
    :param str metamodel: name of metamodel, must be ``crosscat``
    :param str pathname: pathname of legacy models file
    :param bool create: if true and `generator` does not exist, create it
    :param bool ifnotexists: if true and `generator` exists, do it anyway
    :param bool gzipped: if true, or if ``None`` and `pathname`
        ends in ``.pkl.gz``, decompress with gzip first
    """

    if metamodel != 'crosscat':
        raise ValueError('Only crosscat legacy models are supported.')

    if not create:
        if ifnotexists:
            raise ValueError('Not creating generator whether or not exists!')

    # Load the pickled file -- gzipped, if gzipped is true or if
    # gzipped is not specified and the file ends in .pkl.gz.
    pickled = None
    with open(pathname, 'rb') as f:
        if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')):
            with gzip.GzipFile(fileobj=f) as gzf:
                pickled = pickle.load(gzf)
        else:
            pickled = pickle.load(f)

    # Pick apart the schema and model data.
    #
    # XXX Support even older models formats, from before the schema
    # was included.  Not sure exactly how they were structured.
    if 'schema' not in pickled:
        raise IOError('Invalid legacy model: missing schema')
    if 'models' not in pickled:
        raise IOError('Invalid legacy model: missing models')
    schema = pickled['schema']
    models = pickled['models']

    # Make sure the schema looks sensible.  Map legacy stattypes
    # (`cctypes') to modern stattypes.
    if not isinstance(schema, dict):
        raise IOError('Invalid legacy model: schema is not a dict')
    for column_name in schema:
        column_schema = schema[column_name]
        if not isinstance(column_schema, dict):
            raise IOError('Invalid legacy model: column schema is not a dict')
        if not 'cctype' in column_schema:
            raise IOError('Invalid legacy model: column schema missing cctype')
        if column_schema['cctype'] in renamed_column_stattypes:
            column_schema['cctype'] = \
                renamed_column_stattypes[column_schema['cctype']]
        if column_schema['cctype'] not in allowed_column_stattypes:
            raise IOError('Invalid legacy model: unknown column type')

    # XXX Check whether the schema resembles a sane generator schema.
    # XXX Check whether models is a dict mapping integers to thetas.
    # XXX Check whether the thetas look sensible.
    # XXX Check whether the metamodel makes sense of it!

    column_stattypes = dict((casefold(column_name),
                             casefold(schema[column_name]['cctype']))
        for column_name in schema)

    # Ready to update the database.  Do it in a savepoint in case
    # anything goes wrong.
    with bdb.savepoint():

        # Ensure the table exists.  Can't do anything if we have no
        # data.
        if not core.bayesdb_has_table(bdb, table):
            raise ValueError('No such table: %s' % (repr(table),))

        # Ensure the generator exists.
        if core.bayesdb_has_generator(bdb, generator):
            if create and not ifnotexists:
                raise ValueError('Generator already exists: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            generator_table = core.bayesdb_generator_table(bdb, generator_id)
            if casefold(table) != generator_table:
                raise ValueError(
                    'Generator %r is for table %r, not for table: %r' %
                    (generator, generator_table, table))
            # Generator exists.  If the schema differs and there are
            # existing models, fail.  If the schema differs and there
            # are no existing models, change the schema.
            #
            # XXX Not clear changing the schema is really appropriate.
            generator_id = core.bayesdb_get_generator(bdb, generator)
            old_types = bayesdb_generator_column_stattypes(bdb, generator_id)
            if column_stattypes != old_types:
                sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = ?
                '''
                cursor = bdb.sql_execute(bdb, (generator_id,))
                if 0 < cursor_value(cursor):
                    raise ValueError('Legacy models mismatch schema: %s' %
                        (repr(generator),))
                qg = sqlite3_quote_name(generator)
                bdb.execute('DROP GENERATOR %s' % (qg,))
                bayesdb_create_legacy_generator(bdb, generator, table,
                    column_stattypes)
        elif create:
            bayesdb_create_legacy_generator(bdb, generator, table,
                column_stattypes)
        else:
            raise ValueError('No such generator: %s' % (repr(generator),))

        # Map the case of the column names in the models.
        #
        # XXX Check more than just the column names.
        for modelno in models:      # dictionary
            theta = models[modelno]
            if 'X_L' not in theta:
                raise IOError('Invalid legacy model: no X_L in theta[%u]' %
                    (modelno,))
            X_L = theta['X_L']
            if 'view_state' not in X_L:
                raise IOError('Invalid legacy model'
                    ': no view_state in X_L[%u]' %
                    (modelno,))
            for viewno, view_state in enumerate(X_L['view_state']):
                if 'column_names' not in view_state:
                    raise IOError('Invalid legacy model: no column names'
                        ' in view state %u of X_L[%u]' % (viewno, modelno))
                view_column_names = view_state['column_names']
                if not isinstance(view_column_names, list):
                    raise IOError('Invalid legacy model'
                        ': non-list for view %u columns in X_L[%u]'
                        % (viewno, modelno))
                for i in range(len(view_column_names)):
                    name = view_column_names[i]
                    if not core.bayesdb_table_has_column(bdb, table, name):
                        raise IOError('No such column in table %s: %s' %
                            (repr(table), repr(name)))
                    # Canonicalize the case.
                    colno = core.bayesdb_table_column_number(bdb, table, name)
                    name = core.bayesdb_table_column_name(bdb, table, colno)
                    view_column_names[i] = name

        # Determine where to start numbering the new models.
        generator_id = core.bayesdb_get_generator(bdb, generator)
        modelno_max_sql = '''
            SELECT MAX(modelno) FROM bayesdb_generator_model
                WHERE generator_id = ?
        '''
        cursor = bdb.sql_execute(modelno_max_sql, (generator_id,))
        modelno_max = cursor_value(cursor)
        modelno_start = 0 if modelno_max is None else modelno_max + 1

        # Consistently number the models consecutively in order of the
        # external numbering starting at the smallest nonnegative
        # model number not currently used.  Do not vary based on the
        # ordering of Python dict iteration.
        insert_model_sql = '''
            INSERT INTO bayesdb_generator_model
                (generator_id, modelno, iterations)
                VALUES (:generator_id, :modelno, :iterations)
        '''
        insert_theta_json_sql = '''
            INSERT INTO bayesdb_crosscat_theta
                (generator_id, modelno, theta_json)
                VALUES (:generator_id, :modelno, :theta_json)
        '''
        for i, modelno_ext in enumerate(sorted(models.keys())):
            modelno = modelno_start + i
            theta = models[modelno_ext]
            iterations = 0
            if 'iterations' in theta and isinstance(theta['iterations'], int):
                iterations = theta['iterations']
            bdb.sql_execute(insert_model_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'iterations': iterations,
            })
            bdb.sql_execute(insert_theta_json_sql, {
                'generator_id': generator_id,
                'modelno': modelno,
                'theta_json': json.dumps(theta),
            })
Esempio n. 7
0
def bayesdb_load_legacy_models(bdb,
                               generator,
                               table,
                               metamodel,
                               pathname,
                               create=False,
                               ifnotexists=False,
                               gzipped=None):
    """Load legacy BayesDB models from a file.

    Legacy models are from the previous incarnation of BayesDB, before
    bayeslite.  If you did not use the previous incarnation of
    BayesDB, you need not worry about this.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str generator: name of generator
    :param str table: name of table
    :param str metamodel: name of metamodel, must be ``crosscat``
    :param str pathname: pathname of legacy models file
    :param bool create: if true and `generator` does not exist, create it
    :param bool ifnotexists: if true and `generator` exists, do it anyway
    :param bool gzipped: if true, or if ``None`` and `pathname`
        ends in ``.pkl.gz``, decompress with gzip first
    """

    if metamodel != 'crosscat':
        raise ValueError('Only crosscat legacy models are supported.')

    if not create:
        if ifnotexists:
            raise ValueError('Not creating generator whether or not exists!')

    # Load the pickled file -- gzipped, if gzipped is true or if
    # gzipped is not specified and the file ends in .pkl.gz.
    pickled = None
    with open(pathname, 'rb') as f:
        if gzipped or (gzipped is None and pathname.endswith('.pkl.gz')):
            with gzip.GzipFile(fileobj=f) as gzf:
                pickled = pickle.load(gzf)
        else:
            pickled = pickle.load(f)

    # Pick apart the schema and model data.
    #
    # XXX Support even older models formats, from before the schema
    # was included.  Not sure exactly how they were structured.
    if 'schema' not in pickled:
        raise IOError('Invalid legacy model: missing schema')
    if 'models' not in pickled:
        raise IOError('Invalid legacy model: missing models')
    schema = pickled['schema']
    models = pickled['models']

    # Make sure the schema looks sensible.  Map legacy stattypes
    # (`cctypes') to modern stattypes.
    if not isinstance(schema, dict):
        raise IOError('Invalid legacy model: schema is not a dict')
    for column_name in schema:
        column_schema = schema[column_name]
        if not isinstance(column_schema, dict):
            raise IOError('Invalid legacy model: column schema is not a dict')
        if not 'cctype' in column_schema:
            raise IOError('Invalid legacy model: column schema missing cctype')
        if column_schema['cctype'] in renamed_column_stattypes:
            column_schema['cctype'] = \
                renamed_column_stattypes[column_schema['cctype']]
        if column_schema['cctype'] not in allowed_column_stattypes:
            raise IOError('Invalid legacy model: unknown column type')

    # XXX Check whether the schema resembles a sane generator schema.
    # XXX Check whether models is a dict mapping integers to thetas.
    # XXX Check whether the thetas look sensible.
    # XXX Check whether the metamodel makes sense of it!

    column_stattypes = dict(
        (casefold(column_name), casefold(schema[column_name]['cctype']))
        for column_name in schema)

    # Ready to update the database.  Do it in a savepoint in case
    # anything goes wrong.
    with bdb.savepoint():

        # Ensure the table exists.  Can't do anything if we have no
        # data.
        if not core.bayesdb_has_table(bdb, table):
            raise ValueError('No such table: %s' % (repr(table), ))

        # Ensure the generator exists.
        if core.bayesdb_has_generator(bdb, generator):
            if create and not ifnotexists:
                raise ValueError('Generator already exists: %s' %
                                 (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            generator_table = core.bayesdb_generator_table(bdb, generator_id)
            if casefold(table) != generator_table:
                raise ValueError(
                    'Generator %r is for table %r, not for table: %r' %
                    (generator, generator_table, table))
            # Generator exists.  If the schema differs and there are
            # existing models, fail.  If the schema differs and there
            # are no existing models, change the schema.
            #
            # XXX Not clear changing the schema is really appropriate.
            generator_id = core.bayesdb_get_generator(bdb, generator)
            old_types = bayesdb_generator_column_stattypes(bdb, generator_id)
            if column_stattypes != old_types:
                sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = ?
                '''
                cursor = bdb.sql_execute(bdb, (generator_id, ))
                if 0 < cursor_value(cursor):
                    raise ValueError('Legacy models mismatch schema: %s' %
                                     (repr(generator), ))
                qg = sqlite3_quote_name(generator)
                bdb.execute('DROP GENERATOR %s' % (qg, ))
                bayesdb_create_legacy_generator(bdb, generator, table,
                                                column_stattypes)
        elif create:
            bayesdb_create_legacy_generator(bdb, generator, table,
                                            column_stattypes)
        else:
            raise ValueError('No such generator: %s' % (repr(generator), ))

        # Map the case of the column names in the models.
        #
        # XXX Check more than just the column names.
        for modelno in models:  # dictionary
            theta = models[modelno]
            if 'X_L' not in theta:
                raise IOError('Invalid legacy model: no X_L in theta[%u]' %
                              (modelno, ))
            X_L = theta['X_L']
            if 'view_state' not in X_L:
                raise IOError('Invalid legacy model'
                              ': no view_state in X_L[%u]' % (modelno, ))
            for viewno, view_state in enumerate(X_L['view_state']):
                if 'column_names' not in view_state:
                    raise IOError('Invalid legacy model: no column names'
                                  ' in view state %u of X_L[%u]' %
                                  (viewno, modelno))
                view_column_names = view_state['column_names']
                if not isinstance(view_column_names, list):
                    raise IOError('Invalid legacy model'
                                  ': non-list for view %u columns in X_L[%u]' %
                                  (viewno, modelno))
                for i in range(len(view_column_names)):
                    name = view_column_names[i]
                    if not core.bayesdb_table_has_column(bdb, table, name):
                        raise IOError('No such column in table %s: %s' %
                                      (repr(table), repr(name)))
                    # Canonicalize the case.
                    colno = core.bayesdb_table_column_number(bdb, table, name)
                    name = core.bayesdb_table_column_name(bdb, table, colno)
                    view_column_names[i] = name

        # Determine where to start numbering the new models.
        generator_id = core.bayesdb_get_generator(bdb, generator)
        modelno_max_sql = '''
            SELECT MAX(modelno) FROM bayesdb_generator_model
                WHERE generator_id = ?
        '''
        cursor = bdb.sql_execute(modelno_max_sql, (generator_id, ))
        modelno_max = cursor_value(cursor)
        modelno_start = 0 if modelno_max is None else modelno_max + 1

        # Consistently number the models consecutively in order of the
        # external numbering starting at the smallest nonnegative
        # model number not currently used.  Do not vary based on the
        # ordering of Python dict iteration.
        insert_model_sql = '''
            INSERT INTO bayesdb_generator_model
                (generator_id, modelno, iterations)
                VALUES (:generator_id, :modelno, :iterations)
        '''
        insert_theta_json_sql = '''
            INSERT INTO bayesdb_crosscat_theta
                (generator_id, modelno, theta_json)
                VALUES (:generator_id, :modelno, :theta_json)
        '''
        for i, modelno_ext in enumerate(sorted(models.keys())):
            modelno = modelno_start + i
            theta = models[modelno_ext]
            iterations = 0
            if 'iterations' in theta and isinstance(theta['iterations'], int):
                iterations = theta['iterations']
            bdb.sql_execute(
                insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                    'iterations': iterations,
                })
            bdb.sql_execute(
                insert_theta_json_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                    'theta_json': json.dumps(theta),
                })