Ejemplo n.º 1
0
def rename_table(bdb, old, new):
    assert core.bayesdb_has_table(bdb, old)
    assert not core.bayesdb_has_generator(bdb, old)
    assert not core.bayesdb_has_table(bdb, new)
    assert not core.bayesdb_has_generator(bdb, new)
    # Rename the SQL table.
    qo = sqlite3_quote_name(old)
    qn = sqlite3_quote_name(new)
    rename_sql = 'ALTER TABLE %s RENAME TO %s' % (qo, qn)
    bdb.sql_execute(rename_sql)
    # Update bayesdb_column to use the new name.
    update_columns_sql = '''
        UPDATE bayesdb_column SET tabname = ? WHERE tabname = ?
    '''
    bdb.sql_execute(update_columns_sql, (new, old))
    # Update bayesdb_column_map to use the new name.
    update_column_maps_sql = '''
        UPDATE bayesdb_column_map SET tabname = ? WHERE tabname = ?
    '''
    bdb.sql_execute(update_column_maps_sql, (new, old))
    # Update bayesdb_generator to use the new name.
    update_generators_sql = '''
        UPDATE bayesdb_generator SET tabname = ? WHERE tabname = ?
    '''
    bdb.sql_execute(update_generators_sql, (new, old))
Ejemplo n.º 2
0
def rename_table(bdb, old, new):
    assert core.bayesdb_has_table(bdb, old)
    assert not core.bayesdb_has_generator(bdb, old)
    assert not core.bayesdb_has_table(bdb, new)
    assert not core.bayesdb_has_generator(bdb, new)
    # Rename the SQL table.
    qo = sqlite3_quote_name(old)
    qn = sqlite3_quote_name(new)
    rename_sql = "ALTER TABLE %s RENAME TO %s" % (qo, qn)
    bdb.sql_execute(rename_sql)
    # Update bayesdb_column to use the new name.
    update_columns_sql = """
        UPDATE bayesdb_column SET tabname = ? WHERE tabname = ?
    """
    bdb.sql_execute(update_columns_sql, (new, old))
    # Update bayesdb_column_map to use the new name.
    update_column_maps_sql = """
        UPDATE bayesdb_column_map SET tabname = ? WHERE tabname = ?
    """
    bdb.sql_execute(update_column_maps_sql, (new, old))
    # Update bayesdb_generator to use the new name.
    update_generators_sql = """
        UPDATE bayesdb_generator SET tabname = ? WHERE tabname = ?
    """
    bdb.sql_execute(update_generators_sql, (new, old))
Ejemplo n.º 3
0
def _default_categorical(bdb, generator_id, var):
    table = core.bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table)
    qv = sqlite3_quote_name(var)
    cursor = bdb.sql_execute('SELECT COUNT(DISTINCT %s) FROM %s' % (qv, qt))
    k = cursor_value(cursor)
    return 'categorical', {'k': k}
Ejemplo n.º 4
0
 def cast(var, colno, stattype):
     if colno < 0:
         return 'NULL'
     qv = sqlite3_quote_name(var)
     affinity = core.bayesdb_stattype_affinity(bdb, stattype)
     qa = sqlite3_quote_name(affinity)
     return 'CAST(t.%s AS %s)' % (qv, qa)
Ejemplo n.º 5
0
def bayesdb_guess_generator(bdb, generator, table, metamodel,
        ifnotexists=None, count_cutoff=None, ratio_cutoff=None,
        default=None, overrides=None):
    """Heuristically guess a generator for `table` using `metamodel`.

    Based on the data in `table`, create a generator named `generator`
    using `metamodel` for it.

    :param bool ifnotexists: if true or ``None`` and `generator`
        already exists, do nothing.
    :param int count_cutoff: number of distinct values below which
        columns whose values can all be parsed as numbers will be
        considered categorical anyway
    :param real ratio_cutoff: ratio of distinct values to total values
        below which columns whose values can all be parsed as numbers
        will be considered categorical anyway
    :param list overrides: list of ``(name, stattype)``, overriding
        any guessed statistical type for columns by those names

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``, in which case those columns will not be
    modelled at all.
    """

    # Fill in default arguments.
    if ifnotexists is None:
        ifnotexists = False
    if default is None:
        default = False

    with bdb.savepoint():
        if core.bayesdb_has_generator(bdb, generator):
            if ifnotexists:
                return
            else:
                raise ValueError('Generator already exists: %s' %
                    (repr(generator),))
        qt = sqlite3_quote_name(table)
        cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
        column_names = [d[0] for d in cursor.description]
        rows = cursor.fetchall()
        stattypes = bayesdb_guess_stattypes(column_names, rows,
            count_cutoff=count_cutoff, ratio_cutoff=ratio_cutoff,
            overrides=overrides)
        # Skip the key column.
        column_names, stattypes = \
            unzip([(cn, st) for cn, st in zip(column_names, stattypes)
                if st != 'key' and st != 'ignore'])
        if len(column_names) == 0:
            raise ValueError('Table has no modelled columns: %s' %
                (repr(table),))
        qg = sqlite3_quote_name(generator)
        qmm = sqlite3_quote_name(metamodel)
        qcns = map(sqlite3_quote_name, column_names)
        qsts = map(sqlite3_quote_name, stattypes)
        qs = ','.join(qcn + ' ' + qst for qcn, qst in zip(qcns, qsts))
        bdb.execute('CREATE %sGENERATOR %s FOR %s USING %s(%s)' %
            ('DEFAULT ' if default else '', qg, qt, qmm, qs))
Ejemplo n.º 6
0
def bayesdb_guess_generator(bdb,
                            generator,
                            table,
                            metamodel,
                            ifnotexists=None,
                            default=None,
                            **kwargs):
    """Heuristically guess a generator for `table` using `metamodel`.

    Based on the data in `table`, create a generator named `generator`
    using `metamodel` for it.

    :param bool ifnotexists: if true or ``None`` and `generator`
        already exists, do nothing.
    :param bool default: Make this the default generator.
        (for if a later query does not specify a generator).
    :param dict **kwargs: options to pass through to bayesdb_guess_stattypes.

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``, in which case those columns will not be
    modelled at all.
    """

    # Fill in default arguments.
    if ifnotexists is None:
        ifnotexists = False
    if default is None:
        default = False

    with bdb.savepoint():
        if core.bayesdb_has_generator(bdb, generator):
            if ifnotexists:
                return
            else:
                raise ValueError('Generator already exists: %s' %
                                 (repr(generator), ))
        qt = sqlite3_quote_name(table)
        cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
        column_names = [d[0] for d in cursor.description]
        rows = cursor.fetchall()
        stattypes = bayesdb_guess_stattypes(column_names, rows, **kwargs)
        # Skip the key column.
        column_names, stattypes = \
            unzip([(cn, st) for cn, st in zip(column_names, stattypes)
                if st != 'key' and st != 'ignore'])
        if len(column_names) == 0:
            raise ValueError('Table has no modelled columns: %s' %
                             (repr(table), ))
        qg = sqlite3_quote_name(generator)
        qmm = sqlite3_quote_name(metamodel)
        qcns = map(sqlite3_quote_name, column_names)
        qsts = map(sqlite3_quote_name, stattypes)
        qs = ','.join(qcn + ' ' + qst for qcn, qst in zip(qcns, qsts))
        bdb.execute('CREATE %sGENERATOR %s FOR %s USING %s(%s)' %
                    ('DEFAULT ' if default else '', qg, qt, qmm, qs))
Ejemplo n.º 7
0
def bayesdb_guess_population(bdb,
                             population,
                             table,
                             ifnotexists=None,
                             **kwargs):
    """Heuristically guess a population schema for `table`.

    Based on the data in `table`, create a population named
    `population`.

    :param bool ifnotexists: if true or ``None`` and `population`
        already exists, do nothing.
    :param dict kwargs: options to pass through to bayesdb_guess_stattypes.

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``, in which case those columns will not be
    modelled at all.

    """

    # Fill in default arguments.
    if ifnotexists is None:
        ifnotexists = False

    with bdb.savepoint():
        if core.bayesdb_has_population(bdb, population):
            if ifnotexists:
                return
            else:
                raise ValueError('Population exists: %r' % (population, ))
        qt = sqlite3_quote_name(table)
        cursor = bdb.sql_execute('SELECT * FROM %s' % (qt, ))
        column_names = [d[0] for d in cursor.description]
        rows = cursor.fetchall()
        stattypes = [
            st[0]
            for st in bayesdb_guess_stattypes(column_names, rows, **kwargs)
        ]
        # Convert the `key` column to an `ignore`.
        replace = lambda s: 'ignore' if s == 'key' else s
        column_names, stattypes = unzip([
            (cn, replace(st)) for cn, st in zip(column_names, stattypes)
        ])
        if len([s for s in stattypes if s != 'ignore']) == 0:
            raise ValueError('Table has no modelled columns: %s' %
                             (repr(table), ))
        qp = sqlite3_quote_name(population)
        qcns = map(sqlite3_quote_name, column_names)
        qsts = map(sqlite3_quote_name, stattypes)
        qs = ';'.join(qcn + ' ' + qst for qcn, qst in zip(qcns, qsts))
        bdb.execute('CREATE POPULATION %s FOR %s(%s)' % (qp, qt, qs))
Ejemplo n.º 8
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None,
                                                colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item, ) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path,
                          rows_csv=csv_file.name,
                          schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt, )).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid, ) in enumerate(rowids))
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions, ))
Ejemplo n.º 9
0
def bayesdb_guess_generator(bdb, generator, table, metamodel,
        ifnotexists=None, default=None, **kwargs):
    """Heuristically guess a generator for `table` using `metamodel`.

    Based on the data in `table`, create a generator named `generator`
    using `metamodel` for it.

    :param bool ifnotexists: if true or ``None`` and `generator`
        already exists, do nothing.
    :param bool default: Make this the default generator.
        (for if a later query does not specify a generator).
    :param dict **kwargs: options to pass through to bayesdb_guess_stattypes.

    In addition to statistical types, the overrides may specify
    ``key`` or ``ignore``, in which case those columns will not be
    modelled at all.
    """

    # Fill in default arguments.
    if ifnotexists is None:
        ifnotexists = False
    if default is None:
        default = False

    with bdb.savepoint():
        if core.bayesdb_has_generator(bdb, generator):
            if ifnotexists:
                return
            else:
                raise ValueError('Generator already exists: %s' %
                    (repr(generator),))
        qt = sqlite3_quote_name(table)
        cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
        column_names = [d[0] for d in cursor.description]
        rows = cursor.fetchall()
        stattypes = bayesdb_guess_stattypes(column_names, rows, **kwargs)
        # Skip the key column.
        column_names, stattypes = \
            unzip([(cn, st) for cn, st in zip(column_names, stattypes)
                if st != 'key' and st != 'ignore'])
        if len(column_names) == 0:
            raise ValueError('Table has no modelled columns: %s' %
                (repr(table),))
        qg = sqlite3_quote_name(generator)
        qmm = sqlite3_quote_name(metamodel)
        qcns = map(sqlite3_quote_name, column_names)
        qsts = map(sqlite3_quote_name, stattypes)
        qs = ','.join(qcn + ' ' + qst for qcn, qst in zip(qcns, qsts))
        bdb.execute('CREATE %sGENERATOR %s FOR %s USING %s(%s)' %
            ('DEFAULT ' if default else '', qg, qt, qmm, qs))
Ejemplo n.º 10
0
    def create_generator(self, bdb, generator_id, schema, **kwargs):
        population_id = bayesdb_generator_population(bdb, generator_id)
        table = bayesdb_population_table(bdb, population_id)

        # Store generator info in bdb.
        name = self._generate_name(bdb, generator_id)
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_generator
            (generator_id, name, loom_store_path)
            VALUES (?, ?, ?)
        ''', (generator_id, name, self.loom_store_path))

        headers = []
        data = []
        data_by_column = {}
        for colno in bayesdb_variable_numbers(bdb, population_id, None):
            column_name = bayesdb_variable_name(bdb, population_id, None, colno)
            headers.append(column_name)
            qt = sqlite3_quote_name(table)
            qcn = sqlite3_quote_name(column_name)
            cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcn, qt))
            col_data = [item for (item,) in cursor.fetchall()]
            data.append(col_data)
            data_by_column[column_name] = col_data
        data = [list(i) for i in zip(*data)]

        # Ingest data into loom.
        schema_file = self._data_to_schema(bdb, population_id, data_by_column)
        csv_file = self._data_to_csv(bdb, headers, data)
        project_path = self._get_loom_project_path(bdb, generator_id)
        loom.tasks.ingest(project_path, rows_csv=csv_file.name,
            schema=schema_file.name)

        # Store encoding info in bdb.
        self._store_encoding_info(bdb, generator_id)

        # Store rowid mapping in the bdb.
        qt = sqlite3_quote_name(table)
        rowids = bdb.sql_execute('SELECT oid FROM %s' % (qt,)).fetchall()
        insertions = ','.join(
            str((generator_id, table_rowid, loom_rowid))
            for loom_rowid, (table_rowid,) in enumerate(rowids)
        )
        bdb.sql_execute('''
            INSERT INTO bayesdb_loom_rowid_mapping
                (generator_id, table_rowid, loom_rowid)
                VALUES %s
        ''' % (insertions,))
Ejemplo n.º 11
0
def create_prior_gen(bdb, target_metamodel, schema, column_names,
                     prior_samples):
    table = create_empty_table(bdb, column_names)
    prior_gen = create_temp_gen(bdb, table, target_metamodel, schema)
    bdb.execute('INITIALIZE %s MODELS FOR %s' %
        (prior_samples, sqlite3_quote_name(prior_gen.name)))
    return prior_gen
Ejemplo n.º 12
0
def bql_variable_stattypes_and_data(bdb, population_id, colno0, colno1):
    st0 = core.bayesdb_variable_stattype(bdb, population_id, colno0)
    st1 = core.bayesdb_variable_stattype(bdb, population_id, colno1)
    table_name = core.bayesdb_population_table(bdb, population_id)
    qt = sqlite3_quote_name(table_name)
    varname0 = core.bayesdb_variable_name(bdb, population_id, colno0)
    varname1 = core.bayesdb_variable_name(bdb, population_id, colno1)
    qvn0 = sqlite3_quote_name(varname0)
    qvn1 = sqlite3_quote_name(varname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qvn0, qvn1, qt, qvn0, qvn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
Ejemplo n.º 13
0
def bql_quote_name(name):
    """Quote `name` as a BQL identifier, e.g. a table or column name.

    Do NOT use this for strings, e.g. inserting data into a table.
    Use query parameters instead.
    """
    return sqlite3_quote_name(name)
Ejemplo n.º 14
0
def bayesdb_generator_row_values(bdb, generator_id, rowid):
    table_name = bayesdb_generator_table(bdb, generator_id)
    column_names = bayesdb_generator_column_names(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    qcns = ','.join(map(sqlite3_quote_name, column_names))
    select_sql = ('SELECT %s FROM %s WHERE _rowid_ = ?' % (qcns, qt))
    cursor = bdb.sql_execute(select_sql, (rowid,))
    row = None
    try:
        row = cursor.next()
    except StopIteration:
        generator = bayesdb_generator_table(bdb, generator_id)
        raise BQLError(bdb, 'No such row in table %s'
            ' for generator %d: %d' %
            (repr(table_name), repr(generator), repr(rowid)))
    try:
        cursor.next()
    except StopIteration:
        pass
    else:
        generator = bayesdb_generator_table(bdb, generator_id)
        raise BQLError(bdb, 'More than one such row'
            ' in table %s for generator %s: %d' %
            (repr(table_name), repr(generator), repr(rowid)))
    return row
Ejemplo n.º 15
0
def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1):
    st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0)
    st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1)
    table_name = core.bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0)
    colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1)
    qcn0 = sqlite3_quote_name(colname0)
    qcn1 = sqlite3_quote_name(colname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qcn0, qcn1, qt, qcn0, qcn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
Ejemplo n.º 16
0
def bayesdb_population_row_values(bdb, population_id, rowid):
    """Return values stored in `rowid` of given `population_id`."""
    table_name = bayesdb_population_table(bdb, population_id)
    column_names = bayesdb_variable_names(bdb, population_id, None)
    qt = sqlite3_quote_name(table_name)
    qcns = ','.join(map(sqlite3_quote_name, column_names))
    select_sql = ('SELECT %s FROM %s WHERE oid = ?' % (qcns, qt))
    cursor = bdb.sql_execute(select_sql, (rowid,))
    row = None
    try:
        row = cursor.next()
    except StopIteration:
        population = bayesdb_population_table(bdb, population_id)
        raise BQLError(bdb, 'No such row in table %s for population %s: %d'
            % (repr(table_name), repr(population), rowid))
    try:
        cursor.next()
    except StopIteration:
        pass
    else:
        population = bayesdb_population_table(bdb, population_id)
        raise BQLError(bdb,
            'More than one such row in table %s for population %s: %d'
            % (repr(table_name), repr(population), rowid))
    return row
Ejemplo n.º 17
0
def bql_column_stattypes_and_data(bdb, generator_id, colno0, colno1):
    st0 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno0)
    st1 = core.bayesdb_generator_column_stattype(bdb, generator_id, colno1)
    table_name = core.bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    colname0 = core.bayesdb_generator_column_name(bdb, generator_id, colno0)
    colname1 = core.bayesdb_generator_column_name(bdb, generator_id, colno1)
    qcn0 = sqlite3_quote_name(colname0)
    qcn1 = sqlite3_quote_name(colname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qcn0, qcn1, qt, qcn0, qcn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
Ejemplo n.º 18
0
def bayesdb_population_row_values(bdb, population_id, rowid):
    """Return values stored in `rowid` of given `population_id`."""
    table_name = bayesdb_population_table(bdb, population_id)
    column_names = bayesdb_variable_names(bdb, population_id, None)
    qt = sqlite3_quote_name(table_name)
    qcns = ','.join(map(sqlite3_quote_name, column_names))
    select_sql = ('SELECT %s FROM %s WHERE oid = ?' % (qcns, qt))
    cursor = bdb.sql_execute(select_sql, (rowid,))
    row = None
    try:
        row = cursor.next()
    except StopIteration:
        population = bayesdb_population_table(bdb, population_id)
        raise BQLError(bdb, 'No such row in table %s for population %s: %d'
            % (repr(table_name), repr(population), rowid))
    try:
        cursor.next()
    except StopIteration:
        pass
    else:
        population = bayesdb_population_table(bdb, population_id)
        raise BQLError(bdb,
            'More than one such row in table %s for population %s: %d'
            % (repr(table_name), repr(population), rowid))
    return row
Ejemplo n.º 19
0
def bql_variable_stattypes_and_data(bdb, population_id, colno0, colno1):
    st0 = core.bayesdb_variable_stattype(bdb, population_id, None, colno0)
    st1 = core.bayesdb_variable_stattype(bdb, population_id, None, colno1)
    table_name = core.bayesdb_population_table(bdb, population_id)
    qt = sqlite3_quote_name(table_name)
    varname0 = core.bayesdb_variable_name(bdb, population_id, None, colno0)
    varname1 = core.bayesdb_variable_name(bdb, population_id, None, colno1)
    qvn0 = sqlite3_quote_name(varname0)
    qvn1 = sqlite3_quote_name(varname1)
    data_sql = '''
        SELECT %s, %s FROM %s WHERE %s IS NOT NULL AND %s IS NOT NULL
    ''' % (qvn0, qvn1, qt, qvn0, qvn1)
    data = bdb.sql_execute(data_sql).fetchall()
    data0 = [row[0] for row in data]
    data1 = [row[1] for row in data]
    return (st0, st1, data0, data1)
Ejemplo n.º 20
0
def create_prior_gen(bdb, target_metamodel, schema, column_names,
                     prior_samples):
    table = create_empty_table(bdb, column_names)
    prior_gen = create_temp_gen(bdb, table, target_metamodel, schema)
    bdb.execute('INITIALIZE %s MODELS FOR %s' %
                (prior_samples, sqlite3_quote_name(prior_gen.name)))
    return prior_gen
Ejemplo n.º 21
0
def create_geweke_chain_gen(bdb, target_metamodel, schema, column_names,
                            target_cells, geweke_samples, geweke_iterates):
    table = create_empty_table(bdb, column_names)
    geweke_chain_gen = create_temp_gen(bdb, table, target_metamodel, schema)
    bdb.execute('INITIALIZE %s MODELS FOR %s' %
                (geweke_samples, sqlite3_quote_name(geweke_chain_gen.name)))
    for _ in range(geweke_iterates):
        for modelno in range(geweke_samples):
            # Need each Geweke chain to hallucinate its own data.
            # Doing it by model-controlled simulation and inference in
            # one generator.  This does rely on insert-remove
            # invariance for the models that are not analyzed.
            # An alternative would have been to create N generators,
            # each with 1 model.  As of this writing, that feels
            # gottier, because I would need to adjust the KL
            # computation to aggregate them.
            [data] = geweke_chain_gen.simulate_joint(target_cells, [],
                                                     modelno=modelno,
                                                     num_predictions=1)
            for ((i, j), datum) in zip(target_cells, data):
                geweke_chain_gen.insert((i, j, datum))
            geweke_chain_gen.analyze_models(modelnos=[modelno])
            for ((i, j), datum) in zip(target_cells, data):
                geweke_chain_gen.remove((i, j, datum))
    return geweke_chain_gen
Ejemplo n.º 22
0
def bayesdb_generator_row_values(bdb, generator_id, rowid):
    table_name = bayesdb_generator_table(bdb, generator_id)
    column_names = bayesdb_generator_column_names(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    qcns = ','.join(map(sqlite3_quote_name, column_names))
    select_sql = ('SELECT %s FROM %s WHERE _rowid_ = ?' % (qcns, qt))
    cursor = bdb.sql_execute(select_sql, (rowid, ))
    row = None
    try:
        row = cursor.next()
    except StopIteration:
        generator = bayesdb_generator_table(bdb, generator_id)
        raise BQLError(
            bdb, 'No such row in table %s'
            ' for generator %d: %d' %
            (repr(table_name), repr(generator), repr(rowid)))
    try:
        cursor.next()
    except StopIteration:
        pass
    else:
        generator = bayesdb_generator_table(bdb, generator_id)
        raise BQLError(
            bdb, 'More than one such row'
            ' in table %s for generator %s: %d' %
            (repr(table_name), repr(generator), repr(rowid)))
    return row
Ejemplo n.º 23
0
def bql_quote_name(name):
    """Quote `name` as a BQL identifier, e.g. a table or column name.

    Do NOT use this for strings, e.g. inserting data into a table.
    Use query parameters instead.
    """
    return sqlite3_quote_name(name)
Ejemplo n.º 24
0
def data_suff_stats(bdb, table, column_name):
    # This is incorporate/remove in bulk, reading from the database.
    qt = sqlite3_quote_name(table)
    qcn = sqlite3_quote_name(column_name)
    # TODO Do this computation inside the database?
    gather_data_sql = '''
        SELECT %s FROM %s
    ''' % (qcn, qt)
    cursor = bdb.sql_execute(gather_data_sql)
    count = 0
    xsum = 0
    sumsq = 0
    for (item,) in cursor:
        count += 1
        xsum += item
        sumsq += item * item
    return (count, xsum, sumsq)
Ejemplo n.º 25
0
def bayesdb_generator_fresh_row_id(bdb, generator_id):
    table_name = bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt, ))
    max_rowid = cursor_value(cursor)
    if max_rowid is None:
        max_rowid = 0
    return max_rowid + 1  # Synthesize a non-existent SQLite row id
Ejemplo n.º 26
0
def bayesdb_generator_cell_value(bdb, generator_id, rowid, colno):
    table_name = bayesdb_generator_table(bdb, generator_id)
    colname = bayesdb_generator_column_name(bdb, generator_id, colno)
    qt = sqlite3_quote_name(table_name)
    qcn = sqlite3_quote_name(colname)
    value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt)
    value_cursor = bdb.sql_execute(value_sql, (rowid, ))
    value = None
    try:
        row = value_cursor.next()
    except StopIteration:
        generator = bayesdb_generator_name(bdb, generator_id)
        raise BQLError(bdb, 'No such row in %s: %d' % (repr(generator), rowid))
    else:
        assert len(row) == 1
        value = row[0]
    return value
Ejemplo n.º 27
0
def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes):
    column_names = core.bayesdb_table_column_names(bdb, table)
    qcns = map(sqlite3_quote_name, column_names)
    assert all(column_stattypes[name] in allowed_column_stattypes
               for name in column_stattypes)
    column_name_set = set(casefold(name) for name in column_names)
    for name in column_stattypes:
        if name not in column_name_set:
            raise IOError('No such column in table %s: %s' %
                          (repr(table), repr(name)))
    schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)])
                      for name, qcn in zip(column_names, qcns))
    qg = sqlite3_quote_name(generator)
    qt = sqlite3_quote_name(table)
    qmm = 'crosscat'
    bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
                (qg, qt, qmm, schema))
Ejemplo n.º 28
0
def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes):
    column_names = core.bayesdb_table_column_names(bdb, table)
    qcns = map(sqlite3_quote_name, column_names)
    assert all(column_stattypes[name] in allowed_column_stattypes
        for name in column_stattypes)
    column_name_set = set(casefold(name) for name in column_names)
    for name in column_stattypes:
        if name not in column_name_set:
            raise IOError('No such column in table %s: %s' %
                (repr(table), repr(name)))
    schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)])
        for name, qcn in zip(column_names, qcns))
    qg = sqlite3_quote_name(generator)
    qt = sqlite3_quote_name(table)
    qmm = 'crosscat'
    bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
        (qg, qt, qmm, schema))
Ejemplo n.º 29
0
def data_suff_stats(bdb, table, column_name):
    # This is incorporate/remove in bulk, reading from the database.
    qt = sqlite3_quote_name(table)
    qcn = sqlite3_quote_name(column_name)
    # TODO Do this computation inside the database?
    gather_data_sql = '''
        SELECT %s FROM %s
    ''' % (qcn, qt)
    cursor = bdb.sql_execute(gather_data_sql)
    count = 0
    xsum = 0
    sumsq = 0
    for (item,) in cursor:
        count += 1
        xsum += item
        sumsq += item * item
    return (count, xsum, sumsq)
Ejemplo n.º 30
0
def bayesdb_generator_fresh_row_id(bdb, generator_id):
    table_name = bayesdb_generator_table(bdb, generator_id)
    qt = sqlite3_quote_name(table_name)
    cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt,))
    max_rowid = cursor_value(cursor)
    if max_rowid is None:
        max_rowid = 0
    return max_rowid + 1   # Synthesize a non-existent SQLite row id
Ejemplo n.º 31
0
def bayesdb_population_fresh_row_id(bdb, population_id):
    """Return one plus maximum rowid in base table of given `population_id`."""
    table_name = bayesdb_population_table(bdb, population_id)
    qt = sqlite3_quote_name(table_name)
    cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt,))
    max_rowid = cursor_value(cursor)
    if max_rowid is None:
        max_rowid = 0
    return max_rowid + 1   # Synthesize a non-existent SQLite row id
Ejemplo n.º 32
0
def bayesdb_generator_cell_value(bdb, generator_id, rowid, colno):
    table_name = bayesdb_generator_table(bdb, generator_id)
    colname = bayesdb_generator_column_name(bdb, generator_id, colno)
    qt = sqlite3_quote_name(table_name)
    qcn = sqlite3_quote_name(colname)
    value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt)
    value_cursor = bdb.sql_execute(value_sql, (rowid,))
    value = None
    try:
        row = value_cursor.next()
    except StopIteration:
        generator = bayesdb_generator_name(bdb, generator_id)
        raise BQLError(bdb, 'No such row in %s: %d' %
            (repr(generator), rowid))
    else:
        assert len(row) == 1
        value = row[0]
    return value
Ejemplo n.º 33
0
def bayesdb_population_fresh_row_id(bdb, population_id):
    """Return one plus maximum rowid in base table of given `population_id`."""
    table_name = bayesdb_population_table(bdb, population_id)
    qt = sqlite3_quote_name(table_name)
    cursor = bdb.sql_execute('SELECT MAX(_rowid_) FROM %s' % (qt,))
    max_rowid = cursor_value(cursor)
    if max_rowid is None:
        max_rowid = 0
    return max_rowid + 1   # Synthesize a non-existent SQLite row id
Ejemplo n.º 34
0
def bayesdb_table_has_rowid(bdb, table, rowid):
    """True if the table named `table` has record with given rowid.

    `bdb` must have a table named `table`.  If you're not sure, call
    :func:`bayesdb_has_table` first.
    """
    qt = sqlite3_quote_name(table)
    sql = 'SELECT COUNT(*) FROM %s WHERE oid = ?'
    cursor = bdb.sql_execute(sql % (qt,), (rowid,))
    return cursor_value(cursor) != 0
Ejemplo n.º 35
0
def rename_table(bdb, old, new):
    assert core.bayesdb_has_table(bdb, old)
    assert not core.bayesdb_has_table(bdb, new)
    # Rename the SQL table.
    qo = sqlite3_quote_name(old)
    qn = sqlite3_quote_name(new)
    rename_sql = 'ALTER TABLE %s RENAME TO %s' % (qo, qn)
    bdb.sql_execute(rename_sql)
    # Update bayesdb_column to use the new name.
    update_columns_sql = '''
        UPDATE bayesdb_column SET tabname = ? WHERE tabname = ?
    '''
    bdb.sql_execute(update_columns_sql, (new, old))

    # Update bayesdb_population to use the new name.
    update_populations_sql = '''
        UPDATE bayesdb_population SET tabname = ? WHERE tabname = ?
    '''
    bdb.sql_execute(update_populations_sql, (new, old))
Ejemplo n.º 36
0
def bayesdb_table_has_rowid(bdb, table, rowid):
    """True if the table named `table` has record with given rowid.

    `bdb` must have a table named `table`.  If you're not sure, call
    :func:`bayesdb_has_table` first.
    """
    qt = sqlite3_quote_name(table)
    sql = 'SELECT COUNT(*) FROM %s WHERE oid = ?'
    cursor = bdb.sql_execute(sql % (qt,), (rowid,))
    return cursor_value(cursor) != 0
Ejemplo n.º 37
0
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    # XXX Whattakludge!
    idxcol = '_rowid_'
    if idxcol in df.columns:
        raise ValueError('Column `_rowid_\' is not allowed.')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(name for name in df.columns
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown),))
            column_names = ['_rowid_'] + df.columns
        elif create:
            column_names = [idxcol] + list(df.columns)
            qcns = map(sqlite3_quote_name, column_names)
            schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table),))
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in df.to_records():
            bdb.sql_execute(sql, row)
Ejemplo n.º 38
0
def table_to_df(bdb, table_name, column_names=None):
    """Return the contents of the given table as a pandas DataFrame.

    If `column_names` is not None, fetch only those columns.
    """
    qt = sqlite3_quote_name(table_name)
    if column_names is not None:
        qcns = ','.join(map(sqlite3_quote_name, column_names))
        select_sql = 'SELECT %s FROM %s' % (qcns, qt)
    else:
        select_sql = 'SELECT * FROM %s' % (qt, )
    return cursor_to_df(bdb.sql_execute(select_sql))
Ejemplo n.º 39
0
def bayesdb_population_cell_value(bdb, population_id, rowid, colno):
    if colno < 0:
        # Latent variables do not appear in the table.
        return None
    table_name = bayesdb_population_table(bdb, population_id)
    var = bayesdb_variable_name(bdb, population_id, colno)
    qt = sqlite3_quote_name(table_name)
    qv = sqlite3_quote_name(var)
    value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qv, qt)
    value_cursor = bdb.sql_execute(value_sql, (rowid, ))
    value = None
    try:
        row = value_cursor.next()
    except StopIteration:
        population = bayesdb_population_name(bdb, population_id)
        raise BQLError(
            bdb, 'No such invidual in population %r: %d' % (population, rowid))
    else:
        assert len(row) == 1
        value = row[0]
    return value
Ejemplo n.º 40
0
def table_to_df(bdb, table_name, column_names=None):
    """Return the contents of the given table as a pandas DataFrame.

    If `column_names` is not None, fetch only those columns.
    """
    qt = sqlite3_quote_name(table_name)
    if column_names is not None:
        qcns = ",".join(map(sqlite3_quote_name, column_names))
        select_sql = "SELECT %s FROM %s" % (qcns, qt)
    else:
        select_sql = "SELECT * FROM %s" % (qt,)
    return cursor_to_df(bdb.sql_execute(select_sql))
Ejemplo n.º 41
0
def bayesdb_has_table(bdb, name):
    """True if there is a table named `name` in `bdb`.

    The table need not be modeled.
    """
    qt = sqlite3_quote_name(name)
    cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt,))
    try:
        cursor.next()
    except StopIteration:
        return False
    else:
        return True
Ejemplo n.º 42
0
def create_empty_table(bdb, column_names):
    """Create a fresh empty table with the given column names.

    Give all the columns a NUMERIC data type in the underlying SQL.
    Return the name of the new table.
    """
    table = bdb.temp_table_name()
    qt = sqlite3_quote_name(table)
    qcns = map(sqlite3_quote_name, column_names)
    schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
    bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
    core.bayesdb_table_guarantee_columns(bdb, table)
    return table
Ejemplo n.º 43
0
def bayesdb_has_table(bdb, name):
    """True if there is a table named `name` in `bdb`.

    The table need not be modelled.
    """
    qt = sqlite3_quote_name(name)
    cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt,))
    try:
        cursor.next()
    except StopIteration:
        return False
    else:
        return True
Ejemplo n.º 44
0
def bayesdb_population_cell_value(bdb, population_id, rowid, colno):
    """Return value stored in `rowid` and `colno` of given `population_id`."""
    if colno < 0:
        # Latent variables do not appear in the table.
        return None
    table_name = bayesdb_population_table(bdb, population_id)
    var = bayesdb_variable_name(bdb, population_id, None, colno)
    qt = sqlite3_quote_name(table_name)
    qv = sqlite3_quote_name(var)
    value_sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qv, qt)
    value_cursor = bdb.sql_execute(value_sql, (rowid,))
    value = None
    try:
        row = value_cursor.next()
    except StopIteration:
        population = bayesdb_population_name(bdb, population_id)
        raise BQLError(bdb, 'No such individual in population %r: %d'
            % (population, rowid))
    else:
        assert len(row) == 1
        value = row[0]
    return value
Ejemplo n.º 45
0
def create_empty_table(bdb, column_names):
    """Create a fresh empty table with the given column names.

    Give all the columns a NUMERIC data type in the underlying SQL.
    Return the name of the new table.
    """
    table = bdb.temp_table_name()
    qt = sqlite3_quote_name(table)
    qcns = map(sqlite3_quote_name, column_names)
    schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns)
    bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
    core.bayesdb_table_guarantee_columns(bdb, table)
    return table
Ejemplo n.º 46
0
    def _data(self, bdb, generator_id, vars):
        # Get the column numbers and statistical types.
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        colnos = [
            core.bayesdb_variable_number(bdb, population_id, generator_id, var)
            for var in vars
        ]
        stattypes = [
            core.bayesdb_variable_stattype(bdb, population_id, colno)
            for colno in colnos
        ]

        # Get the table name, quoted for constructing SQL.
        table_name = core.bayesdb_generator_table(bdb, generator_id)
        qt = sqlite3_quote_name(table_name)

        # Create SQL expressions to cast each variable to the correct
        # affinity for its statistical type.
        def cast(var, colno, stattype):
            if colno < 0:
                return 'NULL'
            qv = sqlite3_quote_name(var)
            affinity = core.bayesdb_stattype_affinity(bdb, stattype)
            qa = sqlite3_quote_name(affinity)
            return 'CAST(t.%s AS %s)' % (qv, qa)

        qexpressions = ','.join(map(cast, vars, colnos, stattypes))

        # Get a cursor.
        cursor = bdb.sql_execute(
            '''
            SELECT %s FROM %s AS t, bayesdb_cgpm_individual AS ci
                WHERE ci.generator_id = ?
                    AND ci.table_rowid = t._rowid_
            ORDER BY t._rowid_ ASC
        ''' % (qexpressions, qt), (generator_id, ))

        # Map values to codes.
        def map_value(colno, value):
            return self._to_numeric(bdb, generator_id, colno, value)

        return [
            tuple(map_value(colno, x) for colno, x in zip(colnos, row))
            for row in cursor
        ]
Ejemplo n.º 47
0
def bayesdb_table_guarantee_columns(bdb, table):
    """Make sure ``bayesdb_column`` is populated with columns of `table`.

    `bdb` must have a table named `table`.  If you're not sure, call
    :func:`bayesdb_has_table` first.
    """
    with bdb.savepoint():
        qt = sqlite3_quote_name(table)
        insert_column_sql = '''
            INSERT OR IGNORE INTO bayesdb_column (tabname, colno, name)
                VALUES (?, ?, ?)
        '''
        nrows = 0
        for row in bdb.sql_execute('PRAGMA table_info(%s)' % (qt,)):
            nrows += 1
            colno, name, _sqltype, _notnull, _default, _primary_key = row
            bdb.sql_execute(insert_column_sql, (table, colno, name))
        if nrows == 0:
            raise ValueError('No such table: %s' % (repr(table),))
Ejemplo n.º 48
0
def bayesdb_table_guarantee_columns(bdb, table):
    """Make sure ``bayesdb_column`` is populated with columns of `table`.

    `bdb` must have a table named `table`.  If you're not sure, call
    :func:`bayesdb_has_table` first.
    """
    with bdb.savepoint():
        qt = sqlite3_quote_name(table)
        insert_column_sql = '''
            INSERT OR IGNORE INTO bayesdb_column (tabname, colno, name)
                VALUES (?, ?, ?)
        '''
        nrows = 0
        for row in bdb.sql_execute('PRAGMA table_info(%s)' % (qt,)):
            nrows += 1
            colno, name, _sqltype, _notnull, _default, _primary_key = row
            bdb.sql_execute(insert_column_sql, (table, colno, name))
        if nrows == 0:
            raise ValueError('No such table: %s' % (repr(table),))
Ejemplo n.º 49
0
def create_geweke_chain_gen(bdb, target_metamodel, schema, column_names,
                            target_cells, geweke_samples, geweke_iterates):
    table = create_empty_table(bdb, column_names)
    geweke_chain_gen = create_temp_gen(bdb, table, target_metamodel, schema)
    bdb.execute('INITIALIZE %s MODELS FOR %s' %
        (geweke_samples, sqlite3_quote_name(geweke_chain_gen.name)))
    for _ in range(geweke_iterates):
        for modelno in range(geweke_samples):
            # Need each Geweke chain to hallucinate its own data.
            # Doing it by model-controlled simulation and inference in
            # one generator.  This does rely on insert-remove
            # invariance for the models that are not analyzed.
            # An alternative would have been to create N generators,
            # each with 1 model.  As of this writing, that feels
            # gottier, because I would need to adjust the KL
            # computation to aggregate them.
            [data] = geweke_chain_gen.simulate_joint(
                target_cells, [], modelno=modelno, num_predictions=1)
            for ((i, j), datum) in zip(target_cells, data):
                geweke_chain_gen.insert((i, j, datum))
            geweke_chain_gen.analyze_models(modelnos=[modelno])
            for ((i, j), datum) in zip(target_cells, data):
                geweke_chain_gen.remove((i, j, datum))
    return geweke_chain_gen
Ejemplo n.º 50
0
    def predict_confidence(self,
                           bdb,
                           generator_id,
                           modelno,
                           colno,
                           rowid,
                           numsamples=None):
        if not numsamples:
            numsamples = 2
        assert numsamples > 0

        def _impute_categorical(sample):
            counts = Counter(s[0] for s in sample)
            mode_count = max(counts[v] for v in counts)
            pred = iter(v for v in counts if counts[v] == mode_count).next()
            conf = float(mode_count) / numsamples
            return pred, conf

        def _impute_numerical(sample):
            pred = sum(s[0] for s in sample) / float(len(sample))
            conf = 0  # XXX Punt confidence for now
            return pred, conf

        constraints = []
        # If rowid is a hypothetical cell for cgpm (did not exist at the time
        # of INITIALIZE), but exists in the base table (by INSERT INTO), then
        # retrieve all values for rowid as the constraints.
        exists = rowid < core.bayesdb_generator_fresh_row_id(bdb, generator_id)
        max_cgpm_rowid = bdb.sql_execute(
            '''
            SELECT MAX(table_rowid) FROM bayesdb_cgpm_individual
            WHERE generator_id = ?
        ''', (generator_id, )).fetchall()[0][0]
        hypothetical = rowid > max_cgpm_rowid
        if exists and hypothetical:
            population_id = core.bayesdb_generator_population(
                bdb, generator_id)
            # Retrieve all other variables except colno, and ignore latents in
            # generator_id, and place them in the constraints.
            pop_names = core.bayesdb_variable_names(bdb, population_id, None)
            avoid_name = core.bayesdb_variable_name(bdb, population_id, colno)
            constraints_names = [n for n in pop_names if n != avoid_name]
            # Obtain the row.
            qt_names = str.join(',', map(sqlite3_quote_name,
                                         constraints_names))
            qt_table = sqlite3_quote_name(
                core.bayesdb_population_table(bdb, population_id))
            data = bdb.sql_execute(
                '''
                SELECT %s FROM %s WHERE oid = ?
            ''' % (
                    qt_names,
                    qt_table,
                ), (rowid, )).fetchall()[0]
            # Build the constraints.
            pop_nos = core.bayesdb_variable_numbers(bdb, population_id, None)
            constraints_nos = [n for n in pop_nos if n != colno]
            # import ipdb; ipdb.set_trace()
            assert len(data) == len(constraints_nos)
            constraints = [(rowid, c, v)
                           for c, v in zip(constraints_nos, data)
                           if (v is not None) and v]

        # Retrieve the samples.
        sample = self.simulate_joint(bdb, generator_id, [(rowid, colno)],
                                     constraints, modelno, numsamples)

        # Determine the imputation strategy (mode or mean).
        stattype = core.bayesdb_variable_stattype(
            bdb, core.bayesdb_generator_population(bdb, generator_id), colno)
        if _is_categorical(stattype):
            return _impute_categorical(sample)
        else:
            return _impute_numerical(sample)
Ejemplo n.º 51
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = "TEMP " if phrase.temp else ""
            ifnotexists = "IF NOT EXISTS " if phrase.ifnotexists else ""
            out.write("CREATE %sTABLE %s%s AS " % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as generator: %s" % (repr(phrase.name),))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(bdb, "Name already defined as table: %s" % (repr(phrase.name),))
            if not core.bayesdb_has_generator_default(bdb, phrase.simulation.generator):
                raise BQLError(bdb, "No such generator: %s" % (phrase.simulation.generator,))
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute("PRAGMA table_info(%s)" % (qt,))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb,
                        "No such column"
                        " in generator %s table %s: %s"
                        % (repr(phrase.simulation.generator), repr(table), repr(column_name)),
                    )
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write("SELECT ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.nsamples, out)
            out.write(", ")
            with compiler.compiling_paren(bdb, out, "CAST(", " AS INTEGER)"):
                compiler.compile_nobql_expression(bdb, phrase.simulation.modelno, out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(", ")
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(), out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = [
                (core.bayesdb_generator_column_number(bdb, generator_id, name), value)
                for (name, _expression), value in zip(phrase.simulation.constraints, cursor[0][2:])
            ]
            colnos = [core.bayesdb_generator_column_number(bdb, generator_id, name) for name in column_names]
            bdb.sql_execute(
                "CREATE %sTABLE %s%s (%s)"
                % (
                    "TEMP " if phrase.temp else "",
                    "IF NOT EXISTS " if phrase.ifnotexists else "",
                    qn,
                    ",".join(
                        "%s %s" % (qcn, column_sqltypes[casefold(column_name)])
                        for qcn, column_name in zip(qcns, column_names)
                    ),
                )
            )
            insert_sql = """
                INSERT INTO %s (%s) VALUES (%s)
            """ % (
                qn,
                ",".join(qcns),
                ",".join("?" for qcn in qcns),
            )
            for row in bqlfn.bayesdb_simulate(
                bdb, generator_id, constraints, colnos, modelno=modelno, numpredictions=nsamples
            ):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = "SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?"
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(bdb, "Table still in use by generators: %s" % (repr(phrase.name),))
            bdb.sql_execute("DELETE FROM bayesdb_column WHERE tabname = ?", (phrase.name,))
            ifexists = "IF EXISTS " if phrase.ifexists else ""
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute("DROP TABLE %s%s" % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, "No such table: %s" % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + "_temp"
                        while core.bayesdb_has_table(bdb, temp) or core.bayesdb_has_generator(bdb, temp):
                            temp += "_temp"
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError("Renaming columns" " not yet implemented.")
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table, cmd.old):
                            raise BQLError(bdb, "No such column in table %s" ": %s" % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, "Column already exists" " in table %s: %s" % (repr(table), repr(cmd.new))
                            )
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = """
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_column_sql, {"table": table, "old": cmd.old, "new": cmd.new})
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = """
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        """
                        cursor = bdb.sql_execute(generators_sql, (table,))
                        for (generator_id,) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id, old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(bdb, "No such generator: %s" % (repr(cmd.generator),))
                    generator_id = core.bayesdb_get_generator(bdb, cmd.generator)
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                    set_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(set_default_sql, (generator_id,))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = """
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(unset_default_sql, (table,))
                    assert bdb.sqlite3.total_changes - total_changes in (0, 1)
                else:
                    assert False, "Invalid alter table command: %s" % (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(bdb, "No such metamodel: %s" % (repr(phrase.metamodel),))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():

            def instantiate(columns):
                return instantiate_generator(
                    bdb,
                    phrase.name,
                    phrase.table,
                    metamodel,
                    columns,
                    ifnotexists=phrase.ifnotexists,
                    default=phrase.default,
                )

            metamodel.create_generator(bdb, phrase.table, phrase.schema, instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, "No such generator: %s" % (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = """
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            """
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = """
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            """
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = """
                DELETE FROM bayesdb_generator WHERE id = ?
            """
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb, "No such generator: %s" % (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined as table" ": %s" % (repr(cmd.name),))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(bdb, "Name already defined" " as generator: %s" % (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = """
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    """
                    total_changes = bdb.sqlite3.total_changes
                    bdb.sql_execute(update_generator_sql, (cmd.name, generator_id))
                    assert bdb.sqlite3.total_changes - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, "Invalid ALTER GENERATOR command: %s" % (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(
                    modelno for modelno in modelnos if not core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
            else:
                existing = set(
                    modelno for modelno in modelnos if core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                )
                if 0 < len(existing):
                    raise BQLError(
                        bdb, "Generator %s already has models: %s" % (repr(phrase.generator), sorted(existing))
                    )

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = """
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            """
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {"generator_id": generator_id, "modelno": modelno, "iterations": 0})

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos, model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError("No background analysis -- use WAIT.")
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, "No such generator: %s" % (phrase.generator,))
        generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(
            bdb,
            generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
        )
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = """
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {"generator_id": generator_id, "modelno": modelno})
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, "No such model" " in generator %s: %s" % (repr(phrase.generator), repr(modelno))
                        )
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = """
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                """
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = """
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                """
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {"generator_id": generator_id, "modelno": modelno})
        return empty_cursor(bdb)

    assert False  # XXX
Ejemplo n.º 52
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
            out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            if core.bayesdb_has_table(bdb, phrase.name):
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb,
                        'Name already defined as table: %s' %
                        (repr(phrase.name),))
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabCsv):
        with bdb.savepoint():
            table_exists = core.bayesdb_has_table(bdb, phrase.name)
            if table_exists:
                if phrase.ifnotexists:
                    return empty_cursor(bdb)
                else:
                    raise BQLError(bdb, 'Table already exists: %s' %
                        (repr(phrase.name),))
            bayesdb_read_csv_file(
                bdb, phrase.name, phrase.csv, header=True, create=True)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_population WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name,))
            if 0 < cursor_value(cursor):
                raise BQLError(bdb, 'Table still in use by populations: %s' %
                    (repr(phrase.name),))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                (phrase.name,))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table),))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as table: %s'
                                % (repr(cmd.name),))
                        rename_table(bdb, table, cmd.name)
                    # If table has implicit population, rename it too.
                    if core.bayesdb_table_has_implicit_population(
                                bdb, cmd.name):
                        populations = \
                            core.bayesdb_table_populations(bdb, cmd.name)
                        assert len(populations) == 1
                        population_name = core.bayesdb_population_name(
                            bdb, populations[0])
                        qt = sqlite3_quote_name(cmd.name)
                        qp = sqlite3_quote_name(population_name)
                        bdb.execute('ALTER POPULATION %s RENAME TO %s'
                            % (qp, qt))
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                        ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(bdb, table,
                                cmd.old):
                            raise BQLError(bdb, 'No such column in table %s'
                                ': %s' %
                                (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except backends may have the (case-folded) name cached.
                    if old_folded != new_folded:
                        populations_sql = '''
                            SELECT id FROM bayesdb_population WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(populations_sql, (table,))
                        generators = [
                            core.bayesdb_population_generators(
                                bdb, population_id)
                            for (population_id,) in cursor
                        ]
                        for generator_id in set(generators):
                            backend = core.bayesdb_generator_backend(bdb,
                                generator_id)
                            backend.rename_column(bdb, generator_id,
                                old_folded, new_folded)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.GuessSchema):
        if not core.bayesdb_has_table(bdb, phrase.table):
            raise BQLError(bdb, 'No such table : %s' % phrase.table)
        out = compiler.Output(0, {}, {})
        with bdb.savepoint():
            qt = sqlite3_quote_name(phrase.table)
            temptable = bdb.temp_table_name()
            qtt = sqlite3_quote_name(temptable)
            cursor = bdb.sql_execute('SELECT * FROM %s' % (qt,))
            column_names = [d[0] for d in cursor.description]
            rows = cursor.fetchall()
            stattypes = bayesdb_guess_stattypes(column_names, rows)
            distinct_value_counts = [
                len(set([row[i] for row in rows]))
                for i in range(len(column_names))
            ]
            out.winder('''
                CREATE TEMP TABLE %s (
                    column TEXT,
                    stattype TEXT,
                    num_distinct INTEGER,
                    reason TEXT
                )
            ''' % (qtt,), ())
            for cn, st, ct in zip(column_names, stattypes, distinct_value_counts):
                out.winder('''
                    INSERT INTO %s VALUES (?, ?, ?, ?)
                ''' % (qtt), (cn, st[0], ct, st[1]))
            out.write('SELECT * FROM %s' % (qtt,))
            out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    if isinstance(phrase, ast.CreatePop):
        with bdb.savepoint():
            _create_population(bdb, phrase)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropPop):
        with bdb.savepoint():
            if not core.bayesdb_has_population(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such population: %r' % (phrase.name,))
            population_id = core.bayesdb_get_population(bdb, phrase.name)
            generator_ids = core.bayesdb_population_generators(
                bdb, population_id)
            if generator_ids:
                generators = [core.bayesdb_generator_name(bdb, gid)
                    for gid in generator_ids]
                raise BQLError(bdb, 'Population %r still has generators: %r' %
                    (phrase.name, generators))
            # XXX helpful error checking if generators still exist
            # XXX check change counts
            bdb.sql_execute('''
                DELETE FROM bayesdb_variable WHERE population_id = ?
            ''', (population_id,))
            bdb.sql_execute('''
                DELETE FROM bayesdb_population WHERE id = ?
            ''', (population_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterPop):
        with bdb.savepoint():
            population = phrase.population
            if not core.bayesdb_has_population(bdb, population):
                raise BQLError(bdb, 'No such population: %s' %
                    (repr(population),))
            population_id = core.bayesdb_get_population(bdb, population)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterPopRenamePop):
                    table = core.bayesdb_population_table(bdb, population_id)
                    # Prevent renaming of implicit population directly, unless
                    # being called by ast.AlterTabRenameTab in which case the
                    # table name and population name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, population_id) \
                            and casefold(population) == casefold(table):
                        raise BQLError(bdb, 'Cannot rename implicit'
                            'population %s; rename base table instead'
                            % (population,))
                    # Make sure nothing else has this name.
                    if casefold(population) != casefold(cmd.name):
                        if core.bayesdb_has_population(bdb, cmd.name):
                            raise BQLError(bdb,
                                'Name already defined as population' ': %s'
                                % (repr(cmd.name),))
                    # Update bayesdb_population.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_population SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, population_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # If population has implicit generator, rename it too.
                    if core.bayesdb_population_has_implicit_generator(
                            bdb, population_id):
                        generators = core.bayesdb_population_generators(
                            bdb, population_id)
                        assert len(generators) == 1
                        generator_name = core.bayesdb_generator_name(
                            bdb, generators[0])
                        qp = sqlite3_quote_name(cmd.name)
                        qg = sqlite3_quote_name(generator_name)
                        bdb.execute('ALTER GENERATOR %s RENAME TO %s'
                            % (qg, qp,))
                    # Remember the new name for subsequent commands.
                    population = cmd.name
                elif isinstance(cmd, ast.AlterPopAddVar):
                    # Ensure column exists in base table.
                    table = core.bayesdb_population_table(bdb, population_id)
                    if not core.bayesdb_table_has_column(
                            bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'No such variable in base table: %s'
                            % (cmd.name))
                    # Ensure variable not already in population.
                    if core.bayesdb_has_variable(
                            bdb, population_id, None, cmd.name):
                        raise BQLError(bdb,
                            'Variable already in population: %s'
                            % (cmd.name))
                    # Ensure there is at least observation in the column.
                    qt = sqlite3_quote_name(table)
                    qc = sqlite3_quote_name(cmd.name)
                    cursor = bdb.sql_execute(
                        'SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' %
                        (qt, qc))
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb,
                            'Cannot add variable without any values: %s'
                            % (cmd.name))
                    # If stattype is None, guess.
                    if cmd.stattype is None:
                        cursor = bdb.sql_execute(
                            'SELECT %s FROM %s' % (qc, qt))
                        rows = cursor.fetchall()
                        [stattype, reason] = bayesdb_guess_stattypes(
                            [cmd.name], rows)[0]
                        # Fail if trying to model a key.
                        if stattype == 'key':
                            raise BQLError(bdb,
                                'Values in column %s appear to be keys.'
                                % (cmd.name,))
                        # Fail if cannot determine a stattype.
                        elif stattype == 'ignore':
                            raise BQLError(bdb,
                                'Failed to determine a stattype for %s, '
                                'please specify one manually.' % (cmd.name,))
                    # If user specified stattype, ensure it exists.
                    elif not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid stattype: %s' % (cmd.stattype))
                    else:
                        stattype = cmd.stattype
                    # Check that strings are not being modeled as numerical.
                    if stattype == 'numerical' \
                            and _column_contains_string(bdb, table, cmd.name):
                        raise BQLError(bdb,
                            'Numerical column contains string values: %r '
                            % (qc,))
                    with bdb.savepoint():
                        # Add the variable to the population.
                        core.bayesdb_add_variable(
                            bdb, population_id, cmd.name, stattype)
                        colno = core.bayesdb_variable_number(
                            bdb, population_id, None, cmd.name)
                        # Add the variable to each (initialized) generator in
                        # the population.
                        generator_ids = filter(
                            lambda g: core.bayesdb_generator_modelnos(bdb, g),
                            core.bayesdb_population_generators(
                                bdb, population_id),
                        )
                        for generator_id in generator_ids:
                            backend = core.bayesdb_generator_backend(
                                bdb, generator_id)
                            backend.add_column(bdb, generator_id, colno)
                elif isinstance(cmd, ast.AlterPopStatType):
                    # Check the no generators are defined for this population.
                    generators = core.bayesdb_population_generators(
                        bdb, population_id)
                    if generators:
                        raise BQLError(bdb,
                            'Cannot update statistical types for population '
                            '%s, it has generators: %s'
                            % (repr(population), repr(generators),))
                    # Check all the variables are in the population.
                    unknown = [
                        c for c in cmd.names if not
                        core.bayesdb_has_variable(bdb, population_id, None, c)
                    ]
                    if unknown:
                        raise BQLError(bdb,
                            'No such variables in population: %s'
                            % (repr(unknown)))
                    # Check the statistical type is valid.
                    if not core.bayesdb_has_stattype(bdb, cmd.stattype):
                        raise BQLError(bdb,
                            'Invalid statistical type: %r'
                            % (repr(cmd.stattype),))
                    # Check that strings are not being modeled as numerical.
                    if cmd.stattype == 'numerical':
                        table = core.bayesdb_population_table(
                            bdb, population_id)
                        numerical_string_vars = [
                            col for col in cmd.names
                            if _column_contains_string(bdb, table, col)
                        ]
                        if numerical_string_vars:
                            raise BQLError(bdb,
                                'Columns with string values modeled as '
                                'numerical: %r' % (numerical_string_vars,))
                    # Perform the stattype update.
                    colnos = [
                        core.bayesdb_variable_number(
                            bdb, population_id, None, c) for c in cmd.names
                    ]
                    qcolnos = ','.join('%d' % (colno,) for colno in colnos)
                    update_stattype_sql = '''
                        UPDATE bayesdb_variable SET stattype = ?
                            WHERE population_id = ? AND colno IN (%s)
                    ''' % (qcolnos,)
                    bdb.sql_execute(
                        update_stattype_sql,
                        (casefold(cmd.stattype), population_id,))
                else:
                    assert False, 'Invalid ALTER POPULATION command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' %
                (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)

        # Find the backend, or use the default.
        backend_name = phrase.backend
        if phrase.backend is None:
            backend_name = 'cgpm'
        if backend_name not in bdb.backends:
            raise BQLError(bdb, 'No such backend: %s' %
                (repr(backend_name),))
        backend = bdb.backends[backend_name]

        # Retrieve the (possibility implicit) generator name.
        generator_name = phrase.name or phrase.population
        implicit = 1 if phrase.name is None else 0

        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, population_id, generator_name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(generator_name),))
            else:
                # Insert a record into bayesdb_generator and get the
                # assigned id.
                bdb.sql_execute('''
                    INSERT INTO bayesdb_generator
                        (name, population_id, backend, implicit)
                        VALUES (?, ?, ?, ?)
                ''', (generator_name, population_id, backend.name(), implicit))
                generator_id = core.bayesdb_get_generator(
                    bdb, population_id, generator_name)
                # Do any backend-specific initialization.
                backend.create_generator(bdb, generator_id, phrase.schema)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, None, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(phrase.name),))
            generator_id = core.bayesdb_get_generator(bdb, None, phrase.name)
            backend = core.bayesdb_generator_backend(bdb, generator_id)

            # Backend-specific destruction.
            backend.drop_generator(bdb, generator_id)

            # Drop latent variables, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_variable WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id,))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id,))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id,))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, None, generator):
                raise BQLError(bdb, 'No such generator: %s' %
                    (repr(generator),))
            generator_id = core.bayesdb_get_generator(bdb, None, generator)
            cmds_generic = []
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    population_id = core.bayesdb_generator_population(
                        bdb, generator_id)
                    population = core.bayesdb_population_name(
                        bdb, population_id)
                    # Prevent renaming of implicit generator directly, unless
                    # being called by ast.AlterPopRenamePop in which case the
                    # population name and generator name will not be matching.
                    if core.bayesdb_population_is_implicit(bdb, generator_id) \
                            and casefold(generator) == casefold(population):
                        raise BQLError(bdb, 'Cannot rename implicit '
                            'generator; rename base population instead')
                    # Disable modelnos with AlterGenRenameGen.
                    if phrase.modelnos is not None:
                        raise BQLError(bdb, 'Cannot specify models for RENAME')
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_generator(bdb, None, cmd.name):
                            raise BQLError(bdb, 'Name already defined'
                                ' as generator: %s' %
                                (repr(cmd.name),))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                        (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                elif isinstance(cmd, ast.AlterGenGeneric):
                    cmds_generic.append(cmd.command)
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
            if cmds_generic:
                modelnos = phrase.modelnos
                modelnos_invalid = None if modelnos is None else [
                    modelno for modelno in modelnos if not
                    core.bayesdb_generator_has_model(bdb, generator_id, modelno)
                ]
                if modelnos_invalid:
                    raise BQLError(bdb,
                        'No such models in generator %s: %s' %
                        (repr(phrase.generator), repr(modelnos)))
                # Call generic alternations on the backend.
                backend = core.bayesdb_generator_backend(bdb, generator_id)
                backend.alter(bdb, generator_id, modelnos, cmds_generic)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        modelnos = range(phrase.nmodels)

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                    if not core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
            else:
                existing = set(modelno for modelno in modelnos
                    if core.bayesdb_generator_has_model(bdb, generator_id,
                        modelno))
                if 0 < len(existing):
                    raise BQLError(bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno)
                    VALUES (:generator_id, :modelno)
            '''
            for modelno in modelnos:
                bdb.sql_execute(insert_model_sql, {
                    'generator_id': generator_id,
                    'modelno': modelno,
                })

            # Do backend-specific initialization.
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            backend.initialize_models(bdb, generator_id, modelnos)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        # WARNING: It is the backend's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the backend's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the backend can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator(bdb, None, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' %
                (phrase.generator,))
        generator_id = core.bayesdb_get_generator(bdb, None, phrase.generator)
        backend = core.bayesdb_generator_backend(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        backend.analyze_models(bdb, generator_id,
            modelnos=phrase.modelnos,
            iterations=phrase.iterations,
            max_seconds=phrase.seconds,
            ckpt_iterations=phrase.ckpt_iterations,
            ckpt_seconds=phrase.ckpt_seconds,
            program=phrase.program)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator(
                bdb, None, phrase.generator)
            backend = core.bayesdb_generator_backend(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            backend.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id,))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Regress):
        # Retrieve the population.
        if not core.bayesdb_has_population(bdb, phrase.population):
            raise BQLError(bdb, 'No such population: %r' % (phrase.population,))
        population_id = core.bayesdb_get_population(bdb, phrase.population)
        # Retrieve the generator
        generator_id = None
        if phrase.generator:
            if not core.bayesdb_has_generator(bdb, population_id,
                    phrase.generator):
                raise BQLError(bdb,
                    'No such generator: %r' % (phrase.generator,))
            generator_id = core.bayesdb_get_generator(
                bdb, population_id, phrase.generator)
        # Retrieve the target variable.
        if not core.bayesdb_has_variable(
                bdb, population_id, None, phrase.target):
            raise BQLError(bdb, 'No such variable: %r' % (phrase.target,))
        colno_target = core.bayesdb_variable_number(
            bdb, population_id, None, phrase.target)
        stattype = core.bayesdb_variable_stattype(bdb, population_id,
            generator_id, colno_target)
        if stattype != 'numerical':
            raise BQLError(bdb,
                'Target variable is not numerical: %r' % (phrase.target,))
        # Build the given variables.
        if any(isinstance(col, ast.SelColAll) for col in phrase.givens):
            # Using * is not allowed to be mixed with other variables.
            if len(phrase.givens) > 1:
                raise BQLError(bdb, 'Cannot use (*) with other givens.')
            colno_givens = core.bayesdb_variable_numbers(
                bdb, population_id, None)
        else:
            if any(isinstance(col, ast.SelColSub) for col in phrase.givens):
                # Subexpression needs special compiling.
                out = compiler.Output(n_numpar, nampar_map, bindings)
                bql_compiler = compiler.BQLCompiler_None()
                givens = compiler.expand_select_columns(
                    bdb, phrase.givens, True, bql_compiler, out)
            else:
                givens = phrase.givens
            colno_givens = [
                core.bayesdb_variable_number(
                    bdb, population_id, None, given.expression.column)
                for given in givens
            ]
        # Build the arguments to bqlfn.bayesdb_simulate.
        colno_givens_unique = set(
            colno for colno in colno_givens if colno!= colno_target
        )
        if len(colno_givens_unique) == 0:
            raise BQLError(bdb, 'No matching given columns.')
        constraints = []
        colnos = [colno_target] + list(colno_givens_unique)
        nsamp = 100 if phrase.nsamp is None else phrase.nsamp.value.value
        modelnos = None if phrase.modelnos is None else str(phrase.modelnos)
        rows = bqlfn.bayesdb_simulate(
            bdb, population_id, generator_id, modelnos, constraints,
            colnos, numpredictions=nsamp)
        # Retrieve the stattypes.
        stattypes = [
            core.bayesdb_variable_stattype(
                bdb, population_id, generator_id, colno_given)
            for colno_given in colno_givens_unique
        ]
        # Separate the target values from the given values.
        target_values = [row[0] for row in rows]
        given_values = [row[1:] for row in rows]
        given_names = [
            core.bayesdb_variable_name(bdb, population_id, generator_id, given)
            for given in colno_givens_unique
        ]
        # Compute the coefficients. The import to regress_ols is here since the
        # feature depends on pandas + sklearn, so avoid module-wide import.
        from bayeslite.regress import regress_ols
        coefficients = regress_ols(
            target_values, given_values, given_names, stattypes)
        # Store the results in a winder.
        temptable = bdb.temp_table_name()
        qtt = sqlite3_quote_name(temptable)
        out = compiler.Output(0, {}, {})
        out.winder('''
            CREATE TEMP TABLE %s (variable TEXT, coefficient REAL);
        ''' % (qtt,), ())
        for variable, coef in coefficients:
            out.winder('''
                INSERT INTO %s VALUES (?, ?)
            ''' % (qtt), (variable, coef,))
        out.write('SELECT * FROM %s ORDER BY variable' % (qtt,))
        out.unwinder('DROP TABLE %s' % (qtt,), ())
        winders, unwinders = out.getwindings()
        return execute_wound(
            bdb, winders, unwinders, out.getvalue(), out.getbindings())

    assert False                # XXX
Ejemplo n.º 53
0
def bayesdb_read_pandas_df(bdb,
                           table,
                           df,
                           create=False,
                           ifnotexists=False,
                           index=None):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    :param str index: name of column for index

    If `index` is `None`, then the dataframe's index dtype must be
    convertible to int64, and it is mapped to the table's rowids.  If
    the dataframe's index dtype is not convertible to int64, you must
    specify `index` to give a primary key for the table.
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    column_names = [str(column) for column in df.columns]
    if index is None:
        create_column_names = column_names
        insert_column_names = ['_rowid_'] + column_names
        try:
            key_index = df.index.astype('int64')
        except ValueError:
            raise ValueError('Must specify index name for non-integral index!')
    else:
        if index in df.columns:
            raise ValueError('Index name collides with column name: %r' %
                             (index, ))
        create_column_names = [index] + column_names
        insert_column_names = create_column_names
        key_index = df.index
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(
                name for name in create_column_names
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown), ))
        elif create:
            qccns = map(sqlite3_quote_name, create_column_names)

            def column_schema(column_name, qcn):
                if column_name == index:
                    return '%s NUMERIC PRIMARY KEY' % (qcn, )
                else:
                    return '%s NUMERIC' % (qcn, )

            schema = ','.join(
                column_schema(ccn, qccn)
                for ccn, qccn in zip(create_column_names, qccns))
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table), ))
        qt = sqlite3_quote_name(table)
        qicns = map(sqlite3_quote_name, insert_column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qicns), ','.join('?' for _qicn in qicns))
        for key, i in zip(key_index, df.index):
            bdb.sql_execute(sql, (key, ) + tuple(df.ix[i]))
Ejemplo n.º 54
0
    def create_generator(self, bdb, generator_id, schema_tokens, **kwargs):
        schema_ast = cgpm_schema.parse.parse(schema_tokens)
        schema = _create_schema(bdb, generator_id, schema_ast, **kwargs)

        # Store the schema.
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_cgpm_generator (generator_id, schema_json)
                VALUES (?, ?)
        ''', (generator_id, json_dumps(schema)))

        # Get the underlying population and table.
        population_id = core.bayesdb_generator_population(bdb, generator_id)
        table = core.bayesdb_population_table(bdb, population_id)
        qt = sqlite3_quote_name(table)

        # Assign latent variable numbers.
        for var, stattype in sorted(schema['latents'].iteritems()):
            core.bayesdb_add_latent(bdb, population_id, generator_id, var,
                                    stattype)

        # Assign codes to categories and consecutive column numbers to
        # the modelled variables.
        vars_cursor = bdb.sql_execute(
            '''
            SELECT colno, name, stattype FROM bayesdb_variable
                WHERE population_id = ? AND 0 <= colno
        ''', (population_id, ))
        for colno, name, stattype in vars_cursor:
            if _is_categorical(stattype):
                qn = sqlite3_quote_name(name)
                cursor = bdb.sql_execute('''
                    SELECT DISTINCT %s FROM %s WHERE %s IS NOT NULL
                ''' % (qn, qt, qn))
                for code, (value, ) in enumerate(cursor):
                    bdb.sql_execute(
                        '''
                        INSERT INTO bayesdb_cgpm_category
                            (generator_id, colno, value, code)
                            VALUES (?, ?, ?, ?)
                    ''', (generator_id, colno, value, code))

        # Assign contiguous 0-indexed ids to the individuals in the
        # table.
        if schema['subsample']:
            k = schema['subsample']
            n = cursor_value(
                bdb.sql_execute('SELECT COUNT(*) FROM %s' % (qt, )))
            cursor = bdb.sql_execute(
                'SELECT _rowid_ FROM %s ORDER BY _rowid_ ASC' % (qt, ))
            uniform = bdb._prng.weakrandom_uniform
            # https://en.wikipedia.org/wiki/Reservoir_sampling
            samples = []
            for i, row in enumerate(cursor):
                if i < k:
                    samples.append(row)
                else:
                    r = uniform(i + 1)
                    if r < k:
                        samples[r] = row
            cursor = samples
        else:
            cursor = bdb.sql_execute('SELECT _rowid_ FROM %s' % (qt, ))
        for cgpm_rowid, (table_rowid, ) in enumerate(cursor):
            bdb.sql_execute(
                '''
                INSERT INTO bayesdb_cgpm_individual
                    (generator_id, table_rowid, cgpm_rowid)
                    VALUES (?, ?, ?)
            ''', (generator_id, table_rowid, cgpm_rowid))
Ejemplo n.º 55
0
def bayesdb_read_csv(bdb,
                     table,
                     f,
                     header=False,
                     create=False,
                     ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table), ))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                              (repr(list(duplicates)), ))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(
                    name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown), ))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                              (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                              (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Ejemplo n.º 56
0
def bayesdb_read_pandas_df(bdb, table, df, create=False, ifnotexists=False,
        index=None):
    """Read data from a pandas dataframe into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param pandas.DataFrame df: pandas dataframe
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true, and `create` is true` and `table`
        exists, read data into it anyway
    :param str index: name of column for index

    If `index` is `None`, then the dataframe's index dtype must be
    convertible to int64, and it is mapped to the table's rowids.  If
    the dataframe's index dtype is not convertible to int64, you must
    specify `index` to give a primary key for the table.
    """
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    column_names = [str(column) for column in df.columns]
    if index is None:
        create_column_names = column_names
        insert_column_names = ['_rowid_'] + column_names
        try:
            key_index = df.index.astype('int64')
        except ValueError:
            raise ValueError('Must specify index name for non-integral index!')
    else:
        if index in df.columns:
            raise ValueError('Index name collides with column name: %r'
                % (index,))
        create_column_names = [index] + column_names
        insert_column_names = create_column_names
        key_index = df.index
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
            core.bayesdb_table_guarantee_columns(bdb, table)
            unknown = set(name for name in create_column_names
                if not core.bayesdb_table_has_column(bdb, table, name))
            if len(unknown) != 0:
                raise ValueError('Unknown columns: %s' % (list(unknown),))
        elif create:
            qccns = map(sqlite3_quote_name, create_column_names)
            def column_schema(column_name, qcn):
                if column_name == index:
                    return '%s NUMERIC PRIMARY KEY' % (qcn,)
                else:
                    return '%s NUMERIC' % (qcn,)
            schema = ','.join(column_schema(ccn, qccn)
                for ccn, qccn in zip(create_column_names, qccns))
            qt = sqlite3_quote_name(table)
            bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
            core.bayesdb_table_guarantee_columns(bdb, table)
        else:
            raise ValueError('No such table: %s' % (repr(table),))
        qt = sqlite3_quote_name(table)
        qicns = map(sqlite3_quote_name, insert_column_names)
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qicns), ','.join('?' for _qicn in qicns))
        for key, i in zip(key_index, df.index):
            bdb.sql_execute(sql, (key,) + tuple(df.ix[i]))
Ejemplo n.º 57
0
def _create_population(bdb, phrase):
    # Retrieve the (possibility implicit) population name.
    population_name = phrase.name or phrase.table
    implicit = 1 if phrase.name is None else 0

    # Handle IF NOT EXISTS.
    if core.bayesdb_has_population(bdb, population_name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(bdb, 'Name already defined as population: %r' %
                (population_name,))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)

    # Create the population record and get the assigned id.
    bdb.sql_execute('''
        INSERT INTO bayesdb_population (name, tabname, implicit)
            VALUES (?, ?, ?)
    ''', (population_name, phrase.table, implicit))
    population_id = core.bayesdb_get_population(bdb, population_name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(itertools.chain.from_iterable(
        [[(name, s.stattype) for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(itertools.chain.from_iterable(
        [[(name, 'ignore') for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(itertools.chain.from_iterable(
        [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r'
                % (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes])
        migrate = [(col, st) for col, st in pop_guess_vars if st=='key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Ensure no string-valued variables are being modeled as numerical.
    numerical_string_vars = [
        var for var, stattype in pop_model_vars
        if stattype == 'numerical'
            and _column_contains_string(bdb, phrase.table, var)
    ]
    if numerical_string_vars:
        raise BQLError(bdb,
            'Column(s) with string values modeled as numerical: %r'
            % (numerical_string_vars, ))

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r'
            % (not_found, ))

    # Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    seen_variables = set()
    duplicates = set()
    missing = set()
    invalid = set()
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in seen_variables:
            duplicates.add(name)
            continue
        if not core.bayesdb_table_has_column(bdb, phrase.table, nm):
            missing.add(name)
            continue
        cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
        if cursor_value(cursor) == 0 and stattype != 'ignore':
            invalid.add(stattype)
            continue
        seen_variables.add(nm)
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, 'No such columns in table %r: %r' %
            (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),))
    if invalid:
        raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        core.bayesdb_add_variable(bdb, population_id, name, stattype)
Ejemplo n.º 58
0
def execute_phrase(bdb, phrase, bindings=()):
    """Execute the BQL AST phrase `phrase` and return a cursor of results."""
    if isinstance(phrase, ast.Parametrized):
        n_numpar = phrase.n_numpar
        nampar_map = phrase.nampar_map
        phrase = phrase.phrase
        assert 0 < n_numpar
    else:
        n_numpar = 0
        nampar_map = None
        # Ignore extraneous bindings.  XXX Bad idea?

    if ast.is_query(phrase):
        # Compile the query in the transaction in case we need to
        # execute subqueries to determine column lists.  Compiling is
        # a quick tree descent, so this should be fast.
        out = compiler.Output(n_numpar, nampar_map, bindings)
        with bdb.savepoint():
            compiler.compile_query(bdb, phrase, out)
        winders, unwinders = out.getwindings()
        return execute_wound(bdb, winders, unwinders, out.getvalue(),
                             out.getbindings())

    if isinstance(phrase, ast.Begin):
        txn.bayesdb_begin_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Rollback):
        txn.bayesdb_rollback_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.Commit):
        txn.bayesdb_commit_transaction(bdb)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabAs):
        assert ast.is_query(phrase.query)
        with bdb.savepoint():
            out = compiler.Output(n_numpar, nampar_map, bindings)
            qt = sqlite3_quote_name(phrase.name)
            temp = 'TEMP ' if phrase.temp else ''
            ifnotexists = 'IF NOT EXISTS ' if phrase.ifnotexists else ''
            out.write('CREATE %sTABLE %s%s AS ' % (temp, ifnotexists, qt))
            compiler.compile_query(bdb, phrase.query, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                bdb.sql_execute(out.getvalue(), out.getbindings())
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateTabSim):
        assert isinstance(phrase.simulation, ast.Simulate)
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as generator: %s' %
                    (repr(phrase.name), ))
            if core.bayesdb_has_table(bdb, phrase.name):
                raise BQLError(
                    bdb, 'Name already defined as table: %s' %
                    (repr(phrase.name), ))
            if not core.bayesdb_has_generator_default(
                    bdb, phrase.simulation.generator):
                raise BQLError(
                    bdb,
                    'No such generator: %s' % (phrase.simulation.generator, ))
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.simulation.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            table = core.bayesdb_generator_table(bdb, generator_id)
            qn = sqlite3_quote_name(phrase.name)
            qt = sqlite3_quote_name(table)
            qgn = sqlite3_quote_name(phrase.simulation.generator)
            column_names = phrase.simulation.columns
            qcns = map(sqlite3_quote_name, column_names)
            cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
            column_sqltypes = {}
            for _colno, name, sqltype, _nonnull, _default, _primary in cursor:
                assert casefold(name) not in column_sqltypes
                column_sqltypes[casefold(name)] = sqltype
            assert 0 < len(column_sqltypes)
            for column_name in column_names:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            for column_name, _expression in phrase.simulation.constraints:
                if casefold(column_name) not in column_sqltypes:
                    raise BQLError(
                        bdb, 'No such column'
                        ' in generator %s table %s: %s' %
                        (repr(phrase.simulation.generator), repr(table),
                         repr(column_name)))
            # XXX Move to compiler.py.
            # XXX Copypasta of this in compile_simulate!
            out = compiler.Output(n_numpar, nampar_map, bindings)
            out.write('SELECT ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.nsamples,
                                                  out)
            out.write(', ')
            with compiler.compiling_paren(bdb, out, 'CAST(', ' AS INTEGER)'):
                compiler.compile_nobql_expression(bdb,
                                                  phrase.simulation.modelno,
                                                  out)
            for _column_name, expression in phrase.simulation.constraints:
                out.write(', ')
                compiler.compile_nobql_expression(bdb, expression, out)
            winders, unwinders = out.getwindings()
            with compiler.bayesdb_wind(bdb, winders, unwinders):
                cursor = bdb.sql_execute(out.getvalue(),
                                         out.getbindings()).fetchall()
            assert len(cursor) == 1
            nsamples = cursor[0][0]
            assert isinstance(nsamples, int)
            modelno = cursor[0][1]
            assert modelno is None or isinstance(modelno, int)
            constraints = \
                [(core.bayesdb_generator_column_number(bdb, generator_id, name),
                        value)
                    for (name, _expression), value in
                        zip(phrase.simulation.constraints, cursor[0][2:])]
            colnos = \
                [core.bayesdb_generator_column_number(bdb, generator_id, name)
                    for name in column_names]
            bdb.sql_execute(
                'CREATE %sTABLE %s%s (%s)' %
                ('TEMP ' if phrase.temp else '',
                 'IF NOT EXISTS ' if phrase.ifnotexists else '', qn, ','.join(
                     '%s %s' % (qcn, column_sqltypes[casefold(column_name)])
                     for qcn, column_name in zip(qcns, column_names))))
            insert_sql = '''
                INSERT INTO %s (%s) VALUES (%s)
            ''' % (qn, ','.join(qcns), ','.join('?' for qcn in qcns))
            for row in bqlfn.bayesdb_simulate(bdb,
                                              generator_id,
                                              constraints,
                                              colnos,
                                              modelno=modelno,
                                              numpredictions=nsamples):
                bdb.sql_execute(insert_sql, row)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropTab):
        with bdb.savepoint():
            sql = 'SELECT COUNT(*) FROM bayesdb_generator WHERE tabname = ?'
            cursor = bdb.sql_execute(sql, (phrase.name, ))
            if 0 < cursor_value(cursor):
                # XXX Automatically delete the generators?  Generators
                # are more interesting than triggers and indices, so
                # automatic deletion is not obviously right.
                raise BQLError(
                    bdb, 'Table still in use by generators: %s' %
                    (repr(phrase.name), ))
            bdb.sql_execute('DELETE FROM bayesdb_column WHERE tabname = ?',
                            (phrase.name, ))
            ifexists = 'IF EXISTS ' if phrase.ifexists else ''
            qt = sqlite3_quote_name(phrase.name)
            return bdb.sql_execute('DROP TABLE %s%s' % (ifexists, qt))

    if isinstance(phrase, ast.AlterTab):
        with bdb.savepoint():
            table = phrase.table
            if not core.bayesdb_has_table(bdb, table):
                raise BQLError(bdb, 'No such table: %s' % (repr(table), ))
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterTabRenameTab):
                    # If the names differ only in case, we have to do
                    # some extra work because SQLite will reject the
                    # table rename.  Note that we may even have table
                    # == cmd.name here, but if the stored table name
                    # differs in case from cmd.name, we want to update
                    # it anyway.
                    if casefold(table) == casefold(cmd.name):
                        # Go via a temporary table.
                        temp = table + '_temp'
                        while core.bayesdb_has_table(bdb, temp) or \
                              core.bayesdb_has_generator(bdb, temp):
                            temp += '_temp'
                        rename_table(bdb, table, temp)
                        rename_table(bdb, temp, cmd.name)
                    else:
                        # Make sure nothing else has this name and
                        # rename it.
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                        rename_table(bdb, table, cmd.name)
                    # Remember the new name for subsequent commands.
                    table = cmd.name
                elif isinstance(cmd, ast.AlterTabRenameCol):
                    # XXX Need to deal with this in the compiler.
                    raise NotImplementedError('Renaming columns'
                                              ' not yet implemented.')
                    # Make sure the old name exist and the new name does not.
                    old_folded = casefold(cmd.old)
                    new_folded = casefold(cmd.new)
                    if old_folded != new_folded:
                        if not core.bayesdb_table_has_column(
                                bdb, table, cmd.old):
                            raise BQLError(
                                bdb, 'No such column in table %s'
                                ': %s' % (repr(table), repr(cmd.old)))
                        if core.bayesdb_table_has_column(bdb, table, cmd.new):
                            raise BQLError(
                                bdb, 'Column already exists'
                                ' in table %s: %s' %
                                (repr(table), repr(cmd.new)))
                    # Update bayesdb_column.  Everything else refers
                    # to columns by (tabname, colno) pairs rather than
                    # by names.
                    update_column_sql = '''
                        UPDATE bayesdb_column SET name = :new
                            WHERE tabname = :table AND name = :old
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_column_sql, {
                        'table': table,
                        'old': cmd.old,
                        'new': cmd.new,
                    })
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # ...except metamodels may have the (case-folded)
                    # name cached.
                    if old_folded != new_folded:
                        generators_sql = '''
                            SELECT id FROM bayesdb_generator WHERE tabname = ?
                        '''
                        cursor = bdb.sql_execute(generators_sql, (table, ))
                        for (generator_id, ) in cursor:
                            metamodel = core.bayesdb_generator_metamodel(
                                bdb, generator_id)
                            metamodel.rename_column(bdb, generator_id,
                                                    old_folded, new_folded)
                elif isinstance(cmd, ast.AlterTabSetDefGen):
                    if not core.bayesdb_has_generator(bdb, cmd.generator):
                        raise BQLError(
                            bdb,
                            'No such generator: %s' % (repr(cmd.generator), ))
                    generator_id = core.bayesdb_get_generator(
                        bdb, cmd.generator)
                    bayesdb_schema_required(bdb, 6, "generator defaults")
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                    set_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 1 WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(set_default_sql, (generator_id, ))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                elif isinstance(cmd, ast.AlterTabUnsetDefGen):
                    unset_default_sql = '''
                        UPDATE bayesdb_generator SET defaultp = 0
                            WHERE tabname = ? AND defaultp
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(unset_default_sql, (table, ))
                    assert bdb._sqlite3.totalchanges() - total_changes in (0,
                                                                           1)
                else:
                    assert False, 'Invalid alter table command: %s' % \
                        (cmd,)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.CreateGen):
        # Find the metamodel.
        if phrase.metamodel not in bdb.metamodels:
            raise BQLError(
                bdb, 'No such metamodel: %s' % (repr(phrase.metamodel), ))
        metamodel = bdb.metamodels[phrase.metamodel]

        # Let the metamodel parse the schema itself and call
        # create_generator with the modelled columns.
        with bdb.savepoint():
            if core.bayesdb_has_generator(bdb, phrase.name):
                if not phrase.ifnotexists:
                    raise BQLError(
                        bdb, 'Name already defined as generator: %s' %
                        (repr(phrase.name), ))
            else:

                def instantiate(columns):
                    return instantiate_generator(bdb,
                                                 phrase.name,
                                                 phrase.table,
                                                 metamodel,
                                                 columns,
                                                 default=phrase.default)

                metamodel.create_generator(bdb, phrase.table, phrase.schema,
                                           instantiate)

        # All done.  Nothing to return.
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropGen):
        with bdb.savepoint():
            if not core.bayesdb_has_generator(bdb, phrase.name):
                if phrase.ifexists:
                    return empty_cursor(bdb)
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(phrase.name), ))
            generator_id = core.bayesdb_get_generator(bdb, phrase.name)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)

            # Metamodel-specific destruction.
            metamodel.drop_generator(bdb, generator_id)

            # Drop the columns, models, and, finally, generator.
            drop_columns_sql = '''
                DELETE FROM bayesdb_generator_column WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_columns_sql, (generator_id, ))
            drop_model_sql = '''
                DELETE FROM bayesdb_generator_model WHERE generator_id = ?
            '''
            bdb.sql_execute(drop_model_sql, (generator_id, ))
            drop_generator_sql = '''
                DELETE FROM bayesdb_generator WHERE id = ?
            '''
            bdb.sql_execute(drop_generator_sql, (generator_id, ))
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AlterGen):
        with bdb.savepoint():
            generator = phrase.generator
            if not core.bayesdb_has_generator(bdb, generator):
                raise BQLError(bdb,
                               'No such generator: %s' % (repr(generator), ))
            generator_id = core.bayesdb_get_generator(bdb, generator)
            for cmd in phrase.commands:
                if isinstance(cmd, ast.AlterGenRenameGen):
                    # Make sure nothing else has this name.
                    if casefold(generator) != casefold(cmd.name):
                        if core.bayesdb_has_table(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined as table'
                                ': %s' % (repr(cmd.name), ))
                        if core.bayesdb_has_generator(bdb, cmd.name):
                            raise BQLError(
                                bdb, 'Name already defined'
                                ' as generator: %s' % (repr(cmd.name), ))
                    # Update bayesdb_generator.  Everything else
                    # refers to it by id.
                    update_generator_sql = '''
                        UPDATE bayesdb_generator SET name = ? WHERE id = ?
                    '''
                    total_changes = bdb._sqlite3.totalchanges()
                    bdb.sql_execute(update_generator_sql,
                                    (cmd.name, generator_id))
                    assert bdb._sqlite3.totalchanges() - total_changes == 1
                    # Remember the new name for subsequent commands.
                    generator = cmd.name
                else:
                    assert False, 'Invalid ALTER GENERATOR command: %s' % \
                        (repr(cmd),)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.InitModels):
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        modelnos = range(phrase.nmodels)
        model_config = None  # XXX For now.

        with bdb.savepoint():
            # Find the model numbers.  Omit existing ones for
            # ifnotexists; reject existing ones otherwise.
            if phrase.ifnotexists:
                modelnos = set(modelno for modelno in modelnos
                               if not core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
            else:
                existing = set(modelno for modelno in modelnos
                               if core.bayesdb_generator_has_model(
                                   bdb, generator_id, modelno))
                if 0 < len(existing):
                    raise BQLError(
                        bdb, 'Generator %s already has models: %s' %
                        (repr(phrase.generator), sorted(existing)))

            # Stop now if there's nothing to initialize.
            if len(modelnos) == 0:
                return

            # Create the bayesdb_generator_model records.
            modelnos = sorted(modelnos)
            insert_model_sql = '''
                INSERT INTO bayesdb_generator_model
                    (generator_id, modelno, iterations)
                    VALUES (:generator_id, :modelno, :iterations)
            '''
            for modelno in modelnos:
                bdb.sql_execute(
                    insert_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                        'iterations': 0,
                    })

            # Do metamodel-specific initialization.
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            metamodel.initialize_models(bdb, generator_id, modelnos,
                                        model_config)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.AnalyzeModels):
        if not phrase.wait:
            raise NotImplementedError('No background analysis -- use WAIT.')
        # WARNING: It is the metamodel's responsibility to work in a
        # transaction.
        #
        # WARNING: It is the metamodel's responsibility to update the
        # iteration count in bayesdb_generator_model records.
        #
        # We do this so that the metamodel can save incremental
        # progress in case of ^C in the middle.
        #
        # XXX Put these warning somewhere more appropriate.
        if not core.bayesdb_has_generator_default(bdb, phrase.generator):
            raise BQLError(bdb, 'No such generator: %s' % (phrase.generator, ))
        generator_id = core.bayesdb_get_generator_default(
            bdb, phrase.generator)
        metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
        # XXX Should allow parameters for iterations and ckpt/iter.
        metamodel.analyze_models(bdb,
                                 generator_id,
                                 modelnos=phrase.modelnos,
                                 iterations=phrase.iterations,
                                 max_seconds=phrase.seconds,
                                 ckpt_iterations=phrase.ckpt_iterations,
                                 ckpt_seconds=phrase.ckpt_seconds)
        return empty_cursor(bdb)

    if isinstance(phrase, ast.DropModels):
        with bdb.savepoint():
            generator_id = core.bayesdb_get_generator_default(
                bdb, phrase.generator)
            metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
            modelnos = None
            if phrase.modelnos is not None:
                lookup_model_sql = '''
                    SELECT COUNT(*) FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                modelnos = sorted(list(phrase.modelnos))
                for modelno in modelnos:
                    cursor = bdb.sql_execute(lookup_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
                    if cursor_value(cursor) == 0:
                        raise BQLError(
                            bdb, 'No such model'
                            ' in generator %s: %s' %
                            (repr(phrase.generator), repr(modelno)))
            metamodel.drop_models(bdb, generator_id, modelnos=modelnos)
            if modelnos is None:
                drop_models_sql = '''
                    DELETE FROM bayesdb_generator_model WHERE generator_id = ?
                '''
                bdb.sql_execute(drop_models_sql, (generator_id, ))
            else:
                drop_model_sql = '''
                    DELETE FROM bayesdb_generator_model
                        WHERE generator_id = :generator_id
                        AND modelno = :modelno
                '''
                for modelno in modelnos:
                    bdb.sql_execute(drop_model_sql, {
                        'generator_id': generator_id,
                        'modelno': modelno,
                    })
        return empty_cursor(bdb)

    assert False  # XXX
Ejemplo n.º 59
0
def bayesdb_read_csv(bdb, table, f, header=False,
        create=False, ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table),))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            if any(len(c)==0 for c in column_names):
                raise IOError(
                    'Missing column names in header: %s' %repr(column_names))
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                    (repr(list(duplicates)),))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown),))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                    (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                    (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Ejemplo n.º 60
0
def _column_contains_string(bdb, table, column):
    qt = sqlite3_quote_name(table)
    qc = sqlite3_quote_name(column)
    rows = bdb.sql_execute('SELECT %s FROM %s' % (qc, qt))
    return any(isinstance(r[0], unicode) for r in rows)