Example #1
0
def get_schema_as_list(bdb, population_name):
    population_id = bayesdb_get_population(bdb, population_name)
    table_name = bayesdb_population_table(bdb, population_id)
    qt = bql_quote_name(table_name)
    variable_names = bayesdb_variable_names(bdb, population_id, None)
    schema = []
    for variable_name in variable_names:
        colno = bayesdb_variable_number(bdb, population_id, None,
                                        variable_name)
        stattype = bayesdb_variable_stattype(bdb, population_id, None, colno)
        stattype_lookup = {
            'numerical': 'realAdditive',
            'nominal': 'categorical',
            'categorical': 'categorical',
        }
        schema_entry = {
            'name': variable_name,
            'stat_type': stattype_lookup[stattype]
        }
        if stattype == 'nominal':
            qv = bql_quote_name(variable_name)
            values = utils_bql.query(
                bdb, '''
                SELECT DISTINCT(%s) FROM %s
                WHERE %s IS NOT NULL
            ''' % (
                    qv,
                    qt,
                    qv,
                ))
            schema_entry['unique_values'] = \
                values[values.columns[0]].unique().tolist()
        schema.append(schema_entry)
    return schema
Example #2
0
def to_mml(mml_json, table, generator):
    """Returns a CREATE GENERATOR MML statement which will create a generator
    specified by mml_json.

    Parameters
    ----------
    mml_json
        A json representation of the generator. Must validate against
        MML_SCHEMA
    table : str
        The name of the input table to the generator
    generator : str
        The name of the generator to create
    """
    jsonschema.validate(mml_json, MML_SCHEMA)
    schema_phrase = ','.join(["%s %s" % (bql_quote_name(col), v['stattype'])
                             for col, v in mml_json['columns'].items()
                             if v['stattype'] != 'IGNORE'])
    subsample = mml_json.get('subsample', None)
    return (Template('CREATE GENERATOR $gen FOR $table '
                     'USING $metamodel($subsample $schema_phrase);')
            .substitute(
                gen=bql_quote_name(generator),
                table=bql_quote_name(table),
                # TODO(asilvers): This can't be quoted, but should be
                # restricted to a known-good set of metamodels.
                metamodel=mml_json['metamodel'],
                subsample='SUBSAMPLE(%d),' % subsample if subsample else '',
                schema_phrase=schema_phrase))
Example #3
0
def to_mml(mml_json, table, generator):
    """Returns a CREATE GENERATOR MML statement which will create a generator
    specified by mml_json.

    Parameters
    ----------
    mml_json
        A json representation of the generator. Must validate against
        MML_SCHEMA
    table : str
        The name of the input table to the generator
    generator : str
        The name of the generator to create
    """
    jsonschema.validate(mml_json, MML_SCHEMA)
    schema_phrase = ','.join([
        "%s %s" % (bql_quote_name(col), v['stattype'])
        for col, v in mml_json['columns'].items() if v['stattype'] != 'IGNORE'
    ])
    subsample = mml_json.get('subsample', None)
    return (Template(
        'CREATE GENERATOR $gen FOR $table '
        'USING $metamodel($subsample $schema_phrase);'
    ).substitute(
        gen=bql_quote_name(generator),
        table=bql_quote_name(table),
        # TODO(asilvers): This can't be quoted, but should be
        # restricted to a known-good set of metamodels.
        metamodel=mml_json['metamodel'],
        subsample='SUBSAMPLE(%d),' % subsample if subsample else '',
        schema_phrase=schema_phrase))
def simulate_from_rowid(bdb, table, column, rowid, limit=1000):
    qt = bayeslite.bql_quote_name(table)
    qc = bayeslite.bql_quote_name(str(column))
    cursor = bdb.execute('''
        SIMULATE %s FROM %s GIVEN rowid=? LIMIT ?
    ''' % (qc, qt) , bindings=(rowid, limit))
    return [float(x[0]) for x in cursor]
Example #5
0
def bayesdb_population(mkbdb,
                       tab,
                       pop,
                       gen,
                       table_schema,
                       data,
                       columns,
                       backend_name='cgpm'):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qp = bql_quote_name(pop)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(backend_name)
        bdb.execute('CREATE POPULATION %s FOR %s(%s)' %
                    (qp, qt, ';'.join(columns)))
        bdb.execute('CREATE GENERATOR %s FOR %s USING %s;' % (
            qg,
            qp,
            qmm,
        ))
        population_id = core.bayesdb_get_population(bdb, pop)
        generator_id = core.bayesdb_get_generator(bdb, population_id, gen)
        yield bdb, population_id, generator_id
Example #6
0
 def interpret_bql(self, query_string):
     '''Replace %t and %g as appropriate.'''
     return re.sub(
         r'(^|(?<=\s))%t\b', bayeslite.bql_quote_name(self.name),
         re.sub(r'(^|(?<=\s))%g\b',
                bayeslite.bql_quote_name(self.generator_name),
                query_string))
Example #7
0
 def interpret_query(self, query_string):
   '''Replace %t and %g as appropriate.'''
   return re.sub(r'(^|(?<=\s))%t\b',
                 bayeslite.bql_quote_name(self.name),
                 re.sub(r'(^|(?<=\s))%g\b',
                        bayeslite.bql_quote_name(self.generator_name),
                        query_string))
def simulate_from_rowid(bdb, table, column, rowid, limit=1000):
    qt = bayeslite.bql_quote_name(table)
    qc = bayeslite.bql_quote_name(str(column))
    cursor = bdb.execute('''
        SIMULATE %s FROM %s GIVEN rowid=? LIMIT ?
    ''' % (qc, qt),
                         bindings=(rowid, limit))
    return [float(x[0]) for x in cursor]
Example #9
0
def bayesdb_generator(mkbdb, tab, gen, table_schema, data, columns, metamodel_name="crosscat"):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(metamodel_name)
        bdb.execute("CREATE GENERATOR %s FOR %s USING %s(%s)" % (qg, qt, qmm, ",".join(columns)))
        yield bdb, core.bayesdb_get_generator(bdb, gen)
Example #10
0
def bayesdb_generator(mkbdb, tab, gen, table_schema, data, columns,
        metamodel_name='crosscat'):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(metamodel_name)
        bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
            (qg, qt, qmm, ','.join(columns)))
        yield bdb, core.bayesdb_get_generator(bdb, gen)
Example #11
0
def bayesdb_generator_cell_value(bdb, generator_id, colno, rowid):
    table_name = core.bayesdb_generator_table(bdb, generator_id)
    qt = bql_quote_name(table_name)
    colname = core.bayesdb_generator_column_name(bdb, generator_id, colno)
    qcn = bql_quote_name(colname)
    sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt)
    cursor = bdb.sql_execute(sql, (rowid, ))
    try:
        row = cursor.next()
    except StopIteration:
        assert False, 'Missing row at %d!' % (rowid, )
    else:
        return row[0]
Example #12
0
def bayesdb_generator_cell_value(bdb, generator_id, colno, rowid):
    table_name = core.bayesdb_generator_table(bdb, generator_id)
    qt = bql_quote_name(table_name)
    colname = core.bayesdb_generator_column_name(bdb, generator_id, colno)
    qcn = bql_quote_name(colname)
    sql = 'SELECT %s FROM %s WHERE _rowid_ = ?' % (qcn, qt)
    cursor = bdb.sql_execute(sql, (rowid,))
    try:
        row = cursor.next()
    except StopIteration:
        assert False, 'Missing row at %d!' % (rowid,)
    else:
        return row[0]
Example #13
0
def _retest_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, g)
    gid = core.bayesdb_get_generator(bdb, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)
    bdb.execute("ANALYZE %s FOR 1 ITERATION WAIT" % (qg,))
    bdb.execute("ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT" % (qg,))
    bdb.execute("ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT" % (qg,))
Example #14
0
def test_t1_column_value_probability(colno, rowid):
    with analyzed_bayesdb_generator(t1(), 1, 1) as (bdb, generator_id):
        if rowid == 0: rowid = bayesdb_maxrowid(bdb, generator_id)
        value = bayesdb_generator_cell_value(bdb, generator_id, colno, rowid)
        bqlfn.bql_column_value_probability(bdb, generator_id, None, colno,
            value)
        table_name = core.bayesdb_generator_table(bdb, generator_id)
        colname = core.bayesdb_generator_column_name(bdb, generator_id, colno)
        qt = bql_quote_name(table_name)
        qc = bql_quote_name(colname)
        sql = '''
            select bql_column_value_probability(?, NULL, ?,
                (select %s from %s where rowid = ?))
        ''' % (qc, qt)
        bdb.sql_execute(sql, (generator_id, colno, rowid)).fetchall()
Example #15
0
def test_t1_column_value_probability(colno, rowid):
    with analyzed_bayesdb_generator(t1(), 1, 1) as (bdb, generator_id):
        if rowid == 0: rowid = bayesdb_maxrowid(bdb, generator_id)
        value = bayesdb_generator_cell_value(bdb, generator_id, colno, rowid)
        bqlfn.bql_column_value_probability(bdb, generator_id, None, colno,
                                           value)
        table_name = core.bayesdb_generator_table(bdb, generator_id)
        colname = core.bayesdb_generator_column_name(bdb, generator_id, colno)
        qt = bql_quote_name(table_name)
        qc = bql_quote_name(colname)
        sql = '''
            select bql_column_value_probability(?, NULL, ?,
                (select %s from %s where rowid = ?))
        ''' % (qc, qt)
        bdb.sql_execute(sql, (generator_id, colno, rowid)).fetchall()
Example #16
0
 def compile_nominal_case(variable, categories):
     cases = str.join('\n', [
         '\t\t\tWHEN \'%s\' THEN %1.4f' % (category, coefficient)
         for category, coefficient in categories.iteritems()
     ])
     return 'CASE %s\n%s\n\t\t\tELSE NULL\n\t\tEND'\
         % (bql_quote_name(variable), cases,)
Example #17
0
def do_test(bdb, t, df, index=None):
    qt = bql_quote_name(t)
    countem = 'select count(*) from %s' % (qt, )
    assert not bayesdb_has_table(bdb, t)

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb, t, df, index=index)

    bayesdb_read_pandas_df(bdb,
                           t,
                           df,
                           create=True,
                           ifnotexists=False,
                           index=index)
    assert len(df.index) == bdb.execute(countem).fetchvalue()

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb,
                               t,
                               df,
                               create=True,
                               ifnotexists=False,
                               index=index)
    assert 4 == bdb.execute(countem).fetchvalue()

    with pytest.raises(apsw.ConstraintError):
        bayesdb_read_pandas_df(bdb,
                               t,
                               df,
                               create=True,
                               ifnotexists=True,
                               index=index)
    assert 4 == bdb.execute(countem).fetchvalue()
Example #18
0
def _retest_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, g)
    gid = core.bayesdb_get_generator(bdb, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg, ))
Example #19
0
def quick_similar_rows(self, identify_row_by, nsimilar=10):
  """Explore rows similar to the identified one.

  identify_row_by : dict
      Dictionary of column names to their values. These will be turned into
      a WHERE clause in BQL, and must identify one unique row.
  nsimilar : positive integer
      The number of similar rows to retrieve.
  """
  import hashlib
  table_name = 'tmptbl_' + hashlib.md5('\x00'.join(
      [repr(identify_row_by), str(self.status)])).hexdigest()
  column_name = 'similarity_to_' + "__".join(
      re.sub(r'\W', '_', str(val)) for val in identify_row_by.values())
  query_params = []
  query_columns = []
  for k, v in identify_row_by.iteritems():
    query_columns.append('''%s = ? ''' % bayeslite.bql_quote_name(k))
    query_params.append(v)
  query_attrs = ' and '.join(query_columns)

  with self.bdb.savepoint():
    row_exists = self.query('SELECT COUNT(*) FROM %s WHERE %s;' %
                            (self.name, query_attrs))
    if row_exists.ix[0][0] != 1:
      raise BLE(NotImplementedError(
          'identify_row_by found %d rows instead of exactly 1 in %s.' %
          (row_exists.ix[0][0], self.csv_path)))
    creation_query = ('''CREATE TEMP TABLE IF NOT EXISTS %s AS ESTIMATE *,
                         SIMILARITY TO (%s) AS %s FROM %%g LIMIT %d;''' %
                      (table_name, query_attrs, column_name, nsimilar))
    self.query(creation_query, query_params)
    result = self.query('''SELECT * FROM %s ORDER BY %s DESC;''' %
                        (table_name, column_name))
  return result
Example #20
0
def bayesdb_population(mkbdb, tab, pop, gen, table_schema, data, columns,
        backend_name='cgpm'):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qp = bql_quote_name(pop)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(backend_name)
        bdb.execute('CREATE POPULATION %s FOR %s(%s)' %
            (qp, qt, ';'.join(columns)))
        bdb.execute('CREATE GENERATOR %s FOR %s USING %s;' %
            (qg, qp, qmm,))
        population_id = core.bayesdb_get_population(bdb, pop)
        generator_id = core.bayesdb_get_generator(bdb, population_id, gen)
        yield bdb, population_id, generator_id
Example #21
0
def cardinality(bdb, table, columns=None):
    """Compute the number of unique values in the columns of a table."""
    qtable = bql_quote_name(table)
    # If no columns specified then use all.
    if not columns:
        cursor = bdb.sql_execute('PRAGMA table_info(%s)' % (qtable, ))
        columns = [row[1] for row in cursor]
    names = []
    counts = []
    for column in columns:
        qcolumn = bql_quote_name(column)
        cursor = bdb.sql_execute('''
            SELECT COUNT (DISTINCT %s) FROM %s
        ''' % (qcolumn, qtable))
        names.append(column)
        counts.append(cursor_value(cursor))
    return pd.DataFrame({'name': names, 'distinct_count': counts})
Example #22
0
def nullify(bdb, table, value):
    """Replace specified values in a SQL table with ``NULL``."""
    qtable = bql_quote_name(table)
    cursor = bdb.sql_execute('pragma table_info(%s)' % (qtable, ))
    columns = [row[1] for row in cursor]
    if value == '\'\'':
        sql = 'UPDATE %s SET {0} = NULL WHERE {0} = \'\'' % (qtable, )
    else:
        sql = 'UPDATE %s SET {0} = NULL WHERE {0} = ?' % (qtable, )
    cells_changed = 0
    for col in columns:
        qcol = bql_quote_name(col)
        old_changes = bdb._sqlite3.totalchanges()
        bdb.sql_execute(sql.format(qcol), (value, ))
        rows_changed = bdb._sqlite3.totalchanges() - old_changes
        cells_changed += rows_changed
    return cells_changed
Example #23
0
def validate_schema(bdb, table, mml_json):
    """Returns a modified JSON representation of a generator expression,
    changing the stattypes of any columns which cause issues during analysis
    to IGNORE.

    This creates a single model for each column and analyzes it for a single
    iteration. If this succeeds the column and stattype are deemed good. If it
    fails the stattype is changed to IGNORE and the existing stattype is placed
    into that column's 'guessed' field, overwriting it if it exists.

    Parameters
    ----------
    mml_json
        A json representation of the generator. Must validate against
        MML_SCHEMA
    """
    bad_cols = []
    for col, typ in mml_json['columns'].items():
        # If the column is already ignored there's nothing to check
        if typ['stattype'] == 'IGNORE':
            continue
        one_col_json = copy.deepcopy(mml_json)
        one_col_json['columns'] = {col: typ}
        # Create a temp generator
        gen_name = uuid.uuid4().hex
        try:
            bdb.execute(to_mml(one_col_json, table, gen_name))
            bdb.execute('INITIALIZE 1 MODEL FOR %s'
                        % (bql_quote_name(gen_name),))
            bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT'
                        % (bql_quote_name(gen_name),))
        except AssertionError:
            bad_cols.append(col)
        finally:
            # Drop our temp generator
            bdb.execute('DROP GENERATOR %s' % bql_quote_name(gen_name))
    modified_schema = copy.deepcopy(mml_json)
    # TODO(asilvers): Should we also return a summary of the modifications?
    for col in bad_cols:
        modified_schema['columns'][col]['guessed'] = (
            modified_schema['columns'][col]['stattype'])
        modified_schema['columns'][col]['stattype'] = 'IGNORE'
        modified_schema['columns'][col]['reason'] = 'Caused ANALYZE to error'
    jsonschema.validate(modified_schema, MML_SCHEMA)
    return modified_schema
Example #24
0
def validate_schema(bdb, table, mml_json):
    """Returns a modified JSON representation of a generator expression,
    changing the stattypes of any columns which cause issues during analysis
    to IGNORE.

    This creates a single model for each column and analyzes it for a single
    iteration. If this succeeds the column and stattype are deemed good. If it
    fails the stattype is changed to IGNORE and the existing stattype is placed
    into that column's 'guessed' field, overwriting it if it exists.

    Parameters
    ----------
    mml_json
        A json representation of the generator. Must validate against
        MML_SCHEMA
    """
    bad_cols = []
    for col, typ in mml_json['columns'].items():
        # If the column is already ignored there's nothing to check
        if typ['stattype'] == 'IGNORE':
            continue
        one_col_json = copy.deepcopy(mml_json)
        one_col_json['columns'] = {col: typ}
        # Create a temp generator
        gen_name = uuid.uuid4().hex
        try:
            bdb.execute(to_mml(one_col_json, table, gen_name))
            bdb.execute('INITIALIZE 1 MODEL FOR %s' %
                        (bql_quote_name(gen_name), ))
            bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' %
                        (bql_quote_name(gen_name), ))
        except AssertionError:
            bad_cols.append(col)
        finally:
            # Drop our temp generator
            bdb.execute('DROP GENERATOR %s' % bql_quote_name(gen_name))
    modified_schema = copy.deepcopy(mml_json)
    # TODO(asilvers): Should we also return a summary of the modifications?
    for col in bad_cols:
        modified_schema['columns'][col]['guessed'] = (
            modified_schema['columns'][col]['stattype'])
        modified_schema['columns'][col]['stattype'] = 'IGNORE'
        modified_schema['columns'][col]['reason'] = 'Caused ANALYZE to error'
    jsonschema.validate(modified_schema, MML_SCHEMA)
    return modified_schema
Example #25
0
    def _cmd_table(self, args):
        '''Returns a table of the PRAGMA schema of <table>.

        Usage: .table <table>
        '''
        table = args
        qt = bql_quote_name(table)
        cursor = self._bdb.sql_execute('PRAGMA table_info(%s)' % (qt, ))
        return utils_bql.cursor_to_df(cursor)
Example #26
0
def test_t1_column_value_probability(colno, rowid):
    with analyzed_bayesdb_population(t1(), 1, 1) \
            as (bdb, population_id, generator_id):
        if rowid == 0:
            rowid = bayesdb_maxrowid(bdb, population_id)
        value = core.bayesdb_population_cell_value(
            bdb, population_id, rowid, colno)
        bqlfn.bql_column_value_probability(
            bdb, population_id, None, None, colno, value)
        table_name = core.bayesdb_population_table(bdb, population_id)
        var = core.bayesdb_variable_name(bdb, population_id, None, colno)
        qt = bql_quote_name(table_name)
        qv = bql_quote_name(var)
        sql = '''
            select bql_column_value_probability(?, NULL, NULL, ?,
                (select %s from %s where rowid = ?))
        ''' % (qv, qt)
        bdb.sql_execute(sql, (population_id, colno, rowid)).fetchall()
Example #27
0
def test_t1_column_value_probability(colno, rowid):
    with analyzed_bayesdb_population(t1(), 1, 1) \
            as (bdb, population_id, generator_id):
        if rowid == 0:
            rowid = bayesdb_maxrowid(bdb, population_id)
        value = core.bayesdb_population_cell_value(bdb, population_id, rowid,
                                                   colno)
        bqlfn.bql_column_value_probability(bdb, population_id, None, None,
                                           colno, value)
        table_name = core.bayesdb_population_table(bdb, population_id)
        var = core.bayesdb_variable_name(bdb, population_id, None, colno)
        qt = bql_quote_name(table_name)
        qv = bql_quote_name(var)
        sql = '''
            select bql_column_value_probability(?, NULL, NULL, ?,
                (select %s from %s where rowid = ?))
        ''' % (qv, qt)
        bdb.sql_execute(sql, (population_id, colno, rowid)).fetchall()
Example #28
0
def regression_to_sql(df, table=None):
    """Convert df output of BQL REGRESS to a SQL SELECT query."""
    if df.shape[1] != 2:
        raise ValueError('Exactly two columns required.')
    variables = df[df.columns[0]]
    coefficients = df[df.columns[1]]

    intercept = coefficients[variables == 'intercept'].iloc[0]

    numericals = {
        var: coefficients[variables == var].iloc[0]
        for var in variables if '_dum_' not in var and var != 'intercept'
    }

    nominals = set([c.split('_dum_')[0] for c in variables if '_dum_' in c])
    categories = {
        var: {
            v.split('_dum_')[1]: coefficients[variables == v].iloc[0]
            for v in variables if '%s_dum_' % (var, ) in v
        }
        for var in nominals
    }

    def compile_numerical(variable, coefficient):
        return '%1.4f  *  %s' % (coefficient, bql_quote_name(variable))

    def compile_nominal_case(variable, categories):
        cases = str.join('\n', [
            '\t\t\tWHEN \'%s\' THEN %1.4f' % (category, coefficient)
            for category, coefficient in categories.iteritems()
        ])
        return 'CASE %s\n%s\n\t\t\tELSE NULL\n\t\tEND'\
            % (bql_quote_name(variable), cases,)

    # Write query to the buffer.
    out = StringIO()
    # SELECT
    out.write('SELECT\n\t\t')
    # Intercept.
    out.write('%1.4f\n\t' % (intercept, ))
    # Numerical variables.
    out.writelines([
        '+\t%s\n\t' % (compile_numerical(var, coef))
        for var, coef in numericals.iteritems()
    ])
    # Nominal variables.
    for i, variable in enumerate(categories):
        out.write('+\t%s' %
                  compile_nominal_case(variable, categories[variable]))
        out.write('\n')
        if i < len(categories) - 1:
            out.write('\t')
    # FROM table.
    if table:
        out.write('\rFROM %s' % (bql_quote_name(table), ))
    return out.getvalue()
def run_heatmap(bdb, location, gen, *args, **kwargs):
    plt.figure()
    qg = bayeslite.bql_quote_name(gen)
    df = cursor_to_df(bdb.execute('''
        estimate dependence probability from pairwise columns of %s
    ''' % (qg,)))
    bdbcontrib.plot_utils.heatmap(df, *args, **kwargs)
    plt.show()
    plt.savefig(location)
    print 'wrote heatmap to %r' % (location,)
Example #30
0
def guess_types(bdb, table):
    """Guesses stattypes of a given table.

    Returns a dictionary from column names to a tuple of
    (guessed stattype, reason).

    You will most often want to pass this straight through to to_json, but this
    form is a bit easier to programatically manipulate, so it provides an easy
    place to hook in and make adjustments before serializing to json.
    """
    types = {}
    for col in _column_names(bdb, table):
        cursor = bdb.sql_execute('SELECT DISTINCT %s FROM %s' %
                                 (bql_quote_name(col), bql_quote_name(table)))
        # TODO(asilvers): We don't necessarily need to read all of these in,
        # and for some datasets this may be prohibitive. But it's fine for now.
        vals = {row[0] for row in cursor if row[0] not in [None, '']}
        types[col] = _type_given_vals(vals)
    return types
def run_heatmap(bdb, location, gen, *args, **kwargs):
    plt.figure()
    qg = bayeslite.bql_quote_name(gen)
    df = cursor_to_df(
        bdb.execute('''
        estimate dependence probability from pairwise columns of %s
    ''' % (qg, )))
    bdbcontrib.plot_utils.heatmap(df, *args, **kwargs)
    plt.show()
    plt.savefig(location)
    print 'wrote heatmap to %r' % (location, )
Example #32
0
def guess_types(bdb, table):
    """Guesses stattypes of a given table.

    Returns a dictionary from column names to a tuple of
    (guessed stattype, reason).

    You will most often want to pass this straight through to to_json, but this
    form is a bit easier to programatically manipulate, so it provides an easy
    place to hook in and make adjustments before serializing to json.
    """
    types = {}
    for col in _column_names(bdb, table):
        cursor = bdb.sql_execute(
            'SELECT DISTINCT %s FROM %s'
            % (bql_quote_name(col), bql_quote_name(table)))
        # TODO(asilvers): We don't necessarily need to read all of these in,
        # and for some datasets this may be prohibitive. But it's fine for now.
        vals = {row[0] for row in cursor if row[0] not in [None, '']}
        types[col] = _type_given_vals(vals)
    return types
Example #33
0
def bayesdb_generator(mkbdb, tab, gen, table_schema, data, columns,
        metamodel_name='crosscat'):
    with mkbdb as bdb:
        table_schema(bdb)
        data(bdb)
        qt = bql_quote_name(tab)
        qg = bql_quote_name(gen)
        qmm = bql_quote_name(metamodel_name)
        bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
            (qg, qt, qmm, ','.join(columns)))
        sql = 'SELECT id FROM bayesdb_generator WHERE name = ?'
        cursor = bdb.sql_execute(sql, (gen,))
        try:
            row = cursor.next()
        except StopIteration:
            assert False, 'Generator didn\'t make it!'
        else:
            assert len(row) == 1
            assert isinstance(row[0], int)
            generator_id = row[0]
            yield bdb, generator_id
def prepare_bdb(bdb, samples, table):
    qt = bayeslite.bql_quote_name(table)
    dataframe = pd.DataFrame(data=samples)
    read_pandas.bayesdb_read_pandas_df(bdb, 'data', dataframe, create=True)

    bdb.execute('''
        CREATE POPULATION FOR %s WITH SCHEMA (
            GUESS STATTYPES OF (*)
        )
    ''' % (qt,))
    bdb.execute('CREATE GENERATOR FOR %s USING loom;' % (qt,))
    bdb.execute('INITIALIZE 4 MODELS FOR %s;' % (qt,))
    bdb.execute('ANALYZE %s FOR 100 ITERATIONS;' % (qt,))
def prepare_bdb(bdb, samples, table):
    qt = bayeslite.bql_quote_name(table)
    dataframe = pd.DataFrame(data=samples)
    read_pandas.bayesdb_read_pandas_df(bdb, 'data', dataframe, create=True)

    bdb.execute('''
        CREATE POPULATION FOR %s WITH SCHEMA (
            GUESS STATTYPES OF (*)
        )
    ''' % (qt, ))
    bdb.execute('CREATE GENERATOR FOR %s USING loom;' % (qt, ))
    bdb.execute('INITIALIZE 4 MODELS FOR %s;' % (qt, ))
    bdb.execute('ANALYZE %s FOR 100 ITERATIONS;' % (qt, ))
Example #36
0
def subsample_table_columns(bdb, table, new_table, limit, keep, drop, seed):
    """Return a subsample of the columns in the table."""
    if not bayesdb_has_table(bdb, table):
        raise ValueError('No such table: %s' % (table, ))
    if bayesdb_has_table(bdb, new_table):
        raise ValueError('Table already exists: %s' % (new_table, ))
    keep = map(casefold, keep)
    drop = map(casefold, drop)
    skip = keep + drop
    unknown = [
        column for column in skip
        if not bayesdb_table_has_column(bdb, table, column)
    ]
    if unknown:
        raise ValueError('No such columns: %s' % (unknown, ))
    overlap = [column for column in drop if column in keep]
    if overlap:
        raise ValueError('Cannot both drop and keep columns: %s' % (overlap, ))
    num_sample = limit - len(keep)
    if num_sample < 0:
        raise ValueError('Must sample at least as many columns to keep.')
    subselect_columns = [
        column for column in bayesdb_table_column_names(bdb, table)
        if casefold(column) not in skip
    ]
    rng = np.random.RandomState(seed)
    subsample_columns = rng.choice(subselect_columns,
                                   replace=False,
                                   size=min(len(subselect_columns),
                                            num_sample))
    qt = bql_quote_name(table)
    qnt = bql_quote_name(new_table)
    qc = ','.join(map(bql_quote_name, itertools.chain(keep,
                                                      subsample_columns)))
    cursor = bdb.execute('''
        CREATE TABLE %s AS SELECT %s FROM %s
    ''' % (qnt, qc, qt))
    return cursor_to_df(cursor)
Example #37
0
  def query(self, query_string, *bindings):
    '''Basic querying without session capture or reporting.

    help_for_query'''
    self.check_representation()
    query_string = re.sub(r'(^|(?<=\s))%t\b',
                          bayeslite.bql_quote_name(self.name),
                          re.sub(r'(^|(?<=\s))%g\b',
                                 bayeslite.bql_quote_name(self.generator_name),
                                 query_string))
    self.logger.info("BQL [%s] [%r]", query_string, bindings)
    with self.bdb.savepoint():
      try:
        res = self.bdb.execute(query_string, bindings)
        assert res is not None and res.description is not None
        self.logger.debug("BQL [%s] [%r] has returned a cursor." %
                          (query_string, bindings))
        df = bdbcontrib.cursor_to_df(res)
        self.logger.debug("BQL [%s] [%r] has created a dataframe." %
                            (query_string, bindings))
        return df
      except:
        self.logger.exception("")
Example #38
0
 def _cmd_interactive_heatmap(self, query, sql=None, **kwargs):
     c = self._bdb.sql_execute(query) if sql else self._bdb.execute(query)
     df = utils_bql.cursor_to_df(c)
     # XXX Take the last three columns of the dataframe. This behavior is
     # intended to allow BQL PAIRWISE queries to be passed through directly
     # to %bql .interactive_heatmap. Unfortunately, PAIRWISE will return
     # four columns, where the first column is the population_id, and the
     # second, third, and fourth are name0, name1, and value, respectively.
     df = df.iloc[:, -3:]
     table = kwargs.get('table', None)
     label0 = kwargs.get('label0', None)
     label1 = kwargs.get('label1', None)
     if table and label0 and label1:
         qt = bql_quote_name(table)
         qc0 = bql_quote_name(label0)
         qc1 = bql_quote_name(label1)
         c = self._bdb.sql_execute('''
             SELECT %s, %s FROM %s
         ''' % (qc0, qc1, qt))
         df_lookup = utils_bql.cursor_to_df(c)
         lookup = dict(zip(df_lookup[label0], df_lookup[label1]))
         df = df.replace({df.columns[0]: lookup, df.columns[1]: lookup})
     return jsviz.interactive_heatmap(df)
Example #39
0
def analyzed_bayesdb_generator(mkbdb, nmodels, nsteps, max_seconds=None):
    with mkbdb as (bdb, generator_id):
        generator = core.bayesdb_generator_name(bdb, generator_id)
        qg = bql_quote_name(generator)
        bql = 'INITIALIZE %d MODELS FOR %s' % (nmodels, qg)
        bdb.execute(bql)
        # XXX Syntax currently doesn't support both nsteps and
        # max_seconds.
        duration = None
        if max_seconds:
            duration = '%d SECONDS' % (max_seconds,)
        else:
            duration = '%d ITERATIONS' % (nsteps,)
        bql = 'ANALYZE %s FOR %s WAIT' % (qg, duration)
        bdb.execute(bql)
        yield bdb, generator_id
Example #40
0
def analyzed_bayesdb_population(mkbdb, nmodels, nsteps, max_seconds=None):
    with mkbdb as (bdb, population_id, generator_id):
        generator = core.bayesdb_generator_name(bdb, generator_id)
        qg = bql_quote_name(generator)
        bql = 'INITIALIZE %d MODELS FOR %s' % (nmodels, qg)
        bdb.execute(bql)
        # XXX Syntax currently doesn't support both nsteps and
        # max_seconds.
        duration = None
        if max_seconds:
            duration = '%d SECONDS' % (max_seconds, )
        else:
            duration = '%d ITERATIONS' % (nsteps, )
        bql = 'ANALYZE %s FOR %s' % (qg, duration)
        bdb.execute(bql)
        yield bdb, population_id, generator_id
Example #41
0
def do_test(bdb, t, df, index=None):
    qt = bql_quote_name(t)
    countem = "select count(*) from %s" % (qt,)
    assert not bayesdb_has_table(bdb, t)

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb, t, df, index=index)

    bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index)
    assert len(df.index) == bdb.execute(countem).fetchvalue()

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index)
    assert 4 == bdb.execute(countem).fetchvalue()

    with pytest.raises(apsw.ConstraintError):
        bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=True, index=index)
    assert 4 == bdb.execute(countem).fetchvalue()
Example #42
0
def _retest_example(bdb, exname):
    (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
     cleanup) = examples[exname]
    qg = bql_quote_name(g)

    metamodel = mm()
    bayeslite.bayesdb_register_backend(bdb, mm())
    p_id = core.bayesdb_get_population(bdb, p)

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)

    bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg, ))
    try:
        # Test analyzing models.
        bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg, ))
        bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg, ))
    except bayeslite.BQLError, e:
        # loom does not allow model numbers to be specified in analyze models
        assert exname == 'loom'
Example #43
0
def _retest_example(bdb, exname):
    (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
        cleanup) = examples[exname]
    qg = bql_quote_name(g)

    backend = be()
    bayeslite.bayesdb_register_backend(bdb, backend)
    p_id = core.bayesdb_get_population(bdb, p)

    assert core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert core.bayesdb_generator_has_model(bdb, gid, 0)
    assert core.bayesdb_generator_has_model(bdb, gid, 1)

    bdb.execute('ANALYZE %s FOR 1 ITERATION' % (qg,))
    try:
        # Test analyzing models.
        bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION' % (qg,))
        bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION' % (qg,))
    except bayeslite.BQLError, e:
        # loom does not allow model numbers to be specified in analyze models
        assert exname == 'loom'
Example #44
0
def quick_similar_rows(self, identify_row_by, nsimilar=10):
    """Explore rows similar to the identified one.

  identify_row_by : dict
      Dictionary of column names to their values. These will be turned into
      a WHERE clause in BQL, and must identify one unique row.
  nsimilar : positive integer
      The number of similar rows to retrieve.
  """
    import hashlib
    table_name = 'tmptbl_' + hashlib.md5('\x00'.join(
        [repr(identify_row_by), str(self.status)])).hexdigest()
    column_name = 'similarity_to_' + "__".join(
        re.sub(r'\W', '_', str(val)) for val in identify_row_by.values())
    query_params = []
    query_columns = []
    for k, v in identify_row_by.iteritems():
        query_columns.append('''%s = ? ''' % bayeslite.bql_quote_name(k))
        query_params.append(v)
    query_attrs = ' and '.join(query_columns)

    with self.bdb.savepoint():
        row_exists = self.query('SELECT COUNT(*) FROM %s WHERE %s;' %
                                (self.name, query_attrs))
        if row_exists.ix[0][0] != 1:
            raise BLE(
                NotImplementedError(
                    'identify_row_by found %d rows instead of exactly 1 in %s.'
                    % (row_exists.ix[0][0], self.csv_path)))
        creation_query = ('''CREATE TEMP TABLE IF NOT EXISTS %s AS ESTIMATE *,
                         SIMILARITY TO (%s) AS %s FROM %%g LIMIT %d;''' %
                          (table_name, query_attrs, column_name, nsimilar))
        self.query(creation_query, query_params)
        result = self.query('''SELECT * FROM %s ORDER BY %s DESC;''' %
                            (table_name, column_name))
    return result
Example #45
0
def estimate_log_likelihood(bdb, table, generator, targets=None, givens=None, n_samples=None):
    """Estimate the log likelihood for obsevations in a table.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    table : str
        Name of table.
    generator : str
        Name of generator.
    targets : list<str>, optional
        List of columns in the table for which to compute the log-likelihood.
        Defaults to all the columns.
    givens : list<tuple>, optional
        A list of [(column, value)] pairs on which to condition on. Defaults to
        no conditionals. See example for more details.
    n_samples : int, optional
        Number of rows from table to use in the computation. Defaults to all
        the rows.

    Returns
    -------
    ll : float
        The log likelihood of the table[columns] under the conditional
        distribution (specified by givens) of generator.

    Example:
    estimate_log_likelihood(bdb, 'people', 'people_gen',
        targets=['weight', 'height'],
        givens=[('nationality', 'USA'), ('age', 17)])
    """
    # Defaults to all columns if targets is None.
    targets = extract_target_cols(bdb, generator, targets=targets)

    # Defaults to no givens if givens is None
    givens = extract_given_cols_vals(givens=givens)
    givens = ",".join(["{}={}".format(c, v) for (c, v) in givens])

    # Obtain the dataset table.
    table = bql_quote_name(table.strip(";"))
    sql = """
        SELECT {} FROM {};
    """.format(
        ",".join(targets), table
    )
    dataset = bdb.execute(sql)

    # Obtain number of rows in the dataset and samples to use.
    n_samples = n_samples
    n_rows = bdb.execute(
        """
        SELECT COUNT(*) FROM {}""".format(
            table
        )
    ).fetchvalue()
    if n_samples is None or n_rows < n_samples:
        n_samples = n_rows

    # Compute the log-likelihood of the targets, subject to givens.
    # XXX This code is currently wrong due to shortcomings in BQL:
    #  - BQL cannot evaluate joint density. Assume that all the rows are IID,
    #  and that all the columns factor into their marginal density.
    ll, i = 0, 0
    for row in dataset:
        if i > n_samples:
            break
        else:
            i += 1
        # XXX Wrong: assume joint factors into product of marginals.
        for col, val in zip(targets, row):
            if givens:
                # XXX TODO write GIVEN in this query using bindings.
                bql = """
                    ESTIMATE PROBABILITY OF {}=? GIVEN ({}) FROM {} LIMIT 1
                """.format(
                    col, givens, bql_quote_name(generator)
                )
            else:
                bql = """
                    ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
                """.format(
                    col, bql_quote_name(generator)
                )

            ll += math.log(bdb.execute(bql, (val,)).fetchvalue())

    return ll
Example #46
0
def _test_example(bdb, exname):
    (mm, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
     cleanup) = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    metamodel = mm()
    bayeslite.bayesdb_register_backend(bdb, metamodel)

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == n

    # Create a population.
    assert not core.bayesdb_has_population(bdb, p)
    bdb.execute(p_bql)
    p_id = core.bayesdb_get_population(bdb, p)

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id + 1, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)

    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg, ))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg, ))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt, ))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt, ))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, None, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert core.bayesdb_has_generator(bdb, None, g)
    assert gid == core.bayesdb_get_generator(bdb, p_id, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        try:
            bdb.execute('DROP MODEL 1 FROM %s' % (qg, ))
            assert core.bayesdb_generator_has_model(bdb, gid, 0)
            assert not core.bayesdb_generator_has_model(bdb, gid, 1)
            assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
        except bayeslite.BQLError, e:
            # loom does not allow model numbers to be specified in drop models
            assert exname == 'loom'
Example #47
0
def _test_example(bdb, exname):
    (be, t, t_sql, data_sql, data, p, g, p_bql, g_bql, g_bqlbad0, g_bqlbad1,
        cleanup) = examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    backend = be()
    bayeslite.bayesdb_register_backend(bdb, backend)

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == n

    # Create a population.
    assert not core.bayesdb_has_population(bdb, p)
    bdb.execute(p_bql)
    p_id = core.bayesdb_get_population(bdb, p)

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert not core.bayesdb_has_generator(bdb, p_id+1, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, p_id, g)

    gid = core.bayesdb_get_generator(bdb, p_id, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg,))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg,))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt,))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt,))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, None, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, None, g)
    assert core.bayesdb_has_generator(bdb, p_id, g)
    assert core.bayesdb_has_generator(bdb, None, g)
    assert gid == core.bayesdb_get_generator(bdb, p_id, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        try:
            bdb.execute('DROP MODEL 1 FROM %s' % (qg,))
            assert core.bayesdb_generator_has_model(bdb, gid, 0)
            assert not core.bayesdb_generator_has_model(bdb, gid, 1)
            assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
        except bayeslite.BQLError, e:
           # loom does not allow model numbers to be specified in drop models
           assert exname == 'loom'
def insert_row(bdb, table, x, y):
    qt = bayeslite.bql_quote_name(table)
    query = 'INSERT INTO %s ("0", "1") VALUES (?, ?)' % (qt,)
    bdb.sql_execute(query, bindings=(x, y))
    cursor = bdb.sql_execute('SELECT last_insert_rowid()')
    return cursor.fetchone()[0]
Example #49
0
def _test_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt, )).fetchvalue() == n

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert not core.bayesdb_has_generator(bdb, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)

    gid = core.bayesdb_get_generator(bdb, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg, ))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg, ))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt, ))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt, ))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg, ))
        assert not core.bayesdb_has_generator(bdb, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    assert gid == core.bayesdb_get_generator(bdb, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        bdb.execute('DROP MODEL 1 FROM %s' % (qg, ))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert not core.bayesdb_generator_has_model(bdb, gid, 1)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)

    # Test analyzing models.
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg, ))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg, ))
Example #50
0
def estimate_kl_divergence(bdb,
                           generatorA,
                           generatorB,
                           targets=None,
                           givens=None,
                           n_samples=None):
    """Estimate the KL divergence.

    The KL divergence is a mesaure of the "information lost" when generatorB
    (the approximating generator) is used to approximate generatorA (the base
    generator). KL divergence is not symmetric in, and KL(genA||genB) is not
    necessarily equal to KL(genB||genA).

    TODO: Monte Carlo estimation is a terrible way to compute the KL divergence.
    (Not to say there are better methods in general). One illustration of this
    is that the estimated KL divergence has emperically been shown to obtain
    negative realizations for high-dimensional data.

    Computing the KL divergence in general (of high dimensional distributions)
    is a very hard problem; most research uses the structure of the
    distributions to find good estimators. Adaptive quadrature or exact methods
    for numerical integration could outperform Monte Carlo?

    TODO: More sophisticated algorithm for detecting cases where absolute
    continuity could be a problem (currently have a heuristic).
    As it stands, Monte Carlo estimates may have infinite variance depending
    on simulated values from generatorA.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    generatorA : str
        Name of base generator.
    generatorB : str
        Name of approximating generator.
    targets : list<str>, optional
        List of columns in the table for which to compute the log-likelihood.
        Defaults to all the columns.
    givens : list<tuple>, optional
        A list of [(column, value)] pairs on which to condition on. Defaults to
        no conditionals. See example for more details.
    n_samples: int, optional
        Number of simulated samples to use in the Monte Carlo estimate.

    Returns
    -------
    kl : float
        The KL divergence. May be infinity.

    Example:
    estimate_kl_divergence(bdb, 'crosscat_gen', 'baxcat_gen',
        targets=['weight', 'height'],
        givens=[('nationality', 'USA'), ('age', 17)])
    """
    # XXX Default to 10,000 samples
    if n_samples is None:
        n_samples = 10000

    # Defaults to all columns if targets is None.
    targets = extract_target_cols(bdb, generatorA, targets=targets)

    # Defaults to no givens if givens is None
    givens = extract_given_cols_vals(givens=givens)
    givens = ','.join(['{}={}'.format(c, v) for (c, v) in givens])

    # Obtain samples from the base distribution.
    if givens:
        # XXX TODO write GIVEN in this query using bindings.
        bql = '''
            SIMULATE {} FROM {} GIVEN {} LIMIT {}
        '''.format(','.join(targets), bql_quote_name(generatorA), givens,
                   n_samples)
    else:
        bql = '''
            SIMULATE {} FROM {} LIMIT {}
        '''.format(','.join(targets), bql_quote_name(generatorA), n_samples)
    samples = bdb.execute(bql)

    kl = 0
    for s in samples:
        logp_a, logp_b = 0, 0
        # XXX Assume joint probability factors by summing univariate
        # (conditional) probability of each cell value. This is clearly wrong,
        # until we can evaluate joint densities in BQL.
        for col, val in zip(targets, s):
            bql = '''
                ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
            '''.format(col, bql_quote_name(generatorA))
            crs = bdb.execute(bql, (val, ))
            p_a = crs.fetchvalue()

            bql = '''
                ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
            '''.format(col, bql_quote_name(generatorB))
            crs = bdb.execute(bql, (val, ))
            p_b = crs.fetchvalue()

            # XXX Heuristic to detect when genA is not absolutely
            # continuous wrt genB
            if p_a == 0:
                # How on earth did we simulate a value from genA with zero
                # density/prob under genA?
                raise BLE(
                    ValueError(
                        'Fatal error: simulated a (col,val)=({},{}) '
                        'from base generatorA ({}) with zero density. Check '
                        'implementation of simluate and/or logpdf of '
                        'generator.'.format(col, val, generatorA)))
            if p_b == 0:
                # Detected failure of absolute continuity
                # (under assumption that joint factors into marginals)
                return float('inf')

            logp_a += math.log(p_a)
            logp_b += math.log(p_b)

        kl += (logp_a - logp_b)

    # XXX Assertion may fail, see TODO in docstring.
    # assert kl > 0
    if kl < 0:
        raise BLE(
            ValueError(
                'Cannot compute reasonable value for KL divergence. '
                'Try increasing the number of samples (currently using {}'
                'samples).'.format(n_samples)))

    return kl / n_samples
Example #51
0
def estimate_log_likelihood(bdb,
                            table,
                            generator,
                            targets=None,
                            givens=None,
                            n_samples=None):
    """Estimate the log likelihood for obsevations in a table.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    table : str
        Name of table.
    generator : str
        Name of generator.
    targets : list<str>, optional
        List of columns in the table for which to compute the log-likelihood.
        Defaults to all the columns.
    givens : list<tuple>, optional
        A list of [(column, value)] pairs on which to condition on. Defaults to
        no conditionals. See example for more details.
    n_samples : int, optional
        Number of rows from table to use in the computation. Defaults to all
        the rows.

    Returns
    -------
    ll : float
        The log likelihood of the table[columns] under the conditional
        distribution (specified by givens) of generator.

    Example:
    estimate_log_likelihood(bdb, 'people', 'people_gen',
        targets=['weight', 'height'],
        givens=[('nationality', 'USA'), ('age', 17)])
    """
    # Defaults to all columns if targets is None.
    targets = extract_target_cols(bdb, generator, targets=targets)

    # Defaults to no givens if givens is None
    givens = extract_given_cols_vals(givens=givens)
    givens = ','.join(['{}={}'.format(c, v) for (c, v) in givens])

    # Obtain the dataset table.
    table = bql_quote_name(table.strip(';'))
    sql = '''
        SELECT {} FROM {};
    '''.format(','.join(targets), table)
    dataset = bdb.execute(sql)

    # Obtain number of rows in the dataset and samples to use.
    n_samples = n_samples
    n_rows = bdb.execute('''
        SELECT COUNT(*) FROM {}'''.format(table)).fetchvalue()
    if n_samples is None or n_rows < n_samples:
        n_samples = n_rows

    # Compute the log-likelihood of the targets, subject to givens.
    # XXX This code is currently wrong due to shortcomings in BQL:
    #  - BQL cannot evaluate joint density. Assume that all the rows are IID,
    #  and that all the columns factor into their marginal density.
    ll, i = 0, 0
    for row in dataset:
        if i > n_samples:
            break
        else:
            i += 1
        # XXX Wrong: assume joint factors into product of marginals.
        for col, val in zip(targets, row):
            if givens:
                # XXX TODO write GIVEN in this query using bindings.
                bql = '''
                    ESTIMATE PROBABILITY OF {}=? GIVEN ({}) FROM {} LIMIT 1
                '''.format(col, givens, bql_quote_name(generator))
            else:
                bql = '''
                    ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
                '''.format(col, bql_quote_name(generator))

            ll += math.log(bdb.execute(bql, (val, )).fetchvalue())

    return ll
Example #52
0
def bayesdb_maxrowid(bdb, generator_id):
    table_name = core.bayesdb_generator_table(bdb, generator_id)
    qt = bql_quote_name(table_name)
    sql = 'SELECT MAX(_rowid_) FROM %s' % (qt,)
    return cursor_value(bdb.sql_execute(sql))
Example #53
0
def estimate_kl_divergence(bdb, generatorA, generatorB, targets=None, givens=None, n_samples=None):
    """Estimate the KL divergence.

    The KL divergence is a mesaure of the "information lost" when generatorB
    (the approximating generator) is used to approximate generatorA (the base
    generator). KL divergence is not symmetric in, and KL(genA||genB) is not
    necessarily equal to KL(genB||genA).

    TODO: Monte Carlo estimation is a terrible way to compute the KL divergence.
    (Not to say there are better methods in general). One illustration of this
    is that the estimated KL divergence has emperically been shown to obtain
    negative realizations for high-dimensional data.

    Computing the KL divergence in general (of high dimensional distributions)
    is a very hard problem; most research uses the structure of the
    distributions to find good estimators. Adaptive quadrature or exact methods
    for numerical integration could outperform Monte Carlo?

    TODO: More sophisticated algorithm for detecting cases where absolute
    continuity could be a problem (currently have a heuristic).
    As it stands, Monte Carlo estimates may have infinite variance depending
    on simulated values from generatorA.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    generatorA : str
        Name of base generator.
    generatorB : str
        Name of approximating generator.
    targets : list<str>, optional
        List of columns in the table for which to compute the log-likelihood.
        Defaults to all the columns.
    givens : list<tuple>, optional
        A list of [(column, value)] pairs on which to condition on. Defaults to
        no conditionals. See example for more details.
    n_samples: int, optional
        Number of simulated samples to use in the Monte Carlo estimate.

    Returns
    -------
    kl : float
        The KL divergence. May be infinity.

    Example:
    estimate_kl_divergence(bdb, 'crosscat_gen', 'baxcat_gen',
        targets=['weight', 'height'],
        givens=[('nationality', 'USA'), ('age', 17)])
    """
    # XXX Default to 10,000 samples
    if n_samples is None:
        n_samples = 10000

    # Defaults to all columns if targets is None.
    targets = extract_target_cols(bdb, generatorA, targets=targets)

    # Defaults to no givens if givens is None
    givens = extract_given_cols_vals(givens=givens)
    givens = ",".join(["{}={}".format(c, v) for (c, v) in givens])

    # Obtain samples from the base distribution.
    if givens:
        # XXX TODO write GIVEN in this query using bindings.
        bql = """
            SIMULATE {} FROM {} GIVEN {} LIMIT {}
        """.format(
            ",".join(targets), bql_quote_name(generatorA), givens, n_samples
        )
    else:
        bql = """
            SIMULATE {} FROM {} LIMIT {}
        """.format(
            ",".join(targets), bql_quote_name(generatorA), n_samples
        )
    samples = bdb.execute(bql)

    kl = 0
    for s in samples:
        logp_a, logp_b = 0, 0
        # XXX Assume joint probability factors by summing univariate
        # (conditional) probability of each cell value. This is clearly wrong,
        # until we can evaluate joint densities in BQL.
        for col, val in zip(targets, s):
            bql = """
                ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
            """.format(
                col, bql_quote_name(generatorA)
            )
            crs = bdb.execute(bql, (val,))
            p_a = crs.fetchvalue()

            bql = """
                ESTIMATE PROBABILITY OF {}=? FROM {} LIMIT 1
            """.format(
                col, bql_quote_name(generatorB)
            )
            crs = bdb.execute(bql, (val,))
            p_b = crs.fetchvalue()

            # XXX Heuristic to detect when genA is not absolutely
            # continuous wrt genB
            if p_a == 0:
                # How on earth did we simulate a value from genA with zero
                # density/prob under genA?
                raise BLE(
                    ValueError(
                        "Fatal error: simulated a (col,val)=({},{}) "
                        "from base generatorA ({}) with zero density. Check "
                        "implementation of simluate and/or logpdf of "
                        "generator.".format(col, val, generatorA)
                    )
                )
            if p_b == 0:
                # Detected failure of absolute continuity
                # (under assumption that joint factors into marginals)
                return float("inf")

            logp_a += math.log(p_a)
            logp_b += math.log(p_b)

        kl += logp_a - logp_b

    # XXX Assertion may fail, see TODO in docstring.
    # assert kl > 0
    if kl < 0:
        raise BLE(
            ValueError(
                "Cannot compute reasonable value for KL divergence. "
                "Try increasing the number of samples (currently using {}"
                "samples).".format(n_samples)
            )
        )

    return kl / n_samples
def _test_example(bdb, exname):
    mm, t, t_sql, data_sql, data, g, g_bql, g_bqlbad0, g_bqlbad1 = \
        examples[exname]
    qt = bql_quote_name(t)
    qg = bql_quote_name(g)

    bayeslite.bayesdb_register_metamodel(bdb, mm())

    # Create a table.
    assert not core.bayesdb_has_table(bdb, t)
    with bdb.savepoint_rollback():
        bdb.sql_execute(t_sql)
        assert core.bayesdb_has_table(bdb, t)
    assert not core.bayesdb_has_table(bdb, t)
    bdb.sql_execute(t_sql)
    assert core.bayesdb_has_table(bdb, t)

    # Insert data into the table.
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == 0
    for row in data:
        bdb.sql_execute(data_sql, row)
    n = len(data)
    assert bdb.execute('SELECT COUNT(*) FROM %s' % (qt,)).fetchvalue() == n

    # Create a generator.  Make sure savepoints work for this.
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad0)
    assert not core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        with bdb.savepoint():
            bdb.execute(g_bqlbad1)
    assert not core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert not core.bayesdb_has_generator(bdb, g)
    bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)
    with pytest.raises(Exception):
        bdb.execute(g_bql)
    assert core.bayesdb_has_generator(bdb, g)

    gid = core.bayesdb_get_generator(bdb, g)
    assert not core.bayesdb_generator_has_model(bdb, gid, 0)
    assert [] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 1 MODEL FOR %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)
    with bdb.savepoint_rollback():
        bdb.execute('INITIALIZE 10 MODELS FOR %s' % (qg,))
        for i in range(10):
            assert core.bayesdb_generator_has_model(bdb, gid, i)
            assert range(10) == core.bayesdb_generator_modelnos(bdb, gid)
    bdb.execute('INITIALIZE 2 MODELS FOR %s' % (qg,))

    # Test dropping things.
    with pytest.raises(bayeslite.BQLError):
        bdb.execute('DROP TABLE %s' % (qt,))
    with bdb.savepoint_rollback():
        # Note that sql_execute does not protect us!
        bdb.sql_execute('DROP TABLE %s' % (qt,))
        assert not core.bayesdb_has_table(bdb, t)
    assert core.bayesdb_has_table(bdb, t)
    # XXX Should we reject dropping a generator when there remain
    # models?  Should we not reject dropping a table when there remain
    # generators?  A table can be dropped when there remain indices.
    #
    # with pytest.raises(bayeslite.BQLError):
    #     # Models remain.
    #     bdb.execute('DROP GENERATOR %s' % (qg,))
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    with bdb.savepoint_rollback():
        bdb.execute('DROP GENERATOR %s' % (qg,))
        assert not core.bayesdb_has_generator(bdb, g)
        bdb.execute(g_bql)
        assert core.bayesdb_has_generator(bdb, g)
    assert core.bayesdb_has_generator(bdb, g)
    assert gid == core.bayesdb_get_generator(bdb, g)

    # Test dropping models.
    with bdb.savepoint_rollback():
        bdb.execute('DROP MODEL 1 FROM %s' % (qg,))
        assert core.bayesdb_generator_has_model(bdb, gid, 0)
        assert not core.bayesdb_generator_has_model(bdb, gid, 1)
        assert [0] == core.bayesdb_generator_modelnos(bdb, gid)

    # Test analyzing models.
    bdb.execute('ANALYZE %s FOR 1 ITERATION WAIT' % (qg,))
    bdb.execute('ANALYZE %s MODEL 0 FOR 1 ITERATION WAIT' % (qg,))
    bdb.execute('ANALYZE %s MODEL 1 FOR 1 ITERATION WAIT' % (qg,))
Example #55
0
def bayesdb_maxrowid(bdb, population_id):
    table_name = core.bayesdb_population_table(bdb, population_id)
    qt = bql_quote_name(table_name)
    sql = 'SELECT MAX(_rowid_) FROM %s' % (qt, )
    return cursor_value(bdb.sql_execute(sql))
Example #56
0
def _column_names(bdb, table):
    """Returns the column names of a table"""
    with bdb.savepoint():
        cursor = bdb.sql_execute('PRAGMA table_info(%s)'
                                 % (bql_quote_name(table),))
        return list(row[1] for row in cursor)
Example #57
0
def estimate_pairwise_similarity(bdb_file, table, model, sim_table=None,
                                 cores=None, N=None, overwrite=False):
    """
    Estimate pairwise similarity from the given model, splitting processing
    across multiple processors, and save results into sim_table.

    Because called methods in this function must also open up separate BayesDB
    instances, this function accepts a BayesDB filename, rather than an actual
    bayeslite.BayesDB object.

    Parameters
    ----------
    bdb_file : str
        File location of the BayesDB database object. This function will
        handle opening the file with bayeslite.bayesdb_open.
    table : str
        Name of the table containing the raw data.
    model : str
        Name of the metamodel to estimate from.
    sim_table : str
        Name of the table to insert similarity results into. Defaults to
        table name + '_similarity'.
    cores : int
        Number of processors to use. Defaults to the number of cores as
        identified by multiprocessing.num_cores.
    N : int
        Number of rows for which to estimate pairwise similarities (so
        N^2 calculations are done). Should be used just to test small
        batches; currently, there is no control over what specific pairwise
        similarities are estimated with this parameter.
    overwrite : bool
        Whether to overwrite the sim_table if it already exists. If
        overwrite=False and the table exists, function will raise
        sqlite3.OperationalError. Default True.
    """
    bdb = bayesdb_open(pathname=bdb_file)

    if cores is None:
        cores = mp.cpu_count()

    if cores < 1:
        raise BLE(ValueError(
            "Invalid number of cores {}".format(cores)))

    if sim_table is None:
        sim_table = table + '_similarity'

    # Get number of occurrences in the database
    count_cursor = bdb.execute(
        'SELECT COUNT(*) FROM {}'.format(bql_quote_name(table))
    )
    table_count = cursor_value(count_cursor)
    if N is None:
        N = table_count
    elif N > table_count:
        raise BLE(ValueError(
            "Asked for N={} rows but {} rows in table".format(N, table_count)))

    # Calculate the size (# of similarities to compute) and
    # offset (where to start) calculation for each worker query.

    # Divide sizes evenly, and make the last job finish the remainder
    sizes = [(N * N) / cores for i in range(cores)]
    sizes[-1] += (N * N) % cores

    total = 0
    offsets = [total]
    for size in sizes[:-1]:
        total += size
        offsets.append(total)

    # Create the similarity table. Assumes original table has rowid column.
    # XXX: tables don't necessarily have an autoincrementing primary key
    # other than rowid, which is implicit and can't be set as a foreign key.
    # We ought to ask for an optional user-specified foreign key, but
    # ESTIMATE SIMILARITY returns numerical values rather than row names, so
    # changing numerical rownames into that foreign key would be finicky. For
    # now, we eliminate REFERENCE {table}(foreign_key) from the rowid0 and
    # rowid1 specs.
    sim_table_q = bql_quote_name(sim_table)
    if overwrite:
        bdb.sql_execute('DROP TABLE IF EXISTS {}'.format(sim_table_q))

    bdb.sql_execute('''
        CREATE TABLE {} (
            rowid0 INTEGER NOT NULL,
            rowid1 INTEGER NOT NULL,
            value DOUBLE NOT NULL
        )
    '''.format(sim_table_q))

    # Define the helper which inserts data into table in batches
    def insert_into_sim(df):
        """
        Use the main thread bdb handle to successively insert results of
        ESTIMATEs into the table.
        """
        rows = map(list, df.values)
        insert_sql = '''
            INSERT INTO {} (rowid0, rowid1, value) VALUES (?, ?, ?)
        '''.format(sim_table_q)
        # Avoid sqlite3 500-insert limit by grouping insert statements
        # into one transaction.
        with bdb.transaction():
            for row in rows:
                bdb.sql_execute(insert_sql, row)

    pool = mp.Pool(processes=cores)

    manager = mp.Manager()
    queue = manager.Queue()

    # Construct the estimate query template.
    q_template = '''
        ESTIMATE SIMILARITY FROM PAIRWISE {} LIMIT ? OFFSET ?
    ''' .format(bql_quote_name(model))

    for so in zip(sizes, offsets):
        pool.apply_async(
            _query_into_queue, args=(q_template, so, queue, bdb_file)
        )

    # Close pool and wait for processes to finish
    # FIXME: This waits for all processes to finish before inserting
    # into the table, which means that memory usage is potentially very
    # high!
    pool.close()
    pool.join()

    # Process returned results
    while not queue.empty():
        df = queue.get()
        insert_into_sim(df)
Example #58
0
 def clean(column_name):
     qcn = bql_quote_name(column_name)
     sql = "UPDATE t SET %s = NULL WHERE %s = '' OR %s LIKE 'NaN'" % \
         (qcn, qcn, qcn)
     bdb.sql_execute(sql)