Example #1
0
def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes):
    column_names = core.bayesdb_table_column_names(bdb, table)
    qcns = map(sqlite3_quote_name, column_names)
    assert all(column_stattypes[name] in allowed_column_stattypes
        for name in column_stattypes)
    column_name_set = set(casefold(name) for name in column_names)
    for name in column_stattypes:
        if name not in column_name_set:
            raise IOError('No such column in table %s: %s' %
                (repr(table), repr(name)))
    schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)])
        for name, qcn in zip(column_names, qcns))
    qg = sqlite3_quote_name(generator)
    qt = sqlite3_quote_name(table)
    qmm = 'crosscat'
    bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
        (qg, qt, qmm, schema))
Example #2
0
def bayesdb_create_legacy_generator(bdb, generator, table, column_stattypes):
    column_names = core.bayesdb_table_column_names(bdb, table)
    qcns = map(sqlite3_quote_name, column_names)
    assert all(column_stattypes[name] in allowed_column_stattypes
               for name in column_stattypes)
    column_name_set = set(casefold(name) for name in column_names)
    for name in column_stattypes:
        if name not in column_name_set:
            raise IOError('No such column in table %s: %s' %
                          (repr(table), repr(name)))
    schema = ','.join('%s %s' % (qcn, column_stattypes[casefold(name)])
                      for name, qcn in zip(column_names, qcns))
    qg = sqlite3_quote_name(generator)
    qt = sqlite3_quote_name(table)
    qmm = 'crosscat'
    bdb.execute('CREATE GENERATOR %s FOR %s USING %s(%s)' %
                (qg, qt, qmm, schema))
Example #3
0
def subsample_table_columns(bdb, table, new_table, limit, keep, drop, seed):
    """Return a subsample of the columns in the table."""
    if not bayesdb_has_table(bdb, table):
        raise ValueError('No such table: %s' % (table, ))
    if bayesdb_has_table(bdb, new_table):
        raise ValueError('Table already exists: %s' % (new_table, ))
    keep = map(casefold, keep)
    drop = map(casefold, drop)
    skip = keep + drop
    unknown = [
        column for column in skip
        if not bayesdb_table_has_column(bdb, table, column)
    ]
    if unknown:
        raise ValueError('No such columns: %s' % (unknown, ))
    overlap = [column for column in drop if column in keep]
    if overlap:
        raise ValueError('Cannot both drop and keep columns: %s' % (overlap, ))
    num_sample = limit - len(keep)
    if num_sample < 0:
        raise ValueError('Must sample at least as many columns to keep.')
    subselect_columns = [
        column for column in bayesdb_table_column_names(bdb, table)
        if casefold(column) not in skip
    ]
    rng = np.random.RandomState(seed)
    subsample_columns = rng.choice(subselect_columns,
                                   replace=False,
                                   size=min(len(subselect_columns),
                                            num_sample))
    qt = bql_quote_name(table)
    qnt = bql_quote_name(new_table)
    qc = ','.join(map(bql_quote_name, itertools.chain(keep,
                                                      subsample_columns)))
    cursor = bdb.execute('''
        CREATE TABLE %s AS SELECT %s FROM %s
    ''' % (qnt, qc, qt))
    return cursor_to_df(cursor)
Example #4
0
def bayesdb_read_csv(bdb,
                     table,
                     f,
                     header=False,
                     create=False,
                     ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table), ))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table), ))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                              (repr(list(duplicates)), ))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn, ) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(
                    name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown), ))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                              (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                              (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Example #5
0
def bayesdb_read_csv(bdb, table, f, header=False,
        create=False, ifnotexists=False):
    """Read CSV data from a line iterator into a table.

    :param bayeslite.BayesDB bdb: BayesDB instance
    :param str table: name of table
    :param iterable f: iterator returning lines as :class:`str`
    :param bool header: if true, first line specifies column names
    :param bool create: if true and `table` does not exist, create it
    :param bool ifnotexists: if true and `table` exists, do it anyway
    """
    if not header:
        if create:
            raise ValueError('Can\'t create table from headerless CSV!')
    if not create:
        if ifnotexists:
            raise ValueError('Not creating table whether or not exists!')
    with bdb.savepoint():
        if core.bayesdb_has_table(bdb, table):
            if create and not ifnotexists:
                raise ValueError('Table already exists: %s' % (repr(table),))
        elif not create:
            raise ValueError('No such table: %s' % (repr(table),))
        reader = csv.reader(f)
        line = 1
        if header:
            row = None
            try:
                row = reader.next()
            except StopIteration:
                raise IOError('Missing header in CSV file')
            line += 1
            column_names = [unicode(name, 'utf8').strip() for name in row]
            if len(column_names) == 0:
                raise IOError('No columns in CSV file!')
            if any(len(c)==0 for c in column_names):
                raise IOError(
                    'Missing column names in header: %s' %repr(column_names))
            column_name_map = {}
            duplicates = set([])
            for name in column_names:
                name_folded = casefold(name)
                if name_folded in column_name_map:
                    duplicates.add(name_folded)
                else:
                    column_name_map[name_folded] = name
            if 0 < len(duplicates):
                raise IOError('Duplicate columns in CSV: %s' %
                    (repr(list(duplicates)),))
            if create and not core.bayesdb_has_table(bdb, table):
                qt = sqlite3_quote_name(table)
                qcns = map(sqlite3_quote_name, column_names)
                schema = ','.join('%s NUMERIC' % (qcn,) for qcn in qcns)
                bdb.sql_execute('CREATE TABLE %s(%s)' % (qt, schema))
                core.bayesdb_table_guarantee_columns(bdb, table)
            else:
                core.bayesdb_table_guarantee_columns(bdb, table)
                unknown = set(name for name in column_names
                    if not core.bayesdb_table_has_column(bdb, table, name))
                if len(unknown) != 0:
                    raise IOError('Unknown columns: %s' % (list(unknown),))
        else:
            assert not create
            assert not ifnotexists
            column_names = core.bayesdb_table_column_names(bdb, table)
        ncols = len(column_names)
        qt = sqlite3_quote_name(table)
        qcns = map(sqlite3_quote_name, column_names)
        # XXX Would be nice if we could prepare this statement before
        # reading any rows in order to check whether there are missing
        # nonnull columns with no default value.  However, the only
        # way to prepare a statement in the Python wrapper is to
        # execute a cursor, which also binds and steps the statement.
        sql = 'INSERT INTO %s (%s) VALUES (%s)' % \
            (qt, ','.join(qcns), ','.join('?' for _qcn in qcns))
        for row in reader:
            if len(row) < ncols:
                raise IOError('Line %d: Too few columns: %d < %d' %
                    (line, len(row), ncols))
            if len(row) > ncols:
                raise IOError('Line %d: Too many columns: %d > %d' %
                    (line, len(row), ncols))
            bdb.sql_execute(sql, [unicode(v, 'utf8').strip() for v in row])
Example #6
0
def _create_population(bdb, phrase):
    if core.bayesdb_has_population(bdb, phrase.name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(
                bdb,
                'Name already defined as population: %r' % (phrase.name, ))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)
    seen_columns = []

    # Create the population record and get the assigned id.
    bdb.sql_execute(
        '''
        INSERT INTO bayesdb_population (name, tabname) VALUES (?, ?)
    ''', (phrase.name, phrase.table))
    population_id = core.bayesdb_get_population(bdb, phrase.name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(
        itertools.chain.from_iterable([[(name, s.stattype) for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(
        itertools.chain.from_iterable([[(name, 'ignore') for name in s.names]
                                       for s in phrase.schema
                                       if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(
        itertools.chain.from_iterable([
            s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)
        ]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r' %
                (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, pop_guess_stattypes)
        migrate = [(col, st) for col, st in pop_guess_vars if st == 'key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r' %
            (not_found, ))

    # Get a map from variable name to colno.  Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    variable_map = {}
    duplicates = set()
    missing = set()
    invalid = set()
    colno_sql = '''
        SELECT colno FROM bayesdb_column
            WHERE tabname = :table AND name = :column_name
    '''
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in variable_map:
            duplicates.add(name)
            continue
        cursor = bdb.sql_execute(colno_sql, {
            'table': phrase.table,
            'column_name': name,
        })
        try:
            row = cursor.next()
        except StopIteration:
            missing.add(name)
            continue
        else:
            colno = row[0]
            assert isinstance(colno, int)
            cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
            if cursor_value(cursor) == 0 and stattype != 'ignore':
                invalid.add(stattype)
                continue
            variable_map[name] = colno
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(
            bdb,
            'No such columns in table %r: %r' % (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb,
                       'Duplicate column names: %r' % (list(duplicates), ))
    if invalid:
        raise BQLError(bdb,
                       'Invalid statistical types: %r' % (list(invalid), ))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        colno = variable_map[name]
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        bdb.sql_execute(
            '''
            INSERT INTO bayesdb_variable
                (population_id, name, colno, stattype)
                VALUES (?, ?, ?, ?)
        ''', (population_id, name, colno, stattype))
Example #7
0
def _create_population(bdb, phrase):
    # Retrieve the (possibility implicit) population name.
    population_name = phrase.name or phrase.table
    implicit = 1 if phrase.name is None else 0

    # Handle IF NOT EXISTS.
    if core.bayesdb_has_population(bdb, population_name):
        if phrase.ifnotexists:
            return
        else:
            raise BQLError(bdb, 'Name already defined as population: %r' %
                (population_name,))

    # Make sure the bayesdb_column table knows all the columns of the
    # underlying table.
    core.bayesdb_table_guarantee_columns(bdb, phrase.table)

    # Retrieve all columns from the base table. The user is required to provide
    # a strategy for each single variable, either MODEL, IGNORE, or GUESS.
    base_table_columns = core.bayesdb_table_column_names(bdb, phrase.table)

    # Create the population record and get the assigned id.
    bdb.sql_execute('''
        INSERT INTO bayesdb_population (name, tabname, implicit)
            VALUES (?, ?, ?)
    ''', (population_name, phrase.table, implicit))
    population_id = core.bayesdb_get_population(bdb, population_name)

    # Extract the population column names and stattypes as pairs.
    pop_model_vars = list(itertools.chain.from_iterable(
        [[(name, s.stattype) for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopModelVars)]))

    # Extract the ignored columns.
    pop_ignore_vars = list(itertools.chain.from_iterable(
        [[(name, 'ignore') for name in s.names]
        for s in phrase.schema if isinstance(s, ast.PopIgnoreVars)]))

    # Extract the columns to guess.
    pop_guess = list(itertools.chain.from_iterable(
        [s.names for s in phrase.schema if isinstance(s, ast.PopGuessVars)]))
    if '*' in pop_guess:
        # Do not allow * to coincide with other variables.
        if len(pop_guess) > 1:
            raise BQLError(
                bdb, 'Cannot use wildcard GUESS with variables names: %r'
                % (pop_guess, ))
        # Retrieve all variables in the base table.
        avoid = set(casefold(t[0]) for t in pop_model_vars + pop_ignore_vars)
        pop_guess = [t for t in base_table_columns if casefold(t) not in avoid]
    # Perform the guessing.
    if pop_guess:
        qt = sqlite3_quote_name(phrase.table)
        qcns = ','.join(map(sqlite3_quote_name, pop_guess))
        cursor = bdb.sql_execute('SELECT %s FROM %s' % (qcns, qt))
        rows = cursor.fetchall()
        # XXX This function returns a stattype called `key`, which we will add
        # to the pop_ignore_vars.
        pop_guess_stattypes = bayesdb_guess_stattypes(pop_guess, rows)
        pop_guess_vars = zip(pop_guess, [st[0] for st in pop_guess_stattypes])
        migrate = [(col, st) for col, st in pop_guess_vars if st=='key']
        for col, st in migrate:
            pop_guess_vars.remove((col, st))
            pop_ignore_vars.append((col, 'ignore'))
    else:
        pop_guess_vars = []

    # Ensure no string-valued variables are being modeled as numerical.
    numerical_string_vars = [
        var for var, stattype in pop_model_vars
        if stattype == 'numerical'
            and _column_contains_string(bdb, phrase.table, var)
    ]
    if numerical_string_vars:
        raise BQLError(bdb,
            'Column(s) with string values modeled as numerical: %r'
            % (numerical_string_vars, ))

    # Pool all the variables and statistical types together.
    pop_all_vars = pop_model_vars + pop_ignore_vars + pop_guess_vars

    # Check that everyone in the population is modeled.
    # `known` contains all the variables for which a policy is known.
    known = [casefold(t[0]) for t in pop_all_vars]
    not_found = [t for t in base_table_columns if casefold(t) not in known]
    if not_found:
        raise BQLError(
            bdb, 'Cannot determine a modeling policy for variables: %r'
            % (not_found, ))

    # Check
    # - for duplicates,
    # - for nonexistent columns,
    # - for invalid statistical types.
    seen_variables = set()
    duplicates = set()
    missing = set()
    invalid = set()
    stattype_sql = '''
        SELECT COUNT(*) FROM bayesdb_stattype WHERE name = :stattype
    '''
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if name in seen_variables:
            duplicates.add(name)
            continue
        if not core.bayesdb_table_has_column(bdb, phrase.table, nm):
            missing.add(name)
            continue
        cursor = bdb.sql_execute(stattype_sql, {'stattype': stattype})
        if cursor_value(cursor) == 0 and stattype != 'ignore':
            invalid.add(stattype)
            continue
        seen_variables.add(nm)
    # XXX Would be nice to report these simultaneously.
    if missing:
        raise BQLError(bdb, 'No such columns in table %r: %r' %
            (phrase.table, list(missing)))
    if duplicates:
        raise BQLError(bdb, 'Duplicate column names: %r' % (list(duplicates),))
    if invalid:
        raise BQLError(bdb, 'Invalid statistical types: %r' % (list(invalid),))

    # Insert variable records.
    for nm, st in pop_all_vars:
        name = casefold(nm)
        stattype = casefold(st)
        if stattype == 'ignore':
            continue
        core.bayesdb_add_variable(bdb, population_id, name, stattype)