def test_one_variable():
    (df, bdb) = prepare()
    for var in [
            'categorical_1', 'few_ints_3', 'floats_3', 'many_ints_4',
            'skewed_numeric_5'
    ]:
        cursor = bdb.execute('SELECT %s FROM plottest' % (var, ))
        df = cursor_to_df(cursor)
        f = BytesIO()
        run_pairplot((df, bdb), f, show_contour=False)
        assert has_nontrivial_contents_over_white_background(flush(f))
        cursor = bdb.execute('SELECT %s, categorical_2 FROM plottest' %
                             (var, ))
        df = cursor_to_df(cursor)
        f = BytesIO()
        run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=False)
        assert has_nontrivial_contents_over_white_background(flush(f))
        f = BytesIO()
        run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=True)
        assert has_nontrivial_contents_over_white_background(flush(f))
    with pytest.raises(BLE) as exc:
        run_pairplot((df, bdb), f, colorby='floats_3')
        assert 'non-categorical' in str(exc)
    with pytest.raises(BLE):
        run_pairplot((df, bdb),
                     f,
                     colorby='categorical_2',
                     stattypes={'categorical_2': 'numerical'})
        assert 'non-categorical' in str(exc)
def test_estimate_pairwise_similarity_long():
    """
    Tests larger queries that need to be broken into batch inserts of 500
    values each, as well as the N parameter.
    """
    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            # n = 40 -> 40**2 -> 1600 rows total
            temp.write(_bigger_csv_data(40))
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # test N = 0
        parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=0)
        assert cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')).shape == (0, 0)

        # test other values of N
        for N in [1, 2, 10, 20, 40]:
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  N=N,
                                                  overwrite=True)
            assert cursor_to_df(
                bdb.execute('SELECT * FROM t_similarity')).shape == (N**2, 3)
        # N too high should fail
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  N=41,
                                                  overwrite=True)

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1'))
        parallel_sim.index = range(parallel_sim.shape[0])

        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc'))

        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
Exemple #3
0
def barplot(bdb, bql):
    """Plot bar-plot of query giving categories and heights.

    First column specifies names; second column specifies heights.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    bql : str
        The BQL to run and histogram. Must have a two-column result.

    Returns
    ----------
    figure: matplotlib.figure.Figure
    """
    df = bqlu.cursor_to_df(bdb.execute(bql))
    if df.shape[1] != 2:
        raise BLE(ValueError(
            'Need two columns of output from SELECT for barplot.'))
    height_inches = df.shape[0] / 2.0
    figure, ax = plt.subplots(figsize=(height_inches, 5))

    ax.bar([x - .5 for x in range(df.shape[0])], df.ix[:, 1].values,
        color='#333333', edgecolor='#333333')
    ax.set_xticks(range(df.shape[0]))
    ax.set_xticklabels(df.ix[:, 0].values, rotation=90)
    ax.set_xlim([-1, df.shape[0] - .5])
    ax.set_ylabel(df.columns[1])
    ax.set_xlabel(df.columns[0])
    ax.set_title('\n    '.join(wrap(bql, 80)))

    return figure
Exemple #4
0
def heatmap(bdb, bql=None, df=None, **kwargs):
    """Plot clustered heatmap of pairwise matrix.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    bql : str
        The BQL to run and plot. Must be a PAIRWISE BQL query if specified.
        One of bql or df must be specified.
    df : pandas.DataFrame(columns=['generator_id', 'name0', 'name1', 'value'])
        If bql is not specified, take data from here.

    **kwargs : dict
        Passed to zmatrix: vmin, vmax, row_ordering, col_ordering

    Returns
    -------
    clustermap: seaborn.clustermap
    """
    assert bql is not None or df is not None
    assert bql is None or df is None
    if bql is not None:
        df = bqlu.cursor_to_df(bdb.execute(bql))
    df.fillna(0, inplace=True)
    return zmatrix(df, **kwargs)
Exemple #5
0
def _query_into_queue(query_string, queue, bdb_file):
    """
    Estimate pairwise similarity of a certain subset of the bdb according to
    query_string; place it in the multiprocessing Manager.Queue().

    For two technical reasons, this function is defined as a toplevel class and
    independently creates a bdb handle:

    1) Multiprocessing workers must be pickleable, and thus must be
       declared as toplevel functions;
    2) Multiple threads cannot access the same bdb handle, lest concurrency
       issues arise with corrupt data.

    Parameters
    ----------
    query_string : str
        Name of the query to execute, determined by estimate_similarity_mp.
    queue : multiprocessing.Manager.Queue
        Queue to place results into
    bdb_file : str
        File location of the BayesDB database. This function will
        independently open a new BayesDB handler.
    """
    bdb = bayesdb_open(pathname=bdb_file)
    res = bdb.execute(query_string)
    queue.put(cursor_to_df(res))
Exemple #6
0
def _query_into_queue(query_string, params, queue, bdb_file):
    """
    Estimate pairwise similarity of a certain subset of the bdb according to
    query_string; place it in the multiprocessing Manager.Queue().

    For two technical reasons, this function is defined as a toplevel class and
    independently creates a bdb handle:

    1) Multiprocessing workers must be pickleable, and thus must be
       declared as toplevel functions;
    2) Multiple threads cannot access the same bdb handle, lest concurrency
       issues arise with corrupt data.

    Parameters
    ----------
    query_string : str
        Name of the query to execute, determined by estimate_similarity_mp.
    queue : multiprocessing.Manager.Queue
        Queue to place results into
    bdb_file : str
        File location of the BayesDB database. This function will
        independently open a new BayesDB handler.
    """
    bdb = bayesdb_open(pathname=bdb_file)
    res = bdb.execute(query_string, params)
    queue.put(cursor_to_df(res))
Exemple #7
0
def histogram(bdb, bql, bins=15, normed=None):
    """Plot histogram of one- or two-column table.

    If two-column, subdivide the first column according to labels in
    the second column

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    bql : str
        The BQL to run and histogram.
    bins : int, optional
        Number of bins in the histogram.
    normed : bool, optional
        Normalize the histograms?

    Returns
    ----------
    figure: matplotlib.figure.Figure
    """
    df = bqlu.cursor_to_df(bdb.execute(bql))
    figure = comparative_hist(df, bdb=bdb, nbins=bins, normed=normed)

    return figure
Exemple #8
0
def test_one_variable():
    (df, bdb) = prepare()
    for var in ['categorical_1', 'few_ints_3', 'floats_3', 'many_ints_4',
                'skewed_numeric_5']:
      cursor = bdb.execute('SELECT %s FROM plottest' % (var,))
      df = cursor_to_df(cursor)
      f = BytesIO()
      do((df, bdb), f, show_contour=False)
      assert has_nontrivial_contents_over_white_background(flush(f))
      cursor = bdb.execute('SELECT %s, categorical_2 FROM plottest' % (var,))
      df = cursor_to_df(cursor)
      f = BytesIO()
      do((df, bdb), f, colorby='categorical_2', show_contour=False)
      assert has_nontrivial_contents_over_white_background(flush(f))
      f = BytesIO()
      do((df, bdb), f, colorby='categorical_2', show_contour=True)
      assert has_nontrivial_contents_over_white_background(flush(f))
def run_heatmap(bdb, location, gen, *args, **kwargs):
    plt.figure()
    qg = bayeslite.bql_quote_name(gen)
    df = cursor_to_df(bdb.execute('''
        estimate dependence probability from pairwise columns of %s
    ''' % (qg,)))
    bdbcontrib.plot_utils.heatmap(df, *args, **kwargs)
    plt.show()
    plt.savefig(location)
    print 'wrote heatmap to %r' % (location,)
def run_heatmap(bdb, location, gen, *args, **kwargs):
    plt.figure()
    qg = bayeslite.bql_quote_name(gen)
    df = cursor_to_df(
        bdb.execute('''
        estimate dependence probability from pairwise columns of %s
    ''' % (qg, )))
    bdbcontrib.plot_utils.heatmap(df, *args, **kwargs)
    plt.show()
    plt.savefig(location)
    print 'wrote heatmap to %r' % (location, )
def test_one_variable():
    (df, bdb) = prepare()
    for var in ['categorical_1', 'few_ints_3', 'floats_3', 'many_ints_4',
                'skewed_numeric_5']:
      cursor = bdb.execute('SELECT %s FROM plottest' % (var,))
      df = cursor_to_df(cursor)
      f = BytesIO()
      run_pairplot((df, bdb), f, show_contour=False)
      assert has_nontrivial_contents_over_white_background(flush(f))
      cursor = bdb.execute('SELECT %s, categorical_2 FROM plottest' % (var,))
      df = cursor_to_df(cursor)
      f = BytesIO()
      run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=False)
      assert has_nontrivial_contents_over_white_background(flush(f))
      f = BytesIO()
      run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=True)
      assert has_nontrivial_contents_over_white_background(flush(f))
    with pytest.raises(BLE) as exc:
      run_pairplot((df, bdb), f, colorby='floats_3')
      assert 'non-categorical' in str(exc)
    with pytest.raises(BLE):
      run_pairplot((df, bdb), f, colorby='categorical_2',
                   stattypes={'categorical_2': 'numerical'})
      assert 'non-categorical' in str(exc)
def prepare():
    (df, csv_str) = dataset()
    os.environ['BAYESDB_WIZARD_MODE'] = '1'
    bdb = bayeslite.bayesdb_open()
    # XXX Do we not have a bayesdb_read_df ?
    bayesdb_read_csv(bdb, 'plottest', flush(csv_str), header=True, create=True)
    bdb.execute('''
        create generator plottest_cc for plottest using crosscat(guess(*))
    ''')

    # do a plot where a some sub-violins are removed
    _remove_violin_bql = """
        DELETE FROM plottest
            WHERE categorical_1 = "B"
                AND (few_ints_3 = 2 OR few_ints_3 = 1);
    """
    cursor = bdb.execute('SELECT * FROM plottest')
    df = cursor_to_df(cursor)
    return (df, bdb)
Exemple #13
0
def prepare():
    (df, csv_str) = dataset()
    os.environ['BAYESDB_WIZARD_MODE']='1'
    bdb = bayeslite.bayesdb_open()
    # XXX Do we not have a bayesdb_read_df ?
    bayesdb_read_csv(bdb, 'plottest', flush(csv_str),
                     header=True, create=True)
    bdb.execute('''
        create generator plottest_cc for plottest using crosscat(guess(*))
    ''')

    # do a plot where a some sub-violins are removed
    _remove_violin_bql = """
        DELETE FROM plottest
            WHERE categorical_1 = "B"
                AND (few_ints_3 = 2 OR few_ints_3 = 1);
    """
    cursor = bdb.execute('SELECT * FROM plottest')
    df = cursor_to_df(cursor)
    return (df, bdb)
Exemple #14
0
def pairplot(bdb, bql, generator_name=None, show_contour=False, colorby=None,
        show_missing=False, show_full=False):
    """Plot array of plots for all pairs of columns.

    Plots continuous-continuous pairs as scatter (optional KDE contour).
    Plots continuous-categorical pairs as violinplot.
    Plots categorical-categorical pairs as heatmap.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    bql : str
        The BQL to run and pairplot.
    generator_name : str, optional
        The name of generator; explicitly passing in provides optimizations.
    show_contour : bool, optional
        If True, KDE contours are plotted on top of scatter plots
        and histograms.
    show_missing : bool, optional
        If True, rows with one missing value are plotted as lines on scatter
        plots.
    colorby : str, optional
        Name of a column to use to color data points in histograms and scatter
        plots.
    show_full : bool, optional
        Show full pairwise plots, rather than only lower triangular plots.

    Returns
    -------
    figure : matplotlib.figure.Figure
    """
    df = bqlu.cursor_to_df(bdb.execute(bql))
    figure = _pairplot(df, bdb=bdb, generator_name=generator_name,
        show_contour=show_contour, colorby=colorby, show_missing=show_missing,
        show_full=show_full)
    figure.tight_layout()
    inches = len(df.columns) * 4
    figure.set_size_inches((inches, inches))

    return figure
Exemple #15
0
def selected_heatmaps(bdb, selectors, bql=None, df=None, **kwargs):
    """Yield heatmaps of pairwise matrix, broken up according to selectors.

    Parameters
    ----------
    bdb : bayeslite.BayesDB
        Active BayesDB instance.
    selectors : [lambda name --> bool]
        Rather than plot the full NxN matrix all together, make separate plots
        for each combination of these selectors, plotting them in sequence.
        If selectors are specified, yields clustermaps, which caller is
        responsible for showing or saving, and then closing.
    bql : str
        The BQL to run and plot. Must be a PAIRWISE BQL query if specified.
        One of bql or df must be specified.
    df : pandas.DataFrame(columns=['generator_id', 'name0', 'name1', 'value'])
        If bql is not specified, then take data from here instead.
    **kwargs : dict
        Passed to zmatrix: vmin, vmax, row_ordering, col_ordering

    Yields
    ------
    The triple (clustermap, selector1, selector2).  It is recommended that
    caller keep a dict of these functions to names to help identify each one.
    """
    # Cannot specify neither or both.
    assert bql is not None or df is not None
    assert bql is None or df is None
    if bql is not None:
        df = bqlu.cursor_to_df(bdb.execute(bql))
    df.fillna(0, inplace=True)
    for n0selector in selectors:
        n0selection = df.iloc[:, 1].map(n0selector)
        for n1selector in selectors:
            n1selection = df.iloc[:, 2].map(n1selector)
            this_block = df[n0selection & n1selection]
            if len(this_block) > 0:
                yield (zmatrix(this_block, vmin=0, vmax=1, **kwargs),
                       n0selector, n1selector)
Exemple #16
0
def test_cursor_to_df():
    with bayeslite.bayesdb_open() as bdb:
        bql_utils.cursor_to_df(bdb.execute('select * from sqlite_master'))
        bql_utils.cursor_to_df(
            bdb.execute('select * from sqlite_master'
                        ' where 0 = 1'))
Exemple #17
0
def estimate_pairwise_similarity(bdb_file, table, model, sim_table=None,
                                 cores=None, N=None, overwrite=False):
    """
    Estimate pairwise similarity from the given model, splitting processing
    across multiple processors, and save results into sim_table.

    Because called methods in this function must also open up separate BayesDB
    instances, this function accepts a BayesDB filename, rather than an actual
    bayeslite.BayesDB object.

    Parameters
    ----------
    bdb_file : str
        File location of the BayesDB database object. This function will
        handle opening the file with bayeslite.bayesdb_open.
    table : str
        Name of the table containing the raw data.
    model : str
        Name of the metamodel to estimate from.
    sim_table : str
        Name of the table to insert similarity results into. Defaults to
        table name + '_similarity'.
    cores : int
        Number of processors to use. Defaults to the number of cores as
        identified by multiprocessing.num_cores.
    N : int
        Number of rows for which to estimate pairwise similarities (so
        N^2 calculations are done). Should be used just to test small
        batches; currently, there is no control over what specific pairwise
        similarities are estimated with this parameter.
    overwrite : bool
        Whether to overwrite the sim_table if it already exists. If
        overwrite=False and the table exists, function will raise
        sqlite3.OperationalError. Default True.
    """
    bdb = bayesdb_open(pathname=bdb_file)

    if cores is None:
        cores = mp.cpu_count()

    if cores < 1:
        raise BLE(ValueError(
            "Invalid number of cores {}".format(cores)))

    if sim_table is None:
        sim_table = table + '_similarity'

    # Get number of occurrences in the database
    count_cursor = bdb.execute('SELECT COUNT(*) FROM {}'.format(table))
    table_count = int(cursor_to_df(count_cursor)['"COUNT"(*)'][0])
    if N is None:
        N = table_count
    elif N > table_count:
        raise BLE(ValueError(
            "Asked for N={} rows but {} rows in table".format(N, table_count)))

    # Calculate the size (# of similarities to compute) and
    # offset (where to start) calculation for each worker query.

    # Divide sizes evenly, and make the last job finish the remainder
    sizes = [(N * N) / cores for i in range(cores)]
    sizes[-1] += (N * N) % cores

    total = 0
    offsets = [total]
    for size in sizes[:-1]:
        total += size
        offsets.append(total)

    q_template = ('ESTIMATE SIMILARITY FROM PAIRWISE {} '.format(model) +
                  'LIMIT {} OFFSET {}')  # Format sizes/offsets later

    queries = [q_template.format(*so) for so in zip(sizes, offsets)]

    # Create the similarity table. Assumes original table has rowid column.
    # XXX: tables from verbnet bdb don't necessarily have an
    # autoincrementing primary key other than rowid (doesn't work).
    # So we ought to ask for a foreign key, but ESTIMATE SIMILARITY
    # returns numerical values rather than row names, so that code
    # would have to be changed first. For now, we eliminate
    # REFERENCE {table}(foreign_key) from the name0 and name1 col specs.
    if overwrite:
        bdb.sql_execute('DROP TABLE IF EXISTS {}'.format(sim_table))

    bdb.sql_execute('''
        CREATE TABLE {sim_table} (
            rowid0 INTEGER NOT NULL,
            rowid1 INTEGER NOT NULL,
            value DOUBLE NOT NULL
        )
    '''.format(sim_table=sim_table))

    # Define the helper which inserts data into table in batches
    def insert_into_sim(df):
        """
        Use the main thread bdb handle to successively insert results of
        ESTIMATEs into the table.
        """
        # Because the bayeslite implementation of sqlite3 doesn't allow
        # inserts of > 500 rows at a time (else sqlite3.OperationalError),
        # we split the list into chunks of size 500 and perform multiple
        # insert statements.
        rows = map(list, df.values)
        rows_str = ['({})'.format(','.join(map(str, r))) for r in rows]
        for rows_chunk in _chunks(rows_str, 500):
            insert_str = '''
                INSERT INTO {} (rowid0, rowid1, value) VALUES {};
            '''.format(sim_table, ','.join(rows_chunk))
            bdb.sql_execute(insert_str)

    pool = mp.Pool(processes=cores)

    manager = mp.Manager()
    queue = manager.Queue()

    for query in queries:
        pool.apply_async(
            _query_into_queue, args=(query, queue, bdb_file)
        )

    # Close pool and wait for processes to finish
    # FIXME: This waits for all processes to finish before inserting
    # into the table, which means that memory usage is potentially very
    # high!
    pool.close()
    pool.join()

    # Process returned results
    while not queue.empty():
        df = queue.get()
        insert_into_sim(df)
Exemple #18
0
def test_cursor_to_df():
    with bayeslite.bayesdb_open() as bdb:
        bql_utils.cursor_to_df(bdb.execute('select * from sqlite_master'))
        bql_utils.cursor_to_df(bdb.execute('select * from sqlite_master'
                ' where 0 = 1'))
def test_estimate_pairwise_similarity():
    """
    Tests basic estimate pairwise similarity functionality against
    existing BQL estimate queries.
    """
    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            temp.write(test_bql_utils.csv_data)
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)

        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # How to properly use the estimate_pairwise_similarity function.
        parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc')

        # Should complain with bad core value
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  cores=0)

        # Should complain if overwrite flag is not set, but t_similarity
        # exists
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc')
        # Should complain if model and table don't exist
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(bdb_file.name, 'foo',
                                                  'foo_cc')
        # Should complain if bdb_file doesn't exist
        with tempfile.NamedTemporaryFile() as does_not_exist:
            with pytest.raises(SQLError):
                parallel.estimate_pairwise_similarity(does_not_exist.name, 't',
                                                      't_cc')

        # Should run fine if overwrite flag is set
        parallel.estimate_pairwise_similarity(bdb_file.name,
                                              't',
                                              't_cc',
                                              overwrite=True)

        # Should be able to specify another table name
        parallel.estimate_pairwise_similarity(bdb_file.name,
                                              't',
                                              't_cc',
                                              sim_table='t_similarity_2')

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1'))
        parallel_sim_2 = cursor_to_df(
            bdb.execute(
                'SELECT * FROM t_similarity_2 ORDER BY rowid0, rowid1'))

        # Results may be returned out of order. So we sort the values,
        # as above, and we reorder the numeric index
        parallel_sim.index = range(parallel_sim.shape[0])
        parallel_sim_2.index = range(parallel_sim_2.shape[0])

        # The data from two successive parallel pairwise estimates should be
        # identical to each other...
        assert_frame_equal(parallel_sim,
                           parallel_sim_2,
                           check_column_type=True)
        # ...and to a standard estimate pairwise similarity.
        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc'))
        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)