def test_one_variable(): (df, bdb) = prepare() for var in [ 'categorical_1', 'few_ints_3', 'floats_3', 'many_ints_4', 'skewed_numeric_5' ]: cursor = bdb.execute('SELECT %s FROM plottest' % (var, )) df = cursor_to_df(cursor) f = BytesIO() run_pairplot((df, bdb), f, show_contour=False) assert has_nontrivial_contents_over_white_background(flush(f)) cursor = bdb.execute('SELECT %s, categorical_2 FROM plottest' % (var, )) df = cursor_to_df(cursor) f = BytesIO() run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=False) assert has_nontrivial_contents_over_white_background(flush(f)) f = BytesIO() run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=True) assert has_nontrivial_contents_over_white_background(flush(f)) with pytest.raises(BLE) as exc: run_pairplot((df, bdb), f, colorby='floats_3') assert 'non-categorical' in str(exc) with pytest.raises(BLE): run_pairplot((df, bdb), f, colorby='categorical_2', stattypes={'categorical_2': 'numerical'}) assert 'non-categorical' in str(exc)
def test_estimate_pairwise_similarity_long(): """ Tests larger queries that need to be broken into batch inserts of 500 values each, as well as the N parameter. """ with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: # n = 40 -> 40**2 -> 1600 rows total temp.write(_bigger_csv_data(40)) temp.seek(0) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # test N = 0 parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=0) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity')).shape == (0, 0) # test other values of N for N in [1, 2, 10, 20, 40]: parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=N, overwrite=True) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity')).shape == (N**2, 3) # N too high should fail with pytest.raises(BLE): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=41, overwrite=True) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1')) parallel_sim.index = range(parallel_sim.shape[0]) std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def barplot(bdb, bql): """Plot bar-plot of query giving categories and heights. First column specifies names; second column specifies heights. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. bql : str The BQL to run and histogram. Must have a two-column result. Returns ---------- figure: matplotlib.figure.Figure """ df = bqlu.cursor_to_df(bdb.execute(bql)) if df.shape[1] != 2: raise BLE(ValueError( 'Need two columns of output from SELECT for barplot.')) height_inches = df.shape[0] / 2.0 figure, ax = plt.subplots(figsize=(height_inches, 5)) ax.bar([x - .5 for x in range(df.shape[0])], df.ix[:, 1].values, color='#333333', edgecolor='#333333') ax.set_xticks(range(df.shape[0])) ax.set_xticklabels(df.ix[:, 0].values, rotation=90) ax.set_xlim([-1, df.shape[0] - .5]) ax.set_ylabel(df.columns[1]) ax.set_xlabel(df.columns[0]) ax.set_title('\n '.join(wrap(bql, 80))) return figure
def heatmap(bdb, bql=None, df=None, **kwargs): """Plot clustered heatmap of pairwise matrix. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. bql : str The BQL to run and plot. Must be a PAIRWISE BQL query if specified. One of bql or df must be specified. df : pandas.DataFrame(columns=['generator_id', 'name0', 'name1', 'value']) If bql is not specified, take data from here. **kwargs : dict Passed to zmatrix: vmin, vmax, row_ordering, col_ordering Returns ------- clustermap: seaborn.clustermap """ assert bql is not None or df is not None assert bql is None or df is None if bql is not None: df = bqlu.cursor_to_df(bdb.execute(bql)) df.fillna(0, inplace=True) return zmatrix(df, **kwargs)
def _query_into_queue(query_string, queue, bdb_file): """ Estimate pairwise similarity of a certain subset of the bdb according to query_string; place it in the multiprocessing Manager.Queue(). For two technical reasons, this function is defined as a toplevel class and independently creates a bdb handle: 1) Multiprocessing workers must be pickleable, and thus must be declared as toplevel functions; 2) Multiple threads cannot access the same bdb handle, lest concurrency issues arise with corrupt data. Parameters ---------- query_string : str Name of the query to execute, determined by estimate_similarity_mp. queue : multiprocessing.Manager.Queue Queue to place results into bdb_file : str File location of the BayesDB database. This function will independently open a new BayesDB handler. """ bdb = bayesdb_open(pathname=bdb_file) res = bdb.execute(query_string) queue.put(cursor_to_df(res))
def _query_into_queue(query_string, params, queue, bdb_file): """ Estimate pairwise similarity of a certain subset of the bdb according to query_string; place it in the multiprocessing Manager.Queue(). For two technical reasons, this function is defined as a toplevel class and independently creates a bdb handle: 1) Multiprocessing workers must be pickleable, and thus must be declared as toplevel functions; 2) Multiple threads cannot access the same bdb handle, lest concurrency issues arise with corrupt data. Parameters ---------- query_string : str Name of the query to execute, determined by estimate_similarity_mp. queue : multiprocessing.Manager.Queue Queue to place results into bdb_file : str File location of the BayesDB database. This function will independently open a new BayesDB handler. """ bdb = bayesdb_open(pathname=bdb_file) res = bdb.execute(query_string, params) queue.put(cursor_to_df(res))
def histogram(bdb, bql, bins=15, normed=None): """Plot histogram of one- or two-column table. If two-column, subdivide the first column according to labels in the second column Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. bql : str The BQL to run and histogram. bins : int, optional Number of bins in the histogram. normed : bool, optional Normalize the histograms? Returns ---------- figure: matplotlib.figure.Figure """ df = bqlu.cursor_to_df(bdb.execute(bql)) figure = comparative_hist(df, bdb=bdb, nbins=bins, normed=normed) return figure
def test_one_variable(): (df, bdb) = prepare() for var in ['categorical_1', 'few_ints_3', 'floats_3', 'many_ints_4', 'skewed_numeric_5']: cursor = bdb.execute('SELECT %s FROM plottest' % (var,)) df = cursor_to_df(cursor) f = BytesIO() do((df, bdb), f, show_contour=False) assert has_nontrivial_contents_over_white_background(flush(f)) cursor = bdb.execute('SELECT %s, categorical_2 FROM plottest' % (var,)) df = cursor_to_df(cursor) f = BytesIO() do((df, bdb), f, colorby='categorical_2', show_contour=False) assert has_nontrivial_contents_over_white_background(flush(f)) f = BytesIO() do((df, bdb), f, colorby='categorical_2', show_contour=True) assert has_nontrivial_contents_over_white_background(flush(f))
def run_heatmap(bdb, location, gen, *args, **kwargs): plt.figure() qg = bayeslite.bql_quote_name(gen) df = cursor_to_df(bdb.execute(''' estimate dependence probability from pairwise columns of %s ''' % (qg,))) bdbcontrib.plot_utils.heatmap(df, *args, **kwargs) plt.show() plt.savefig(location) print 'wrote heatmap to %r' % (location,)
def run_heatmap(bdb, location, gen, *args, **kwargs): plt.figure() qg = bayeslite.bql_quote_name(gen) df = cursor_to_df( bdb.execute(''' estimate dependence probability from pairwise columns of %s ''' % (qg, ))) bdbcontrib.plot_utils.heatmap(df, *args, **kwargs) plt.show() plt.savefig(location) print 'wrote heatmap to %r' % (location, )
def test_one_variable(): (df, bdb) = prepare() for var in ['categorical_1', 'few_ints_3', 'floats_3', 'many_ints_4', 'skewed_numeric_5']: cursor = bdb.execute('SELECT %s FROM plottest' % (var,)) df = cursor_to_df(cursor) f = BytesIO() run_pairplot((df, bdb), f, show_contour=False) assert has_nontrivial_contents_over_white_background(flush(f)) cursor = bdb.execute('SELECT %s, categorical_2 FROM plottest' % (var,)) df = cursor_to_df(cursor) f = BytesIO() run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=False) assert has_nontrivial_contents_over_white_background(flush(f)) f = BytesIO() run_pairplot((df, bdb), f, colorby='categorical_2', show_contour=True) assert has_nontrivial_contents_over_white_background(flush(f)) with pytest.raises(BLE) as exc: run_pairplot((df, bdb), f, colorby='floats_3') assert 'non-categorical' in str(exc) with pytest.raises(BLE): run_pairplot((df, bdb), f, colorby='categorical_2', stattypes={'categorical_2': 'numerical'}) assert 'non-categorical' in str(exc)
def prepare(): (df, csv_str) = dataset() os.environ['BAYESDB_WIZARD_MODE'] = '1' bdb = bayeslite.bayesdb_open() # XXX Do we not have a bayesdb_read_df ? bayesdb_read_csv(bdb, 'plottest', flush(csv_str), header=True, create=True) bdb.execute(''' create generator plottest_cc for plottest using crosscat(guess(*)) ''') # do a plot where a some sub-violins are removed _remove_violin_bql = """ DELETE FROM plottest WHERE categorical_1 = "B" AND (few_ints_3 = 2 OR few_ints_3 = 1); """ cursor = bdb.execute('SELECT * FROM plottest') df = cursor_to_df(cursor) return (df, bdb)
def prepare(): (df, csv_str) = dataset() os.environ['BAYESDB_WIZARD_MODE']='1' bdb = bayeslite.bayesdb_open() # XXX Do we not have a bayesdb_read_df ? bayesdb_read_csv(bdb, 'plottest', flush(csv_str), header=True, create=True) bdb.execute(''' create generator plottest_cc for plottest using crosscat(guess(*)) ''') # do a plot where a some sub-violins are removed _remove_violin_bql = """ DELETE FROM plottest WHERE categorical_1 = "B" AND (few_ints_3 = 2 OR few_ints_3 = 1); """ cursor = bdb.execute('SELECT * FROM plottest') df = cursor_to_df(cursor) return (df, bdb)
def pairplot(bdb, bql, generator_name=None, show_contour=False, colorby=None, show_missing=False, show_full=False): """Plot array of plots for all pairs of columns. Plots continuous-continuous pairs as scatter (optional KDE contour). Plots continuous-categorical pairs as violinplot. Plots categorical-categorical pairs as heatmap. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. bql : str The BQL to run and pairplot. generator_name : str, optional The name of generator; explicitly passing in provides optimizations. show_contour : bool, optional If True, KDE contours are plotted on top of scatter plots and histograms. show_missing : bool, optional If True, rows with one missing value are plotted as lines on scatter plots. colorby : str, optional Name of a column to use to color data points in histograms and scatter plots. show_full : bool, optional Show full pairwise plots, rather than only lower triangular plots. Returns ------- figure : matplotlib.figure.Figure """ df = bqlu.cursor_to_df(bdb.execute(bql)) figure = _pairplot(df, bdb=bdb, generator_name=generator_name, show_contour=show_contour, colorby=colorby, show_missing=show_missing, show_full=show_full) figure.tight_layout() inches = len(df.columns) * 4 figure.set_size_inches((inches, inches)) return figure
def selected_heatmaps(bdb, selectors, bql=None, df=None, **kwargs): """Yield heatmaps of pairwise matrix, broken up according to selectors. Parameters ---------- bdb : bayeslite.BayesDB Active BayesDB instance. selectors : [lambda name --> bool] Rather than plot the full NxN matrix all together, make separate plots for each combination of these selectors, plotting them in sequence. If selectors are specified, yields clustermaps, which caller is responsible for showing or saving, and then closing. bql : str The BQL to run and plot. Must be a PAIRWISE BQL query if specified. One of bql or df must be specified. df : pandas.DataFrame(columns=['generator_id', 'name0', 'name1', 'value']) If bql is not specified, then take data from here instead. **kwargs : dict Passed to zmatrix: vmin, vmax, row_ordering, col_ordering Yields ------ The triple (clustermap, selector1, selector2). It is recommended that caller keep a dict of these functions to names to help identify each one. """ # Cannot specify neither or both. assert bql is not None or df is not None assert bql is None or df is None if bql is not None: df = bqlu.cursor_to_df(bdb.execute(bql)) df.fillna(0, inplace=True) for n0selector in selectors: n0selection = df.iloc[:, 1].map(n0selector) for n1selector in selectors: n1selection = df.iloc[:, 2].map(n1selector) this_block = df[n0selection & n1selection] if len(this_block) > 0: yield (zmatrix(this_block, vmin=0, vmax=1, **kwargs), n0selector, n1selector)
def test_cursor_to_df(): with bayeslite.bayesdb_open() as bdb: bql_utils.cursor_to_df(bdb.execute('select * from sqlite_master')) bql_utils.cursor_to_df( bdb.execute('select * from sqlite_master' ' where 0 = 1'))
def estimate_pairwise_similarity(bdb_file, table, model, sim_table=None, cores=None, N=None, overwrite=False): """ Estimate pairwise similarity from the given model, splitting processing across multiple processors, and save results into sim_table. Because called methods in this function must also open up separate BayesDB instances, this function accepts a BayesDB filename, rather than an actual bayeslite.BayesDB object. Parameters ---------- bdb_file : str File location of the BayesDB database object. This function will handle opening the file with bayeslite.bayesdb_open. table : str Name of the table containing the raw data. model : str Name of the metamodel to estimate from. sim_table : str Name of the table to insert similarity results into. Defaults to table name + '_similarity'. cores : int Number of processors to use. Defaults to the number of cores as identified by multiprocessing.num_cores. N : int Number of rows for which to estimate pairwise similarities (so N^2 calculations are done). Should be used just to test small batches; currently, there is no control over what specific pairwise similarities are estimated with this parameter. overwrite : bool Whether to overwrite the sim_table if it already exists. If overwrite=False and the table exists, function will raise sqlite3.OperationalError. Default True. """ bdb = bayesdb_open(pathname=bdb_file) if cores is None: cores = mp.cpu_count() if cores < 1: raise BLE(ValueError( "Invalid number of cores {}".format(cores))) if sim_table is None: sim_table = table + '_similarity' # Get number of occurrences in the database count_cursor = bdb.execute('SELECT COUNT(*) FROM {}'.format(table)) table_count = int(cursor_to_df(count_cursor)['"COUNT"(*)'][0]) if N is None: N = table_count elif N > table_count: raise BLE(ValueError( "Asked for N={} rows but {} rows in table".format(N, table_count))) # Calculate the size (# of similarities to compute) and # offset (where to start) calculation for each worker query. # Divide sizes evenly, and make the last job finish the remainder sizes = [(N * N) / cores for i in range(cores)] sizes[-1] += (N * N) % cores total = 0 offsets = [total] for size in sizes[:-1]: total += size offsets.append(total) q_template = ('ESTIMATE SIMILARITY FROM PAIRWISE {} '.format(model) + 'LIMIT {} OFFSET {}') # Format sizes/offsets later queries = [q_template.format(*so) for so in zip(sizes, offsets)] # Create the similarity table. Assumes original table has rowid column. # XXX: tables from verbnet bdb don't necessarily have an # autoincrementing primary key other than rowid (doesn't work). # So we ought to ask for a foreign key, but ESTIMATE SIMILARITY # returns numerical values rather than row names, so that code # would have to be changed first. For now, we eliminate # REFERENCE {table}(foreign_key) from the name0 and name1 col specs. if overwrite: bdb.sql_execute('DROP TABLE IF EXISTS {}'.format(sim_table)) bdb.sql_execute(''' CREATE TABLE {sim_table} ( rowid0 INTEGER NOT NULL, rowid1 INTEGER NOT NULL, value DOUBLE NOT NULL ) '''.format(sim_table=sim_table)) # Define the helper which inserts data into table in batches def insert_into_sim(df): """ Use the main thread bdb handle to successively insert results of ESTIMATEs into the table. """ # Because the bayeslite implementation of sqlite3 doesn't allow # inserts of > 500 rows at a time (else sqlite3.OperationalError), # we split the list into chunks of size 500 and perform multiple # insert statements. rows = map(list, df.values) rows_str = ['({})'.format(','.join(map(str, r))) for r in rows] for rows_chunk in _chunks(rows_str, 500): insert_str = ''' INSERT INTO {} (rowid0, rowid1, value) VALUES {}; '''.format(sim_table, ','.join(rows_chunk)) bdb.sql_execute(insert_str) pool = mp.Pool(processes=cores) manager = mp.Manager() queue = manager.Queue() for query in queries: pool.apply_async( _query_into_queue, args=(query, queue, bdb_file) ) # Close pool and wait for processes to finish # FIXME: This waits for all processes to finish before inserting # into the table, which means that memory usage is potentially very # high! pool.close() pool.join() # Process returned results while not queue.empty(): df = queue.get() insert_into_sim(df)
def test_cursor_to_df(): with bayeslite.bayesdb_open() as bdb: bql_utils.cursor_to_df(bdb.execute('select * from sqlite_master')) bql_utils.cursor_to_df(bdb.execute('select * from sqlite_master' ' where 0 = 1'))
def test_estimate_pairwise_similarity(): """ Tests basic estimate pairwise similarity functionality against existing BQL estimate queries. """ with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: temp.write(test_bql_utils.csv_data) temp.seek(0) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # How to properly use the estimate_pairwise_similarity function. parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc') # Should complain with bad core value with pytest.raises(BLE): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', cores=0) # Should complain if overwrite flag is not set, but t_similarity # exists with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc') # Should complain if model and table don't exist with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(bdb_file.name, 'foo', 'foo_cc') # Should complain if bdb_file doesn't exist with tempfile.NamedTemporaryFile() as does_not_exist: with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(does_not_exist.name, 't', 't_cc') # Should run fine if overwrite flag is set parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', overwrite=True) # Should be able to specify another table name parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', sim_table='t_similarity_2') parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1')) parallel_sim_2 = cursor_to_df( bdb.execute( 'SELECT * FROM t_similarity_2 ORDER BY rowid0, rowid1')) # Results may be returned out of order. So we sort the values, # as above, and we reorder the numeric index parallel_sim.index = range(parallel_sim.shape[0]) parallel_sim_2.index = range(parallel_sim_2.shape[0]) # The data from two successive parallel pairwise estimates should be # identical to each other... assert_frame_equal(parallel_sim, parallel_sim_2, check_column_type=True) # ...and to a standard estimate pairwise similarity. std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)