def test_estimate_pairwise_similarity_long(): """ Tests larger queries that need to be broken into batch inserts of 500 values each, as well as the N parameter. """ os.environ['BAYESDB_WIZARD_MODE'] = '1' with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: # n = 40 -> 40**2 -> 1600 rows total temp.write(_bigger_csv_data(40)) temp.seek(0) bayeslite.bayesdb_read_csv_file( bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # test N = 0 parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=0 ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (0, 0) # test other values of N for N in [1, 2, 10, 20, 40]: parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=N, overwrite=True ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (N**2, 3) # N too high should fail with pytest.raises(BLE): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=41, overwrite=True ) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).sort_values(by=['rowid0', 'rowid1']) parallel_sim.index = range(parallel_sim.shape[0]) std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc') ) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_estimate_pairwise_similarity_long(): """ Tests larger queries that need to be broken into batch inserts of 500 values each, as well as the N parameter. """ with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: # n = 40 -> 40**2 -> 1600 rows total temp.write(_bigger_csv_data(40)) temp.seek(0) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # test N = 0 parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=0) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity')).shape == (0, 0) # test other values of N for N in [1, 2, 10, 20, 40]: parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=N, overwrite=True) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity')).shape == (N**2, 3) # N too high should fail with pytest.raises(BLE): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=41, overwrite=True) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1')) parallel_sim.index = range(parallel_sim.shape[0]) std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_estimate_pairwise_similarity(): """ Tests basic estimate pairwise similarity functionality against existing BQL estimate queries. """ os.environ['BAYESDB_WIZARD_MODE'] = '1' with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: temp.write(test_utils.csv_data) temp.seek(0) bayeslite.bayesdb_read_csv_file( bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # How to properly use the estimate_pairwise_similarity function. parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc' ) # Should complain with bad core value with pytest.raises(BLE): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', cores=0 ) # Should complain if overwrite flag is not set, but t_similarity # exists with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc' ) # Should complain if model and table don't exist with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( bdb_file.name, 'foo', 'foo_cc' ) # Should complain if bdb_file doesn't exist with tempfile.NamedTemporaryFile() as does_not_exist: with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( does_not_exist.name, 't', 't_cc' ) # Should run fine if overwrite flag is set parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', overwrite=True ) # Should be able to specify another table name parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', sim_table='t_similarity_2' ) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).sort_values(by=['rowid0', 'rowid1']) parallel_sim_2 = cursor_to_df( bdb.execute('SELECT * FROM t_similarity_2') ).sort_values(by=['rowid0', 'rowid1']) # Results may be returned out of order. So we sort the values, # as above, and we reorder the numeric index parallel_sim.index = range(parallel_sim.shape[0]) parallel_sim_2.index = range(parallel_sim_2.shape[0]) # The data from two successive parallel pairwise estimates should be # identical to each other... assert_frame_equal( parallel_sim, parallel_sim_2, check_column_type=True) # ...and to a standard estimate pairwise similarity. std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc') ) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_estimate_pairwise_similarity(): """ Tests basic estimate pairwise similarity functionality against existing BQL estimate queries. """ with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: temp.write(test_bql_utils.csv_data) temp.seek(0) bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # How to properly use the estimate_pairwise_similarity function. parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc') # Should complain with bad core value with pytest.raises(BLE): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', cores=0) # Should complain if overwrite flag is not set, but t_similarity # exists with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc') # Should complain if model and table don't exist with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(bdb_file.name, 'foo', 'foo_cc') # Should complain if bdb_file doesn't exist with tempfile.NamedTemporaryFile() as does_not_exist: with pytest.raises(SQLError): parallel.estimate_pairwise_similarity(does_not_exist.name, 't', 't_cc') # Should run fine if overwrite flag is set parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', overwrite=True) # Should be able to specify another table name parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', sim_table='t_similarity_2') parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1')) parallel_sim_2 = cursor_to_df( bdb.execute( 'SELECT * FROM t_similarity_2 ORDER BY rowid0, rowid1')) # Results may be returned out of order. So we sort the values, # as above, and we reorder the numeric index parallel_sim.index = range(parallel_sim.shape[0]) parallel_sim_2.index = range(parallel_sim_2.shape[0]) # The data from two successive parallel pairwise estimates should be # identical to each other... assert_frame_equal(parallel_sim, parallel_sim_2, check_column_type=True) # ...and to a standard estimate pairwise similarity. std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)