def test_estimate_pairwise_similarity_long(): """ Tests larger queries that need to be broken into batch inserts of 500 values each, as well as the N parameter. """ os.environ['BAYESDB_WIZARD_MODE'] = '1' with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: # n = 40 -> 40**2 -> 1600 rows total temp.write(_bigger_csv_data(40)) temp.seek(0) bayeslite.bayesdb_read_csv_file( bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # test N = 0 parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=0 ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (0, 0) # test other values of N for N in [1, 2, 10, 20, 40]: parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=N, overwrite=True ) assert cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).shape == (N**2, 3) # N too high should fail with pytest.raises(BLE): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', N=41, overwrite=True ) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).sort_values(by=['rowid0', 'rowid1']) parallel_sim.index = range(parallel_sim.shape[0]) std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc') ) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def query(self, query_string, *bindings): '''Basic querying without session capture or reporting. help_for_query''' self.check_representation() query_string = self.interpret_query(query_string) self.logger.info("BQL [%s] [%r]", query_string, bindings) with self.bdb.savepoint(): try: res = self.bdb.execute(query_string, bindings) assert res is not None and res.description is not None self.logger.debug("BQL [%s] [%r] has returned a cursor." % (query_string, bindings)) df = bdbcontrib.cursor_to_df(res) self.logger.debug("BQL [%s] [%r] has created a dataframe." % (query_string, bindings)) return df except: self.logger.exception("")
def query(self, query_string, *bindings): '''Basic querying without session capture or reporting. help_for_query''' self.check_representation() query_string = re.sub(r'(^|(?<=\s))%t\b', bayeslite.bql_quote_name(self.name), re.sub(r'(^|(?<=\s))%g\b', bayeslite.bql_quote_name(self.generator_name), query_string)) self.logger.info("BQL [%s] [%r]", query_string, bindings) with self.bdb.savepoint(): try: res = self.bdb.execute(query_string, bindings) assert res is not None and res.description is not None self.logger.debug("BQL [%s] [%r] has returned a cursor." % (query_string, bindings)) df = bdbcontrib.cursor_to_df(res) self.logger.debug("BQL [%s] [%r] has created a dataframe." % (query_string, bindings)) return df except: self.logger.exception("")
def test_estimate_pairwise_similarity(): """ Tests basic estimate pairwise similarity functionality against existing BQL estimate queries. """ os.environ['BAYESDB_WIZARD_MODE'] = '1' with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file: bdb = bayeslite.bayesdb_open(bdb_file.name) with tempfile.NamedTemporaryFile() as temp: temp.write(test_utils.csv_data) temp.seek(0) bayeslite.bayesdb_read_csv_file( bdb, 't', temp.name, header=True, create=True) bdb.execute(''' CREATE GENERATOR t_cc FOR t USING crosscat ( GUESS(*), id IGNORE ) ''') bdb.execute('INITIALIZE 3 MODELS FOR t_cc') bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT') # How to properly use the estimate_pairwise_similarity function. parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc' ) # Should complain with bad core value with pytest.raises(BLE): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', cores=0 ) # Should complain if overwrite flag is not set, but t_similarity # exists with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc' ) # Should complain if model and table don't exist with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( bdb_file.name, 'foo', 'foo_cc' ) # Should complain if bdb_file doesn't exist with tempfile.NamedTemporaryFile() as does_not_exist: with pytest.raises(SQLError): parallel.estimate_pairwise_similarity( does_not_exist.name, 't', 't_cc' ) # Should run fine if overwrite flag is set parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', overwrite=True ) # Should be able to specify another table name parallel.estimate_pairwise_similarity( bdb_file.name, 't', 't_cc', sim_table='t_similarity_2' ) parallel_sim = cursor_to_df( bdb.execute('SELECT * FROM t_similarity') ).sort_values(by=['rowid0', 'rowid1']) parallel_sim_2 = cursor_to_df( bdb.execute('SELECT * FROM t_similarity_2') ).sort_values(by=['rowid0', 'rowid1']) # Results may be returned out of order. So we sort the values, # as above, and we reorder the numeric index parallel_sim.index = range(parallel_sim.shape[0]) parallel_sim_2.index = range(parallel_sim_2.shape[0]) # The data from two successive parallel pairwise estimates should be # identical to each other... assert_frame_equal( parallel_sim, parallel_sim_2, check_column_type=True) # ...and to a standard estimate pairwise similarity. std_sim = cursor_to_df( bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc') ) assert_frame_equal(std_sim, parallel_sim, check_column_type=True)