def cgpm_smoke_bdb(): with bayesdb_open(':memory:', builtin_metamodels=False) as bdb: registry = { 'piecewise': PieceWise, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry, multiprocess=0)) bdb.sql_execute('CREATE TABLE t (Output, cat, Input)') for i in xrange(3): for j in xrange(3): for k in xrange(3): output = i + j / (k + 1) cat = -1 if (i + j * k) % 2 else +1 input = (i * j - k)**2 if i % 2: output = None if j % 2: cat = None if k % 2: input = None bdb.sql_execute( ''' INSERT INTO t (output, cat, input) VALUES (?, ?, ?) ''', (output, cat, input)) bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( MODEL output, input AS NUMERICAL; MODEL cat AS CATEGORICAL ) ''') yield bdb
def cgpm_dummy_satellites_pop_bdb(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' create population satellites for satellites_ucs with schema( model apogee as numerical; model class_of_orbit as categorical; model country_of_operator as categorical; model launch_mass as numerical; model perigee as numerical; model period as numerical ) ''') metamodel = CGPM_Metamodel(dict(), multiprocess=0) bayesdb_register_metamodel(bdb, metamodel) yield bdb
def test_bad_analyze_vars(): try: from cgpm.regressions.linreg import LinearRegression except ImportError: pytest.skip('no sklearn') return with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee AS NUMERICAL; MODEL class_of_orbit AS CATEGORICAL; MODEL country_of_operator AS CATEGORICAL; MODEL launch_mass AS NUMERICAL; MODEL perigee AS NUMERICAL; MODEL period AS NUMERICAL ) ''') registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry)) bdb.execute(''' CREATE METAMODEL satellites_cgpm FOR satellites USING cgpm ''') bdb.execute('INITIALIZE 1 MODEL FOR satellites_cgpm') bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION WAIT ()') bdb.execute('ANALYZE satellites_cgpm FOR 1 ITERATION WAIT') with pytest.raises(BQLError): # Unknown variable `perige'. bdb.execute(''' ANALYZE satellites_cgpm FOR 1 ITERATION WAIT ( VARIABLES period, perige ) ''') with pytest.raises(BQLError): # Unknown variable `perige'. bdb.execute(''' ANALYZE satellites_cgpm FOR 1 ITERATION WAIT ( SKIP period, perige ) ''')
def test_regress_bonanza__ci_integration(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_metamodel( bdb, CGPM_Metamodel(dict(), multiprocess=0)) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee AS NUMERICAL; MODEL class_of_orbit AS NOMINAL; MODEL country_of_operator AS NOMINAL; MODEL launch_mass AS NUMERICAL; MODEL perigee AS NUMERICAL; MODEL period AS NUMERICAL ) ''') bdb.execute(''' CREATE METAMODEL m FOR satellites WITH BASELINE crosscat; ''') bdb.execute('INITIALIZE 2 MODELS FOR m;') def check_regression_variables(results, numericals, nominals): seen = set() for r in results: assert len(r) == 2 variable = r[0] assert variable not in seen assert variable in numericals or \ any(variable.startswith('%s_dum_' % (nominal,)) for nominal in nominals) seen.add(variable) # Regression on 1 numerical variable. results = bdb.execute(''' REGRESS apogee GIVEN (perigee) USING 12 SAMPLES BY satellites; ''').fetchall() assert len(results) == 2 check_regression_variables(results, ['intercept', 'perigee'], []) # Regression on 1 nominal variable. results = bdb.execute(''' REGRESS apogee GIVEN (country_of_operator) USING 12 SAMPLES BY satellites; ''').fetchall() check_regression_variables( results, ['intercept'], ['country_of_operator']) # Regression on 1 nominal + 1 numerical variable. bdb.execute(''' REGRESS apogee GIVEN (perigee, country_of_operator) USING 12 SAMPLES BY satellites; ''').fetchall() check_regression_variables( results, ['intercept', 'perigee'], ['country_of_operator']) # Regression on all variables. results = bdb.execute(''' REGRESS apogee GIVEN (*) USING 12 SAMPLES BY satellites; ''', (3,)).fetchall() check_regression_variables( results, ['intercept', 'perigee', 'launch_mass', 'period',], ['country_of_operator', 'class_of_orbit',], ) # Regression on column selector subexpression with a binding. results = bdb.execute(''' REGRESS apogee GIVEN ( satellites.( ESTIMATE * FROM VARIABLES OF satellites ORDER BY dependence probability with apogee DESC LIMIT ? ) ) USING 12 SAMPLES BY satellites MODELLED BY m USING MODEL 1; ''', (3,)).fetchall() cursor = bdb.execute(''' ESTIMATE * FROM VARIABLES OF satellites ORDER BY dependence probability with apogee DESC LIMIT ? ''', (3,)).fetchall() top_variables = [c[0] for c in cursor] nominals = [ var for var in top_variables if var in ['country_of_operator', 'class_of_orbit',] ] numericals = [var for var in top_variables if var not in nominals] check_regression_variables( results, numericals + ['intercept'], nominals) # Cannot mix * with other variables. with pytest.raises(BQLError): bdb.execute(''' REGRESS apogee GIVEN (*, class_of_orbit) USING 1 SAMPLES BY satellites; ''').fetchall() # Not enough data for regression, 1 unique nominal variable. with pytest.raises(ValueError): bdb.execute(''' REGRESS apogee GIVEN (class_of_orbit) USING 1 SAMPLES BY satellites; ''').fetchall()
def test_analysis_subproblems_basic(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee AS NUMERICAL; MODEL class_of_orbit AS CATEGORICAL; MODEL country_of_operator AS CATEGORICAL; MODEL launch_mass AS NUMERICAL; MODEL perigee AS NUMERICAL; MODEL period AS NUMERICAL ) ''') bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0)) bdb.execute(''' CREATE ANALYSIS SCHEMA g0 FOR satellites USING cgpm( SUBSAMPLE 10 ); ''') bdb.execute('INITIALIZE 4 ANALYSES FOR g0') # Test each subproblem individually except for variable hyperparameters. for optimized in [ '', 'OPTIMIZED;', ]: for subproblem in [ 'variable clustering', 'variable clustering concentration', 'row clustering', 'row clustering concentration', ]: bdb.execute(''' ANALYZE g0 ANALYSES 0,1 FOR 4 ITERATION WAIT( SUBPROBLEM %s; %s ); ''' % (subproblem, optimized)) # Test variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION WAIT ( VARIABLES period, launch_mass; SUBPROBLEM variable hyperparameters; ) ''') with pytest.raises(BQLError): # OPTIMIZED backend does not support variable hyperparameters. bdb.execute(''' ANALYZE g0 FOR 1 SECONDS WAIT ( SUBPROBLEM variable hyperparameters; OPTIMIZED; ) ''') # Test rows. generator_id = bayeslite.core.bayesdb_get_generator(bdb, None, 'g0') cursor = bdb.execute( ''' SELECT table_rowid FROM bayesdb_cgpm_individual WHERE generator_id = ? ''', (generator_id, )) subsample_rows = [c[0] for c in cursor] bad_rows = [i for i in xrange(20) if i not in subsample_rows] for optimized in ['', 'OPTIMIZED;']: bdb.execute(''' ANALYZE g0 ANALYSIS 3 FOR 1 ITERATION WAIT ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, subsample_rows)), optimized)) with pytest.raises(BQLError): # Fail on rows not in the population or subsample. bdb.execute(''' ANALYZE g0 ANALYSIS 3 FOR 1 ITERATION WAIT ( VARIABLES class_of_orbit; ROWS %s; SUBPROBLEMS ( row clustering, row clustering concentration ); %s ) ''' % (','.join(map(str, bad_rows)), optimized))
'bayesdb_open', 'bayesdb_read_csv', 'bayesdb_read_csv_file', 'bayesdb_register_metamodel', 'bayesdb_upgrade_schema', 'bql_quote_name', 'IBayesDBMetamodel', 'IBayesDBTracer', ] # Register crosscat as a builtin metamodel. from bayeslite.metamodels.crosscat import CrosscatMetamodel from crosscat.LocalEngine import LocalEngine as CrosscatLocalEngine bayesdb_builtin_metamodel(CrosscatMetamodel(CrosscatLocalEngine(seed=0))) # Register cgpm as a builtin metamodel. from bayeslite.metamodels.cgpm_metamodel import CGPM_Metamodel bayesdb_builtin_metamodel(CGPM_Metamodel({}, multiprocess=True)) import bayeslite.remote import os if not 'BAYESDB_DISABLE_VERSION_CHECK' in os.environ: bayeslite.remote.version_check() # Notebooks should contain comment lines documenting this behavior and # offering a solution, like so: # Please keep BayesDB up to date. To disable remote version checking: # import os; os.environ['BAYESDB_DISABLE_VERSION_CHECK'] = '1'
def test_predictive_relevance(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_metamodel(bdb, CGPM_Metamodel(cgpm_registry=dict())) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA ( MODEL apogee AS NUMERICAL; MODEL class_of_orbit AS CATEGORICAL; MODEL country_of_operator AS CATEGORICAL; MODEL launch_mass AS NUMERICAL; MODEL perigee AS NUMERICAL; MODEL period AS NUMERICAL ) ''') bdb.execute('CREATE METAMODEL m FOR satellites;') bdb.execute('INITIALIZE 2 MODELS FOR m;') bdb.execute('ANALYZE m FOR 25 ITERATION WAIT (OPTIMIZED);') # Check self-similarites, and also provide coverage of bindings. rowids = bdb.execute('SELECT OID from satellites_ucs;').fetchall() for rowid in rowids[:4]: cursor = bdb.execute( ''' ESTIMATE PREDICTIVE RELEVANCE TO EXISTING ROWS (rowid = ?) IN THE CONTEXT OF "period" FROM satellites WHERE rowid = ? ''', ( 1, 1, )) assert next(cursor)[0] == 1. # A full extravaganza query, using FROM (as a 1-row). cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE TO EXISTING ROWS (country_of_operator = 'Russia' AND period < 0) AND HYPOTHETICAL ROWS WITH VALUES ( (perigee=1.0, launch_mass=120), (country_of_operator='Bulgaria', perigee=2.0)) IN THE CONTEXT OF "country_of_operator" FROM satellites LIMIT 5 ''').fetchall() assert len(cursor) == 5 assert all(0 <= c[0] <= 1 for c in cursor) # A full extravaganza query, using BY (as a constant). cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (country_of_operator = 'Russia' AND period < 0) AND HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0), (country_of_operator='Bulgaria')) IN THE CONTEXT OF "country_of_operator" BY satellites ''').fetchall() assert len(cursor) == 1 assert all(0 <= c[0] <= 1 for c in cursor) # Hypothetical satellite with negative perigee should not be similar, # and use a binding to just ensure that they work. cursor = bdb.execute( ''' ESTIMATE PREDICTIVE RELEVANCE TO HYPOTHETICAL ROWS WITH VALUES ( (perigee = ?)) IN THE CONTEXT OF "perigee" FROM satellites LIMIT 5 ''', (-10000, )).fetchall() assert len(cursor) == 5 assert all(np.allclose(c[0], 0) for c in cursor) # No matching target OF row. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid < 0) TO EXISTING ROWS (rowid = 10) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Unknown CONTEXT variable "banana". with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (rowid = 2) IN THE CONTEXT OF "banana" BY satellites ''') # No matching EXISTING ROW. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 10) TO EXISTING ROWS (rowid < 0) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Unknown categorical values 'Mongolia' in HYPOTHETICAL ROWS. with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 10) TO HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='Mongolia'), (country_of_operator='Bulgaria', perigee=2.0)) IN THE CONTEXT OF "launch_mass" BY satellites ''') # Create a new row. bdb.sql_execute(''' INSERT INTO satellites_ucs (apogee, launch_mass) VALUES (12.128, 12.128) ''') # TARGET ROW not yet incorporated should return nan. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (apogee = 12.128) TO HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0)) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is None # EXISTING ROW not yet incorporated should return nan, since there is # no hypothetical. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is None # Although apogee = 12.128 is EXISTING but not incorporated, there are # other EXISTING ROWS with apogee > 0, so we should still get a result. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128 OR apogee > 0) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is not None # Although apogee = 12.128 is EXISTING but not incorporated, there are # other HYPOTHETICAL ROWS, so we should still get a result. cursor = bdb.execute(''' ESTIMATE PREDICTIVE RELEVANCE OF (rowid = 1) TO EXISTING ROWS (apogee = 12.128 OR apogee > 0) AND HYPOTHETICAL ROWS WITH VALUES ( (country_of_operator='China', perigee=1.0), (country_of_operator='Bulgaria')) IN THE CONTEXT OF "launch_mass" BY satellites ''') result = cursor_value(cursor) assert result is not None
def test_initialize_with_all_nulls(): # This test ensures that trying to initialize a CGPM metamodel with any # (manifest) column of all null variables will crash. # Initializing an overriden column with all null variables should not # be a problem in general, so we test this case as well. with bayesdb_open(':memory:', builtin_metamodels=False) as bdb: registry = { 'barebones': BareBonesCGpm, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry, multiprocess=0)) # Create table with all missing values for a. bdb.sql_execute(''' CREATE TABLE t (a REAL, b REAL, c REAL); ''') bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 3)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, None, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -2, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, -5, 1)) bdb.sql_execute('INSERT INTO t VALUES (?,?,?)', (None, 2, 3)) # Fail when a is numerical and modeled by crosscat. bdb.execute(''' CREATE POPULATION p FOR t WITH SCHEMA( MODEL a, b, c AS NUMERICAL ) ''') bdb.execute(''' CREATE METAMODEL m FOR p WITH BASELINE crosscat; ''') with pytest.raises(BQLError): bdb.execute(''' INITIALIZE 2 MODELS FOR m; ''') # Fail when a is nominal and modeled by crosscat. bdb.execute(''' CREATE POPULATION p2 FOR t WITH SCHEMA( MODEL a AS NOMINAL; MODEL b, c AS NUMERICAL ) ''') bdb.execute('CREATE METAMODEL m2 FOR p2 WITH BASELINE crosscat;') with pytest.raises(BQLError): bdb.execute('INITIALIZE 2 MODELS FOR m2;') # Succeed when a is ignored. bdb.execute(''' CREATE POPULATION p3 FOR t WITH SCHEMA( IGNORE a; MODEL b, c AS NUMERICAL ) ''') bdb.execute('CREATE METAMODEL m3 FOR p3 WITH BASELINE crosscat;') bdb.execute('INITIALIZE 2 MODELS FOR m3;') # Succeed when a is numerical overriden using a dummy CGPM. bdb.execute(''' CREATE METAMODEL m4 FOR p WITH BASELINE crosscat( OVERRIDE MODEL FOR a GIVEN b USING barebones ) ''') bdb.execute('INITIALIZE 2 MODELS FOR m4;') bdb.execute('ANALYZE m4 FOR 1 ITERATION WAIT;')
def test_output_stattypes(): try: from cgpm.factor.factor import FactorAnalysis except ImportError: pytest.skip('no sklearn') return with cgpm_dummy_satellites_bdb() as bdb: # Missing policy for class_of_orbit, perigee, period with pytest.raises(BQLError): bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee, launch_mass AS NUMERICAL; MODEL country_of_operator AS CATEGORICAL ) ''') bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( IGNORE class_of_orbit, perigee, period; MODEL apogee, launch_mass AS NUMERICAL; MODEL country_of_operator AS CATEGORICAL ) ''') registry = { 'factor_analysis': FactorAnalysis, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry)) # Creating factor analysis with categorical manifest should crash. bdb.execute(''' CREATE METAMODEL satellites_g0 FOR satellites( OVERRIDE MODEL FOR apogee, country_of_operator AND EXPOSE pc_1 NUMERICAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0') with pytest.raises(BQLError): # Duplicate pc_2 in LATENT and EXPOSE. bdb.execute(''' CREATE METAMODEL satellites_g1 FOR satellites( LATENT pc_2 CATEGORICAL, OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 CATEGORICAL USING factor_analysis(L=1) ) ''') # Creating factor analysis with categorical latent should crash. bdb.execute(''' CREATE METAMODEL satellites_g1 FOR satellites( OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 CATEGORICAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1') # Creating factor analysis with all numerical should be ok. bdb.execute(''' CREATE METAMODEL satellites_g2 FOR satellites USING cgpm( LATENT pc_3 NUMERICAL; OVERRIDE MODEL FOR apogee, launch_mass, pc_3, pc_4 USING factor_analysis(L=2); LATENT pc_4 NUMERICAL ) ''') bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2') bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION WAIT;') # Cannot transitioned baseline and foreign using timed analyis. with pytest.raises(BQLError): bdb.execute(''' ANALYZE satellites_g2 FOR 2 SECONDS WAIT ( VARIABLES country_of_operator, apogee, launch_mass, pc_3); ''') bdb.execute(''' ANALYZE satellites_g2 FOR 1 ITERATION WAIT ( VARIABLES apogee, launch_mass); ''') # Dependence probability of manifest with latent. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF apogee WITH pc_3 BY satellites MODELED BY satellites_g2; ''').fetchall() assert cursor[0][0] == 1. # Dependence probability of latent with latent. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF pc_3 WITH pc_4 BY satellites MODELED BY satellites_g2; ''').fetchall() assert cursor[0][0] == 1. # Mutual information of latent with manifest. cursor = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF apogee WITH pc_4 USING 1 SAMPLES BY satellites MODELED BY satellites_g2; ''').fetchall() # Mutual information of latent with latent. cursor = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF pc_3 WITH pc_4 USING 1 SAMPLES BY satellites MODELED BY satellites_g2; ''').fetchall()
def test_unknown_stattype(): try: from cgpm.regressions.linreg import LinearRegression except ImportError: pytest.skip('no sklearn') return with cgpm_dummy_satellites_bdb() as bdb: # Add a column called relaunches, sum of apogee and perigee. bdb.sql_execute('ALTER TABLE satellites_ucs ADD COLUMN relaunches') n_rows = bdb.sql_execute(''' SELECT COUNT(*) FROM satellites_ucs ''').next()[0] for rowid in xrange(n_rows): bdb.sql_execute( ''' UPDATE satellites_ucs SET relaunches = (SELECT apogee + perigee) WHERE _rowid_ = ? ''', (rowid + 1, )) # Nobody will ever create a QUAGGA statistical type! with pytest.raises(BQLError): # No such statistical type at the moment. bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee, perigee, launch_mass, period AS NUMERICAL; MODEL class_of_orbit, country_of_operator AS NOMINAL; MODEL relaunches AS QUAGGA ) ''') # Invent the statistical type. bdb.sql_execute('INSERT INTO bayesdb_stattype VALUES (?)', ('quagga', )) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee, perigee, launch_mass, period AS NUMERICAL; MODEL class_of_orbit, country_of_operator AS NOMINAL; MODEL relaunches AS QUAGGA ) ''') registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry)) with pytest.raises(BQLError): # Can't model QUAGGA by default. bdb.execute('CREATE METAMODEL g0 FOR satellites USING cgpm') with pytest.raises(BQLError): # Can't model QUAGGA as input. bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR relaunches GIVEN apogee USING linreg; OVERRIDE MODEL FOR period GIVEN relaunches USING linreg ) ''') # Can model QUAGGA with an explicit distribution family. bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR relaunches TO POISSON ) ''') bdb.execute(''' CREATE METAMODEL g1 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR relaunches TO POISSON; OVERRIDE MODEL FOR period GIVEN relaunches USING linreg ) ''')
def test_cgpm_kepler(): try: from cgpm.regressions.linreg import LinearRegression except ImportError: pytest.skip('no sklearn') return with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee AS NUMERICAL; MODEL class_of_orbit AS CATEGORICAL; MODEL country_of_operator AS CATEGORICAL; MODEL launch_mass AS NUMERICAL; MODEL perigee AS NUMERICAL; MODEL period AS NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION from PAIRWISE VARIABLES OF satellites ''').fetchall() registry = { 'kepler': Kepler, 'linreg': LinearRegression, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry, multiprocess=0)) bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( OVERRIDE GENERATIVE MODEL FOR period GIVEN apogee, perigee USING linreg ) ''') bdb.execute('INITIALIZE 1 MODEL FOR g0') c = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual') n = c.fetchvalue() # Another generator: exponential launch mass instead of normal. bdb.execute(''' CREATE METAMODEL g1 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR launch_mass TO EXPONENTIAL; OVERRIDE MODEL FOR period GIVEN apogee, perigee USING kepler(quagga = eland); SUBSAMPLE 20 ) ''') c_ = bdb.execute('SELECT COUNT(*) FROM bayesdb_cgpm_individual') n_ = c_.fetchvalue() assert n_ - n == 20 bdb.execute('INITIALIZE 1 MODEL IF NOT EXISTS FOR g1') bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT') bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT (VARIABLES period)') bdb.execute('ANALYZE g1 FOR 1 ITERATION WAIT') bdb.execute('ANALYZE g1 FOR 1 ITERATION WAIT (VARIABLES period)') # OPTIMIZED is ignored because period is a foreign variable. bdb.execute(''' ANALYZE g1 FOR 1 ITERATION WAIT (OPTIMIZED; VARIABLES period) ''') # This should fail since we have a SET CATEGORY MODEL which is not # compatible with lovecat. The ValueError is from cgpm not bayeslite. with pytest.raises(ValueError): bdb.execute(''' ANALYZE g1 FOR 1 ITERATION WAIT (OPTIMIZED; VARIABLES launch_mass) ''') # Cannot use timed analysis with mixed variables. with pytest.raises(BQLError): bdb.execute(''' ANALYZE g1 FOR 5 SECONDS WAIT (VARIABLES period, apogee) ''') # Cannot use timed analysis with mixed variables (period by SKIP). with pytest.raises(BQLError): bdb.execute(''' ANALYZE g1 FOR 5 SECONDS WAIT (SKIP apogee) ''') # OK to use iteration analysis with mixed values. bdb.execute(''' ANALYZE g1 FOR 1 ITERATION WAIT (VARIABLES period, apogee) ''') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF satellites ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY DENSITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE apogee, perigee, period FROM satellites LIMIT 100 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_confidence USING 5 SAMPLES FROM satellites LIMIT 2 ''').fetchall() results = bdb.execute(''' INFER EXPLICIT PREDICT class_of_orbit CONFIDENCE class_of_orbit_confidence FROM satellites LIMIT 2 ''').fetchall() assert len(results[0]) == 2 assert isinstance(results[0][0], unicode) assert isinstance(results[0][1], float) # No CONFIDENCE specified. results = bdb.execute(''' INFER EXPLICIT PREDICT class_of_orbit USING 2 SAMPLES FROM satellites LIMIT 2 ''').fetchall() assert len(results[0]) == 1 assert isinstance(results[0][0], unicode) bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP METAMODEL g0') bdb.execute('DROP METAMODEL g1')
def test_using_modelnos(): with cgpm_dummy_satellites_bdb() as bdb: bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee AS NUMERICAL; MODEL class_of_orbit AS CATEGORICAL; MODEL country_of_operator AS CATEGORICAL; MODEL launch_mass AS NUMERICAL; MODEL perigee AS NUMERICAL; MODEL period AS NUMERICAL ) ''') bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0)) bdb.execute(''' CREATE ANALYSIS SCHEMA g0 FOR satellites USING cgpm( SUBSAMPLE 10 ); ''') bdb.execute('INITIALIZE 2 ANALYSES FOR g0') # Predictive probability results should be different for modelnos 0, 1. # Crash test simulate. bdb.execute(''' SIMULATE apogee, class_of_orbit FROM satellites MODELED BY g0 USING ANALYSIS 0-1 LIMIT 10 ''') # Crash test infer explicit. bdb.execute(''' INFER EXPLICIT PREDICT period, perigee FROM satellites MODELED BY g0 USING ANALYSIS 0 LIMIT 2 ''') # Crash test dependence probability BY. c = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF launch_mass WITH period BY satellites MODELED BY g0 USING ANALYSIS 0 ''') assert cursor_value(c) in [0, 1] # Crash test dependence probability pairwise. cursor = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE VARIABLES OF satellites MODELED BY g0 USING ANALYSIS 1 ''') for d in cursor: assert d[0] in [0, 1] # Crash test mutual information 1row. bdb.execute(''' ESTIMATE MUTUAL INFORMATION WITH (period) USING 1 SAMPLES FROM VARIABLES OF satellites USING ANALYSIS 0 ''').fetchall() # Test analyze on per-model basis. bdb.execute(''' ANALYZE g0 ANALYSIS 0 FOR 1 ITERATION CHECKPOINT 1 ITERATION WAIT; ''') engine = bdb.metamodels['cgpm']._engine(bdb, 1) assert len(engine.states[0].diagnostics['logscore']) == 1 assert len(engine.states[1].diagnostics['logscore']) == 0 bdb.execute(''' ANALYZE g0 ANALYSIS 1 FOR 4 ITERATION CHECKPOINT 1 ITERATION WAIT (OPTIMIZED); ''') assert len(engine.states[0].diagnostics['logscore']) == 1 assert len(engine.states[1].diagnostics['logscore']) == 4 # Some errors with bad modelnos. with pytest.raises(BQLError): bdb.execute(''' ANALYZE g0 ANALYSIS 0-3 FOR 4 ITERATION WAIT; ''') with pytest.raises(BQLError): bdb.execute(''' SIMULATE apogee FROM satellites USING ANALYSIS 25 LIMIT 10; ''') with pytest.raises(BQLError): bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites USING MODELS 0-8 LIMIT 2; ''')
def test_add_drop_models(): with cgpm_dummy_satellites_bdb() as bdb: bayesdb_register_metamodel(bdb, CGPM_Metamodel(dict(), multiprocess=0)) bdb.execute(''' CREATE POPULATION p FOR satellites_ucs WITH SCHEMA( GUESS STATTYPES FOR (*); ) ''') bdb.execute('CREATE METAMODEL m FOR p (SUBSAMPLE 10);') # Retrieve id for testing. population_id = bayesdb_get_population(bdb, 'p') generator_id = bayesdb_get_generator(bdb, population_id, 'm') def check_modelno_mapping(lookup): pairs = bdb.sql_execute( ''' SELECT modelno, cgpm_modelno FROM bayesdb_cgpm_modelno WHERE generator_id = ? ''', (generator_id, )) for pair in pairs: assert lookup[pair[0]] == pair[1] del lookup[pair[0]] assert len(lookup) == 0 # Initialize some models. bdb.execute('INITIALIZE 16 MODELS FOR m') # Assert identity mapping initially. check_modelno_mapping({i: i for i in xrange(16)}) bdb.execute('ANALYZE m FOR 1 ITERATION WAIT (QUIET);') # Drop some models. bdb.execute('DROP MODELS 1, 8-12, 14 FROM m') # Assert cgpm models are contiguous while bayesdb models are not, with # the mapping preserving the strict order. check_modelno_mapping({ 0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 13: 7, 15: 8, }) # Run some analysis again. bdb.execute('ANALYZE m FOR 1 ITERATION WAIT (OPTIMIZED; QUIET);') # Initialize 14 models if not existing. bdb.execute('INITIALIZE 14 MODELS IF NOT EXISTS FOR m') # Assert cgpm models are 0-14, while bayesdb are 0-15 excluding 14. Note # that INITIALIZE 14 MODELS IF NOT EXISTS does not guarantee that 14 # MODELS in total will exist after the query, rather it will initialize # any non-existing modelnos with index 0-13, and any modelnos > 14 # (modelno 15 in this test case) are untouched. check_modelno_mapping({ 0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 13: 7, 15: 8, # Recreated models. 1: 9, 8: 10, 9: 11, 10: 12, 11: 13, 12: 14, }) # Drop some more models, add them back with some more, and confirm # arithmetic and ordering remains correct. bdb.execute('DROP MODELS 0-1 FROM m') check_modelno_mapping({ 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 13: 6, 15: 7, # Recreated models. 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, }) bdb.execute('INITIALIZE 20 MODELS IF NOT EXISTS FOR m;') check_modelno_mapping({ 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 13: 6, 15: 7, # Recreated models. 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, # Re-recreated models. 0: 13, 1: 14, # New models. 14: 15, 16: 16, 17: 17, 18: 18, 19: 19, }) # No such models. with pytest.raises(BQLError): bdb.execute('DROP MODELS 20-50 FROM m') # Drop all models. bdb.execute('DROP MODELS FROM m;') # No such models. with pytest.raises(BQLError): bdb.execute('DROP MODEL 0 FROM m') # Assert cgpm mapping is cleared. cursor = bdb.sql_execute( ''' SELECT COUNT(*) FROM bayesdb_cgpm_modelno WHERE generator_id = ? ''', (generator_id, )) assert cursor_value(cursor) == 0
def test_cgpm_extravaganza__ci_slow(): try: from cgpm.regressions.forest import RandomForest from cgpm.regressions.linreg import LinearRegression from cgpm.venturescript.vscgpm import VsCGpm except ImportError: pytest.skip('no sklearn or venturescript') return with bayesdb_open(':memory:', builtin_metamodels=False) as bdb: # XXX Use the real satellites data instead of this bogosity? bdb.sql_execute(''' CREATE TABLE satellites_ucs ( name, apogee, class_of_orbit, country_of_operator, launch_mass, perigee, period ) ''') for l, f in [ ('geo', lambda x, y: x + y**2), ('leo', lambda x, y: math.sin(x + y)), ]: for x in xrange(1000): for y in xrange(10): countries = ['US', 'Russia', 'China', 'Bulgaria'] country = countries[bdb._np_prng.randint( 0, len(countries))] name = 'sat-%s-%d' % (country, bdb._np_prng.randint(0, 10**8)) mass = bdb._np_prng.normal(1000, 50) bdb.sql_execute( ''' INSERT INTO satellites_ucs (name, country_of_operator, launch_mass, class_of_orbit, apogee, perigee, period) VALUES (?,?,?,?,?,?,?) ''', (name, country, mass, l, x, y, f(x, y))) bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs ( name IGNORE; apogee NUMERICAL; class_of_orbit CATEGORICAL; country_of_operator CATEGORICAL; launch_mass NUMERICAL; perigee NUMERICAL; period NUMERICAL ) ''') bdb.execute(''' ESTIMATE CORRELATION FROM PAIRWISE VARIABLES OF satellites ''').fetchall() cgpm_registry = { 'venturescript': VsCGpm, 'linreg': LinearRegression, 'forest': RandomForest, } cgpmt = CGPM_Metamodel(cgpm_registry) bayesdb_register_metamodel(bdb, cgpmt) with pytest.raises(BQLError): bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apoge TO NORMAL ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( OVERRIDE MODEL FOR perigee GIVEN apoge USING linreg ) ''') with pytest.raises(BQLError): bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( LATENT apogee NUMERICAL ) ''') bdb.execute(''' CREATE METAMODEL g0 FOR satellites USING cgpm ( SET CATEGORY MODEL FOR apogee TO NORMAL; LATENT kepler_cluster_id NUMERICAL; LATENT kepler_noise NUMERICAL; OVERRIDE MODEL FOR kepler_cluster_id, kepler_noise, period GIVEN apogee, perigee USING venturescript (source = "{}"); OVERRIDE MODEL FOR perigee GIVEN apogee USING linreg; OVERRIDE MODEL FOR class_of_orbit GIVEN apogee, period, perigee, kepler_noise USING forest (k = 4); SUBSAMPLE 100, ) '''.format(kepler_source)) population_id = core.bayesdb_get_population(bdb, 'satellites') generator_id = core.bayesdb_get_generator(bdb, population_id, 'g0') assert core.bayesdb_generator_column_numbers(bdb, generator_id) == \ [-2, -1, 1, 2, 3, 4, 5, 6] assert core.bayesdb_variable_numbers(bdb, population_id, None) == \ [1, 2, 3, 4, 5, 6] assert core.bayesdb_variable_numbers( bdb, population_id, generator_id) == \ [-2, -1, 1, 2, 3, 4, 5, 6] # -- MODEL country_of_operator GIVEN class_of_orbit USING forest; bdb.execute('INITIALIZE 1 MODELS FOR g0') bdb.execute('ANALYZE g0 FOR 1 iteration WAIT (;)') bdb.execute(''' ANALYZE g0 FOR 1 iteration WAIT (VARIABLES kepler_cluster_id) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration WAIT ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') # OPTIMIZED uses the lovecat backend. bdb.execute('ANALYZE g0 FOR 20 iteration WAIT (OPTIMIZED)') with pytest.raises(Exception): # Disallow both SKIP and VARIABLES clauses. # # XXX Catch a more specific exception. bdb.execute(''' ANALYZE g0 FOR 1 ITERATION WAIT ( SKIP kepler_cluster_id; VARIABLES apogee, perigee; ) ''') bdb.execute(''' ANALYZE g0 FOR 1 iteration WAIT ( SKIP kepler_cluster_id, kepler_noise, period; ) ''') bdb.execute('ANALYZE g0 FOR 1 ITERATION WAIT') bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF kepler_cluster_id WITH period WITHIN satellites MODELLED BY g0 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF apogee FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_cluster_id FROM satellites MODELLED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF kepler_noise FROM satellites MODELLED BY g0 LIMIT 1 ''').fetchall() bdb.execute(''' ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites LIMIT 1 ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_cluster_id CONFIDENCE kepler_cluster_id_conf FROM satellites MODELLED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT kepler_noise CONFIDENCE kepler_noise_conf FROM satellites MODELLED BY g0 LIMIT 2; ''').fetchall() bdb.execute(''' INFER EXPLICIT PREDICT apogee CONFIDENCE apogee_conf FROM satellites MODELLED BY g0 LIMIT 1; ''').fetchall() bdb.execute(''' ESTIMATE PROBABILITY OF period = 42 GIVEN (apogee = 8 AND perigee = 7) BY satellites ''').fetchall() bdb.execute(''' SIMULATE kepler_cluster_id, apogee, perigee, period FROM satellites MODELLED BY g0 LIMIT 4 ''').fetchall() bdb.execute('DROP MODELS FROM g0') bdb.execute('DROP METAMODEL g0') bdb.execute('DROP POPULATION satellites') bdb.execute('DROP TABLE satellites_ucs')
def test_output_stattypes(): try: from cgpm.factor.factor import FactorAnalysis except ImportError: pytest.skip('no sklearn') return with cgpm_dummy_satellites_bdb() as bdb: # Missing policy for class_of_orbit, perigee, period with pytest.raises(BQLError): bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( MODEL apogee, launch_mass AS NUMERICAL; MODEL country_of_operator AS CATEGORICAL ) ''') bdb.execute(''' CREATE POPULATION satellites FOR satellites_ucs WITH SCHEMA( IGNORE class_of_orbit, perigee, period; MODEL apogee, launch_mass AS NUMERICAL; MODEL country_of_operator AS CATEGORICAL ) ''') registry = { 'factor_analysis': FactorAnalysis, } bayesdb_register_metamodel(bdb, CGPM_Metamodel(registry)) # Creating factor analysis with categorical manifest should crash. bdb.execute(''' CREATE METAMODEL satellites_g0 FOR satellites( OVERRIDE MODEL FOR apogee, country_of_operator AND EXPOSE pc_1 NUMERICAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g0') with pytest.raises(BQLError): # Duplicate pc_2 in LATENT and EXPOSE. bdb.execute(''' CREATE METAMODEL satellites_g1 FOR satellites( LATENT pc_2 CATEGORICAL, OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 CATEGORICAL USING factor_analysis(L=1) ) ''') # Creating factor analysis with categorical latent should crash. bdb.execute(''' CREATE METAMODEL satellites_g1 FOR satellites( OVERRIDE GENERATIVE MODEL FOR apogee, launch_mass AND EXPOSE pc_2 CATEGORICAL USING factor_analysis(L=1) ) ''') with pytest.raises(ValueError): bdb.execute('INITIALIZE 1 MODEL FOR satellites_g1') # Creating factor analysis with all numerical should be ok. bdb.execute(''' CREATE METAMODEL satellites_g2 FOR satellites USING cgpm( LATENT pc_3 NUMERICAL; OVERRIDE MODEL FOR apogee, launch_mass, pc_3 USING factor_analysis(L=1) ) ''') bdb.execute('INITIALIZE 1 MODEL FOR satellites_g2') bdb.execute('ANALYZE satellites_g2 FOR 2 ITERATION WAIT;')