def nullify(self, argin): """replace user-specified missing value with NULL <table> <value> Example: bayeslite> .nullify mytable NaN bayeslite> .nullify mytable '' """ parser = utils.ArgumentParser(prog='.nullify') parser.add_argument('table', type=str, help='Name of the table.') parser.add_argument('value', type=str, help='Target string to nullify.') try: args = parser.parse_args(shlex.split(argin)) except utils.ArgparseError as e: self.stdout.write('%s' % (e.message, )) return bdbcontrib.nullify(self._bdb, args.table, args.value)
def nullify(self, argin): """replace user-specified missing value with NULL <table> <value> Example: bayeslite> .nullify mytable NaN bayeslite> .nullify mytable '' """ parser = utils.ArgumentParser(prog='.nullify') parser.add_argument('table', type=str, help='Name of the table.') parser.add_argument('value', type=str, help='Target string to nullify.') try: args = parser.parse_args(shlex.split(argin)) except utils.ArgparseError as e: self.stdout.write('%s' % (e.message,)) return bdbcontrib.nullify(self._bdb, args.table, args.value)
def nullify(self, value): """Wraps bdbcontrib.nullify by passing bdb and name. bdbcontrib_nullify_doc""" bdbcontrib.nullify(self.bdb, self.name, value)
def test_composer_integration__ci_slow(): # But currently difficult to seperate these tests into smaller tests because # of their sequential nature. We will still test all internal functions # with different regimes of operation. # SETUP # ----- # Dataset. bdb = bayeslite.bayesdb_open() bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV, header=True, create=True) bdbcontrib.nullify(bdb, 'satellites', 'NaN') # Composer. composer = Composer(n_samples=5) composer.register_foreign_predictor( multiple_regression.MultipleRegression) composer.register_foreign_predictor(keplers_law.KeplersLaw) composer.register_foreign_predictor(random_forest.RandomForest) # Use complex generator for interesting test cases. bayeslite.bayesdb_register_metamodel(bdb, composer) bdb.execute(''' CREATE GENERATOR t1 FOR satellites USING composer( default ( Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL, ), random_forest ( Type_of_Orbit CATEGORICAL GIVEN Apogee_km, Perigee_km, Eccentricity, Period_minutes, Launch_Mass_kg, Power_watts, Anticipated_Lifetime, Class_of_orbit ), keplers_law ( Period_minutes NUMERICAL GIVEN Perigee_km, Apogee_km ), multiple_regression ( Anticipated_Lifetime NUMERICAL GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor ), DEPENDENT(Apogee_km, Perigee_km, Eccentricity), DEPENDENT(Contractor, Country_of_Contractor), INDEPENDENT(Country_of_Operator, Date_of_Launch) );''') # ---------------------- # TEST INITIALIZE MODELS # ---------------------- bdb.execute('INITIALIZE 2 MODELS FOR t1') # Check number of models. df = bdbcontrib.describe_generator_models(bdb, 't1') assert len(df) == 2 df = bdbcontrib.describe_generator_models(bdb, 't1_cc') assert len(df) == 2 # ------------------- # TEST ANALYZE MODELS # ------------------- bdb.execute('ANALYZE t1 FOR 2 ITERATIONS WAIT;') # Check number of iterations of composer. df = bdbcontrib.describe_generator_models(bdb, 't1') for index, modelno, iterations in df.itertuples(): assert iterations == 2 # Check number of iterations of composer_cc. df = bdbcontrib.describe_generator_models(bdb, 't1_cc') for index, modelno, iterations in df.itertuples(): assert iterations == 2 # ---------------------------------- # TEST COLUMN DEPENDENCE PROBABILITY # ---------------------------------- # Special 0/1 regimes. # Local with a INDEPENDENT local should be 0. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Date_of_Launch WITH Country_of_Operator FROM t1 LIMIT 1 ''') assert curs.next()[0] == 0 # Local with a DEPENDENT local should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Perigee_km WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1 curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Apogee_km WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1 # Foreign with a local parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Apogee_km FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Power_watts FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a foreign parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Type_of_Orbit WITH Anticipated_Lifetime FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a local non-parent DEPENDENT with local parent should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Eccentricity FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with foreign sharing common direct ancestor should be 1. # Launch_Mass_kg is the common parent. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Type_of_Orbit FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Foreign with a foreign sharing a common DEPENDENT ancestor should be 1. # Eccentricity is a parent of Type_of_orbit, and is dependent # with Period_minutes through DEPENDENT(Apogee_km, Perigee_km, Eccentricity) curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Type_of_Orbit FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Column with itself should be 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Anticipated_Lifetime FROM t1 LIMIT 1 ''') assert curs.next()[0] == 1. # Unknown [0,1] regimes. # Foreign with a local of unknown relation with parents. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH longitude_radians_of_geo FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH longitude_radians_of_geo FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. # Foreign with a foreign of unknown ancestry relation. curs = bdb.execute(''' ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Period_minutes FROM t1 LIMIT 1 ''') assert 0 <= curs.next()[0] <= 1. # ---------------------------------- # TEST SIMULATE # ---------------------------------- # Crash tests for various code paths. Quality of simulations ignored. # Joint local. curs = bdb.execute(''' SIMULATE Power_watts, Launch_Mass_kg FROM t1 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Forward simulate foreign. curs = bdb.execute(''' SIMULATE Period_minutes FROM t1 GIVEN Apogee_km = 1000, Perigee_km = 980 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Forward simulate foreign with missing parents. curs = bdb.execute(''' SIMULATE Anticipated_Lifetime FROM t1 GIVEN Dry_Mass_kg = 2894, Launch_Mass_kg = 1730 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Joint simulate foreign with parents, and missing parents. curs = bdb.execute(''' SIMULATE Type_of_Orbit, Eccentricity FROM t1 GIVEN Dry_Mass_kg = 2894, Launch_Mass_kg = 1730 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Joint simulate foreign with non-parents. curs = bdb.execute(''' SIMULATE Period_minutes, Eccentricity FROM t1 GIVEN Apogee_km = 38000 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Simulate joint local conditioned on two foreigns. curs = bdb.execute(''' SIMULATE Country_of_Operator, Inclination_radians FROM t1 GIVEN Period_minutes = 1432, Anticipated_Lifetime = 5 LIMIT 2; ''') assert len(curs.fetchall()) == 2 # Simulate joint foreign conditioned on third foreign. curs = bdb.execute(''' SIMULATE Period_minutes, Anticipated_Lifetime FROM t1 GIVEN Type_of_Orbit = 'Deep Highly Eccentric' LIMIT 2 ''') assert len(curs.fetchall()) == 2 # Simulate foreign conditioned on itself. curs = bdb.execute(''' SIMULATE Period_minutes, Apogee_km FROM t1 GIVEN Period_minutes = 102 LIMIT 2 ''') assert [s[0] for s in curs] == [102] * 2 # ----------------------------- # TEST COLUMN VALUE PROBABILITY # ----------------------------- # Crash tests for various code path. Quality of logpdf ignored. # Conditional local. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Power_watts = 800 GIVEN (Perigee_km = 980, Launch_Mass_kg = 890) FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Unconditional foreign curs = bdb.execute(''' ESTIMATE PROBABILITY OF Period_minutes = 1020 FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Conditional foreign on parent and non-parents. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Period_minutes = 1020 GIVEN (Apogee_km = 38000, Eccentricity = 0.03) FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[0] # Conditional foriegn on foreign. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Anticipated_Lifetime = 4.09 GIVEN (Class_of_Orbit = 'LEO', Purpose='Astrophysics', Period_minutes = 1436) FROM t1 LIMIT 1; ''') assert 0. <= curs.next()[0] # Categorical foreign should be less than 1. curs = bdb.execute(''' ESTIMATE PROBABILITY OF Type_of_Orbit = 'Polar' FROM t1 LIMIT 1; ''') assert curs.next()[0] <= 1. # Query inconsistent with evidence should be 0. curs = bdb.execute(''' ESTIMATE PROBABILITY OF "Type_of_Orbit" = 'Polar' GIVEN ("Type_of_Orbit" = 'Deep Highly Eccentric') FROM t1 LIMIT 1; ''') assert curs.next()[0] == 0. # In theory, query consistent with evidence should be 1, but this is very # hard to ensure due to stochastic sampling giving different estimates of # P(Y), once in joint and once in marginal Monte Carlo estimation. # ----------------------- # TEST MUTUAL INFORMATION # ----------------------- # Two local columns. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Country_of_Contractor WITH longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX Small sample sizes non-deterministically produce negative MI assert -1 <= curs.next()[0] # One local and one foreign column. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX This non-deterministically fails when sample sizes are small # assert 0. <= curs.next()[0] assert float("-inf") <= curs.next()[0] # Two foreign columns. curs = bdb.execute(''' ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH Anticipated_Lifetime USING 5 SAMPLES FROM t1 LIMIT 1; ''') # XXX This non-deterministically fails when sample sizes are small # assert 0. <= curs.next()[0] assert float("-inf") <= curs.next()[0] # ----------------------- # TEST PREDICT CONFIDENCE # ----------------------- # Continuous local column. curs = bdb.execute(''' INFER EXPLICIT PREDICT Dry_Mass_kg CONFIDENCE c FROM t1 LIMIT 1; ''') assert curs.next()[1] >= 0. # Discrete local column with no children. curs = bdb.execute(''' INFER EXPLICIT PREDICT Purpose CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 # Discrete local column with children. curs = bdb.execute(''' INFER EXPLICIT PREDICT Contractor CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 # Continuous foreign columns. curs = bdb.execute(''' INFER EXPLICIT PREDICT Period_minutes CONFIDENCE c FROM t1 LIMIT 1; ''') assert curs.next()[1] >= 0. # Discrete foreign column. curs = bdb.execute(''' INFER EXPLICIT PREDICT Type_of_Orbit CONFIDENCE c FROM t1 LIMIT 1; ''') assert 0 <= curs.next()[1] <= 1 bdb.close()
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed): then = time.time() timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d') user = subprocess.check_output(["whoami"]).strip() host = subprocess.check_output(["hostname"]).strip() filestamp = '-' + timestamp + '-' + user def out_file_name(base, ext): return out_dir + '/' + base + filestamp + ext csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv') bdb_file = out_file_name('satellites', '.bdb') # so we can build bdb models os.environ['BAYESDB_WIZARD_MODE']='1' if not os.path.isdir(out_dir): os.makedirs(out_dir) if os.path.exists(bdb_file): print 'Error: File', bdb_file, 'already exists. Please remove it.' sys.exit(1) # create database mapped to filesystem log('opening bdb on disk: %s' % bdb_file) bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False) def execute(bql): log("executing %s" % bql) bdb.execute(bql) # read csv into table log('reading data from %s' % csv_file) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file, header=True, create=True, ifnotexists=True) # Add a "not applicable" orbit sub-type log('adding "not applicable" orbit sub-type') bdb.sql_execute('''UPDATE satellites SET type_of_orbit = 'N/A' WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO') AND type_of_orbit = 'NaN' ''') # nullify "NaN" log('nullifying NaN') bdbcontrib.nullify(bdb, 'satellites', 'NaN') # register crosscat metamodel cc = ccme.MultiprocessingEngine(seed=seed) ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccmm) # create the crosscat generator using execute(''' CREATE GENERATOR satellites_cc FOR satellites USING crosscat ( GUESS(*), name IGNORE, Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Type_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Period_minutes NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL ) ''') execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models,)) cur_iter_ct = 0 def snapshot(): log('vacuuming') bdb.sql_execute('vacuum') cur_infix = '-%dm-%di' % (num_models, cur_iter_ct) save_file_name = out_file_name('satellites', cur_infix + '.bdb') meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt') log('recording snapshot ' + save_file_name) os.system("cp %s %s" % (bdb_file, save_file_name)) report(save_file_name, meta_file_name) def record_metadata(f, saved_file_name, sha_sum, total_time, plot_file_name=None): f.write("DB file " + saved_file_name + "\n") f.write(sha_sum) f.write("built from " + csv_file + "\n") f.write("by %s@%s\n" % (user, host)) f.write("at seed %s\n" % seed) f.write("in %3.2f seconds\n" % total_time) f.write("with %s models analyzed for %s iterations\n" % (num_models, num_iters)) f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__)) if plot_file_name is not None: f.write("diagnostics recorded to %s\n" % plot_file_name) f.flush() def report(saved_file_name, metadata_file, echo=False, plot_file_name=None): sha256 = hashlib.sha256() with open(saved_file_name, 'rb') as fd: for chunk in iter(lambda: fd.read(65536), ''): sha256.update(chunk) sha_sum = sha256.hexdigest() + '\n' total_time = time.time() - then with open(metadata_file, 'w') as fd: record_metadata(fd, saved_file_name, sha_sum, total_time, plot_file_name) fd.write('using script ') fd.write('-' * 57) fd.write('\n') fd.flush() os.system("cat %s >> %s" % (__file__, metadata_file)) if echo: record_metadata(sys.stdout, saved_file_name, sha_sum, total_time, plot_file_name) def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name) snapshot() while cur_iter_ct < num_iters: execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq) cur_iter_ct += checkpoint_freq snapshot() final_report() log('closing bdb %s' % bdb_file) bdb.close() os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))