def plot_probe_results(filename, outdir): log("Loading probe results from %s" % filename) with open(filename, "r") as f: results = pickle.load(f) results = [((probe, n_models, analysis_count_from_file_name(fname)), val) for ((fname, n_models, probe), val) in results.iteritems()] plot_results(results, outdir=outdir)
def snapshot(): log('vacuuming') bdb.sql_execute('vacuum') cur_infix = '-%dm-%di' % (num_models, cur_iter_ct) save_file_name = out_file_name('satellites', cur_infix + '.bdb') meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt') log('recording snapshot ' + save_file_name) os.system("cp %s %s" % (bdb_file, save_file_name)) report(save_file_name, meta_file_name)
def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name)
def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics( bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name)
def doit(files, outfile, model_schedule, n_replications): out_dir = os.path.dirname(outfile) if out_dir and not os.path.isdir(out_dir): os.makedirs(out_dir) results = probe_fileset( files, "satellites_cc", [country_purpose_probes, unlikely_periods_probes, orbit_type_imputation_probes], model_schedule = model_schedule, n_replications = n_replications) with open(outfile, "w") as f: pickle.dump(results, f) log("Saved probe results to %s" % outfile)
def doit(files, outfile, model_schedule, n_replications): out_dir = os.path.dirname(outfile) if out_dir and not os.path.isdir(out_dir): os.makedirs(out_dir) results = probe_fileset( files, "satellites_cc", [country_purpose_probes, unlikely_periods_probes, orbit_type_imputation_probes], model_schedule=model_schedule, n_replications=n_replications, ) with open(outfile, "w") as f: pickle.dump(results, f) log("Saved probe results to %s" % outfile)
def plot_results(results, outdir="figures", ext=".png"): """Plot the aggregate results of probing. `results` is a list of pairs giving probe conditions and aggregated probe results. `outdir` is the name of a directory to which to write the visualizations. Default: "figures". `ext` is the file extension for visualizations, which determines the image format used. Default ".png". Each probe condition is expected to be a 3-tuple: probe name, model count, analysis iteration count. Each result is expected to be a tagged aggregate (see aggregation.py). Each numerical probe produces one plot, named after the probe. The plot facets over the model count, displays the iteration count on the x-axis, and a violin plot of the results on the y axis. All boolean probes are aggregated into one plot named "boolean-probes", whose y axis is the frequency of a "True" result. Each probe is a line giving the relationship of the frequency to the number of analysis iterations. """ if not os.path.exists(outdir): os.makedirs(outdir) replications = num_replications(results) probes = sorted( set((pname, ptype) for ((pname, _, _), (ptype, _)) in results)) for probe, ptype in probes: if not ptype == 'num': continue grid = plot_results_numerical(results, probe) grid.fig.suptitle(probe + ", %d replications" % replications) # XXX Actually shell quote the probe name figname = string.replace(probe, " ", "-").replace("/", "") + ext savepath = os.path.join(outdir, figname) grid.savefig(savepath) plt.close(grid.fig) log("Probe '%s' results saved to %s" % (probe, savepath)) grid = plot_results_boolean(results) grid.fig.suptitle("Boolean probes, %d replications" % replications) figname = "boolean-probes" + ext savepath = os.path.join(outdir, figname) grid.savefig(savepath) plt.close(grid.fig) log("Boolean probe results saved to %s" % (savepath, ))
def plot_results(results, outdir="figures", ext=".png"): """Plot the aggregate results of probing. `results` is a list of pairs giving probe conditions and aggregated probe results. `outdir` is the name of a directory to which to write the visualizations. Default: "figures". `ext` is the file extension for visualizations, which determines the image format used. Default ".png". Each probe condition is expected to be a 3-tuple: probe name, model count, analysis iteration count. Each result is expected to be a tagged aggregate (see aggregation.py). Each numerical probe produces one plot, named after the probe. The plot facets over the model count, displays the iteration count on the x-axis, and a violin plot of the results on the y axis. All boolean probes are aggregated into one plot named "boolean-probes", whose y axis is the frequency of a "True" result. Each probe is a line giving the relationship of the frequency to the number of analysis iterations. """ if not os.path.exists(outdir): os.makedirs(outdir) replications = num_replications(results) probes = sorted(set((pname, ptype) for ((pname, _, _), (ptype, _)) in results)) for probe, ptype in probes: if not ptype == 'num': continue grid = plot_results_numerical(results, probe) grid.fig.suptitle(probe + ", %d replications" % replications) # XXX Actually shell quote the probe name figname = string.replace(probe, " ", "-").replace("/", "") + ext savepath = os.path.join(outdir, figname) grid.savefig(savepath) plt.close(grid.fig) log("Probe '%s' results saved to %s" % (probe, savepath)) grid = plot_results_boolean(results) grid.fig.suptitle("Boolean probes, %d replications" % replications) figname = "boolean-probes" + ext savepath = os.path.join(outdir, figname) grid.savefig(savepath) plt.close(grid.fig) log("Boolean probe results saved to %s" % (savepath,))
def execute(bql): log("executing %s" % bql) bdb.execute(bql)
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed): then = time.time() timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d') user = subprocess.check_output(["whoami"]).strip() host = subprocess.check_output(["hostname"]).strip() filestamp = '-' + timestamp + '-' + user def out_file_name(base, ext): return out_dir + '/' + base + filestamp + ext csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv') bdb_file = out_file_name('satellites', '.bdb') # so we can build bdb models os.environ['BAYESDB_WIZARD_MODE'] = '1' if not os.path.isdir(out_dir): os.makedirs(out_dir) if os.path.exists(bdb_file): print 'Error: File', bdb_file, 'already exists. Please remove it.' sys.exit(1) # create database mapped to filesystem log('opening bdb on disk: %s' % bdb_file) bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False) def execute(bql): log("executing %s" % bql) bdb.execute(bql) # read csv into table log('reading data from %s' % csv_file) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file, header=True, create=True, ifnotexists=True) # Add a "not applicable" orbit sub-type log('adding "not applicable" orbit sub-type') bdb.sql_execute('''UPDATE satellites SET type_of_orbit = 'N/A' WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO') AND type_of_orbit = 'NaN' ''') # nullify "NaN" log('nullifying NaN') bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN') # register crosscat metamodel cc = ccme.MultiprocessingEngine(seed=seed) ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccmm) # create the crosscat generator using execute(''' CREATE GENERATOR satellites_cc FOR satellites USING crosscat ( GUESS(*), name IGNORE, Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Type_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Period_minutes NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL ) ''') execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models, )) cur_iter_ct = 0 def snapshot(): log('vacuuming') bdb.sql_execute('vacuum') cur_infix = '-%dm-%di' % (num_models, cur_iter_ct) save_file_name = out_file_name('satellites', cur_infix + '.bdb') meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt') log('recording snapshot ' + save_file_name) os.system("cp %s %s" % (bdb_file, save_file_name)) report(save_file_name, meta_file_name) def record_metadata(f, saved_file_name, sha_sum, total_time, plot_file_name=None): f.write("DB file " + saved_file_name + "\n") f.write(sha_sum) f.write("built from " + csv_file + "\n") f.write("by %s@%s\n" % (user, host)) f.write("at seed %s\n" % seed) f.write("in %3.2f seconds\n" % total_time) f.write("with %s models analyzed for %s iterations\n" % (num_models, num_iters)) f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__)) if plot_file_name is not None: f.write("diagnostics recorded to %s\n" % plot_file_name) f.flush() def report(saved_file_name, metadata_file, echo=False, plot_file_name=None): sha256 = hashlib.sha256() with open(saved_file_name, 'rb') as fd: for chunk in iter(lambda: fd.read(65536), ''): sha256.update(chunk) sha_sum = sha256.hexdigest() + '\n' total_time = time.time() - then with open(metadata_file, 'w') as fd: record_metadata(fd, saved_file_name, sha_sum, total_time, plot_file_name) fd.write('using script ') fd.write('-' * 57) fd.write('\n') fd.flush() os.system("cat %s >> %s" % (__file__, metadata_file)) if echo: record_metadata(sys.stdout, saved_file_name, sha_sum, total_time, plot_file_name) def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics( bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name) snapshot() while cur_iter_ct < num_iters: execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq) cur_iter_ct += checkpoint_freq snapshot() final_report() log('closing bdb %s' % bdb_file) bdb.close() os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed): then = time.time() timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d') user = subprocess.check_output(["whoami"]).strip() host = subprocess.check_output(["hostname"]).strip() filestamp = '-' + timestamp + '-' + user def out_file_name(base, ext): return out_dir + '/' + base + filestamp + ext csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv') bdb_file = out_file_name('satellites', '.bdb') # so we can build bdb models os.environ['BAYESDB_WIZARD_MODE']='1' if not os.path.isdir(out_dir): os.makedirs(out_dir) if os.path.exists(bdb_file): print 'Error: File', bdb_file, 'already exists. Please remove it.' sys.exit(1) # create database mapped to filesystem log('opening bdb on disk: %s' % bdb_file) bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False) def execute(bql): log("executing %s" % bql) bdb.execute(bql) # read csv into table log('reading data from %s' % csv_file) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file, header=True, create=True, ifnotexists=True) # Add a "not applicable" orbit sub-type log('adding "not applicable" orbit sub-type') bdb.sql_execute('''UPDATE satellites SET type_of_orbit = 'N/A' WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO') AND type_of_orbit = 'NaN' ''') # nullify "NaN" log('nullifying NaN') bdbcontrib.nullify(bdb, 'satellites', 'NaN') # register crosscat metamodel cc = ccme.MultiprocessingEngine(seed=seed) ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccmm) # create the crosscat generator using execute(''' CREATE GENERATOR satellites_cc FOR satellites USING crosscat ( GUESS(*), name IGNORE, Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Type_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Period_minutes NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL ) ''') execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models,)) cur_iter_ct = 0 def snapshot(): log('vacuuming') bdb.sql_execute('vacuum') cur_infix = '-%dm-%di' % (num_models, cur_iter_ct) save_file_name = out_file_name('satellites', cur_infix + '.bdb') meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt') log('recording snapshot ' + save_file_name) os.system("cp %s %s" % (bdb_file, save_file_name)) report(save_file_name, meta_file_name) def record_metadata(f, saved_file_name, sha_sum, total_time, plot_file_name=None): f.write("DB file " + saved_file_name + "\n") f.write(sha_sum) f.write("built from " + csv_file + "\n") f.write("by %s@%s\n" % (user, host)) f.write("at seed %s\n" % seed) f.write("in %3.2f seconds\n" % total_time) f.write("with %s models analyzed for %s iterations\n" % (num_models, num_iters)) f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__)) if plot_file_name is not None: f.write("diagnostics recorded to %s\n" % plot_file_name) f.flush() def report(saved_file_name, metadata_file, echo=False, plot_file_name=None): sha256 = hashlib.sha256() with open(saved_file_name, 'rb') as fd: for chunk in iter(lambda: fd.read(65536), ''): sha256.update(chunk) sha_sum = sha256.hexdigest() + '\n' total_time = time.time() - then with open(metadata_file, 'w') as fd: record_metadata(fd, saved_file_name, sha_sum, total_time, plot_file_name) fd.write('using script ') fd.write('-' * 57) fd.write('\n') fd.flush() os.system("cat %s >> %s" % (__file__, metadata_file)) if echo: record_metadata(sys.stdout, saved_file_name, sha_sum, total_time, plot_file_name) def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name) snapshot() while cur_iter_ct < num_iters: execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq) cur_iter_ct += checkpoint_freq snapshot() final_report() log('closing bdb %s' % bdb_file) bdb.close() os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))