def get_CrossCatClient(client_type, **kwargs): """Helper which instantiates the appropriate Engine and returns a Client """ client = None if client_type == 'local': import crosscat.LocalEngine as LocalEngine le = LocalEngine.LocalEngine(**kwargs) client = CrossCatClient(le) elif client_type == 'hadoop': import crosscat.HadoopEngine as HadoopEngine he = HadoopEngine.HadoopEngine(**kwargs) client = CrossCatClient(he) elif client_type == 'jsonrpc': import crosscat.JSONRPCEngine as JSONRPCEngine je = JSONRPCEngine.JSONRPCEngine(**kwargs) client = CrossCatClient(je) elif client_type == 'multiprocessing': import crosscat.MultiprocessingEngine as MultiprocessingEngine me = MultiprocessingEngine.MultiprocessingEngine(**kwargs) client = CrossCatClient(me) else: raise Exception('unknown client_type: %s' % client_type) return client
def run(stdin, stdout, stderr, argv): args = parse_args(argv[1:]) progname = argv[0] slash = progname.rfind('/') if slash: progname = progname[slash + 1:] if args.bdbpath is None and not args.memory: stderr.write('%s: pass filename or -m/--memory\n' % (progname, )) return 1 if args.bdbpath == '-': stderr.write('%s: missing option?\n' % (progname, )) return 1 bdb = bayeslite.bayesdb_open(pathname=args.bdbpath, builtin_metamodels=False) if args.jobs != 1: import crosscat.MultiprocessingEngine as ccme jobs = args.jobs if args.jobs > 0 else None crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs) else: import crosscat.LocalEngine as ccle crosscat = ccle.LocalEngine(seed=args.seed) metamodel = CrosscatMetamodel(crosscat) bayeslite.bayesdb_register_metamodel(bdb, metamodel) bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr) with hook.set_current_shell(bdbshell): if not args.no_init_file: init_file = os.path.join(os.path.expanduser('~/.bayesliterc')) if os.path.isfile(init_file): bdbshell.dot_read(init_file) if args.file is not None: for path in args.file: if os.path.isfile(path): bdbshell.dot_read(path) else: bdbshell.stdout.write('%s is not a file. Aborting.\n' % (str(path), )) break if not args.batch: bdbshell.cmdloop() return 0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed): then = time.time() timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d') user = subprocess.check_output(["whoami"]).strip() host = subprocess.check_output(["hostname"]).strip() filestamp = '-' + timestamp + '-' + user def out_file_name(base, ext): return out_dir + '/' + base + filestamp + ext csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv') bdb_file = out_file_name('satellites', '.bdb') # so we can build bdb models os.environ['BAYESDB_WIZARD_MODE'] = '1' if not os.path.isdir(out_dir): os.makedirs(out_dir) if os.path.exists(bdb_file): print 'Error: File', bdb_file, 'already exists. Please remove it.' sys.exit(1) # create database mapped to filesystem log('opening bdb on disk: %s' % bdb_file) bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False) def execute(bql): log("executing %s" % bql) bdb.execute(bql) # read csv into table log('reading data from %s' % csv_file) bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file, header=True, create=True, ifnotexists=True) # Add a "not applicable" orbit sub-type log('adding "not applicable" orbit sub-type') bdb.sql_execute('''UPDATE satellites SET type_of_orbit = 'N/A' WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO') AND type_of_orbit = 'NaN' ''') # nullify "NaN" log('nullifying NaN') bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN') # register crosscat metamodel cc = ccme.MultiprocessingEngine(seed=seed) ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc) bayeslite.bayesdb_register_metamodel(bdb, ccmm) # create the crosscat generator using execute(''' CREATE GENERATOR satellites_cc FOR satellites USING crosscat ( GUESS(*), name IGNORE, Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL, Users CATEGORICAL, Purpose CATEGORICAL, Class_of_Orbit CATEGORICAL, Type_of_Orbit CATEGORICAL, Perigee_km NUMERICAL, Apogee_km NUMERICAL, Eccentricity NUMERICAL, Period_minutes NUMERICAL, Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL, Power_watts NUMERICAL, Date_of_Launch NUMERICAL, Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL, Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL, Launch_Vehicle CATEGORICAL, Source_Used_for_Orbital_Data CATEGORICAL, longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL ) ''') execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models, )) cur_iter_ct = 0 def snapshot(): log('vacuuming') bdb.sql_execute('vacuum') cur_infix = '-%dm-%di' % (num_models, cur_iter_ct) save_file_name = out_file_name('satellites', cur_infix + '.bdb') meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt') log('recording snapshot ' + save_file_name) os.system("cp %s %s" % (bdb_file, save_file_name)) report(save_file_name, meta_file_name) def record_metadata(f, saved_file_name, sha_sum, total_time, plot_file_name=None): f.write("DB file " + saved_file_name + "\n") f.write(sha_sum) f.write("built from " + csv_file + "\n") f.write("by %s@%s\n" % (user, host)) f.write("at seed %s\n" % seed) f.write("in %3.2f seconds\n" % total_time) f.write("with %s models analyzed for %s iterations\n" % (num_models, num_iters)) f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__)) if plot_file_name is not None: f.write("diagnostics recorded to %s\n" % plot_file_name) f.flush() def report(saved_file_name, metadata_file, echo=False, plot_file_name=None): sha256 = hashlib.sha256() with open(saved_file_name, 'rb') as fd: for chunk in iter(lambda: fd.read(65536), ''): sha256.update(chunk) sha_sum = sha256.hexdigest() + '\n' total_time = time.time() - then with open(metadata_file, 'w') as fd: record_metadata(fd, saved_file_name, sha_sum, total_time, plot_file_name) fd.write('using script ') fd.write('-' * 57) fd.write('\n') fd.flush() os.system("cat %s >> %s" % (__file__, metadata_file)) if echo: record_metadata(sys.stdout, saved_file_name, sha_sum, total_time, plot_file_name) def final_report(): # create a diagnostics plot plot_file_name = out_file_name('satellites', '-logscores.pdf') log('writing diagnostic plot to %s' % plot_file_name) _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics( bdb, 'logscore', 'satellites_cc') plt.savefig(plot_file_name) final_metadata_file = out_file_name('satellites', '-meta.txt') report(bdb_file, final_metadata_file, echo=True, plot_file_name=plot_file_name) snapshot() while cur_iter_ct < num_iters: execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq) cur_iter_ct += checkpoint_freq snapshot() final_report() log('closing bdb %s' % bdb_file) bdb.close() os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))
# set everything up T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed) num_rows = len(T) num_cols = len(T[0]) col_names = numpy.array( [M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)]) engine = LE.LocalEngine(inf_seed) # initialize the chains p = Pool() seeds = range(num_chains) if False: chain_tuples = p.map(do_initialize, seeds) chain_tuples = p.map(do_analyze, zip(chain_tuples, seeds)) else: engine = MultiprocessingEngine.MultiprocessingEngine() X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=num_chains) X_L, X_D = engine.analyze(M_c, T, X_L, X_D) chain_tuples = zip(X_L, X_D) # visualize the column cooccurence matrix X_L_list, X_D_list = map(list, zip(*chain_tuples)) # save the progress to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list) fu.pickle(to_pickle, pkl_filename) # to_pickle = fu.unpickle(pkl_filename) # X_L_list = to_pickle['X_L_list'] # X_D_list = to_pickle['X_D_list']
def test_kl_divergence_as_a_function_of_N_and_transitions(): n_clusters = 3 n_chains = 8 do_times = 4 # N_list = [25, 50, 100, 250, 500, 1000, 2000] N_list = [25, 50, 100, 175, 250, 400, 500] # max_transitions = 500 max_transitions = 500 transition_interval = 50 t_iterations = max_transitions/transition_interval cctype = 'continuous' cluster_weights = [1.0/float(n_clusters)]*n_clusters separation = .5 get_next_seed = lambda : random.randrange(2147483647) # data grid KLD = numpy.zeros((len(N_list), t_iterations+1)) for _ in range(do_times): for n in range(len(N_list)): N = N_list[n] T, M_c, struc = sdg.gen_data([cctype], N, [0], [cluster_weights], [separation], seed=get_next_seed(), distargs=[None], return_structure=True) M_r = du.gen_M_r_from_T(T) # precompute the support and pdf to speed up calculation of KL divergence support = qtu.get_mixture_support(cctype, ccmext.p_ContinuousComponentModel, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, ccmext.p_ContinuousComponentModel, struc['component_params'][0],cluster_weights) # intialize a multiprocessing engine mstate = mpe.MultiprocessingEngine(cpu_count=8) X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains) # kl_divergences klds = numpy.zeros(len(X_L_list)) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,0] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) # run transition_interval then take a reading. Rinse and repeat. for t in range( t_iterations ): X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list, n_steps=transition_interval) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,t+1] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) KLD /= float(n_chains*do_times) pylab.subplot(1,3,1) pylab.contourf(list(range(0,max_transitions+1,transition_interval), N_list, KLD)) pylab.title('KL divergence') pylab.ylabel('N') pylab.xlabel('# transitions') pylab.subplot(1,3,2) m_N = numpy.mean(KLD,axis=1) e_N = numpy.std(KLD,axis=1)/float(KLD.shape[1])**-.5 pylab.errorbar(N_list, m_N, yerr=e_N) pylab.title('KL divergence by N') pylab.xlabel('N') pylab.ylabel('KL divergence') pylab.subplot(1,3,3) m_t = numpy.mean(KLD,axis=0) e_t = numpy.std(KLD,axis=0)/float(KLD.shape[0])**-.5 pylab.errorbar(list(range(0,max_transitions+1,transition_interval), m_t, yerr=e_t)) pylab.title('KL divergence by transitions') pylab.xlabel('trasition') pylab.ylabel('KL divergence') pylab.show() return KLD