Esempio n. 1
0
def get_CrossCatClient(client_type, **kwargs):
    """Helper which instantiates the appropriate Engine and returns a Client

    """

    client = None
    if client_type == 'local':
        import crosscat.LocalEngine as LocalEngine
        le = LocalEngine.LocalEngine(**kwargs)
        client = CrossCatClient(le)
    elif client_type == 'hadoop':
        import crosscat.HadoopEngine as HadoopEngine
        he = HadoopEngine.HadoopEngine(**kwargs)
        client = CrossCatClient(he)
    elif client_type == 'jsonrpc':
        import crosscat.JSONRPCEngine as JSONRPCEngine
        je = JSONRPCEngine.JSONRPCEngine(**kwargs)
        client = CrossCatClient(je)
    elif client_type == 'multiprocessing':
        import crosscat.MultiprocessingEngine as MultiprocessingEngine
        me = MultiprocessingEngine.MultiprocessingEngine(**kwargs)
        client = CrossCatClient(me)
    else:
        raise Exception('unknown client_type: %s' % client_type)
    return client
Esempio n. 2
0
def run(stdin, stdout, stderr, argv):
    args = parse_args(argv[1:])
    progname = argv[0]
    slash = progname.rfind('/')
    if slash:
        progname = progname[slash + 1:]
    if args.bdbpath is None and not args.memory:
        stderr.write('%s: pass filename or -m/--memory\n' % (progname, ))
        return 1
    if args.bdbpath == '-':
        stderr.write('%s: missing option?\n' % (progname, ))
        return 1
    bdb = bayeslite.bayesdb_open(pathname=args.bdbpath,
                                 builtin_metamodels=False)

    if args.jobs != 1:
        import crosscat.MultiprocessingEngine as ccme
        jobs = args.jobs if args.jobs > 0 else None
        crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs)
    else:
        import crosscat.LocalEngine as ccle
        crosscat = ccle.LocalEngine(seed=args.seed)
    metamodel = CrosscatMetamodel(crosscat)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr)
    with hook.set_current_shell(bdbshell):
        if not args.no_init_file:
            init_file = os.path.join(os.path.expanduser('~/.bayesliterc'))
            if os.path.isfile(init_file):
                bdbshell.dot_read(init_file)

        if args.file is not None:
            for path in args.file:
                if os.path.isfile(path):
                    bdbshell.dot_read(path)
                else:
                    bdbshell.stdout.write('%s is not a file.  Aborting.\n' %
                                          (str(path), ))
                    break

        if not args.batch:
            bdbshell.cmdloop()
    return 0
Esempio n. 3
0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed):
    then = time.time()

    timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d')
    user = subprocess.check_output(["whoami"]).strip()
    host = subprocess.check_output(["hostname"]).strip()
    filestamp = '-' + timestamp + '-' + user

    def out_file_name(base, ext):
        return out_dir + '/' + base + filestamp + ext

    csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv')
    bdb_file = out_file_name('satellites', '.bdb')

    # so we can build bdb models
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(bdb_file):
        print 'Error: File', bdb_file, 'already exists. Please remove it.'
        sys.exit(1)

    # create database mapped to filesystem
    log('opening bdb on disk: %s' % bdb_file)
    bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False)

    def execute(bql):
        log("executing %s" % bql)
        bdb.execute(bql)

    # read csv into table
    log('reading data from %s' % csv_file)
    bayeslite.bayesdb_read_csv_file(bdb,
                                    'satellites',
                                    csv_file,
                                    header=True,
                                    create=True,
                                    ifnotexists=True)

    # Add a "not applicable" orbit sub-type
    log('adding "not applicable" orbit sub-type')
    bdb.sql_execute('''UPDATE satellites
        SET type_of_orbit = 'N/A'
        WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO')
          AND type_of_orbit = 'NaN'
    ''')

    # nullify "NaN"
    log('nullifying NaN')
    bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN')

    # register crosscat metamodel
    cc = ccme.MultiprocessingEngine(seed=seed)
    ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, ccmm)

    # create the crosscat generator using
    execute('''
        CREATE GENERATOR satellites_cc FOR satellites USING crosscat (
            GUESS(*),
            name IGNORE,
            Country_of_Operator CATEGORICAL,
            Operator_Owner CATEGORICAL,
            Users CATEGORICAL,
            Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL,
            Type_of_Orbit CATEGORICAL,
            Perigee_km NUMERICAL,
            Apogee_km NUMERICAL,
            Eccentricity NUMERICAL,
            Period_minutes NUMERICAL,
            Launch_Mass_kg NUMERICAL,
            Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL,
            Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL,
            Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL
        )
    ''')

    execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models, ))

    cur_iter_ct = 0

    def snapshot():
        log('vacuuming')
        bdb.sql_execute('vacuum')
        cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
        save_file_name = out_file_name('satellites', cur_infix + '.bdb')
        meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
        log('recording snapshot ' + save_file_name)
        os.system("cp %s %s" % (bdb_file, save_file_name))
        report(save_file_name, meta_file_name)

    def record_metadata(f,
                        saved_file_name,
                        sha_sum,
                        total_time,
                        plot_file_name=None):
        f.write("DB file " + saved_file_name + "\n")
        f.write(sha_sum)
        f.write("built from " + csv_file + "\n")
        f.write("by %s@%s\n" % (user, host))
        f.write("at seed %s\n" % seed)
        f.write("in %3.2f seconds\n" % total_time)
        f.write("with %s models analyzed for %s iterations\n" %
                (num_models, num_iters))
        f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" %
                (bayeslite.__version__, crosscat.__version__,
                 bdbcontrib.__version__))
        if plot_file_name is not None:
            f.write("diagnostics recorded to %s\n" % plot_file_name)
        f.flush()

    def report(saved_file_name,
               metadata_file,
               echo=False,
               plot_file_name=None):
        sha256 = hashlib.sha256()
        with open(saved_file_name, 'rb') as fd:
            for chunk in iter(lambda: fd.read(65536), ''):
                sha256.update(chunk)
        sha_sum = sha256.hexdigest() + '\n'
        total_time = time.time() - then
        with open(metadata_file, 'w') as fd:
            record_metadata(fd, saved_file_name, sha_sum, total_time,
                            plot_file_name)
            fd.write('using script ')
            fd.write('-' * 57)
            fd.write('\n')
            fd.flush()
            os.system("cat %s >> %s" % (__file__, metadata_file))

        if echo:
            record_metadata(sys.stdout, saved_file_name, sha_sum, total_time,
                            plot_file_name)

    def final_report():
        # create a diagnostics plot
        plot_file_name = out_file_name('satellites', '-logscores.pdf')
        log('writing diagnostic plot to %s' % plot_file_name)
        _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics(
            bdb, 'logscore', 'satellites_cc')
        plt.savefig(plot_file_name)
        final_metadata_file = out_file_name('satellites', '-meta.txt')
        report(bdb_file,
               final_metadata_file,
               echo=True,
               plot_file_name=plot_file_name)

    snapshot()
    while cur_iter_ct < num_iters:
        execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' %
                checkpoint_freq)
        cur_iter_ct += checkpoint_freq
        snapshot()

    final_report()

    log('closing bdb %s' % bdb_file)
    bdb.close()
    os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" %
              (out_dir, filestamp))
Esempio n. 4
0
# set everything up
T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed)
num_rows = len(T)
num_cols = len(T[0])
col_names = numpy.array(
    [M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)])
engine = LE.LocalEngine(inf_seed)

# initialize the chains
p = Pool()
seeds = range(num_chains)
if False:
    chain_tuples = p.map(do_initialize, seeds)
    chain_tuples = p.map(do_analyze, zip(chain_tuples, seeds))
else:
    engine = MultiprocessingEngine.MultiprocessingEngine()
    X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=num_chains)
    X_L, X_D = engine.analyze(M_c, T, X_L, X_D)
    chain_tuples = zip(X_L, X_D)

# visualize the column cooccurence matrix
X_L_list, X_D_list = map(list, zip(*chain_tuples))

# save the progress
to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list)
fu.pickle(to_pickle, pkl_filename)

# to_pickle = fu.unpickle(pkl_filename)
# X_L_list = to_pickle['X_L_list']
# X_D_list = to_pickle['X_D_list']
def test_kl_divergence_as_a_function_of_N_and_transitions():

	n_clusters = 3
	n_chains = 8
	do_times = 4

	# N_list = [25, 50, 100, 250, 500, 1000, 2000]
	N_list = [25, 50, 100, 175, 250, 400, 500]

	# max_transitions = 500
	max_transitions = 500
	transition_interval = 50
	t_iterations = max_transitions/transition_interval

	cctype = 'continuous'
	cluster_weights = [1.0/float(n_clusters)]*n_clusters
	separation = .5

	get_next_seed = lambda : random.randrange(2147483647)

	# data grid
	KLD = numpy.zeros((len(N_list), t_iterations+1))

	for _ in range(do_times):
		for n in range(len(N_list)):
			N = N_list[n]
			T, M_c, struc = sdg.gen_data([cctype], N, [0], [cluster_weights], 
							[separation], seed=get_next_seed(), distargs=[None],
							return_structure=True)

			M_r = du.gen_M_r_from_T(T)

			# precompute the support and pdf to speed up calculation of KL divergence
			support = qtu.get_mixture_support(cctype, 
						ccmext.p_ContinuousComponentModel, 
						struc['component_params'][0], nbins=1000, support=.995)
			true_log_pdf = qtu.get_mixture_pdf(support,
						ccmext.p_ContinuousComponentModel, 
						struc['component_params'][0],cluster_weights)

			# intialize a multiprocessing engine
			mstate = mpe.MultiprocessingEngine(cpu_count=8)
			X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains)

			# kl_divergences
			klds = numpy.zeros(len(X_L_list))

			for i in range(len(X_L_list)):
				X_L = X_L_list[i]
				X_D = X_D_list[i]
				KLD[n,0] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel,
						struc['component_params'][0], cluster_weights, M_c, 
						X_L, X_D, n_samples=1000, support=support, 
						true_log_pdf=true_log_pdf)


			# run transition_interval then take a reading. Rinse and repeat.
			for t in range( t_iterations ):
				X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list,
							n_steps=transition_interval)

				for i in range(len(X_L_list)):
					X_L = X_L_list[i]
					X_D = X_D_list[i]
					KLD[n,t+1] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel,
							struc['component_params'][0], cluster_weights, M_c, 
							X_L, X_D, n_samples=1000, support=support, 
							true_log_pdf=true_log_pdf)


	KLD /= float(n_chains*do_times)

	pylab.subplot(1,3,1)
	pylab.contourf(list(range(0,max_transitions+1,transition_interval), N_list, KLD))
	pylab.title('KL divergence')
	pylab.ylabel('N')
	pylab.xlabel('# transitions')


	pylab.subplot(1,3,2)
	m_N = numpy.mean(KLD,axis=1)
	e_N = numpy.std(KLD,axis=1)/float(KLD.shape[1])**-.5
	pylab.errorbar(N_list,  m_N, yerr=e_N)
	pylab.title('KL divergence by N')
	pylab.xlabel('N')
	pylab.ylabel('KL divergence')

	pylab.subplot(1,3,3)
	m_t = numpy.mean(KLD,axis=0)
	e_t = numpy.std(KLD,axis=0)/float(KLD.shape[0])**-.5
	pylab.errorbar(list(range(0,max_transitions+1,transition_interval), m_t, yerr=e_t))
	pylab.title('KL divergence by transitions')
	pylab.xlabel('trasition')
	pylab.ylabel('KL divergence')

	pylab.show()

	return KLD