def test_lnprob_calculates_multi_phase_probability_for_success(datasets_db): """lnprob() successfully calculates the probability for equilibrium """ dbf = Database.from_string(CU_MG_TDB, fmt='tdb') datasets_db.insert(CU_MG_DATASET_ZPF_WORKING) comps = ['CU', 'MG', 'VA'] phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'] param = 'VV0001' orig_val = dbf.symbols[param].args[0].expr initial_params = {param: orig_val} zpf_kwargs = { 'zpf_data': get_zpf_data(dbf, comps, phases, datasets_db, initial_params), 'data_weight': 1.0, } opt = EmceeOptimizer(dbf) res = opt.predict([10], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs) assert np.isreal(res) assert np.isclose(res, -31.309645520830344, rtol=1e-4) res_2 = opt.predict([10000000], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs) assert not np.isclose(res_2, -31.309645520830344, rtol=1e-6)
def test_lnprob_calculates_multi_phase_probability_for_success(datasets_db): """lnprob() successfully calculates the probability for equilibrium """ dbf = Database.from_string(CU_MG_TDB, fmt='tdb') datasets_db.insert(CU_MG_DATASET_ZPF_WORKING) comps = ['CU', 'MG', 'VA'] phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'] param = 'VV0001' orig_val = dbf.symbols[param].args[0].expr models = instantiate_models(dbf, comps, phases, parameters={param: orig_val}) eq_callables = build_callables(dbf, comps, phases, models, parameter_symbols=[param], output='GM', build_gradients=True, build_hessians=False, additional_statevars={v.N, v.P, v.T}) zpf_kwargs = { 'dbf': dbf, 'phases': phases, 'zpf_data': get_zpf_data(comps, phases, datasets_db), 'phase_models': models, 'callables': eq_callables, 'data_weight': 1.0, } opt = EmceeOptimizer(dbf) res = opt.predict([10], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs) assert np.isreal(res) assert np.isclose(res, -31.309645520830344, rtol=1e-6) res_2 = opt.predict([10000000], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs) assert not np.isclose(res_2, -31.309645520830344, rtol=1e-6)
def test_emcee_opitmizer_can_restart(datasets_db): """A restart trace can be passed to the Emcee optimizer """ dbf = Database.from_string(CU_MG_TDB, fmt='tdb') datasets_db.insert(CU_MG_DATASET_ZPF_WORKING) param = 'VV0001' opt = EmceeOptimizer(dbf) restart_tr = -4*np.ones((2, 10, 1)) # 2 chains, 10 iterations, 1 parameter opt.fit([param], datasets_db, iterations=1, chains_per_parameter=2, restart_trace=restart_tr) assert opt.sampler.chain.shape == (2, 1, 1)
def test_lnprob_does_not_raise_on_ValueError(datasets_db): """lnprob() should catch ValueError raised by equilibrium and return -np.inf""" dbf = Database.from_string(CU_MG_TDB, fmt='tdb') opt = EmceeOptimizer(dbf) comps = ['CU', 'MG', 'VA'] phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'] datasets_db.insert(CU_MG_DATASET_ZPF_WORKING) zpf_kwargs = {'dbf': dbf, 'phases': phases, 'zpf_data': get_zpf_data(comps, phases, datasets_db), 'data_weight': 1.0} res = opt.predict([10], prior_rvs=[rv_zero()], symbols_to_fit=['VV0001'], zpf_kwargs=zpf_kwargs) assert np.isneginf(res)
def test_parameter_initialization(): """Determinisitically generated parameters should match.""" initial_parameters = np.array([1, 10, 100, 1000]) opt = EmceeOptimizer(Database()) deterministic_params = opt.initialize_new_chains(initial_parameters, 1, 0.10, deterministic=True) expected_parameters = np.array([ [9.81708401e-01, 9.39027722e+00, 1.08016748e+02, 9.13512881e+02], [1.03116874, 9.01412995, 112.79594345, 916.44725799], [1.00664662e+00, 1.07178898e+01, 9.63696718e+01, 1.36872292e+03], [1.07642366e+00, 1.16413520e+01, 8.71742457e+01, 9.61836382e+02]]) assert np.all(np.isclose(deterministic_params, expected_parameters))
def test_equilibrium_thermochemical_correct_probability(datasets_db): """Integration test for equilibrium thermochemical error.""" dbf = Database(CU_MG_TDB) opt = EmceeOptimizer(dbf) datasets_db.insert(CU_MG_EQ_HMR_LIQUID) ctx = setup_context(dbf, datasets_db, ['VV0017']) ctx.update(opt.get_priors(None, ['VV0017'], [0])) prob = opt.predict(np.array([-31626.6]), **ctx) expected_prob = norm(loc=0, scale=500).logpdf([-31626.6 * 0.5 * 0.5]).sum() assert np.isclose(prob, expected_prob) # change to -40000 prob = opt.predict(np.array([-40000], dtype=np.float_), **ctx) expected_prob = norm(loc=0, scale=500).logpdf([-40000 * 0.5 * 0.5]).sum() assert np.isclose(prob, expected_prob)
def test_emcee_optimizer_can_restart(datasets_db): """A restart trace can be passed to the Emcee optimizer """ dbf = Database.from_string(CU_MG_TDB, fmt='tdb') datasets_db.insert(CU_MG_DATASET_ZPF_WORKING) param = 'VV0001' opt = EmceeOptimizer(dbf) restart_tr = np.array([[[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4], [5]], [[-6], [-4], [-2], [0], [2], [4], [6], [8], [10], [12]]]) # 2 chains, 10 iterations, 1 parameter opt.fit([param], datasets_db, iterations=1, chains_per_parameter=2, restart_trace=restart_tr) assert opt.sampler.chain.shape == (2, 1, 1)
def test_equilibrium_thermochemical_context_is_pickleable(datasets_db): """Test that the context for equilibrium thermochemical data is pickleable""" datasets_db.insert(CU_MG_EQ_HMR_LIQUID) dbf = Database(CU_MG_TDB) symbols_to_fit = database_symbols_to_fit(dbf) initial_guess = np.array([unpack_piecewise(dbf.symbols[s]) for s in symbols_to_fit]) prior_dict = EmceeOptimizer.get_priors(None, symbols_to_fit, initial_guess) ctx = setup_context(dbf, datasets_db) ctx.update(prior_dict) ctx_pickle = pickle.dumps(ctx) ctx_unpickled = pickle.loads(ctx_pickle) regular_predict = EmceeOptimizer.predict(initial_guess, **ctx) unpickle_predict = EmceeOptimizer.predict(initial_guess, **ctx_unpickled) assert np.isclose(regular_predict, unpickle_predict)
def test_zpf_context_is_pickleable(datasets_db): """Test that the context for ZPF data is pickleable""" datasets_db.insert(CU_MG_DATASET_ZPF_ZERO_ERROR) dbf = Database(CU_MG_TDB) symbols_to_fit = database_symbols_to_fit(dbf) initial_guess = np.array([unpack_piecewise(dbf.symbols[s]) for s in symbols_to_fit]) prior_dict = EmceeOptimizer.get_priors(None, symbols_to_fit, initial_guess) ctx = setup_context(dbf, datasets_db) ctx.update(prior_dict) ctx_pickle = pickle.dumps(ctx) ctx_unpickled = pickle.loads(ctx_pickle) regular_predict = EmceeOptimizer.predict(initial_guess, **ctx) unpickle_predict = EmceeOptimizer.predict(initial_guess, **ctx_unpickled) assert np.isclose(regular_predict, unpickle_predict)
def test_lnprob_calculates_single_phase_probability_for_success(datasets_db): """lnprob() succesfully calculates the probability from single phase data""" dbf = Database.from_string(CU_MG_TDB_FCC_ONLY, fmt='tdb') datasets_db.insert(CU_MG_HM_MIX_SINGLE_FCC_A1) comps = ['CU', 'MG', 'VA'] phases = ['FCC_A1'] param = 'VV0003' orig_val = -14.0865 opt = EmceeOptimizer(dbf) thermochemical_data = get_thermochemical_data(dbf, comps, phases, datasets_db, symbols_to_fit=[param]) thermochemical_kwargs = { 'dbf': dbf, 'comps': comps, 'thermochemical_data': thermochemical_data } res_orig = opt.predict([orig_val], prior_rvs=[rv_zero()], symbols_to_fit=[param], thermochemical_kwargs=thermochemical_kwargs) assert np.isreal(res_orig) assert np.isclose(res_orig, -9.119484935312146, rtol=1e-6) res_10 = opt.predict([10], prior_rvs=[rv_zero()], symbols_to_fit=[param], thermochemical_kwargs=thermochemical_kwargs) assert np.isreal(res_10) assert np.isclose(res_10, -9.143559131626864, rtol=1e-6) res_1e5 = opt.predict([1e5], prior_rvs=[rv_zero()], symbols_to_fit=[param], thermochemical_kwargs=thermochemical_kwargs) assert np.isreal(res_1e5) assert np.isclose(res_1e5, -1359.1335466316268, rtol=1e-6)
def test_lnprob_calculates_associate_tdb(datasets_db): """lnprob() successfully calculates the probability for equilibrium """ dbf = Database.from_string(CU_MG_TDB_ASSOC, fmt='tdb') datasets_db.insert(CU_MG_DATASET_ZPF_WORKING) comps = ['CU', 'MG', 'VA'] phases = ['LIQUID', 'FCC_A1', 'HCP_A3', 'LAVES_C15', 'CUMG2'] param = 'VV0001' orig_val = dbf.symbols[param].args[0] initial_params = {param: orig_val} zpf_kwargs = { 'zpf_data': get_zpf_data(dbf, comps, phases, datasets_db, initial_params), 'data_weight': 1.0, } opt = EmceeOptimizer(dbf) res = opt.predict([10], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs) assert np.isreal(res) assert not np.isinf(res) assert np.isclose(res, -31.309645520830344, rtol=1e-6) # The purpose of this part is to test that the driving forces (and probability) # are different than the case of VV0001 = 10. res_2 = opt.predict([-10000000], prior_rvs=[rv_zero()], symbols_to_fit=[param], zpf_kwargs=zpf_kwargs) assert np.isreal(res_2) assert not np.isinf(res_2) # Accept a large rtol becuase the results should be _very_ different assert not np.isclose(res_2, -31.309645520830344, rtol=1e-2)
def run_espei(run_settings): """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary. Parameters ---------- run_settings : dict Dictionary of input settings Returns ------- Either a Database (for generate parameters only) or a tuple of (Database, sampler) """ run_settings = get_run_settings(run_settings) system_settings = run_settings['system'] output_settings = run_settings['output'] generate_parameters_settings = run_settings.get('generate_parameters') mcmc_settings = run_settings.get('mcmc') # handle verbosity verbosity = { 0: logging.WARNING, 1: logging.INFO, 2: TRACE, 3: logging.DEBUG } logging.basicConfig(level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) log_version_info() # load datasets and handle i/o logging.log(TRACE, 'Loading and checking datasets.') dataset_path = system_settings['datasets'] datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json'))) if len(datasets.all()) == 0: logging.warning( 'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.' .format(dataset_path)) apply_tags(datasets, system_settings.get('tags', dict())) add_ideal_exclusions(datasets) logging.log(TRACE, 'Finished checking datasets') with open(system_settings['phase_models']) as fp: phase_models = json.load(fp) if generate_parameters_settings is not None: refdata = generate_parameters_settings['ref_state'] excess_model = generate_parameters_settings['excess_model'] ridge_alpha = generate_parameters_settings['ridge_alpha'] aicc_penalty = generate_parameters_settings['aicc_penalty_factor'] input_dbf = generate_parameters_settings.get('input_db', None) if input_dbf is not None: input_dbf = Database(input_dbf) dbf = generate_parameters( phase_models, datasets, refdata, excess_model, ridge_alpha=ridge_alpha, dbf=input_dbf, aicc_penalty_factor=aicc_penalty, ) dbf.to_file(output_settings['output_db'], if_exists='overwrite') if mcmc_settings is not None: tracefile = output_settings['tracefile'] probfile = output_settings['probfile'] # check that the MCMC output files do not already exist # only matters if we are actually running MCMC if os.path.exists(tracefile): raise OSError( 'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.' .format(tracefile)) if os.path.exists(probfile): raise OSError( 'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.' .format(probfile)) # scheduler setup if mcmc_settings['scheduler'] == 'dask': _raise_dask_work_stealing() # check for work-stealing from distributed import LocalCluster cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) # TODO: make dask-scheduler-verbosity a YAML input so that users can debug. Should have the same log levels as verbosity scheduler = LocalCluster(n_workers=cores, threads_per_worker=1, processes=True, memory_limit=0) client = ImmediateClient(scheduler) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (scheduler, sum(client.ncores().values()))) try: bokeh_server_info = client.scheduler_info( )['services']['bokeh'] logging.info( "bokeh server for dask scheduler at localhost:{}".format( bokeh_server_info)) except KeyError: logging.info("Install bokeh to use the dask bokeh server.") elif mcmc_settings['scheduler'] == 'None': client = None logging.info( "Not using a parallel scheduler. ESPEI is running MCMC on a single core." ) else: # we were passed a scheduler file name _raise_dask_work_stealing() # check for work-stealing client = ImmediateClient(scheduler_file=mcmc_settings['scheduler']) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (client.scheduler, sum(client.ncores().values()))) # get a Database if mcmc_settings.get('input_db'): dbf = Database(mcmc_settings.get('input_db')) # load the restart trace if needed if mcmc_settings.get('restart_trace'): restart_trace = np.load(mcmc_settings.get('restart_trace')) else: restart_trace = None # load the remaining mcmc fitting parameters iterations = mcmc_settings.get('iterations') save_interval = mcmc_settings.get('save_interval') chains_per_parameter = mcmc_settings.get('chains_per_parameter') chain_std_deviation = mcmc_settings.get('chain_std_deviation') deterministic = mcmc_settings.get('deterministic') prior = mcmc_settings.get('prior') data_weights = mcmc_settings.get('data_weights') syms = mcmc_settings.get('symbols') # set up and run the EmceeOptimizer optimizer = EmceeOptimizer(dbf, scheduler=client) optimizer.save_interval = save_interval all_symbols = syms if syms is not None else database_symbols_to_fit( dbf) optimizer.fit(all_symbols, datasets, prior=prior, iterations=iterations, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, deterministic=deterministic, restart_trace=restart_trace, tracefile=tracefile, probfile=probfile, mcmc_data_weights=data_weights) optimizer.commit() optimizer.dbf.to_file(output_settings['output_db'], if_exists='overwrite') # close the scheduler, if possible if hasattr(client, 'close'): client.close() return optimizer.dbf, optimizer.sampler return dbf
def mcmc_fit(dbf, datasets, iterations=1000, save_interval=1, chains_per_parameter=2, chain_std_deviation=0.1, scheduler=None, tracefile=None, probfile=None, restart_trace=None, deterministic=True, prior=None, mcmc_data_weights=None): """ Run MCMC via the EmceeOptimizer class Parameters ---------- dbf : Database A pycalphad Database to fit with symbols to fit prefixed with `VV` followed by a number, e.g. `VV0001` datasets : PickleableTinyDB A database of single- and multi-phase data to fit iterations : int Number of trace iterations to calculate in MCMC. Default is 1000 iterations. save_interval :int interval of iterations to save the tracefile and probfile chains_per_parameter : int number of chains for each parameter. Must be an even integer greater or equal to 2. Defaults to 2. chain_std_deviation : float standard deviation of normal for parameter initialization as a fraction of each parameter. Must be greater than 0. Default is 0.1, which is 10%. scheduler : callable Scheduler to use with emcee. Must implement a map method. tracefile : str filename to store the trace with NumPy.save. Array has shape (chains, iterations, parameters) probfile : str filename to store the log probability with NumPy.save. Has shape (chains, iterations) restart_trace : np.ndarray ndarray of the previous trace. Should have shape (chains, iterations, parameters) deterministic : bool If True, the emcee sampler will be seeded to give deterministic sampling draws. This will ensure that the runs with the exact same database, chains_per_parameter, and chain_std_deviation (or restart_trace) will produce exactly the same results. prior : str Prior to use to generate priors. Defaults to 'zero', which keeps backwards compatibility. Can currently choose 'normal', 'uniform', 'triangular', or 'zero'. mcmc_data_weights : dict Dictionary of weights for each data type, e.g. {'ZPF': 20, 'HM': 2} """ warnings.warn("The mcmc convenience function will be removed in ESPEI 0.8") all_symbols = database_symbols_to_fit(dbf) optimizer = EmceeOptimizer(dbf, scheduler=scheduler) optimizer.save_interval = save_interval optimizer.fit(all_symbols, datasets, prior=prior, iterations=iterations, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, deterministic=deterministic, restart_trace=restart_trace, tracefile=tracefile, probfile=probfile, mcmc_data_weights=mcmc_data_weights) optimizer.commit() return optimizer.dbf, optimizer.sampler