def __init__(self, dbf): self.orig_dbf = copy.deepcopy(dbf) self.dbf = copy.deepcopy(dbf) parameters = { sym: unpack_piecewise(dbf.symbols[sym]) for sym in database_symbols_to_fit(dbf) } ds = load_datasets([]) # empty TinyDB root = OptNode(parameters, ds) self.current_node = root self.staged_nodes = [] self.graph = OptGraph(root)
def parameter_labels(dbf, formatted=True): parameter_symbols = database_symbols_to_fit(dbf) if formatted: parameter_labels = [] for sym in parameter_symbols: fp = formatted_parameter(dbf, sym) label = "{}({})\n{}: {}".format(fp.phase_name, fp.interaction, fp.parameter_type, fp.term_symbol) parameter_labels.append(label) return parameter_labels else: return parameter_symbols
def test_equilibrium_thermochemical_context_is_pickleable(datasets_db): """Test that the context for equilibrium thermochemical data is pickleable""" datasets_db.insert(CU_MG_EQ_HMR_LIQUID) dbf = Database(CU_MG_TDB) symbols_to_fit = database_symbols_to_fit(dbf) initial_guess = np.array([unpack_piecewise(dbf.symbols[s]) for s in symbols_to_fit]) prior_dict = EmceeOptimizer.get_priors(None, symbols_to_fit, initial_guess) ctx = setup_context(dbf, datasets_db) ctx.update(prior_dict) ctx_pickle = pickle.dumps(ctx) ctx_unpickled = pickle.loads(ctx_pickle) regular_predict = EmceeOptimizer.predict(initial_guess, **ctx) unpickle_predict = EmceeOptimizer.predict(initial_guess, **ctx_unpickled) assert np.isclose(regular_predict, unpickle_predict)
def test_zpf_context_is_pickleable(datasets_db): """Test that the context for ZPF data is pickleable""" datasets_db.insert(CU_MG_DATASET_ZPF_ZERO_ERROR) dbf = Database(CU_MG_TDB) symbols_to_fit = database_symbols_to_fit(dbf) initial_guess = np.array([unpack_piecewise(dbf.symbols[s]) for s in symbols_to_fit]) prior_dict = EmceeOptimizer.get_priors(None, symbols_to_fit, initial_guess) ctx = setup_context(dbf, datasets_db) ctx.update(prior_dict) ctx_pickle = pickle.dumps(ctx) ctx_unpickled = pickle.loads(ctx_pickle) regular_predict = EmceeOptimizer.predict(initial_guess, **ctx) unpickle_predict = EmceeOptimizer.predict(initial_guess, **ctx_unpickled) assert np.isclose(regular_predict, unpickle_predict)
def get_thermochemical_data(dbf, comps, phases, datasets, weight_dict=None, symbols_to_fit=None): """ Parameters ---------- dbf : pycalphad.Database Database to consider comps : list List of active component names phases : list List of phases to consider datasets : espei.utils.PickleableTinyDB Datasets that contain single phase data weight_dict : dict Dictionary of weights for each data type, e.g. {'HM': 200, 'SM': 2} symbols_to_fit : list Parameters to fit. Used to build the models and PhaseRecords. Returns ------- list List of data dictionaries to iterate over """ # phase by phase, then property by property, then by model exclusions if weight_dict is None: weight_dict = {} if symbols_to_fit is not None: symbols_to_fit = sorted(symbols_to_fit) else: symbols_to_fit = database_symbols_to_fit(dbf) # estimated from NIST TRC uncertainties property_std_deviation = { 'HM': 500.0 / weight_dict.get('HM', 1.0), # J/mol 'SM': 0.2 / weight_dict.get('SM', 1.0), # J/K-mol 'CPM': 0.2 / weight_dict.get('CPM', 1.0), # J/K-mol } properties = [ 'HM_FORM', 'SM_FORM', 'CPM_FORM', 'HM_MIX', 'SM_MIX', 'CPM_MIX' ] ref_states = [] for el in get_pure_elements(dbf, comps): ref_state = ReferenceState(el, dbf.refstates[el]['phase']) ref_states.append(ref_state) all_data_dicts = [] for phase_name in phases: for prop in properties: desired_data = get_prop_data( comps, phase_name, prop, datasets, additional_query=(where('solver').exists())) if len(desired_data) == 0: continue unique_exclusions = set([ tuple(sorted(d.get('excluded_model_contributions', []))) for d in desired_data ]) for exclusion in unique_exclusions: data_dict = { 'phase_name': phase_name, 'prop': prop, # needs the following keys to be added: # species, calculate_dict, phase_records, model, output, weights } # get all the data with these model exclusions if exclusion == tuple([]): exc_search = ( ~where('excluded_model_contributions').exists()) & ( where('solver').exists()) else: exc_search = (where('excluded_model_contributions').test( lambda x: tuple(sorted(x)) == exclusion)) & ( where('solver').exists()) curr_data = get_prop_data(comps, phase_name, prop, datasets, additional_query=exc_search) calculate_dict = get_prop_samples(dbf, comps, phase_name, curr_data) mod = Model(dbf, comps, phase_name, parameters=symbols_to_fit) if prop.endswith('_FORM'): output = ''.join(prop.split('_')[:-1]) + 'R' mod.shift_reference_state( ref_states, dbf, contrib_mods={e: sympy.S.Zero for e in exclusion}) else: output = prop for contrib in exclusion: mod.models[contrib] = sympy.S.Zero mod.reference_model.models[contrib] = sympy.S.Zero species = sorted(unpack_components(dbf, comps), key=str) data_dict['species'] = species model = {phase_name: mod} statevar_dict = { getattr(v, c, None): vals for c, vals in calculate_dict.items() if isinstance(getattr(v, c, None), v.StateVariable) } statevar_dict = OrderedDict( sorted(statevar_dict.items(), key=lambda x: str(x[0]))) str_statevar_dict = OrderedDict( (str(k), vals) for k, vals in statevar_dict.items()) phase_records = build_phase_records( dbf, species, [phase_name], statevar_dict, model, output=output, parameters={s: 0 for s in symbols_to_fit}, build_gradients=False, build_hessians=False) data_dict['str_statevar_dict'] = str_statevar_dict data_dict['phase_records'] = phase_records data_dict['calculate_dict'] = calculate_dict data_dict['model'] = model data_dict['output'] = output data_dict['weights'] = np.array( property_std_deviation[prop.split('_')[0]]) / np.array( calculate_dict.pop('weights')) all_data_dicts.append(data_dict) return all_data_dicts
def setup_context(dbf, datasets, symbols_to_fit=None, data_weights=None, phase_models=None, make_callables=True): """ Set up a context dictionary for calculating error. Parameters ---------- dbf : Database A pycalphad Database that will be fit datasets : PickleableTinyDB A database of single- and multi-phase data to fit symbols_to_fit : list of str List of symbols in the Database that will be fit. If None (default) are passed, then all parameters prefixed with `VV` followed by a number, e.g. VV0001 will be fit. Returns ------- Notes ----- A copy of the Database is made and used in the context. To commit changes back to the original database, the dbf.symbols.update method should be used. """ dbf = copy.deepcopy(dbf) if phase_models is not None: comps = sorted(phase_models['components']) else: comps = sorted([sp for sp in dbf.elements]) if symbols_to_fit is None: symbols_to_fit = database_symbols_to_fit(dbf) else: symbols_to_fit = sorted(symbols_to_fit) data_weights = data_weights if data_weights is not None else {} if len(symbols_to_fit) == 0: raise ValueError( 'No degrees of freedom. Database must contain symbols starting with \'V\' or \'VV\', followed by a number.' ) else: _log.info('Fitting %s degrees of freedom.', len(symbols_to_fit)) for x in symbols_to_fit: if isinstance(dbf.symbols[x], symengine.Piecewise): _log.debug('Replacing %s in database', x) dbf.symbols[x] = dbf.symbols[x].args[0] # construct the models for each phase, substituting in the SymEngine symbol to fit. if phase_models is not None: model_dict = get_model_dict(phase_models) else: model_dict = {} _log.trace('Building phase models (this may take some time)') import time t1 = time.time() phases = sorted( filter_phases(dbf, unpack_components(dbf, comps), dbf.phases.keys())) parameters = dict(zip(symbols_to_fit, [0] * len(symbols_to_fit))) models = instantiate_models(dbf, comps, phases, model=model_dict, parameters=parameters) if make_callables: eq_callables = build_callables(dbf, comps, phases, models, parameter_symbols=symbols_to_fit, output='GM', build_gradients=True, build_hessians=True, additional_statevars={v.N, v.P, v.T}) else: eq_callables = None t2 = time.time() _log.trace('Finished building phase models (%0.2fs)', t2 - t1) _log.trace( 'Getting non-equilibrium thermochemical data (this may take some time)' ) t1 = time.time() thermochemical_data = get_thermochemical_data( dbf, comps, phases, datasets, model=model_dict, weight_dict=data_weights, symbols_to_fit=symbols_to_fit) t2 = time.time() _log.trace('Finished getting non-equilibrium thermochemical data (%0.2fs)', t2 - t1) _log.trace( 'Getting equilibrium thermochemical data (this may take some time)') t1 = time.time() eq_thermochemical_data = get_equilibrium_thermochemical_data( dbf, comps, phases, datasets, model=model_dict, parameters=parameters, data_weight_dict=data_weights) t2 = time.time() _log.trace('Finished getting equilibrium thermochemical data (%0.2fs)', t2 - t1) _log.trace('Getting ZPF data (this may take some time)') t1 = time.time() zpf_data = get_zpf_data(dbf, comps, phases, datasets, model=model_dict, parameters=parameters) t2 = time.time() _log.trace('Finished getting ZPF data (%0.2fs)', t2 - t1) # context for the log probability function # for all cases, parameters argument addressed in MCMC loop error_context = { 'symbols_to_fit': symbols_to_fit, 'zpf_kwargs': { 'zpf_data': zpf_data, 'data_weight': data_weights.get('ZPF', 1.0), }, 'equilibrium_thermochemical_kwargs': { 'eq_thermochemical_data': eq_thermochemical_data, }, 'thermochemical_kwargs': { 'thermochemical_data': thermochemical_data, }, 'activity_kwargs': { 'dbf': dbf, 'comps': comps, 'phases': phases, 'datasets': datasets, 'phase_models': models, 'callables': eq_callables, 'data_weight': data_weights.get('ACR', 1.0), }, } return error_context
def invariant_samples(dbf, params, X, P, Tl, Tu, comp, client=None, comps=None, phases=None): """ Find the composition and temperature of the invariants for parameter sets in params (for a binary) Parameters ---------- dbf : Database Thermodynamic database containing the relevant parameters conds : dict or list of dict StateVariables and their corresponding value params : numpy array Array where the rows contain the parameter sets for the pycalphad equilibrium calculation X : float Guess for the mole fraction (of comp) of the invariant P : float Pressure (in Pa) at which to search for the invariants Tl : float Lower temperature bound to search for the invariants Tu : float Upper temperature bound to search for the invariants comp : str Name of the species client : Client, optional interface to dask.distributed compute cluster comps : list, optional Names of species to consider in the calculation phases : list or dict, optional Names of phases to consider in the calculation Returns ------- Tv : list List of invariant temperatures corresponding to the parameter sets phv : list of list List of lists of phases bndv : numpy array Array where the first index corresponds to the parameter set, and the second index corresponds to the composition of the zero phase fraction bounaries of the first and last phases in phv, and of the three phase equilibrium. Examples -------- >>> # let's do a multicore example >>> # first import modules and functions >>> import numpy as np >>> from dask.distributed import Client >>> from distributed.deploy.local import LocalCluster >>> from pycalphad import Database, variables as v >>> from pduq.invariant_calc import invariant_samples >>> # start the distributed client to parallelize the calculation >>> c = LocalCluster(n_workers=2, threads_per_worker=1) >>> client = Client(c) >>> # load the pycalphad database >>> dbf = Database('CU-MG_param_gen.tdb') >>> # load the parameter file >>> params = np.loadtxt('trace.csv', delimeter=',')[-2:, :] >>> # calculate the locations of invariant points for these two >>> # parameter sets in params >>> Tv, phv, bndv = invariant_samples( >>> dbf, params, client=client, X=.2, P=101325, >>> Tl=600, Tu=1400, comp='MG') >>> # print the temperatures of the invariant points >>> print(Tv) [1008.29467773 993.89038086] >>> # print the phases in equilibrium at the invariant points >>> print(phv) [['FCC_A1' 'LIQUID' 'LAVES_C15'], ['FCC_A1' 'LIQUID' 'LAVES_C15']] >>> # print the Mg molar fractions for the left phase boundary, >>> # the invariant, and the right phase boundary >>> print(bndv) [[0.04005779 0.21173958 0.33261747] [0.04096447 0.21720666 0.33295817]] """ if comps is None: comps = list(dbf.elements) if phases is None: phases = list(dbf.phases.keys()) neq = params.shape[0] # calculate invariants for neq parameter sets symbols_to_fit = database_symbols_to_fit(dbf) # eq_callables = get_eq_callables_(dbf, comps, phases, symbols_to_fit) eq_callables = None # eq_callables is disabled for current pycalcphad kwargs = { 'dbf': dbf, 'comps': comps, 'phases': phases, 'X': X, 'P': P, 'Tl': Tl, 'Tu': Tu, 'comp': comp, 'params': params, 'symbols_to_fit': symbols_to_fit, 'eq_callables': eq_callables } # invariant_(0, **kwargs) # define the map for the invariant calculation for neq parameter sets if client is None: invL = [] for ii in range(neq): invL.append(invariant_(ii, **kwargs)) else: A = client.map(invariant_, range(neq), **kwargs) invL = client.gather(A) client.close() # collect the key results after the map Tv = np.zeros((neq, )) phv = neq * [None] bndv = np.zeros((neq, 3)) for ii in range(neq): Tv[ii] = invL[ii][0] phv[ii] = invL[ii][1] bndv[ii, :] = invL[ii][2] return Tv, phv, bndv
def run_espei(run_settings): """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary. Parameters ---------- run_settings : dict Dictionary of input settings Returns ------- Either a Database (for generate parameters only) or a tuple of (Database, sampler) """ run_settings = get_run_settings(run_settings) system_settings = run_settings['system'] output_settings = run_settings['output'] generate_parameters_settings = run_settings.get('generate_parameters') mcmc_settings = run_settings.get('mcmc') # handle verbosity verbosity = { 0: logging.WARNING, 1: logging.INFO, 2: TRACE, 3: logging.DEBUG } logging.basicConfig(level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) log_version_info() # load datasets and handle i/o logging.log(TRACE, 'Loading and checking datasets.') dataset_path = system_settings['datasets'] datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json'))) if len(datasets.all()) == 0: logging.warning( 'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.' .format(dataset_path)) apply_tags(datasets, system_settings.get('tags', dict())) add_ideal_exclusions(datasets) logging.log(TRACE, 'Finished checking datasets') with open(system_settings['phase_models']) as fp: phase_models = json.load(fp) if generate_parameters_settings is not None: refdata = generate_parameters_settings['ref_state'] excess_model = generate_parameters_settings['excess_model'] ridge_alpha = generate_parameters_settings['ridge_alpha'] aicc_penalty = generate_parameters_settings['aicc_penalty_factor'] input_dbf = generate_parameters_settings.get('input_db', None) if input_dbf is not None: input_dbf = Database(input_dbf) dbf = generate_parameters( phase_models, datasets, refdata, excess_model, ridge_alpha=ridge_alpha, dbf=input_dbf, aicc_penalty_factor=aicc_penalty, ) dbf.to_file(output_settings['output_db'], if_exists='overwrite') if mcmc_settings is not None: tracefile = output_settings['tracefile'] probfile = output_settings['probfile'] # check that the MCMC output files do not already exist # only matters if we are actually running MCMC if os.path.exists(tracefile): raise OSError( 'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.' .format(tracefile)) if os.path.exists(probfile): raise OSError( 'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.' .format(probfile)) # scheduler setup if mcmc_settings['scheduler'] == 'dask': _raise_dask_work_stealing() # check for work-stealing from distributed import LocalCluster cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) # TODO: make dask-scheduler-verbosity a YAML input so that users can debug. Should have the same log levels as verbosity scheduler = LocalCluster(n_workers=cores, threads_per_worker=1, processes=True, memory_limit=0) client = ImmediateClient(scheduler) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (scheduler, sum(client.ncores().values()))) try: bokeh_server_info = client.scheduler_info( )['services']['bokeh'] logging.info( "bokeh server for dask scheduler at localhost:{}".format( bokeh_server_info)) except KeyError: logging.info("Install bokeh to use the dask bokeh server.") elif mcmc_settings['scheduler'] == 'None': client = None logging.info( "Not using a parallel scheduler. ESPEI is running MCMC on a single core." ) else: # we were passed a scheduler file name _raise_dask_work_stealing() # check for work-stealing client = ImmediateClient(scheduler_file=mcmc_settings['scheduler']) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (client.scheduler, sum(client.ncores().values()))) # get a Database if mcmc_settings.get('input_db'): dbf = Database(mcmc_settings.get('input_db')) # load the restart trace if needed if mcmc_settings.get('restart_trace'): restart_trace = np.load(mcmc_settings.get('restart_trace')) else: restart_trace = None # load the remaining mcmc fitting parameters iterations = mcmc_settings.get('iterations') save_interval = mcmc_settings.get('save_interval') chains_per_parameter = mcmc_settings.get('chains_per_parameter') chain_std_deviation = mcmc_settings.get('chain_std_deviation') deterministic = mcmc_settings.get('deterministic') prior = mcmc_settings.get('prior') data_weights = mcmc_settings.get('data_weights') syms = mcmc_settings.get('symbols') # set up and run the EmceeOptimizer optimizer = EmceeOptimizer(dbf, scheduler=client) optimizer.save_interval = save_interval all_symbols = syms if syms is not None else database_symbols_to_fit( dbf) optimizer.fit(all_symbols, datasets, prior=prior, iterations=iterations, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, deterministic=deterministic, restart_trace=restart_trace, tracefile=tracefile, probfile=probfile, mcmc_data_weights=data_weights) optimizer.commit() optimizer.dbf.to_file(output_settings['output_db'], if_exists='overwrite') # close the scheduler, if possible if hasattr(client, 'close'): client.close() return optimizer.dbf, optimizer.sampler return dbf
def mcmc_fit( dbf, datasets, iterations=1000, save_interval=100, chains_per_parameter=2, chain_std_deviation=0.1, scheduler=None, tracefile=None, probfile=None, restart_trace=None, deterministic=True, ): """ Run Markov Chain Monte Carlo on the Database given datasets Parameters ---------- dbf : Database A pycalphad Database to fit with symbols to fit prefixed with `VV` followed by a number, e.g. `VV0001` datasets : PickleableTinyDB A database of single- and multi-phase to fit iterations : int Number of trace iterations to calculate in MCMC. Default is 1000 iterations. save_interval :int interval of iterations to save the tracefile and probfile chains_per_parameter : int number of chains for each parameter. Must be an even integer greater or equal to 2. Defaults to 2. chain_std_deviation : float standard deviation of normal for parameter initialization as a fraction of each parameter. Must be greater than 0. Default is 0.1, which is 10%. scheduler : callable Scheduler to use with emcee. Must implement a map method. tracefile : str filename to store the trace with NumPy.save. Array has shape (chains, iterations, parameters) probfile : str filename to store the log probability with NumPy.save. Has shape (chains, iterations) restart_trace : np.ndarray ndarray of the previous trace. Should have shape (chains, iterations, parameters) deterministic : bool If True, the emcee sampler will be seeded to give deterministic sampling draws. This will ensure that the runs with the exact same database, chains_per_parameter, and chain_std_deviation (or restart_trace) will produce exactly the same results. Returns ------- dbf : Database Resulting pycalphad database of optimized parameters sampler : EnsembleSampler, ndarray) emcee sampler for further data wrangling """ comps = sorted([sp for sp in dbf.elements]) symbols_to_fit = database_symbols_to_fit(dbf) if len(symbols_to_fit) == 0: raise ValueError( 'No degrees of freedom. Database must contain symbols starting with \'V\' or \'VV\', followed by a number.' ) else: logging.info('Fitting {} degrees of freedom.'.format( len(symbols_to_fit))) for x in symbols_to_fit: if isinstance(dbf.symbols[x], sympy.Piecewise): logging.debug('Replacing {} in database'.format(x)) dbf.symbols[x] = dbf.symbols[x].args[0].expr # get initial parameters and remove these from the database # we'll replace them with SymPy symbols initialized to 0 in the phase models initial_parameters = np.array( [np.array(float(dbf.symbols[x])) for x in symbols_to_fit]) # construct the models for each phase, substituting in the SymPy symbol to fit. logging.debug('Building phase models (this may take some time)') # 0 is placeholder value phases = sorted(dbf.phases.keys()) sympy_symbols_to_fit = [sympy.Symbol(sym) for sym in symbols_to_fit] orig_parameters = { sym: p for sym, p in zip(symbols_to_fit, initial_parameters) } eq_callables = build_callables(dbf, comps, phases, model=Model, parameters=orig_parameters) # because error_context expencts 'phase_models' key, change it eq_callables['phase_models'] = eq_callables.pop('model') eq_callables.pop('phase_records') # we also need to build models that have no ideal mixing for thermochemical error and to build them for each property we might calculate # TODO: potential optimization to only calculate for phase/property combos that we have in the datasets # first construct dict of models without ideal mixing mods_no_idmix = {} for phase_name in phases: # we have to pass the list of Symbol objects to fit so they are popped from the database and can properly be replaced. mods_no_idmix[phase_name] = Model(dbf, comps, phase_name, parameters=sympy_symbols_to_fit) mods_no_idmix[phase_name].models['idmix'] = 0 # now construct callables for each possible property that can be calculated thermochemical_callables = { } # will be dict of {output_property: eq_callables_dict} whitelist_properties = ['HM', 'SM', 'CPM'] whitelist_properties = whitelist_properties + [ prop + '_MIX' for prop in whitelist_properties ] for prop in whitelist_properties: thermochemical_callables[prop] = build_callables( dbf, comps, phases, model=mods_no_idmix, output=prop, parameters=orig_parameters, build_gradients=False) # pop off the callables not used in properties because we don't want them around (they should be None, anyways) thermochemical_callables[prop].pop('phase_records') thermochemical_callables[prop].pop('model') logging.debug('Finished building phase models') # context for the log probability function error_context = { 'comps': comps, 'dbf': dbf, 'phases': phases, 'phase_models': eq_callables['phase_models'], 'datasets': datasets, 'symbols_to_fit': symbols_to_fit, 'thermochemical_callables': thermochemical_callables, 'callables': eq_callables, } def save_sampler_state(sampler): if tracefile: logging.debug('Writing trace to {}'.format(tracefile)) np.save(tracefile, sampler.chain) if probfile: logging.debug('Writing lnprob to {}'.format(probfile)) np.save(probfile, sampler.lnprobability) # initialize the walkers either fresh or from the restart if restart_trace is not None: walkers = restart_trace[np.nonzero(restart_trace)].reshape( (restart_trace.shape[0], -1, restart_trace.shape[2]))[:, -1, :] nwalkers = walkers.shape[0] ndim = walkers.shape[1] initial_parameters = walkers.mean(axis=0) logging.info( 'Restarting from previous calculation with {} chains ({} per parameter).' .format(nwalkers, nwalkers / ndim)) logging.debug( 'Means of restarting parameters are {}'.format(initial_parameters)) logging.debug( 'Standard deviations of restarting parameters are {}'.format( walkers.std(axis=0))) else: logging.debug('Initial parameters: {}'.format(initial_parameters)) ndim = initial_parameters.size nwalkers = ndim * chains_per_parameter logging.info( 'Initializing {} chains with {} chains per parameter.'.format( nwalkers, chains_per_parameter)) walkers = generate_parameter_distribution(initial_parameters, nwalkers, chain_std_deviation, deterministic=deterministic) # the pool must implement a map function sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, kwargs=error_context, pool=scheduler) if deterministic: from espei.rstate import numpy_rstate sampler.random_state = numpy_rstate logging.info('Using a deterministic ensemble sampler.') progbar_width = 30 logging.info('Running MCMC for {} iterations.'.format(iterations)) try: for i, result in enumerate( sampler.sample(walkers, iterations=iterations)): # progress bar if (i + 1) % save_interval == 0: save_sampler_state(sampler) logging.debug('Acceptance ratios for parameters: {}'.format( sampler.acceptance_fraction)) n = int((progbar_width + 1) * float(i) / iterations) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format( '#' * n, ' ' * (progbar_width - n), i + 1, iterations)) n = int((progbar_width + 1) * float(i + 1) / iterations) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format( '#' * n, ' ' * (progbar_width - n), i + 1, iterations)) except KeyboardInterrupt: pass # final processing save_sampler_state(sampler) optimal_params = optimal_parameters(sampler.chain, sampler.lnprobability) logging.debug('Intial parameters: {}'.format(initial_parameters)) logging.debug('Optimal parameters: {}'.format(optimal_params)) logging.debug('Change in parameters: {}'.format( np.abs(initial_parameters - optimal_params) / initial_parameters)) for param_name, value in zip(symbols_to_fit, optimal_params): dbf.symbols[param_name] = value logging.info('MCMC complete.') return dbf, sampler
def plot_property(dbf, comps, phaseL, params, T, prop, config=None, datasets=None, xlim=None, xlabel=None, ylabel=None, yscale=None, phase_label_dict=None, unit='kJ/mol.', cdict=None, figsize=None): """ Plot a property of interest versus temperature with uncertainty bounds for all phases of interest Parameters ---------- dbf : Database Thermodynamic database containing the relevant parameters comps : list Names of components to consider in the calculation phaseL : list Names of phases to plot properties for params : numpy array Array where the rows contain the parameter sets for the pycalphad equilibrium calculation T : list, array or x-array object Temperature values at which to plot the selected property prop : str property (or attribute in pycalphad terminology) to sample, e.g. GM for molar gibbs energy or H_MIX for the enthalpy of mixing config : tuple, optional Sublattice configuration as a tuple, e.g. (“CU”, (“CU”, “MG”)) datasets : espei.utils.PickleableTinyDB, optional Database of datasets to search for data xlims : list or tuple of float, optional List or tuple with two floats corresponding to the minimum and maximum molar composition of comp xlabel : str, optional plot x label ylabel : str, optional plot y label yscale : int or float, optional scaling factor to apply to property (e.g. to plot kJ/mol. instead of J/mol. choose yscale to be 0.001) phase_label_dict : dict, optional Dictionary with keys given by phase names and corresponding strings to use in plotting (e.g. to enable LaTeX labels) unit : str, optional Unit to plot on the y-axis for the property of interest cdict : dict, optional Dictionary with phase names and corresponding colors figsize : tuple or list of int or float, optional Plot dimensions in inches Returns ------- Examples -------- >>> import numpy as np >>> import pduq.uq_plot as uq >>> from pycalphad import Database >>> dbf = Database('CU-MG_param_gen.tdb') >>> comps = ['MG', 'CU', 'VA'] >>> phaseL = ['CUMG2', 'LIQUID'] >>> params = np.loadtxt('params.npy')[: -1, :] >>> T = 650 >>> prop = 'GM' >>> # Plot the molar gibbs energy of all phases in phaseL >>> # versus molar fraction of MG at 650K. This will have >>> # uncertainty intervals generated by the parameter sets >>> # in params >>> uq.plot_property(dbf, comps, phaseL, params, T, prop) """ symbols_to_fit = database_symbols_to_fit(dbf) CI = 95 nph = len(phaseL) colorL = sns.color_palette("cubehelix", nph) markerL = 10 * [ 'o', 'D', '^', 'x', 'h', 's', 'v', '*', 'P', 'p', '>', 'd', '<' ] plt.figure(figsize=figsize) # compute uncertainty in property for each phase in list for ii in range(nph): phase = phaseL[ii] print('starting', prop, 'evaluations for the', phase, 'phase') # for each parameter sample calculate the property # for each possible site occupancy ratios compL = [] for index in range(params.shape[0]): param_dict = { param_name: param for param_name, param in zip(symbols_to_fit, params[index, :]) } parameters = OrderedDict(sorted(param_dict.items(), key=str)) comp = calculate(dbf, comps, phase, P=101325, T=T, output=prop, parameters=parameters) compL += [comp] # concatenate the calculate results in an xarray along # an axis named 'sample' compC = xr.concat(compL, 'sample') compC.coords['sample'] = np.arange(params.shape[0]) # The composition vector is the same for all samples if hasattr(T, "__len__"): Xvals = T else: Xvals = comp.X.sel(component=comps[0]).values.squeeze() Pvals = compC[prop].where(compC.Phase == phase).values.squeeze() if np.array(Xvals).size == 1: print('phase is a line compound') Xvals_ = np.array([Xvals - 0.002, Xvals + 0.002]) Pvals_ = np.vstack([Pvals, Pvals]).T else: # find the lower hull of the property by finding # the configuration with the lowest value within # each interval. In each interval record the composition # and property indxL = np.array([]) # Xbnds = np.arange(0, 1.01, 0.01) Xbnds = np.linspace(Xvals.min(), Xvals.max(), 100) for lb, ub in zip(Xbnds[:-1], Xbnds[1:]): # print('lb: ', lb, ', ub: ', ub) boolA = (lb <= Xvals) * (Xvals < ub) if boolA.sum() == 0: continue indxA = np.arange(boolA.size)[boolA] P_ = Pvals[0, boolA] indxL = np.append(indxL, indxA[P_.argmin()]) # indxL = np.append(indxL, indxA[P_.argmax()]) indxL = indxL.astype('int32') if indxL.size == 1: print('only one point found') Xvals_ = Xvals[np.asscalar(indxL)] Pvals_ = Pvals[:, np.asscalar(indxL)] else: Xvals_ = Xvals[indxL] Pvals_ = Pvals[:, indxL] # Xvals_ = Xvals # Pvals_ = Pvals # for ii in range(params.shape[0]): # plt.plot(Xvals_, Pvals_[ii, :], 'k-', linewidth=0.5, alpha=0.1) # plt.show() if yscale is not None: Pvals_ *= yscale low, mid, high = np.percentile( Pvals_, [0.5 * (100 - CI), 50, 100 - 0.5 * (100 - CI)], axis=0) if cdict is not None: color = cdict[phase] else: color = colorL[ii] if phase_label_dict is not None: label = phase_label_dict[phase] else: label = phase plt.plot(Xvals_, mid, linestyle='-', color=color, label=label) plt.fill_between(np.atleast_1d(Xvals_), low, high, alpha=0.3, facecolor=color) # collect and plot experimental data if config is not None and datasets is not None: symmetry = None data = get_data(comps, phase, config, symmetry, datasets, prop) print(data) for data_s, marker in zip(data, markerL): occupancies = data_s['solver']['sublattice_occupancies'] # at the moment this needs to be changed manually X_vec = [row[0][0] for row in occupancies] values = np.squeeze(data_s['values']) if yscale is not None: values *= yscale plt.plot(X_vec, values, linestyle='', marker=marker, markerfacecolor='none', markeredgecolor=color, markersize=6, alpha=0.9, label=data_s['reference']) if xlim is None: plt.xlim([Xvals_.min(), Xvals_.max()]) else: plt.xlim(xlim) if xlabel is not None: plt.xlabel(xlabel) else: plt.xlabel(r'$X_{%s}$' % comps[0]) if ylabel is not None: plt.ylabel(ylabel) else: plt.ylabel(prop + ' (' + unit + ')') plt.legend() plt.tight_layout()
def mcmc_fit(dbf, datasets, mcmc_steps=1000, save_interval=100, chains_per_parameter=2, chain_std_deviation=0.1, scheduler=None, tracefile=None, probfile=None, restart_chain=None, deterministic=True,): """Run Markov Chain Monte Carlo on the Database given datasets Parameters ---------- dbf : Database A pycalphad Database to fit with symbols to fit prefixed with `VV` followed by a number, e.g. `VV0001` datasets : PickleableTinyDB A database of single- and multi-phase to fit mcmc_steps : int Number of chain steps to calculate in MCMC. Note the flattened chain will have (mcmc_steps*DOF) values. Default is 1000 steps. save_interval :int interval of steps to save the chain to the tracefile and probfile chains_per_parameter : int number of chains for each parameter. Must be an even integer greater or equal to 2. Defaults to 2. chain_std_deviation : float standard deviation of normal for parameter initialization as a fraction of each parameter. Must be greater than 0. Default is 0.1, which is 10%. scheduler : callable Scheduler to use with emcee. Must implement a map method. tracefile : str filename to store the flattened chain with NumPy.save. Array has shape (nwalkers, iterations, nparams) probfile : str filename to store the flattened ln probability with NumPy.save restart_chain : np.ndarray ndarray of the previous chain. Should have shape (nwalkers, iterations, nparams) deterministic : bool If True, the emcee sampler will be seeded to give deterministic sampling draws. This will ensure that the runs with the exact same database, chains_per_parameter, and chain_std_deviation (or restart_chain) will produce exactly the same results. Returns ------- dbf : Database Resulting pycalphad database of optimized parameters sampler : EnsembleSampler, ndarray) emcee sampler for further data wrangling """ comps = sorted([sp for sp in dbf.elements]) symbols_to_fit = database_symbols_to_fit(dbf) if len(symbols_to_fit) == 0: raise ValueError('No degrees of freedom. Database must contain symbols starting with \'V\' or \'VV\', followed by a number.') else: logging.info('Fitting {} degrees of freedom.'.format(len(symbols_to_fit))) for x in symbols_to_fit: if isinstance(dbf.symbols[x], sympy.Piecewise): logging.debug('Replacing {} in database'.format(x)) dbf.symbols[x] = dbf.symbols[x].args[0].expr # get initial parameters and remove these from the database # we'll replace them with SymPy symbols initialized to 0 in the phase models initial_parameters = np.array([np.array(float(dbf.symbols[x])) for x in symbols_to_fit]) for x in symbols_to_fit: del dbf.symbols[x] # construct the models for each phase, substituting in the SymPy symbol to fit. phase_models = dict() logging.debug('Building phase models') # 0 is placeholder value phases = sorted(dbf.phases.keys()) for phase_name in phases: mod = CompiledModel(dbf, comps, phase_name, parameters=OrderedDict([(sympy.Symbol(s), 0) for s in symbols_to_fit])) phase_models[phase_name] = mod logging.debug('Finished building phase models') # contect for the log probability function error_context = {'comps': comps, 'dbf': dbf, 'phases': phases, 'phase_models': phase_models, 'datasets': datasets, 'symbols_to_fit': symbols_to_fit, } def save_sampler_state(sampler): if tracefile: logging.debug('Writing chain to {}'.format(tracefile)) np.save(tracefile, sampler.chain) if probfile: logging.debug('Writing lnprob to {}'.format(probfile)) np.save(probfile, sampler.lnprobability) # initialize the walkers either fresh or from the restart if restart_chain is not None: walkers = restart_chain[np.nonzero(restart_chain)].reshape( (restart_chain.shape[0], -1, restart_chain.shape[2]))[:, -1, :] nwalkers = walkers.shape[0] ndim = walkers.shape[1] initial_parameters = walkers.mean(axis=0) logging.info('Restarting from previous calculation with {} chains ({} per parameter).'.format(nwalkers, nwalkers / ndim)) logging.debug('Means of restarting parameters are {}'.format(initial_parameters)) logging.debug('Standard deviations of restarting parameters are {}'.format(walkers.std(axis=0))) else: logging.debug('Initial parameters: {}'.format(initial_parameters)) ndim = initial_parameters.size nwalkers = ndim * chains_per_parameter logging.info('Initializing {} chains with {} chains per parameter.'.format(nwalkers, chains_per_parameter)) walkers = generate_parameter_distribution(initial_parameters, nwalkers, chain_std_deviation, deterministic=deterministic) # the pool must implement a map function sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, kwargs=error_context, pool=scheduler) if deterministic: from espei.rstate import numpy_rstate sampler.random_state = numpy_rstate logging.info('Using a deterministic ensemble sampler.') progbar_width = 30 logging.info('Running MCMC with {} steps.'.format(mcmc_steps)) try: for i, result in enumerate(sampler.sample(walkers, iterations=mcmc_steps)): # progress bar if (i + 1) % save_interval == 0: save_sampler_state(sampler) logging.debug('Acceptance ratios for parameters: {}'.format(sampler.acceptance_fraction)) n = int((progbar_width + 1) * float(i) / mcmc_steps) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format('#'*n, ' '*(progbar_width - n), i + 1, mcmc_steps)) n = int((progbar_width + 1) * float(i + 1) / mcmc_steps) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format('#'*n, ' '*(progbar_width - n), i + 1, mcmc_steps)) except KeyboardInterrupt: pass # final processing save_sampler_state(sampler) optimal_params = optimal_parameters(sampler.chain, sampler.lnprobability) logging.debug('Intial parameters: {}'.format(initial_parameters)) logging.debug('Optimal parameters: {}'.format(optimal_params)) logging.debug('Change in parameters: {}'.format(np.abs(initial_parameters - optimal_params) / initial_parameters)) for param_name, value in zip(symbols_to_fit, optimal_params): dbf.symbols[param_name] = value logging.info('MCMC complete.') return dbf, sampler
def eq_calc_samples(dbf, conds, params, client=None, comps=None, phases=None, savef=None): """ Perform equilibrium calculations for the parameter sets in params Parameters ---------- dbf : Database Thermodynamic database containing the relevant parameters conds : dict or list of dict StateVariables and their corresponding value params : numpy array Array where the rows contain the parameter sets for the pycalphad equilibrium calculation client : Client, optional interface to dask.distributed compute cluster comps : list Names of components to consider in the calculation phases : list or dict Names of phases to consider in the calculation savef : str Save file for the equilibrium calculations Returns ------- structured equilibrium calculation structured equilibrium calculations for parameter sets in params Examples -------- >>> # let's do a multicore example >>> # first import modules and functions >>> import numpy as np >>> from pycalphad import Database, variables as v >>> from distributed.deploy.local import LocalCluster >>> from pduq.dbf_calc import eq_calc_samples >>> # start the distributed client to parallelize the calculation >>> c = LocalCluster(n_workers=2, threads_per_worker=1) >>> client = Client(c) >>> # load the pycalphad database >>> dbf = Database('CU-MG_param_gen.tdb') >>> # load the parameter file >>> params = np.loadtxt('trace.csv', delimeter=',') >>> # define the equilibrium conditions >>> conds = {v.P: 101325, v.T: 1003, v.X('MG'): 0.214} >>> # perform the parallel equilibrium calculations for the last two >>> # parameter sets in param >>> eqC = eq_calc_samples(dbf, conds, params[-2:, :], client=client) >>> # let's look at the phases in equilibrium for the two parameter >>> # sets >>> print(np.squeeze(eqC.Phase.values)) [['FCC_A1' 'LAVES_C15' ''] ['LIQUID' '' '']] """ if comps is None: comps = list(dbf.elements) if phases is None: phases = list(dbf.phases.keys()) symbols_to_fit = database_symbols_to_fit(dbf) # eq_callables = get_eq_callables_(dbf, comps, phases, symbols_to_fit) eq_callables = None kwargs = { 'dbf': dbf, 'comps': comps, 'phases': phases, 'conds': conds, 'params': params, 'symbols_to_fit': symbols_to_fit, 'eq_callables': eq_callables } neq = params.shape[0] if neq < 20: nch = neq else: nch = 20 chunks = [list(range(neq))[ii::nch] for ii in range(nch)] if client is None: eqL = [] for chunk in chunks: eqL += eq_calc_chunk_(chunk, **kwargs) else: A = client.map(eq_calc_chunk_, chunks, **kwargs) eqL = client.gather(A) eqL = list(chain.from_iterable(eqL)) client.close() eqC = xr.concat(eqL, 'sample') eqC.coords['sample'] = np.arange(neq) logging.info(str(eqC)) if savef is not None: with open(savef, 'wb') as buff: pickle.dump(eqC, buff) return eqC
def mcmc_fit(dbf, datasets, iterations=1000, save_interval=1, chains_per_parameter=2, chain_std_deviation=0.1, scheduler=None, tracefile=None, probfile=None, restart_trace=None, deterministic=True, prior=None, mcmc_data_weights=None): """ Run MCMC via the EmceeOptimizer class Parameters ---------- dbf : Database A pycalphad Database to fit with symbols to fit prefixed with `VV` followed by a number, e.g. `VV0001` datasets : PickleableTinyDB A database of single- and multi-phase data to fit iterations : int Number of trace iterations to calculate in MCMC. Default is 1000 iterations. save_interval :int interval of iterations to save the tracefile and probfile chains_per_parameter : int number of chains for each parameter. Must be an even integer greater or equal to 2. Defaults to 2. chain_std_deviation : float standard deviation of normal for parameter initialization as a fraction of each parameter. Must be greater than 0. Default is 0.1, which is 10%. scheduler : callable Scheduler to use with emcee. Must implement a map method. tracefile : str filename to store the trace with NumPy.save. Array has shape (chains, iterations, parameters) probfile : str filename to store the log probability with NumPy.save. Has shape (chains, iterations) restart_trace : np.ndarray ndarray of the previous trace. Should have shape (chains, iterations, parameters) deterministic : bool If True, the emcee sampler will be seeded to give deterministic sampling draws. This will ensure that the runs with the exact same database, chains_per_parameter, and chain_std_deviation (or restart_trace) will produce exactly the same results. prior : str Prior to use to generate priors. Defaults to 'zero', which keeps backwards compatibility. Can currently choose 'normal', 'uniform', 'triangular', or 'zero'. mcmc_data_weights : dict Dictionary of weights for each data type, e.g. {'ZPF': 20, 'HM': 2} """ warnings.warn("The mcmc convenience function will be removed in ESPEI 0.8") all_symbols = database_symbols_to_fit(dbf) optimizer = EmceeOptimizer(dbf, scheduler=scheduler) optimizer.save_interval = save_interval optimizer.fit(all_symbols, datasets, prior=prior, iterations=iterations, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, deterministic=deterministic, restart_trace=restart_trace, tracefile=tracefile, probfile=probfile, mcmc_data_weights=mcmc_data_weights) optimizer.commit() return optimizer.dbf, optimizer.sampler