def _fit( self, symbols, ds, prior=None, iterations=1000, chains_per_parameter=2, chain_std_deviation=0.1, deterministic=True, restart_trace=None, tracefile=None, probfile=None, mcmc_data_weights=None, approximate_equilibrium=False, ): """ Parameters ---------- symbols : list of str ds : PickleableTinyDB prior : str Prior to use to generate priors. Defaults to 'zero', which keeps backwards compatibility. Can currently choose 'normal', 'uniform', 'triangular', or 'zero'. iterations : int Number of iterations to calculate in MCMC. Default is 1000. chains_per_parameter : int number of chains for each parameter. Must be an even integer greater or equal to 2. Defaults to 2. chain_std_deviation : float Standard deviation of normal for parameter initialization as a fraction of each parameter. Must be greater than 0. Defaults to 0.1. deterministic : bool If True, the emcee sampler will be seeded to give deterministic sampling draws. This will ensure that the runs with the exact same database, chains_per_parameter, and chain_std_deviation (or restart_trace) will produce exactly the same results. restart_trace : np.ndarray ndarray of the previous trace. Should have shape (chains, iterations, parameters) tracefile : str filename to store the trace with NumPy.save. Array has shape (chains, iterations, parameters) probfile : str filename to store the log probability with NumPy.save. Has shape (chains, iterations) mcmc_data_weights : dict Dictionary of weights for each data type, e.g. {'ZPF': 20, 'HM': 2} Returns ------- Dict[str, float] """ # Set NumPy print options so logged arrays print on one line. Reset at the end. np.set_printoptions(linewidth=sys.maxsize) cbs = self.scheduler is None ctx = setup_context(self.dbf, ds, symbols, data_weights=mcmc_data_weights, phase_models=self.phase_models, make_callables=cbs) symbols_to_fit = ctx['symbols_to_fit'] initial_guess = np.array( [unpack_piecewise(self.dbf.symbols[s]) for s in symbols_to_fit]) prior_dict = self.get_priors(prior, symbols_to_fit, initial_guess) ctx.update(prior_dict) if 'zpf_kwargs' in ctx: ctx['zpf_kwargs'][ 'approximate_equilibrium'] = approximate_equilibrium if 'equilibrium_thermochemical_kwargs' in ctx: ctx['equilibrium_thermochemical_kwargs'][ 'approximate_equilibrium'] = approximate_equilibrium # Run the initial parameters for guessing purposes: _log.trace("Probability for initial parameters") self.predict(initial_guess, **ctx) if restart_trace is not None: chains = self.initialize_chains_from_trace(restart_trace) # TODO: check that the shape is valid with the existing parameters else: chains = self.initialize_new_chains(initial_guess, chains_per_parameter, chain_std_deviation, deterministic) sampler = emcee.EnsembleSampler(chains.shape[0], initial_guess.size, self.predict, kwargs=ctx, pool=self.scheduler) if deterministic: from espei.rstate import numpy_rstate sampler.random_state = numpy_rstate _log.info('Using a deterministic ensemble sampler.') self.sampler = sampler self.tracefile = tracefile self.probfile = probfile # Run the MCMC simulation self.do_sampling(chains, iterations) # Post process optimal_params = optimal_parameters(sampler.chain, sampler.lnprobability) _log.trace('Initial parameters: %s', initial_guess) _log.trace('Optimal parameters: %s', optimal_params) _log.trace('Change in parameters: %s', np.abs(initial_guess - optimal_params) / initial_guess) parameters = dict(zip(symbols_to_fit, optimal_params)) np.set_printoptions(linewidth=75) return parameters
def mcmc_fit(dbf, datasets, mcmc_steps=1000, save_interval=100, chains_per_parameter=2, chain_std_deviation=0.1, scheduler=None, tracefile=None, probfile=None, restart_chain=None, deterministic=True,): """Run Markov Chain Monte Carlo on the Database given datasets Parameters ---------- dbf : Database A pycalphad Database to fit with symbols to fit prefixed with `VV` followed by a number, e.g. `VV0001` datasets : PickleableTinyDB A database of single- and multi-phase to fit mcmc_steps : int Number of chain steps to calculate in MCMC. Note the flattened chain will have (mcmc_steps*DOF) values. Default is 1000 steps. save_interval :int interval of steps to save the chain to the tracefile and probfile chains_per_parameter : int number of chains for each parameter. Must be an even integer greater or equal to 2. Defaults to 2. chain_std_deviation : float standard deviation of normal for parameter initialization as a fraction of each parameter. Must be greater than 0. Default is 0.1, which is 10%. scheduler : callable Scheduler to use with emcee. Must implement a map method. tracefile : str filename to store the flattened chain with NumPy.save. Array has shape (nwalkers, iterations, nparams) probfile : str filename to store the flattened ln probability with NumPy.save restart_chain : np.ndarray ndarray of the previous chain. Should have shape (nwalkers, iterations, nparams) deterministic : bool If True, the emcee sampler will be seeded to give deterministic sampling draws. This will ensure that the runs with the exact same database, chains_per_parameter, and chain_std_deviation (or restart_chain) will produce exactly the same results. Returns ------- dbf : Database Resulting pycalphad database of optimized parameters sampler : EnsembleSampler, ndarray) emcee sampler for further data wrangling """ comps = sorted([sp for sp in dbf.elements]) symbols_to_fit = database_symbols_to_fit(dbf) if len(symbols_to_fit) == 0: raise ValueError('No degrees of freedom. Database must contain symbols starting with \'V\' or \'VV\', followed by a number.') else: logging.info('Fitting {} degrees of freedom.'.format(len(symbols_to_fit))) for x in symbols_to_fit: if isinstance(dbf.symbols[x], sympy.Piecewise): logging.debug('Replacing {} in database'.format(x)) dbf.symbols[x] = dbf.symbols[x].args[0].expr # get initial parameters and remove these from the database # we'll replace them with SymPy symbols initialized to 0 in the phase models initial_parameters = np.array([np.array(float(dbf.symbols[x])) for x in symbols_to_fit]) for x in symbols_to_fit: del dbf.symbols[x] # construct the models for each phase, substituting in the SymPy symbol to fit. phase_models = dict() logging.debug('Building phase models') # 0 is placeholder value phases = sorted(dbf.phases.keys()) for phase_name in phases: mod = CompiledModel(dbf, comps, phase_name, parameters=OrderedDict([(sympy.Symbol(s), 0) for s in symbols_to_fit])) phase_models[phase_name] = mod logging.debug('Finished building phase models') # contect for the log probability function error_context = {'comps': comps, 'dbf': dbf, 'phases': phases, 'phase_models': phase_models, 'datasets': datasets, 'symbols_to_fit': symbols_to_fit, } def save_sampler_state(sampler): if tracefile: logging.debug('Writing chain to {}'.format(tracefile)) np.save(tracefile, sampler.chain) if probfile: logging.debug('Writing lnprob to {}'.format(probfile)) np.save(probfile, sampler.lnprobability) # initialize the walkers either fresh or from the restart if restart_chain is not None: walkers = restart_chain[np.nonzero(restart_chain)].reshape( (restart_chain.shape[0], -1, restart_chain.shape[2]))[:, -1, :] nwalkers = walkers.shape[0] ndim = walkers.shape[1] initial_parameters = walkers.mean(axis=0) logging.info('Restarting from previous calculation with {} chains ({} per parameter).'.format(nwalkers, nwalkers / ndim)) logging.debug('Means of restarting parameters are {}'.format(initial_parameters)) logging.debug('Standard deviations of restarting parameters are {}'.format(walkers.std(axis=0))) else: logging.debug('Initial parameters: {}'.format(initial_parameters)) ndim = initial_parameters.size nwalkers = ndim * chains_per_parameter logging.info('Initializing {} chains with {} chains per parameter.'.format(nwalkers, chains_per_parameter)) walkers = generate_parameter_distribution(initial_parameters, nwalkers, chain_std_deviation, deterministic=deterministic) # the pool must implement a map function sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, kwargs=error_context, pool=scheduler) if deterministic: from espei.rstate import numpy_rstate sampler.random_state = numpy_rstate logging.info('Using a deterministic ensemble sampler.') progbar_width = 30 logging.info('Running MCMC with {} steps.'.format(mcmc_steps)) try: for i, result in enumerate(sampler.sample(walkers, iterations=mcmc_steps)): # progress bar if (i + 1) % save_interval == 0: save_sampler_state(sampler) logging.debug('Acceptance ratios for parameters: {}'.format(sampler.acceptance_fraction)) n = int((progbar_width + 1) * float(i) / mcmc_steps) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format('#'*n, ' '*(progbar_width - n), i + 1, mcmc_steps)) n = int((progbar_width + 1) * float(i + 1) / mcmc_steps) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format('#'*n, ' '*(progbar_width - n), i + 1, mcmc_steps)) except KeyboardInterrupt: pass # final processing save_sampler_state(sampler) optimal_params = optimal_parameters(sampler.chain, sampler.lnprobability) logging.debug('Intial parameters: {}'.format(initial_parameters)) logging.debug('Optimal parameters: {}'.format(optimal_params)) logging.debug('Change in parameters: {}'.format(np.abs(initial_parameters - optimal_params) / initial_parameters)) for param_name, value in zip(symbols_to_fit, optimal_params): dbf.symbols[param_name] = value logging.info('MCMC complete.') return dbf, sampler
def mcmc_fit( dbf, datasets, iterations=1000, save_interval=100, chains_per_parameter=2, chain_std_deviation=0.1, scheduler=None, tracefile=None, probfile=None, restart_trace=None, deterministic=True, ): """ Run Markov Chain Monte Carlo on the Database given datasets Parameters ---------- dbf : Database A pycalphad Database to fit with symbols to fit prefixed with `VV` followed by a number, e.g. `VV0001` datasets : PickleableTinyDB A database of single- and multi-phase to fit iterations : int Number of trace iterations to calculate in MCMC. Default is 1000 iterations. save_interval :int interval of iterations to save the tracefile and probfile chains_per_parameter : int number of chains for each parameter. Must be an even integer greater or equal to 2. Defaults to 2. chain_std_deviation : float standard deviation of normal for parameter initialization as a fraction of each parameter. Must be greater than 0. Default is 0.1, which is 10%. scheduler : callable Scheduler to use with emcee. Must implement a map method. tracefile : str filename to store the trace with NumPy.save. Array has shape (chains, iterations, parameters) probfile : str filename to store the log probability with NumPy.save. Has shape (chains, iterations) restart_trace : np.ndarray ndarray of the previous trace. Should have shape (chains, iterations, parameters) deterministic : bool If True, the emcee sampler will be seeded to give deterministic sampling draws. This will ensure that the runs with the exact same database, chains_per_parameter, and chain_std_deviation (or restart_trace) will produce exactly the same results. Returns ------- dbf : Database Resulting pycalphad database of optimized parameters sampler : EnsembleSampler, ndarray) emcee sampler for further data wrangling """ comps = sorted([sp for sp in dbf.elements]) symbols_to_fit = database_symbols_to_fit(dbf) if len(symbols_to_fit) == 0: raise ValueError( 'No degrees of freedom. Database must contain symbols starting with \'V\' or \'VV\', followed by a number.' ) else: logging.info('Fitting {} degrees of freedom.'.format( len(symbols_to_fit))) for x in symbols_to_fit: if isinstance(dbf.symbols[x], sympy.Piecewise): logging.debug('Replacing {} in database'.format(x)) dbf.symbols[x] = dbf.symbols[x].args[0].expr # get initial parameters and remove these from the database # we'll replace them with SymPy symbols initialized to 0 in the phase models initial_parameters = np.array( [np.array(float(dbf.symbols[x])) for x in symbols_to_fit]) # construct the models for each phase, substituting in the SymPy symbol to fit. logging.debug('Building phase models (this may take some time)') # 0 is placeholder value phases = sorted(dbf.phases.keys()) sympy_symbols_to_fit = [sympy.Symbol(sym) for sym in symbols_to_fit] orig_parameters = { sym: p for sym, p in zip(symbols_to_fit, initial_parameters) } eq_callables = build_callables(dbf, comps, phases, model=Model, parameters=orig_parameters) # because error_context expencts 'phase_models' key, change it eq_callables['phase_models'] = eq_callables.pop('model') eq_callables.pop('phase_records') # we also need to build models that have no ideal mixing for thermochemical error and to build them for each property we might calculate # TODO: potential optimization to only calculate for phase/property combos that we have in the datasets # first construct dict of models without ideal mixing mods_no_idmix = {} for phase_name in phases: # we have to pass the list of Symbol objects to fit so they are popped from the database and can properly be replaced. mods_no_idmix[phase_name] = Model(dbf, comps, phase_name, parameters=sympy_symbols_to_fit) mods_no_idmix[phase_name].models['idmix'] = 0 # now construct callables for each possible property that can be calculated thermochemical_callables = { } # will be dict of {output_property: eq_callables_dict} whitelist_properties = ['HM', 'SM', 'CPM'] whitelist_properties = whitelist_properties + [ prop + '_MIX' for prop in whitelist_properties ] for prop in whitelist_properties: thermochemical_callables[prop] = build_callables( dbf, comps, phases, model=mods_no_idmix, output=prop, parameters=orig_parameters, build_gradients=False) # pop off the callables not used in properties because we don't want them around (they should be None, anyways) thermochemical_callables[prop].pop('phase_records') thermochemical_callables[prop].pop('model') logging.debug('Finished building phase models') # context for the log probability function error_context = { 'comps': comps, 'dbf': dbf, 'phases': phases, 'phase_models': eq_callables['phase_models'], 'datasets': datasets, 'symbols_to_fit': symbols_to_fit, 'thermochemical_callables': thermochemical_callables, 'callables': eq_callables, } def save_sampler_state(sampler): if tracefile: logging.debug('Writing trace to {}'.format(tracefile)) np.save(tracefile, sampler.chain) if probfile: logging.debug('Writing lnprob to {}'.format(probfile)) np.save(probfile, sampler.lnprobability) # initialize the walkers either fresh or from the restart if restart_trace is not None: walkers = restart_trace[np.nonzero(restart_trace)].reshape( (restart_trace.shape[0], -1, restart_trace.shape[2]))[:, -1, :] nwalkers = walkers.shape[0] ndim = walkers.shape[1] initial_parameters = walkers.mean(axis=0) logging.info( 'Restarting from previous calculation with {} chains ({} per parameter).' .format(nwalkers, nwalkers / ndim)) logging.debug( 'Means of restarting parameters are {}'.format(initial_parameters)) logging.debug( 'Standard deviations of restarting parameters are {}'.format( walkers.std(axis=0))) else: logging.debug('Initial parameters: {}'.format(initial_parameters)) ndim = initial_parameters.size nwalkers = ndim * chains_per_parameter logging.info( 'Initializing {} chains with {} chains per parameter.'.format( nwalkers, chains_per_parameter)) walkers = generate_parameter_distribution(initial_parameters, nwalkers, chain_std_deviation, deterministic=deterministic) # the pool must implement a map function sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, kwargs=error_context, pool=scheduler) if deterministic: from espei.rstate import numpy_rstate sampler.random_state = numpy_rstate logging.info('Using a deterministic ensemble sampler.') progbar_width = 30 logging.info('Running MCMC for {} iterations.'.format(iterations)) try: for i, result in enumerate( sampler.sample(walkers, iterations=iterations)): # progress bar if (i + 1) % save_interval == 0: save_sampler_state(sampler) logging.debug('Acceptance ratios for parameters: {}'.format( sampler.acceptance_fraction)) n = int((progbar_width + 1) * float(i) / iterations) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format( '#' * n, ' ' * (progbar_width - n), i + 1, iterations)) n = int((progbar_width + 1) * float(i + 1) / iterations) sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format( '#' * n, ' ' * (progbar_width - n), i + 1, iterations)) except KeyboardInterrupt: pass # final processing save_sampler_state(sampler) optimal_params = optimal_parameters(sampler.chain, sampler.lnprobability) logging.debug('Intial parameters: {}'.format(initial_parameters)) logging.debug('Optimal parameters: {}'.format(optimal_params)) logging.debug('Change in parameters: {}'.format( np.abs(initial_parameters - optimal_params) / initial_parameters)) for param_name, value in zip(symbols_to_fit, optimal_params): dbf.symbols[param_name] = value logging.info('MCMC complete.') return dbf, sampler
def optimal_parameters_dict(dbf, trace, lnprob): return dict( zip(parameter_labels(dbf, formatted=False), optimal_parameters(trace, lnprob, 0)))