Example #1
0
    def _fit(
        self,
        symbols,
        ds,
        prior=None,
        iterations=1000,
        chains_per_parameter=2,
        chain_std_deviation=0.1,
        deterministic=True,
        restart_trace=None,
        tracefile=None,
        probfile=None,
        mcmc_data_weights=None,
        approximate_equilibrium=False,
    ):
        """

        Parameters
        ----------
        symbols : list of str
        ds : PickleableTinyDB
        prior : str
            Prior to use to generate priors. Defaults to 'zero', which keeps
            backwards compatibility. Can currently choose 'normal', 'uniform',
            'triangular', or 'zero'.
        iterations : int
            Number of iterations to calculate in MCMC. Default is 1000.
        chains_per_parameter : int
            number of chains for each parameter. Must be an even integer greater
            or equal to 2. Defaults to 2.
        chain_std_deviation : float
            Standard deviation of normal for parameter initialization as a
            fraction of each parameter. Must be greater than 0. Defaults to 0.1.
        deterministic : bool
            If True, the emcee sampler will be seeded to give deterministic sampling
            draws. This will ensure that the runs with the exact same database,
            chains_per_parameter, and chain_std_deviation (or restart_trace) will
            produce exactly the same results.
        restart_trace : np.ndarray
            ndarray of the previous trace. Should have shape (chains, iterations, parameters)
        tracefile : str
            filename to store the trace with NumPy.save. Array has shape
            (chains, iterations, parameters)
        probfile : str
            filename to store the log probability with NumPy.save. Has shape (chains, iterations)
        mcmc_data_weights : dict
            Dictionary of weights for each data type, e.g. {'ZPF': 20, 'HM': 2}

        Returns
        -------
        Dict[str, float]

        """
        # Set NumPy print options so logged arrays print on one line. Reset at the end.
        np.set_printoptions(linewidth=sys.maxsize)
        cbs = self.scheduler is None
        ctx = setup_context(self.dbf,
                            ds,
                            symbols,
                            data_weights=mcmc_data_weights,
                            phase_models=self.phase_models,
                            make_callables=cbs)
        symbols_to_fit = ctx['symbols_to_fit']
        initial_guess = np.array(
            [unpack_piecewise(self.dbf.symbols[s]) for s in symbols_to_fit])

        prior_dict = self.get_priors(prior, symbols_to_fit, initial_guess)
        ctx.update(prior_dict)
        if 'zpf_kwargs' in ctx:
            ctx['zpf_kwargs'][
                'approximate_equilibrium'] = approximate_equilibrium
        if 'equilibrium_thermochemical_kwargs' in ctx:
            ctx['equilibrium_thermochemical_kwargs'][
                'approximate_equilibrium'] = approximate_equilibrium
        # Run the initial parameters for guessing purposes:
        _log.trace("Probability for initial parameters")
        self.predict(initial_guess, **ctx)
        if restart_trace is not None:
            chains = self.initialize_chains_from_trace(restart_trace)
            # TODO: check that the shape is valid with the existing parameters
        else:
            chains = self.initialize_new_chains(initial_guess,
                                                chains_per_parameter,
                                                chain_std_deviation,
                                                deterministic)
        sampler = emcee.EnsembleSampler(chains.shape[0],
                                        initial_guess.size,
                                        self.predict,
                                        kwargs=ctx,
                                        pool=self.scheduler)
        if deterministic:
            from espei.rstate import numpy_rstate
            sampler.random_state = numpy_rstate
            _log.info('Using a deterministic ensemble sampler.')
        self.sampler = sampler
        self.tracefile = tracefile
        self.probfile = probfile
        # Run the MCMC simulation
        self.do_sampling(chains, iterations)

        # Post process
        optimal_params = optimal_parameters(sampler.chain,
                                            sampler.lnprobability)
        _log.trace('Initial parameters: %s', initial_guess)
        _log.trace('Optimal parameters: %s', optimal_params)
        _log.trace('Change in parameters: %s',
                   np.abs(initial_guess - optimal_params) / initial_guess)
        parameters = dict(zip(symbols_to_fit, optimal_params))
        np.set_printoptions(linewidth=75)
        return parameters
Example #2
0
def mcmc_fit(dbf, datasets, mcmc_steps=1000, save_interval=100, chains_per_parameter=2,
             chain_std_deviation=0.1, scheduler=None, tracefile=None, probfile=None,
             restart_chain=None, deterministic=True,):
    """Run Markov Chain Monte Carlo on the Database given datasets

    Parameters
    ----------
    dbf : Database
        A pycalphad Database to fit with symbols to fit prefixed with `VV`
        followed by a number, e.g. `VV0001`
    datasets : PickleableTinyDB
        A database of single- and multi-phase to fit
    mcmc_steps : int
        Number of chain steps to calculate in MCMC. Note the flattened chain will
        have (mcmc_steps*DOF) values. Default is 1000 steps.
    save_interval :int
        interval of steps to save the chain to the tracefile and probfile
    chains_per_parameter : int
        number of chains for each parameter. Must be an even integer greater or
        equal to 2. Defaults to 2.
    chain_std_deviation : float
        standard deviation of normal for parameter initialization as a fraction
        of each parameter. Must be greater than 0. Default is 0.1, which is 10%.
    scheduler : callable
        Scheduler to use with emcee. Must implement a map method.
    tracefile : str
        filename to store the flattened chain with NumPy.save. Array has shape
        (nwalkers, iterations, nparams)
    probfile : str
        filename to store the flattened ln probability with NumPy.save
    restart_chain : np.ndarray
        ndarray of the previous chain. Should have shape (nwalkers, iterations, nparams)
    deterministic : bool
        If True, the emcee sampler will be seeded to give deterministic sampling
        draws. This will ensure that the runs with the exact same database,
        chains_per_parameter, and chain_std_deviation (or restart_chain) will
        produce exactly the same results.

    Returns
    -------
    dbf : Database
        Resulting pycalphad database of optimized parameters
    sampler : EnsembleSampler, ndarray)
        emcee sampler for further data wrangling
    """
    comps = sorted([sp for sp in dbf.elements])
    symbols_to_fit = database_symbols_to_fit(dbf)

    if len(symbols_to_fit) == 0:
        raise ValueError('No degrees of freedom. Database must contain symbols starting with \'V\' or \'VV\', followed by a number.')
    else:
        logging.info('Fitting {} degrees of freedom.'.format(len(symbols_to_fit)))

    for x in symbols_to_fit:
        if isinstance(dbf.symbols[x], sympy.Piecewise):
            logging.debug('Replacing {} in database'.format(x))
            dbf.symbols[x] = dbf.symbols[x].args[0].expr

    # get initial parameters and remove these from the database
    # we'll replace them with SymPy symbols initialized to 0 in the phase models
    initial_parameters = np.array([np.array(float(dbf.symbols[x])) for x in symbols_to_fit])
    for x in symbols_to_fit:
        del dbf.symbols[x]

    # construct the models for each phase, substituting in the SymPy symbol to fit.
    phase_models = dict()
    logging.debug('Building phase models')
    # 0 is placeholder value
    phases = sorted(dbf.phases.keys())
    for phase_name in phases:
        mod = CompiledModel(dbf, comps, phase_name, parameters=OrderedDict([(sympy.Symbol(s), 0) for s in symbols_to_fit]))
        phase_models[phase_name] = mod
    logging.debug('Finished building phase models')

    # contect for the log probability function
    error_context = {'comps': comps, 'dbf': dbf,
                     'phases': phases,
                     'phase_models': phase_models,
                     'datasets': datasets, 'symbols_to_fit': symbols_to_fit,
                     }

    def save_sampler_state(sampler):
        if tracefile:
            logging.debug('Writing chain to {}'.format(tracefile))
            np.save(tracefile, sampler.chain)
        if probfile:
            logging.debug('Writing lnprob to {}'.format(probfile))
            np.save(probfile, sampler.lnprobability)


    # initialize the walkers either fresh or from the restart
    if restart_chain is not None:
        walkers = restart_chain[np.nonzero(restart_chain)].reshape(
            (restart_chain.shape[0], -1, restart_chain.shape[2]))[:, -1, :]
        nwalkers = walkers.shape[0]
        ndim = walkers.shape[1]
        initial_parameters = walkers.mean(axis=0)
        logging.info('Restarting from previous calculation with {} chains ({} per parameter).'.format(nwalkers, nwalkers / ndim))
        logging.debug('Means of restarting parameters are {}'.format(initial_parameters))
        logging.debug('Standard deviations of restarting parameters are {}'.format(walkers.std(axis=0)))
    else:
        logging.debug('Initial parameters: {}'.format(initial_parameters))
        ndim = initial_parameters.size
        nwalkers = ndim * chains_per_parameter
        logging.info('Initializing {} chains with {} chains per parameter.'.format(nwalkers, chains_per_parameter))
        walkers = generate_parameter_distribution(initial_parameters, nwalkers, chain_std_deviation, deterministic=deterministic)

    # the pool must implement a map function
    sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, kwargs=error_context, pool=scheduler)
    if deterministic:
        from espei.rstate import numpy_rstate
        sampler.random_state = numpy_rstate
        logging.info('Using a deterministic ensemble sampler.')
    progbar_width = 30
    logging.info('Running MCMC with {} steps.'.format(mcmc_steps))
    try:
        for i, result in enumerate(sampler.sample(walkers, iterations=mcmc_steps)):
            # progress bar
            if (i + 1) % save_interval == 0:
                save_sampler_state(sampler)
                logging.debug('Acceptance ratios for parameters: {}'.format(sampler.acceptance_fraction))
            n = int((progbar_width + 1) * float(i) / mcmc_steps)
            sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format('#'*n, ' '*(progbar_width - n), i + 1, mcmc_steps))
        n = int((progbar_width + 1) * float(i + 1) / mcmc_steps)
        sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format('#'*n, ' '*(progbar_width - n), i + 1, mcmc_steps))
    except KeyboardInterrupt:
        pass
    # final processing
    save_sampler_state(sampler)
    optimal_params = optimal_parameters(sampler.chain, sampler.lnprobability)
    logging.debug('Intial parameters: {}'.format(initial_parameters))
    logging.debug('Optimal parameters: {}'.format(optimal_params))
    logging.debug('Change in parameters: {}'.format(np.abs(initial_parameters - optimal_params) / initial_parameters))
    for param_name, value in zip(symbols_to_fit, optimal_params):
        dbf.symbols[param_name] = value
    logging.info('MCMC complete.')
    return dbf, sampler
Example #3
0
def mcmc_fit(
    dbf,
    datasets,
    iterations=1000,
    save_interval=100,
    chains_per_parameter=2,
    chain_std_deviation=0.1,
    scheduler=None,
    tracefile=None,
    probfile=None,
    restart_trace=None,
    deterministic=True,
):
    """
    Run Markov Chain Monte Carlo on the Database given datasets

    Parameters
    ----------
    dbf : Database
        A pycalphad Database to fit with symbols to fit prefixed with `VV`
        followed by a number, e.g. `VV0001`
    datasets : PickleableTinyDB
        A database of single- and multi-phase to fit
    iterations : int
        Number of trace iterations to calculate in MCMC. Default is 1000 iterations.
    save_interval :int
        interval of iterations to save the tracefile and probfile
    chains_per_parameter : int
        number of chains for each parameter. Must be an even integer greater or
        equal to 2. Defaults to 2.
    chain_std_deviation : float
        standard deviation of normal for parameter initialization as a fraction
        of each parameter. Must be greater than 0. Default is 0.1, which is 10%.
    scheduler : callable
        Scheduler to use with emcee. Must implement a map method.
    tracefile : str
        filename to store the trace with NumPy.save. Array has shape
        (chains, iterations, parameters)
    probfile : str
        filename to store the log probability with NumPy.save. Has shape (chains, iterations)
    restart_trace : np.ndarray
        ndarray of the previous trace. Should have shape (chains, iterations, parameters)
    deterministic : bool
        If True, the emcee sampler will be seeded to give deterministic sampling
        draws. This will ensure that the runs with the exact same database,
        chains_per_parameter, and chain_std_deviation (or restart_trace) will
        produce exactly the same results.

    Returns
    -------
    dbf : Database
        Resulting pycalphad database of optimized parameters
    sampler : EnsembleSampler, ndarray)
        emcee sampler for further data wrangling
    """
    comps = sorted([sp for sp in dbf.elements])
    symbols_to_fit = database_symbols_to_fit(dbf)

    if len(symbols_to_fit) == 0:
        raise ValueError(
            'No degrees of freedom. Database must contain symbols starting with \'V\' or \'VV\', followed by a number.'
        )
    else:
        logging.info('Fitting {} degrees of freedom.'.format(
            len(symbols_to_fit)))

    for x in symbols_to_fit:
        if isinstance(dbf.symbols[x], sympy.Piecewise):
            logging.debug('Replacing {} in database'.format(x))
            dbf.symbols[x] = dbf.symbols[x].args[0].expr

    # get initial parameters and remove these from the database
    # we'll replace them with SymPy symbols initialized to 0 in the phase models
    initial_parameters = np.array(
        [np.array(float(dbf.symbols[x])) for x in symbols_to_fit])

    # construct the models for each phase, substituting in the SymPy symbol to fit.
    logging.debug('Building phase models (this may take some time)')
    # 0 is placeholder value
    phases = sorted(dbf.phases.keys())
    sympy_symbols_to_fit = [sympy.Symbol(sym) for sym in symbols_to_fit]
    orig_parameters = {
        sym: p
        for sym, p in zip(symbols_to_fit, initial_parameters)
    }
    eq_callables = build_callables(dbf,
                                   comps,
                                   phases,
                                   model=Model,
                                   parameters=orig_parameters)
    # because error_context expencts 'phase_models' key, change it
    eq_callables['phase_models'] = eq_callables.pop('model')
    eq_callables.pop('phase_records')
    # we also need to build models that have no ideal mixing for thermochemical error and to build them for each property we might calculate
    # TODO: potential optimization to only calculate for phase/property combos that we have in the datasets
    # first construct dict of models without ideal mixing
    mods_no_idmix = {}
    for phase_name in phases:
        # we have to pass the list of Symbol objects to fit so they are popped from the database and can properly be replaced.
        mods_no_idmix[phase_name] = Model(dbf,
                                          comps,
                                          phase_name,
                                          parameters=sympy_symbols_to_fit)
        mods_no_idmix[phase_name].models['idmix'] = 0
    # now construct callables for each possible property that can be calculated
    thermochemical_callables = {
    }  # will be dict of {output_property: eq_callables_dict}
    whitelist_properties = ['HM', 'SM', 'CPM']
    whitelist_properties = whitelist_properties + [
        prop + '_MIX' for prop in whitelist_properties
    ]
    for prop in whitelist_properties:
        thermochemical_callables[prop] = build_callables(
            dbf,
            comps,
            phases,
            model=mods_no_idmix,
            output=prop,
            parameters=orig_parameters,
            build_gradients=False)
        # pop off the callables not used in properties because we don't want them around (they should be None, anyways)
        thermochemical_callables[prop].pop('phase_records')
        thermochemical_callables[prop].pop('model')
    logging.debug('Finished building phase models')

    # context for the log probability function
    error_context = {
        'comps': comps,
        'dbf': dbf,
        'phases': phases,
        'phase_models': eq_callables['phase_models'],
        'datasets': datasets,
        'symbols_to_fit': symbols_to_fit,
        'thermochemical_callables': thermochemical_callables,
        'callables': eq_callables,
    }

    def save_sampler_state(sampler):
        if tracefile:
            logging.debug('Writing trace to {}'.format(tracefile))
            np.save(tracefile, sampler.chain)
        if probfile:
            logging.debug('Writing lnprob to {}'.format(probfile))
            np.save(probfile, sampler.lnprobability)

    # initialize the walkers either fresh or from the restart
    if restart_trace is not None:
        walkers = restart_trace[np.nonzero(restart_trace)].reshape(
            (restart_trace.shape[0], -1, restart_trace.shape[2]))[:, -1, :]
        nwalkers = walkers.shape[0]
        ndim = walkers.shape[1]
        initial_parameters = walkers.mean(axis=0)
        logging.info(
            'Restarting from previous calculation with {} chains ({} per parameter).'
            .format(nwalkers, nwalkers / ndim))
        logging.debug(
            'Means of restarting parameters are {}'.format(initial_parameters))
        logging.debug(
            'Standard deviations of restarting parameters are {}'.format(
                walkers.std(axis=0)))
    else:
        logging.debug('Initial parameters: {}'.format(initial_parameters))
        ndim = initial_parameters.size
        nwalkers = ndim * chains_per_parameter
        logging.info(
            'Initializing {} chains with {} chains per parameter.'.format(
                nwalkers, chains_per_parameter))
        walkers = generate_parameter_distribution(initial_parameters,
                                                  nwalkers,
                                                  chain_std_deviation,
                                                  deterministic=deterministic)

    # the pool must implement a map function
    sampler = emcee.EnsembleSampler(nwalkers,
                                    ndim,
                                    lnprob,
                                    kwargs=error_context,
                                    pool=scheduler)
    if deterministic:
        from espei.rstate import numpy_rstate
        sampler.random_state = numpy_rstate
        logging.info('Using a deterministic ensemble sampler.')
    progbar_width = 30
    logging.info('Running MCMC for {} iterations.'.format(iterations))
    try:
        for i, result in enumerate(
                sampler.sample(walkers, iterations=iterations)):
            # progress bar
            if (i + 1) % save_interval == 0:
                save_sampler_state(sampler)
                logging.debug('Acceptance ratios for parameters: {}'.format(
                    sampler.acceptance_fraction))
            n = int((progbar_width + 1) * float(i) / iterations)
            sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format(
                '#' * n, ' ' * (progbar_width - n), i + 1, iterations))
        n = int((progbar_width + 1) * float(i + 1) / iterations)
        sys.stdout.write("\r[{0}{1}] ({2} of {3})\n".format(
            '#' * n, ' ' * (progbar_width - n), i + 1, iterations))
    except KeyboardInterrupt:
        pass
    # final processing
    save_sampler_state(sampler)
    optimal_params = optimal_parameters(sampler.chain, sampler.lnprobability)
    logging.debug('Intial parameters: {}'.format(initial_parameters))
    logging.debug('Optimal parameters: {}'.format(optimal_params))
    logging.debug('Change in parameters: {}'.format(
        np.abs(initial_parameters - optimal_params) / initial_parameters))
    for param_name, value in zip(symbols_to_fit, optimal_params):
        dbf.symbols[param_name] = value
    logging.info('MCMC complete.')
    return dbf, sampler
Example #4
0
def optimal_parameters_dict(dbf, trace, lnprob):
    return dict(
        zip(parameter_labels(dbf, formatted=False),
            optimal_parameters(trace, lnprob, 0)))