Example #1
0
def test_ternary_candidate_models_are_constructed_correctly():
    """Candidate models should be generated for all valid combinations of possible models in the ternary case"""
    features = OrderedDict([("CPM_FORM", (v.T * sympy.log(v.T), v.T**2)),
                            ("SM_FORM", (v.T, )),
                            ("HM_FORM", (sympy.S.One, ))])
    YS = sympy.Symbol('YS')
    V_I, V_J, V_K = sympy.Symbol('V_I'), sympy.Symbol('V_J'), sympy.Symbol(
        'V_K')
    candidate_models = build_candidate_models((('A', 'B', 'C'), 'A'), features)
    assert candidate_models == OrderedDict([
        ('CPM_FORM', [
            [v.T * YS * sympy.log(v.T)],
            [v.T * YS * sympy.log(v.T), v.T**2 * YS],
            [
                v.T * V_I * YS * sympy.log(v.T),
                v.T * V_J * YS * sympy.log(v.T),
                v.T * V_K * YS * sympy.log(v.T)
            ],
            [
                v.T * V_I * YS * sympy.log(v.T),
                v.T * V_J * YS * sympy.log(v.T),
                v.T * V_K * YS * sympy.log(v.T), v.T**2 * V_I * YS,
                v.T**2 * V_J * YS, v.T**2 * V_K * YS
            ],
        ]),
        ('SM_FORM', [[v.T * YS],
                     [v.T * V_I * YS, v.T * V_J * YS, v.T * V_K * YS]]),
        ('HM_FORM', [[YS], [V_I * YS, V_J * YS, V_K * YS]])
    ])
Example #2
0
def test_binary_candidate_models_are_constructed_correctly():
    """Candidate models should be generated for all valid combinations of possible models in the binary case"""
    features = OrderedDict([("CPM_FORM",
                 (v.T*symengine.log(v.T), v.T**2)),
                ("SM_FORM", (v.T,)),
                ("HM_FORM", (symengine.S.One,))
                ])
    YS = symengine.Symbol('YS')
    Z = symengine.Symbol('Z')
    candidate_models = build_candidate_models((('A', 'B'), 'A'), features)
    assert candidate_models == OrderedDict([
        ('CPM_FORM', [
            [v.T*YS*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T)],
            [v.T*YS*symengine.log(v.T), v.T**2*YS, v.T*YS*Z*symengine.log(v.T), v.T**2*YS*Z, v.T*YS*Z**2*symengine.log(v.T), v.T**2*YS*Z**2, v.T*YS*Z**3*symengine.log(v.T), v.T**2*YS*Z**3]
        ]),
        ('SM_FORM', [
            [v.T*YS],
            [v.T*YS, v.T*YS*Z],
            [v.T*YS, v.T*YS*Z, v.T*YS*Z**2],
            [v.T*YS, v.T*YS*Z, v.T*YS*Z**2, v.T*YS*Z**3]
        ]),
        ('HM_FORM', [
            [YS],
            [YS, YS*Z],
            [YS, YS*Z, YS*Z**2],
            [YS, YS*Z, YS*Z**2, YS*Z**3]
        ])
    ])
Example #3
0
def fit_formation_energy(dbf, comps, phase_name, configuration, symmetry, datasets, ridge_alpha=None, aicc_phase_penalty=None, features=None):
    """
    Find suitable linear model parameters for the given phase.
    We do this by successively fitting heat capacities, entropies and
    enthalpies of formation, and selecting against criteria to prevent
    overfitting. The "best" set of parameters minimizes the error
    without overfitting.

    Parameters
    ----------
    dbf : Database
        pycalphad Database. Partially complete, so we know what degrees of freedom to fix.
    comps : [str]
        Names of the relevant components.
    phase_name : str
        Name of the desired phase for which the parameters will be found.
    configuration : ndarray
        Configuration of the sublattices for the fitting procedure.
    symmetry : [[int]]
        Symmetry of the sublattice configuration.
    datasets : PickleableTinyDB
        All the datasets desired to fit to.
    ridge_alpha : float
        Value of the :math:`\\alpha` hyperparameter used in ridge regression. Defaults to 1.0e-100, which should be degenerate
        with ordinary least squares regression. For now, the parameter is applied to all features.
    aicc_feature_factors : dict
        Map of phase name to feature to a multiplication factor for the AICc's parameter penalty.
    features : dict
        Maps "property" to a list of features for the linear model.
        These will be transformed from "GM" coefficients
        e.g., {"CPM_FORM": (v.T*sympy.log(v.T), v.T**2, v.T**-1, v.T**3)} (Default value = None)

    Returns
    -------
    dict
        {feature: estimated_value}

    """
    aicc_feature_factors = aicc_phase_penalty if aicc_phase_penalty is not None else {}
    if interaction_test(configuration):
        _log.debug('ENDMEMBERS FROM INTERACTION: %s', endmembers_from_interaction(configuration))
        fitting_steps = (["CPM_FORM", "CPM_MIX"], ["SM_FORM", "SM_MIX"], ["HM_FORM", "HM_MIX"])

    else:
        # We are only fitting an endmember; no mixing data needed
        fitting_steps = (["CPM_FORM"], ["SM_FORM"], ["HM_FORM"])

    # create the candidate models and fitting steps
    if features is None:
        features = OrderedDict([("CPM_FORM", (v.T * sympy.log(v.T), v.T**2, v.T**-1, v.T**3)),
                                ("SM_FORM", (v.T,)),
                                ("HM_FORM", (sympy.S.One,)),
                                ])
    # dict of {feature, [candidate_models]}
    candidate_models_features = build_candidate_models(configuration, features)

    # All possible parameter values that could be taken on. This is some legacy
    # code from before there were many candidate models built. For very large
    # sets of candidate models, this could be quite slow.
    # TODO: we might be able to remove this initialization for clarity, depends on fixed poritions
    parameters = {}
    for candidate_models in candidate_models_features.values():
        for model in candidate_models:
            for coef in model:
                parameters[coef] = 0

    # These is our previously fit partial model from previous steps
    # Subtract out all of these contributions (zero out reference state because these are formation properties)
    fixed_model = Model(dbf, comps, phase_name, parameters={'GHSER'+(c.upper()*2)[:2]: 0 for c in comps})
    fixed_portions = [0]

    for desired_props in fitting_steps:
        feature_type = desired_props[0].split('_')[0]  # HM_FORM -> HM
        aicc_factor = aicc_feature_factors.get(feature_type, 1.0)
        solver_qry = (where('solver').test(symmetry_filter, configuration, recursive_tuplify(symmetry) if symmetry else symmetry))
        desired_data = get_prop_data(comps, phase_name, desired_props, datasets, additional_query=solver_qry)
        desired_data = filter_configurations(desired_data, configuration, symmetry)
        desired_data = filter_temperatures(desired_data)
        _log.trace('%s: datasets found: %s', desired_props, len(desired_data))
        if len(desired_data) > 0:
            config_tup = tuple(map(tuplify, configuration))
            calculate_dict = get_prop_samples(desired_data, config_tup)
            sample_condition_dicts = _get_sample_condition_dicts(calculate_dict, list(map(len, config_tup)))
            weights = calculate_dict['weights']
            assert len(sample_condition_dicts) == len(weights)

            # We assume all properties in the same fitting step have the same
            # features (all CPM, all HM, etc., but different ref states).
            # data quantities are the same for each candidate model and can be computed up front
            data_qtys = get_data_quantities(feature_type, fixed_model, fixed_portions, desired_data, sample_condition_dicts)

            # build the candidate model transformation matrix and response vector (A, b in Ax=b)
            feature_matricies = []
            data_quantities = []
            for candidate_coefficients in candidate_models_features[desired_props[0]]:
                # Map coeffiecients in G to coefficients in the feature_type (H, S, CP)
                transformed_coefficients = list(map(feature_transforms[feature_type], candidate_coefficients))
                if interaction_test(configuration, 3):
                    feature_matricies.append(_build_feature_matrix(sample_condition_dicts, transformed_coefficients))
                else:
                    feature_matricies.append(_build_feature_matrix(sample_condition_dicts, transformed_coefficients))
                data_quantities.append(data_qtys)

            # provide candidate models and get back a selected model.
            selected_model = select_model(zip(candidate_models_features[desired_props[0]], feature_matricies, data_quantities), ridge_alpha, weights=weights, aicc_factor=aicc_factor)
            selected_features, selected_values = selected_model
            parameters.update(zip(*(selected_features, selected_values)))
            # Add these parameters to be fixed for the next fitting step
            fixed_portion = np.array(selected_features, dtype=np.object_)
            fixed_portion = np.dot(fixed_portion, selected_values)
            fixed_portions.append(fixed_portion)
    return parameters
Example #4
0
def fit_formation_energy(dbf,
                         comps,
                         phase_name,
                         configuration,
                         symmetry,
                         datasets,
                         ridge_alpha=1.0e-100,
                         features=None):
    """
    Find suitable linear model parameters for the given phase.
    We do this by successively fitting heat capacities, entropies and
    enthalpies of formation, and selecting against criteria to prevent
    overfitting. The "best" set of parameters minimizes the error
    without overfitting.

    Parameters
    ----------
    dbf : Database
        pycalphad Database. Partially complete, so we know what degrees of freedom to fix.
    comps : [str]
        Names of the relevant components.
    phase_name : str
        Name of the desired phase for which the parameters will be found.
    configuration : ndarray
        Configuration of the sublattices for the fitting procedure.
    symmetry : [[int]]
        Symmetry of the sublattice configuration.
    datasets : PickleableTinyDB
        All the datasets desired to fit to.
    ridge_alpha : float
        Value of the $alpha$ hyperparameter used in ridge regression. Defaults to 1.0e-100, which should be degenerate
        with ordinary least squares regression. For now, the parameter is applied to all features.
    features : dict
        Maps "property" to a list of features for the linear model.
        These will be transformed from "GM" coefficients
        e.g., {"CPM_FORM": (v.T*sympy.log(v.T), v.T**2, v.T**-1, v.T**3)} (Default value = None)

    Returns
    -------
    dict
        {feature: estimated_value}

    """
    if interaction_test(configuration):
        logging.debug('ENDMEMBERS FROM INTERACTION: {}'.format(
            endmembers_from_interaction(configuration)))
        fitting_steps = (["CPM_FORM",
                          "CPM_MIX"], ["SM_FORM",
                                       "SM_MIX"], ["HM_FORM", "HM_MIX"])

    else:
        # We are only fitting an endmember; no mixing data needed
        fitting_steps = (["CPM_FORM"], ["SM_FORM"], ["HM_FORM"])

    # create the candidate models and fitting steps
    if features is None:
        features = OrderedDict([("CPM_FORM", (v.T * sympy.log(v.T), v.T**2,
                                              v.T**-1, v.T**3)),
                                ("SM_FORM", (v.T, )),
                                ("HM_FORM", (sympy.S.One, ))])
    # dict of {feature, [candidate_models]}
    candidate_models_features = build_candidate_models(configuration, features)

    # All possible parameter values that could be taken on. This is some legacy
    # code from before there were many candidate models built. For very large
    # sets of candidate models, this could be quite slow.
    # TODO: we might be able to remove this initialization for clarity, depends on fixed poritions
    parameters = {}
    for candidate_models in candidate_models_features.values():
        for model in candidate_models:
            for coef in model:
                parameters[coef] = 0

    # These is our previously fit partial model from previous steps
    # Subtract out all of these contributions (zero out reference state because these are formation properties)
    fixed_model = Model(
        dbf,
        comps,
        phase_name,
        parameters={'GHSER' + (c.upper() * 2)[:2]: 0
                    for c in comps})
    fixed_model.models['idmix'] = 0
    fixed_portions = [0]

    moles_per_formula_unit = sympy.S(0)
    YS = sympy.Symbol('YS')  # site fraction symbol that we will reuse
    Z = sympy.Symbol('Z')  # site fraction symbol that we will reuse
    subl_idx = 0
    for num_sites, const in zip(dbf.phases[phase_name].sublattices,
                                dbf.phases[phase_name].constituents):
        if v.Species('VA') in const:
            moles_per_formula_unit += num_sites * (
                1 - v.SiteFraction(phase_name, subl_idx, v.Species('VA')))
        else:
            moles_per_formula_unit += num_sites
        subl_idx += 1

    for desired_props in fitting_steps:
        desired_data = get_data(comps, phase_name, configuration, symmetry,
                                datasets, desired_props)
        logging.debug('{}: datasets found: {}'.format(desired_props,
                                                      len(desired_data)))
        if len(desired_data) > 0:
            # We assume all properties in the same fitting step have the same features (all CPM, all HM, etc.) (but different ref states)
            all_samples = get_samples(desired_data)
            site_fractions = [
                build_sitefractions(
                    phase_name, ds['solver']['sublattice_configurations'],
                    ds['solver'].get(
                        'sublattice_occupancies',
                        np.ones((
                            len(ds['solver']['sublattice_configurations']),
                            len(ds['solver']['sublattice_configurations'][0])),
                                dtype=np.float))) for ds in desired_data
                for _ in ds['conditions']['T']
            ]
            # Flatten list
            site_fractions = list(itertools.chain(*site_fractions))

            # build the candidate model transformation matrix and response vector (A, b in Ax=b)
            feature_matricies = []
            data_quantities = []
            for candidate_model in candidate_models_features[desired_props[0]]:
                if interaction_test(configuration, 3):
                    feature_matricies.append(
                        build_ternary_feature_matrix(desired_props[0],
                                                     candidate_model,
                                                     desired_data))
                else:
                    feature_matricies.append(
                        _build_feature_matrix(desired_props[0],
                                              candidate_model, desired_data))

                data_qtys = np.concatenate(shift_reference_state(
                    desired_data, feature_transforms[desired_props[0]],
                    fixed_model),
                                           axis=-1)

                # Remove existing partial model contributions from the data
                data_qtys = data_qtys - feature_transforms[desired_props[0]](
                    fixed_model.ast)
                # Subtract out high-order (in T) parameters we've already fit
                data_qtys = data_qtys - feature_transforms[desired_props[0]](
                    sum(fixed_portions)) / moles_per_formula_unit

                # if any site fractions show up in our data_qtys that aren't in this datasets site fractions, set them to zero.
                for sf, i, (_, (sf_product,
                                inter_product)) in zip(site_fractions,
                                                       data_qtys, all_samples):
                    missing_variables = sympy.S(
                        i * moles_per_formula_unit).atoms(
                            v.SiteFraction) - set(sf.keys())
                    sf.update({x: 0. for x in missing_variables})
                    # The equations we have just have the site fractions as YS
                    # and interaction products as Z, so take the product of all
                    # the site fractions that we see in our data qtys
                    sf.update({YS: sf_product, Z: inter_product})

                # moles_per_formula_unit factor is here because our data is stored per-atom
                # but all of our fits are per-formula-unit
                data_qtys = [
                    sympy.S(i * moles_per_formula_unit).xreplace(sf).xreplace({
                        v.T:
                        ixx[0]
                    }).evalf() for i, sf, ixx in zip(data_qtys, site_fractions,
                                                     all_samples)
                ]
                data_qtys = np.asarray(data_qtys, dtype=np.float)
                data_quantities.append(data_qtys)

            # provide candidate models and get back a selected model.
            selected_model = select_model(
                zip(candidate_models_features[desired_props[0]],
                    feature_matricies, data_quantities), ridge_alpha)
            selected_features, selected_values = selected_model
            parameters.update(zip(*(selected_features, selected_values)))
            # Add these parameters to be fixed for the next fitting step
            fixed_portion = np.array(selected_features, dtype=np.object)
            fixed_portion = np.dot(fixed_portion, selected_values)
            fixed_portions.append(fixed_portion)
    return parameters