def _build_feature_matrix(prop, features, desired_data): """ Return an MxN matrix of M data sample and N features. Parameters ---------- prop : str String name of the property, e.g. 'HM_MIX' features : tuple Tuple of SymPy parameters that can be fit for this property. desired_data : dict Full dataset dictionary containing values, conditions, etc. Returns ------- numpy.ndarray An MxN matrix of M samples (from desired data) and N features. """ transformed_features = sympy.Matrix( [feature_transforms[prop](i) for i in features]) all_samples = get_samples(desired_data) feature_matrix = np.empty((len(all_samples), len(transformed_features)), dtype=np.float) feature_matrix[:, :] = [ transformed_features.subs({ v.T: temp, 'YS': compf[0], 'Z': compf[1] }).evalf() for temp, compf in all_samples ] return feature_matrix
def build_ternary_feature_matrix(prop, candidate_models, desired_data): """ Return an MxN matrix of M data sample and N features. Parameters ---------- prop : str String name of the property, e.g. 'HM_MIX' candidate_models : list List of SymPy parameters that can be fit for this property. desired_data : dict Full dataset dictionary containing values, conditions, etc. Returns ------- numpy.ndarray An MxN matrix of M samples (from desired data) and N features. """ transformed_features = sympy.Matrix( [feature_transforms[prop](i) for i in candidate_models]) all_samples = get_samples(desired_data) feature_matrix = np.empty((len(all_samples), len(transformed_features)), dtype=np.float) feature_matrix[:, :] = [ transformed_features.subs({ v.T: temp, 'YS': ys, 'V_I': v_i, 'V_J': v_j, 'V_K': v_k }).evalf() for temp, (ys, (v_i, v_j, v_k)) in all_samples ] return feature_matrix
def _build_feature_matrix(prop, features, desired_data): transformed_features = sympy.Matrix( [feature_transforms[prop](i) for i in features]) all_samples = get_samples(desired_data) feature_matrix = np.empty((len(all_samples), len(transformed_features)), dtype=np.float) feature_matrix[:, :] = [ transformed_features.subs({ v.T: temp, 'YS': compf[0], 'Z': compf[1] }).evalf() for temp, compf in all_samples ] return feature_matrix
def _compare_data_to_parameters(dbf, comps, phase_name, desired_data, mod, configuration, x, y, ax=None): """ Return one set of plotted Axes with data compared to calculated parameters Parameters ---------- dbf : Database pycalphad thermodynamic database containing the relevant parameters. comps : list Names of components to consider in the calculation. phase_name : str Name of the considered phase phase desired_data : mod : Model A pycalphad Model. The Model may or may not have the reference state zeroed out for formation properties. configuration : x : str Model property to plot on the x-axis e.g. 'T', 'HM_MIX', 'SM_FORM' y : str Model property to plot on the y-axis e.g. 'T', 'HM_MIX', 'SM_FORM' ax : matplotlib.Axes Default axes used if not specified. Returns ------- matplotlib.Axes """ all_samples = np.array(get_samples(desired_data), dtype=np.object) endpoints = endmembers_from_interaction(configuration) interacting_subls = [ c for c in list_to_tuple(configuration) if isinstance(c, tuple) ] disordered_config = False if (len(set(interacting_subls)) == 1) and (len(interacting_subls[0]) == 2): # This configuration describes all sublattices with the same two elements interacting # In general this is a high-dimensional space; just plot the diagonal to see the disordered mixing endpoints = [endpoints[0], endpoints[-1]] disordered_config = True if not ax: fig = plt.figure(figsize=plt.figaspect(1)) ax = fig.gca() bar_chart = False bar_labels = [] bar_data = [] if y.endswith('_FORM'): # We were passed a Model object with zeroed out reference states yattr = y[:-5] else: yattr = y if len(endpoints) == 1: # This is an endmember so we can just compute T-dependent stuff temperatures = np.array([i[0] for i in all_samples], dtype=np.float) if temperatures.min() != temperatures.max(): temperatures = np.linspace(temperatures.min(), temperatures.max(), num=100) else: # We only have one temperature: let's do a bar chart instead bar_chart = True temperatures = temperatures.min() endmember = _translate_endmember_to_array( endpoints[0], mod.ast.atoms(v.SiteFraction))[None, None] predicted_quantities = calculate(dbf, comps, [phase_name], output=yattr, T=temperatures, P=101325, points=endmember, model=mod, mode='numpy') if y == 'HM' and x == 'T': # Shift enthalpy data so that value at minimum T is zero predicted_quantities[yattr] -= predicted_quantities[yattr].sel( T=temperatures[0]).values.flatten() response_data = predicted_quantities[yattr].values.flatten() if not bar_chart: extra_kwargs = {} if len(response_data) < 10: extra_kwargs['markersize'] = 20 extra_kwargs['marker'] = '.' extra_kwargs['linestyle'] = 'none' extra_kwargs['clip_on'] = False ax.plot(temperatures, response_data, label='This work', color='k', **extra_kwargs) ax.set_xlabel(plot_mapping.get(x, x)) ax.set_ylabel(plot_mapping.get(y, y)) else: bar_labels.append('This work') bar_data.append(response_data[0]) elif len(endpoints) == 2: # Binary interaction parameter first_endpoint = _translate_endmember_to_array( endpoints[0], mod.ast.atoms(v.SiteFraction)) second_endpoint = _translate_endmember_to_array( endpoints[1], mod.ast.atoms(v.SiteFraction)) point_matrix = np.linspace(0, 1, num=100)[None].T * second_endpoint + \ (1 - np.linspace(0, 1, num=100))[None].T * first_endpoint # TODO: Real temperature support point_matrix = point_matrix[None, None] predicted_quantities = calculate(dbf, comps, [phase_name], output=yattr, T=300, P=101325, points=point_matrix, model=mod, mode='numpy') response_data = predicted_quantities[yattr].values.flatten() if not bar_chart: extra_kwargs = {} if len(response_data) < 10: extra_kwargs['markersize'] = 20 extra_kwargs['marker'] = '.' extra_kwargs['linestyle'] = 'none' extra_kwargs['clip_on'] = False ax.plot(np.linspace(0, 1, num=100), response_data, label='This work', color='k', **extra_kwargs) ax.set_xlim((0, 1)) ax.set_xlabel( str(':'.join(endpoints[0])) + ' to ' + str(':'.join(endpoints[1]))) ax.set_ylabel(plot_mapping.get(y, y)) else: bar_labels.append('This work') bar_data.append(response_data[0]) else: raise NotImplementedError( 'No support for plotting configuration {}'.format(configuration)) bib_reference_keys = sorted( list({entry['reference'] for entry in desired_data})) symbol_map = bib_marker_map(bib_reference_keys) for data in desired_data: indep_var_data = None response_data = np.zeros_like(data['values'], dtype=np.float) if x == 'T' or x == 'P': indep_var_data = np.array(data['conditions'][x], dtype=np.float).flatten() elif x == 'Z': if disordered_config: # Take the second element of the first interacting sublattice as the coordinate # Because it's disordered all sublattices should be equivalent # TODO: Fix this to filter because we need to guarantee the plot points are disordered occ = data['solver']['sublattice_occupancies'] subl_idx = np.nonzero( [isinstance(c, (list, tuple)) for c in occ[0]])[0] if len(subl_idx) > 1: subl_idx = int(subl_idx[0]) else: subl_idx = int(subl_idx) indep_var_data = [c[subl_idx][1] for c in occ] else: interactions = np.array([i[1][1] for i in get_samples([data])], dtype=np.float) indep_var_data = 1 - (interactions + 1) / 2 if y.endswith('_MIX') and data['output'].endswith('_FORM'): # All the _FORM data we have still has the lattice stability contribution # Need to zero it out to shift formation data to mixing mod_latticeonly = Model( dbf, comps, phase_name, parameters={'GHSER' + c.upper(): 0 for c in comps}) mod_latticeonly.models = { key: value for key, value in mod_latticeonly.models.items() if key == 'ref' } temps = data['conditions'].get('T', 300) pressures = data['conditions'].get('P', 101325) points = build_sitefractions( phase_name, data['solver']['sublattice_configurations'], data['solver']['sublattice_occupancies']) for point_idx in range(len(points)): missing_variables = mod_latticeonly.ast.atoms( v.SiteFraction) - set(points[point_idx].keys()) # Set unoccupied values to zero points[point_idx].update( {key: 0 for key in missing_variables}) # Change entry to a sorted array of site fractions points[point_idx] = list( OrderedDict(sorted(points[point_idx].items(), key=str)).values()) points = np.array(points, dtype=np.float) # TODO: Real temperature support points = points[None, None] stability = calculate(dbf, comps, [phase_name], output=data['output'][:-5], T=temps, P=pressures, points=points, model=mod_latticeonly, mode='numpy') response_data -= stability[data['output'][:-5]].values response_data += np.array(data['values'], dtype=np.float) response_data = response_data.flatten() if not bar_chart: extra_kwargs = {} if len(response_data) < 10: extra_kwargs['markersize'] = 8 extra_kwargs['linestyle'] = 'none' extra_kwargs['clip_on'] = False ref = data.get('reference', '') mark = symbol_map[ref]['markers'] ax.plot(indep_var_data, response_data, label=symbol_map[ref]['formatted'], marker=mark['marker'], fillstyle=mark['fillstyle'], **extra_kwargs) else: bar_labels.append(data.get('reference', None)) bar_data.append(response_data[0]) if bar_chart: ax.barh(0.02 * np.arange(len(bar_data)), bar_data, color='k', height=0.01) endmember_title = ' to '.join([':'.join(i) for i in endpoints]) ax.get_figure().suptitle('{} (T = {} K)'.format( endmember_title, temperatures), fontsize=20) ax.set_yticks(0.02 * np.arange(len(bar_data))) ax.set_yticklabels(bar_labels, fontsize=20) # This bar chart is rotated 90 degrees, so "y" is now x ax.set_xlabel(plot_mapping.get(y, y)) else: ax.set_frame_on(False) leg = ax.legend(loc='best') leg.get_frame().set_edgecolor('black') return ax
def fit_formation_energy(dbf, comps, phase_name, configuration, symmetry, datasets, features=None): """ Find suitable linear model parameters for the given phase. We do this by successively fitting heat capacities, entropies and enthalpies of formation, and selecting against criteria to prevent overfitting. The "best" set of parameters minimizes the error without overfitting. Parameters ---------- dbf : Database pycalphad Database. Partially complete, so we know what degrees of freedom to fix. comps : [str] Names of the relevant components. phase_name : str Name of the desired phase for which the parameters will be found. configuration : ndarray Configuration of the sublattices for the fitting procedure. symmetry : [[int]] Symmetry of the sublattice configuration. datasets : PickleableTinyDB All the datasets desired to fit to. features : dict Maps "property" to a list of features for the linear model. These will be transformed from "GM" coefficients e.g., {"CPM_FORM": (v.T*sympy.log(v.T), v.T**2, v.T**-1, v.T**3)} (Default value = None) Returns ------- dict {feature: estimated_value} """ if features is None: features = [("CPM_FORM", (v.T * sympy.log(v.T), v.T**2, v.T**-1, v.T**3)), ("SM_FORM", (v.T, )), ("HM_FORM", (sympy.S.One, ))] features = OrderedDict(features) if any([isinstance(conf, (list, tuple)) for conf in configuration]): # TODO: assumes binary interaction here fitting_steps = (["CPM_FORM", "CPM_MIX"], ["SM_FORM", "SM_MIX"], ["HM_FORM", "HM_MIX"]) # Product of all nonzero site fractions in all sublattices YS = sympy.Symbol('YS') # Product of all binary interaction terms Z = sympy.Symbol('Z') redlich_kister_features = (YS, YS * Z, YS * (Z**2), YS * (Z**3)) for feature in features.keys(): all_features = list( itertools.product(redlich_kister_features, features[feature])) features[feature] = [i[0] * i[1] for i in all_features] logging.debug('ENDMEMBERS FROM INTERACTION: {}'.format( endmembers_from_interaction(configuration))) else: # We are only fitting an endmember; no mixing data needed fitting_steps = (["CPM_FORM"], ["SM_FORM"], ["HM_FORM"]) parameters = {} for feature in features.values(): for coef in feature: parameters[coef] = 0 # These is our previously fit partial model # Subtract out all of these contributions (zero out reference state because these are formation properties) fixed_model = Model( dbf, comps, phase_name, parameters={'GHSER' + (c.upper() * 2)[:2]: 0 for c in comps}) fixed_model.models['idmix'] = 0 fixed_portions = [0] moles_per_formula_unit = sympy.S(0) subl_idx = 0 for num_sites, const in zip(dbf.phases[phase_name].sublattices, dbf.phases[phase_name].constituents): if Species('VA') in const: moles_per_formula_unit += num_sites * ( 1 - v.SiteFraction(phase_name, subl_idx, Species('VA'))) else: moles_per_formula_unit += num_sites subl_idx += 1 for desired_props in fitting_steps: desired_data = get_data(comps, phase_name, configuration, symmetry, datasets, desired_props) logging.debug('{}: datasets found: {}'.format(desired_props, len(desired_data))) if len(desired_data) > 0: # We assume all properties in the same fitting step have the same features (but different ref states) feature_matrix = _build_feature_matrix(desired_props[0], features[desired_props[0]], desired_data) all_samples = get_samples(desired_data) data_quantities = np.concatenate(_shift_reference_state( desired_data, feature_transforms[desired_props[0]], fixed_model), axis=-1) site_fractions = [ build_sitefractions( phase_name, ds['solver']['sublattice_configurations'], ds['solver'].get( 'sublattice_occupancies', np.ones(( len(ds['solver']['sublattice_configurations']), len(ds['solver']['sublattice_configurations'][0])), dtype=np.float))) for ds in desired_data for _ in ds['conditions']['T'] ] # Flatten list site_fractions = list(itertools.chain(*site_fractions)) # Remove existing partial model contributions from the data data_quantities = data_quantities - feature_transforms[ desired_props[0]](fixed_model.ast) # Subtract out high-order (in T) parameters we've already fit data_quantities = data_quantities - \ feature_transforms[desired_props[0]](sum(fixed_portions)) / moles_per_formula_unit for sf, i in zip(site_fractions, data_quantities): missing_variables = sympy.S(i * moles_per_formula_unit).atoms( v.SiteFraction) - set(sf.keys()) sf.update({x: 0. for x in missing_variables}) # moles_per_formula_unit factor is here because our data is stored per-atom # but all of our fits are per-formula-unit data_quantities = [ sympy.S(i * moles_per_formula_unit).xreplace(sf).xreplace({ v.T: ixx[0] }).evalf() for i, sf, ixx in zip(data_quantities, site_fractions, all_samples) ] data_quantities = np.asarray(data_quantities, dtype=np.float) parameters.update( _fit_parameters(feature_matrix, data_quantities, features[desired_props[0]])) # Add these parameters to be fixed for the next fitting step fixed_portion = np.array(features[desired_props[0]], dtype=np.object) fixed_portion = np.dot(fixed_portion, [ parameters[feature] for feature in features[desired_props[0]] ]) fixed_portions.append(fixed_portion) return parameters
def fit_formation_energy(dbf, comps, phase_name, configuration, symmetry, datasets, ridge_alpha=1.0e-100, features=None): """ Find suitable linear model parameters for the given phase. We do this by successively fitting heat capacities, entropies and enthalpies of formation, and selecting against criteria to prevent overfitting. The "best" set of parameters minimizes the error without overfitting. Parameters ---------- dbf : Database pycalphad Database. Partially complete, so we know what degrees of freedom to fix. comps : [str] Names of the relevant components. phase_name : str Name of the desired phase for which the parameters will be found. configuration : ndarray Configuration of the sublattices for the fitting procedure. symmetry : [[int]] Symmetry of the sublattice configuration. datasets : PickleableTinyDB All the datasets desired to fit to. ridge_alpha : float Value of the $alpha$ hyperparameter used in ridge regression. Defaults to 1.0e-100, which should be degenerate with ordinary least squares regression. For now, the parameter is applied to all features. features : dict Maps "property" to a list of features for the linear model. These will be transformed from "GM" coefficients e.g., {"CPM_FORM": (v.T*sympy.log(v.T), v.T**2, v.T**-1, v.T**3)} (Default value = None) Returns ------- dict {feature: estimated_value} """ if interaction_test(configuration): logging.debug('ENDMEMBERS FROM INTERACTION: {}'.format( endmembers_from_interaction(configuration))) fitting_steps = (["CPM_FORM", "CPM_MIX"], ["SM_FORM", "SM_MIX"], ["HM_FORM", "HM_MIX"]) else: # We are only fitting an endmember; no mixing data needed fitting_steps = (["CPM_FORM"], ["SM_FORM"], ["HM_FORM"]) # create the candidate models and fitting steps if features is None: features = OrderedDict([("CPM_FORM", (v.T * sympy.log(v.T), v.T**2, v.T**-1, v.T**3)), ("SM_FORM", (v.T, )), ("HM_FORM", (sympy.S.One, ))]) # dict of {feature, [candidate_models]} candidate_models_features = build_candidate_models(configuration, features) # All possible parameter values that could be taken on. This is some legacy # code from before there were many candidate models built. For very large # sets of candidate models, this could be quite slow. # TODO: we might be able to remove this initialization for clarity, depends on fixed poritions parameters = {} for candidate_models in candidate_models_features.values(): for model in candidate_models: for coef in model: parameters[coef] = 0 # These is our previously fit partial model from previous steps # Subtract out all of these contributions (zero out reference state because these are formation properties) fixed_model = Model( dbf, comps, phase_name, parameters={'GHSER' + (c.upper() * 2)[:2]: 0 for c in comps}) fixed_model.models['idmix'] = 0 fixed_portions = [0] moles_per_formula_unit = sympy.S(0) YS = sympy.Symbol('YS') # site fraction symbol that we will reuse Z = sympy.Symbol('Z') # site fraction symbol that we will reuse subl_idx = 0 for num_sites, const in zip(dbf.phases[phase_name].sublattices, dbf.phases[phase_name].constituents): if v.Species('VA') in const: moles_per_formula_unit += num_sites * ( 1 - v.SiteFraction(phase_name, subl_idx, v.Species('VA'))) else: moles_per_formula_unit += num_sites subl_idx += 1 for desired_props in fitting_steps: desired_data = get_data(comps, phase_name, configuration, symmetry, datasets, desired_props) logging.debug('{}: datasets found: {}'.format(desired_props, len(desired_data))) if len(desired_data) > 0: # We assume all properties in the same fitting step have the same features (all CPM, all HM, etc.) (but different ref states) all_samples = get_samples(desired_data) site_fractions = [ build_sitefractions( phase_name, ds['solver']['sublattice_configurations'], ds['solver'].get( 'sublattice_occupancies', np.ones(( len(ds['solver']['sublattice_configurations']), len(ds['solver']['sublattice_configurations'][0])), dtype=np.float))) for ds in desired_data for _ in ds['conditions']['T'] ] # Flatten list site_fractions = list(itertools.chain(*site_fractions)) # build the candidate model transformation matrix and response vector (A, b in Ax=b) feature_matricies = [] data_quantities = [] for candidate_model in candidate_models_features[desired_props[0]]: if interaction_test(configuration, 3): feature_matricies.append( build_ternary_feature_matrix(desired_props[0], candidate_model, desired_data)) else: feature_matricies.append( _build_feature_matrix(desired_props[0], candidate_model, desired_data)) data_qtys = np.concatenate(shift_reference_state( desired_data, feature_transforms[desired_props[0]], fixed_model), axis=-1) # Remove existing partial model contributions from the data data_qtys = data_qtys - feature_transforms[desired_props[0]]( fixed_model.ast) # Subtract out high-order (in T) parameters we've already fit data_qtys = data_qtys - feature_transforms[desired_props[0]]( sum(fixed_portions)) / moles_per_formula_unit # if any site fractions show up in our data_qtys that aren't in this datasets site fractions, set them to zero. for sf, i, (_, (sf_product, inter_product)) in zip(site_fractions, data_qtys, all_samples): missing_variables = sympy.S( i * moles_per_formula_unit).atoms( v.SiteFraction) - set(sf.keys()) sf.update({x: 0. for x in missing_variables}) # The equations we have just have the site fractions as YS # and interaction products as Z, so take the product of all # the site fractions that we see in our data qtys sf.update({YS: sf_product, Z: inter_product}) # moles_per_formula_unit factor is here because our data is stored per-atom # but all of our fits are per-formula-unit data_qtys = [ sympy.S(i * moles_per_formula_unit).xreplace(sf).xreplace({ v.T: ixx[0] }).evalf() for i, sf, ixx in zip(data_qtys, site_fractions, all_samples) ] data_qtys = np.asarray(data_qtys, dtype=np.float) data_quantities.append(data_qtys) # provide candidate models and get back a selected model. selected_model = select_model( zip(candidate_models_features[desired_props[0]], feature_matricies, data_quantities), ridge_alpha) selected_features, selected_values = selected_model parameters.update(zip(*(selected_features, selected_values))) # Add these parameters to be fixed for the next fitting step fixed_portion = np.array(selected_features, dtype=np.object) fixed_portion = np.dot(fixed_portion, selected_values) fixed_portions.append(fixed_portion) return parameters
def get_data_quantities(desired_property, fixed_model, fixed_portions, data): """ Parameters ---------- desired_property : str String property corresponding to the features that could be fit, e.g. HM, SM_FORM, CPM_MIX fixed_model : pycalphad.Model Model with all lower order (in composition) terms already fit. Pure element reference state (GHSER functions) should be set to zero. fixed_portions : List[sympy.Expr] SymPy expressions for model parameters and interaction productions for higher order (in T) terms for this property, e.g. [0, 3.0*YS*v.T]. In [qty]/mole-formula. data : List[Dict[str, Any]] ESPEI single phase datasets for this property. Returns ------- np.ndarray[:] Ravelled data quantities in [qty]/mole-formula Notes ----- pycalphad Model parameters (and therefore fixed_portions) are stored as per mole-formula quantites, but the calculated properties and our data are all in [qty]/mole-atoms. We multiply by mole-atoms/mole-formula to convert the units to [qty]/mole-formula. """ mole_atoms_per_mole_formula_unit = fixed_model._site_ratio_normalization samples = get_samples(data) # Define site fraction symbols that will be reused YS = sympy.Symbol('YS') Z = sympy.Symbol('Z') V_I, V_J, V_K = sympy.Symbol('V_I'), sympy.Symbol('V_J'), sympy.Symbol('V_K') phase_name = fixed_model.phase_name # Construct flattened list of site fractions corresponding to the ravelled data (from shift_reference_state) site_fractions = [] for ds in data: for _ in ds['conditions']['T']: sf = build_sitefractions(phase_name, ds['solver']['sublattice_configurations'], ds['solver'].get('sublattice_occupancies', np.ones((len(ds['solver']['sublattice_configurations']), len(ds['solver']['sublattice_configurations'][0])), dtype=np.float))) site_fractions.append(sf) site_fractions = list(itertools.chain(*site_fractions)) feat_transform = feature_transforms[desired_property] data_qtys = np.concatenate(shift_reference_state(data, feat_transform, fixed_model, mole_atoms_per_mole_formula_unit), axis=-1) # Remove existing partial model contributions from the data, convert to per mole-formula units data_qtys = data_qtys - feat_transform(fixed_model.ast)*mole_atoms_per_mole_formula_unit # Subtract out high-order (in T) parameters we've already fit, already in per mole-formula units data_qtys = data_qtys - feat_transform(sum(fixed_portions)) # if any site fractions show up in our data_qtys that aren't in this datasets site fractions, set them to zero. for sf, i, (_, (sf_product, inter_product)) in zip(site_fractions, data_qtys, samples): missing_variables = sympy.S(i).atoms(v.SiteFraction) - set(sf.keys()) sf.update({x: 0. for x in missing_variables}) # The equations we have just have the site fractions as YS # and interaction products as Z, so take the product of all # the site fractions that we see in our data qtys if hasattr(inter_product, '__len__'): # Z is an array of [V_I, V_J, V_K] sf.update({YS: sf_product, V_I: inter_product[0], V_J: inter_product[1], V_K: inter_product[2]}) else: # Z is probably a number sf.update({YS: sf_product, Z: inter_product}) data_qtys = [sympy.S(i).xreplace(sf).xreplace({v.T: ixx[0]}).evalf() for i, sf, ixx in zip(data_qtys, site_fractions, samples)] data_qtys = np.asarray(data_qtys, dtype=np.float) return data_qtys