def test_pickelable_tinydb_can_be_pickled_and_unpickled(): """PickleableTinyDB should be able to be pickled and unpickled.""" test_dict = {'test_key': ['test', 'values']} db = PickleableTinyDB(storage=MemoryStorage) db.insert(test_dict) db = pickle.loads(pickle.dumps(db)) assert db.search(where('test_key').exists())[0] == test_dict
def load_datasets(dataset_filenames): """ Create a PickelableTinyDB with the data from a list of filenames. Parameters ---------- dataset_filenames : [str] List of filenames to load as datasets Returns ------- PickleableTinyDB """ ds_database = PickleableTinyDB(storage=MemoryStorage) for fname in dataset_filenames: with open(fname) as file_: try: d = json.load(file_) check_dataset(d) ds_database.insert(clean_dataset(d)) except ValueError as e: raise ValueError('JSON Error in {}: {}'.format(fname, e)) except DatasetError as e: raise DatasetError('Dataset Error in {}: {}'.format(fname, e)) return ds_database
def load_datasets(dataset_filenames, include_disabled=False) -> PickleableTinyDB: """ Create a PickelableTinyDB with the data from a list of filenames. Parameters ---------- dataset_filenames : [str] List of filenames to load as datasets Returns ------- PickleableTinyDB """ ds_database = PickleableTinyDB(storage=MemoryStorage) for fname in dataset_filenames: with open(fname) as file_: try: d = json.load(file_) if not include_disabled and d.get('disabled', False): # The dataset is disabled and not included continue check_dataset(d) ds_database.insert(clean_dataset(d)) except ValueError as e: raise ValueError('JSON Error in {}: {}'.format(fname, e)) except DatasetError as e: raise DatasetError('Dataset Error in {}: {}'.format(fname, e)) return ds_database
def apply_tags(datasets: PickleableTinyDB, tags): """ Modify datasets using the tags system Parameters ---------- datasets : PickleableTinyDB Datasets to modify tags : dict Dictionary of {tag: update_dict} Returns ------- None Notes ----- In general, everything replaces or is additive. We use the following update rules: 1. If the update value is a list, extend the existing list (empty list if key does not exist) 2. If the update value is scalar, override the previous (deleting any old value, if present) 3. If the update value is a dict, update the exist dict (empty dict if dict does not exist) 4. Otherwise, the value is updated, overriding the previous Examples -------- >>> from espei.utils import PickleableTinyDB >>> from tinydb.storages import MemoryStorage >>> ds = PickleableTinyDB(storage=MemoryStorage) >>> doc_id = ds.insert({'tags': ['dft'], 'excluded_model_contributions': ['contrib']}) >>> my_tags = {'dft': {'excluded_model_contributions': ['idmix', 'mag'], 'weight': 5.0}} >>> from espei.datasets import apply_tags >>> apply_tags(ds, my_tags) >>> all_data = ds.all() >>> all(d['excluded_model_contributions'] == ['contrib', 'idmix', 'mag'] for d in all_data) True >>> all(d['weight'] == 5.0 for d in all_data) True """ for tag, update_dict in tags.items(): matching_datasets = datasets.search(where("tags").test(lambda x: tag in x)) for newkey, newval in update_dict.items(): for match in matching_datasets: if isinstance(newval, list): match[newkey] = match.get(newkey, []) + newval elif np.isscalar(newval): match[newkey] = newval elif isinstance(newval, dict): d = match.get(newkey, dict()) d.update(newval) match[newkey] = d else: match[newkey] = newval datasets.update(match, doc_ids=[match.doc_id])
def test_dataplot_plots_binary_equilibria_types(): """Dataplot should be able to reproduce a single boundary (null) equilibria, a tieline, and a 3 phase equilibria for a binary""" ds = PickleableTinyDB(storage=MemoryStorage) ds.insert(A_B_DATASET_BINARY_PHASE_EQUILIBRIA) comps = ['A', 'B'] phases = ["PHASE_1", "PHASE_2", "PHASE_3"] conds = {v.P: 101325, v.T: (0, 400, 40), v.X('B'): (0, 1, 0.01)} ax = dataplot(comps, phases, conds, ds) ax.set_xlim(0, 1) ax.set_ylim(0, 400)
def test_get_data_for_a_minimal_example(): """Given a dataset and the congfiguration pertaining to that dataset, we should find the values.""" SAMPLE_DATASET = { "components": ["CU", "MG", "VA"], "phases": ["LAVES_C15"], "solver": { "mode": "manual", "sublattice_site_ratios": [2, 1], "sublattice_configurations": [["CU", "MG"], ["MG", "CU"], ["MG", "MG"], ["CU", "CU"]] }, "conditions": { "P": 101325, "T": 298.15 }, "output": "HM_FORM", "values": [[[-15720, 34720, 7000, 15500]]] } datasets = PickleableTinyDB(storage=MemoryStorage) datasets.insert(SAMPLE_DATASET) comps = ['CU', 'MG', 'VA'] phase_name = 'LAVES_C15' configuration = ('MG', 'CU') symmetry = None desired_props = ['HM_FORM'] # The following lines replace "get_data" in a more functional form solver_qry = (tinydb.where('solver').test( symmetry_filter, configuration, recursive_tuplify(symmetry) if symmetry else symmetry)) desired_data = get_prop_data(comps, phase_name, desired_props, datasets, additional_query=solver_qry) desired_data = filter_configurations(desired_data, configuration, symmetry) desired_data = filter_temperatures(desired_data) assert len(desired_data) == 1 desired_data = desired_data[0] assert desired_data['components'] == comps assert desired_data['phases'][0] == phase_name assert desired_data['solver']['sublattice_site_ratios'] == [2, 1] assert desired_data['solver']['sublattice_configurations'] == (('MG', 'CU'), ) assert desired_data['conditions']['P'] == 101325 assert desired_data['conditions']['T'] == 298.15 assert desired_data['output'] == 'HM_FORM' assert desired_data['values'] == np.array([[[34720.0]]])
def get_equilibrium_thermochemical_data( dbf: Database, comps: Sequence[str], phases: Sequence[str], datasets: PickleableTinyDB, model: Optional[Dict[str, Model]] = None, parameters: Optional[Dict[str, float]] = None, data_weight_dict: Optional[Dict[str, float]] = None, ) -> Sequence[EqPropData]: """ Get all the EqPropData for each matching equilibrium thermochemical dataset in the datasets Parameters ---------- dbf : Database Database with parameters to fit comps : Sequence[str] List of pure element components used to find matching datasets. phases : Sequence[str] List of phases used to search for matching datasets. datasets : PickleableTinyDB Datasets that contain single phase data model : Optional[Dict[str, Type[Model]]] Dictionary phase names to pycalphad Model classes. parameters : Optional[Dict[str, float]] Mapping of parameter symbols to values. data_weight_dict : Optional[Dict[str, float]] Mapping of a data type (e.g. `HM` or `SM`) to a weight. Notes ----- Found datasets will be subsets of the components and phases. Equilibrium thermochemical data is assumed to be any data that does not have the `solver` key, and does not have an output of `ZPF` or `ACR` (which correspond to different data types than can be calculated here.) Returns ------- Sequence[EqPropData] """ desired_data = datasets.search( # data that isn't ZPF or non-equilibrium thermochemical (where('output') != 'ZPF') & (~where('solver').exists()) & (where('output').test(lambda x: 'ACR' not in x)) & # activity data not supported yet (where('components').test(lambda x: set(x).issubset(comps))) & (where('phases').test(lambda x: set(x).issubset(set(phases))))) eq_thermochemical_data = [] # 1:1 correspondence with each dataset for data in desired_data: eq_thermochemical_data.append( build_eqpropdata(data, dbf, model=model, parameters=parameters, data_weight_dict=data_weight_dict)) return eq_thermochemical_data
def test_get_data_for_a_minimal_example(): """Given a dataset and the congfiguration pertaining to that dataset, we should find the values.""" SAMPLE_DATASET = { "components": ["CU", "MG", "VA"], "phases": ["LAVES_C15"], "solver": { "mode": "manual", "sublattice_site_ratios": [2, 1], "sublattice_configurations": [["CU", "MG"], ["MG", "CU"], ["MG", "MG"], ["CU", "CU"]] }, "conditions": { "P": 101325, "T": 298.15 }, "output": "HM_FORM", "values": [[[-15720, 34720, 7000, 15500]]] } datasets = PickleableTinyDB(storage=MemoryStorage) datasets.insert(SAMPLE_DATASET) comps = ['CU', 'MG', 'VA'] phase_name = 'LAVES_C15' configuration = ('MG', 'CU') symmetry = None desired_props = ['HM_FORM'] desired_data = get_data(comps, phase_name, configuration, symmetry, datasets, desired_props) assert len(desired_data) == 1 desired_data = desired_data[0] assert desired_data['components'] == comps assert desired_data['phases'][0] == phase_name assert desired_data['solver']['sublattice_site_ratios'] == [2, 1] assert desired_data['solver']['sublattice_configurations'] == (('MG', 'CU'), ) assert desired_data['conditions']['P'] == 101325 assert desired_data['conditions']['T'] == 298.15 assert desired_data['output'] == 'HM_FORM' assert desired_data['values'] == np.array([[[34720.0]]])
def test_weighting_invariance(): """Test that weights do not affect model selection using perfect L0 and L1 cases.""" phase_models = { "components": ["AL", "B"], "phases": { "ALPHA": { "sublattice_model": [["AL", "B"]], "sublattice_site_ratios": [1] } } } L0_data = { "components": ["AL", "B"], "phases": ["ALPHA"], "solver": { "sublattice_site_ratios": [1], "sublattice_occupancies": [[[0.5, 0.5]]], "sublattice_configurations": [[["AL", "B"]]], "mode": "manual" }, "conditions": { "P": 101325, "T": 298.15 }, "output": "HM_MIX", "values": [[[-1000]]] } L1_data = { "components": ["AL", "B"], "phases": ["ALPHA"], "solver": { "sublattice_site_ratios": [1], "sublattice_occupancies": [[[0.25, 0.75]], [[0.5, 0.5]], [[0.75, 0.25]]], "sublattice_configurations": [[["AL", "B"]], [["AL", "B"]], [["AL", "B"]]], "mode": "manual" }, "conditions": { "P": 101325, "T": 298.15 }, "output": "HM_MIX", "values": [[[-1000.0, 0, 1000.0]]] } # Perfect L0, no weight datasets_db = PickleableTinyDB(storage=MemoryStorage) datasets_db.insert(L0_data) dbf = generate_parameters(phase_models, datasets_db, 'SGTE91', 'linear') datasets_db.close() params = dbf._parameters.search(where('parameter_type') == 'L') print([f"L{p['parameter_order']}: {p['parameter']}" for p in params]) print({ str(p['parameter']): dbf.symbols[str(p['parameter'])] for p in params }) assert len(params) == 1 assert dbf.symbols['VV0000'] == -4000 # Perfect L0, with weight datasets_db = PickleableTinyDB(storage=MemoryStorage) L0_data['weight'] = 0.1 # lower weight datasets_db.insert(L0_data) dbf = generate_parameters(phase_models, datasets_db, 'SGTE91', 'linear') datasets_db.close() params = dbf._parameters.search(where('parameter_type') == 'L') print([f"L{p['parameter_order']}: {p['parameter']}" for p in params]) print({ str(p['parameter']): dbf.symbols[str(p['parameter'])] for p in params }) assert len(params) == 1 assert dbf.symbols['VV0000'] == -4000 # Perfect L1, no weight datasets_db = PickleableTinyDB(storage=MemoryStorage) datasets_db.insert(L1_data) dbf = generate_parameters(phase_models, datasets_db, 'SGTE91', 'linear') datasets_db.close() params = dbf._parameters.search(where('parameter_type') == 'L') print([f"L{p['parameter_order']}: {p['parameter']}" for p in params]) print({ str(p['parameter']): dbf.symbols[str(p['parameter'])] for p in params }) assert len(params) == 2 assert np.isclose(dbf.symbols['VV0000'], 1000 * 32 / 3) # L1 assert np.isclose(dbf.symbols['VV0001'], 0) # L0 # Perfect L1, with weight datasets_db = PickleableTinyDB(storage=MemoryStorage) L1_data['weight'] = 0.1 # lower weight datasets_db.insert(L1_data) dbf = generate_parameters(phase_models, datasets_db, 'SGTE91', 'linear') datasets_db.close() params = dbf._parameters.search(where('parameter_type') == 'L') print([f"L{p['parameter_order']}: {p['parameter']}" for p in params]) print({ str(p['parameter']): dbf.symbols[str(p['parameter'])] for p in params }) # TODO: sometimes the presence of L0 terms can be flaky # assert len(params) == 2 assert np.isclose(dbf.symbols['VV0000'], 1000 * 32 / 3) # L1
def get_zpf_data(dbf: Database, comps: Sequence[str], phases: Sequence[str], datasets: PickleableTinyDB, parameters: Dict[str, float], model: Optional[Dict[str, Type[Model]]] = None): """ Return the ZPF data used in the calculation of ZPF error Parameters ---------- comps : list List of active component names phases : list List of phases to consider datasets : espei.utils.PickleableTinyDB Datasets that contain single phase data parameters : dict Dictionary mapping symbols to optimize to their initial values model : Optional[Dict[str, Type[Model]]] Dictionary phase names to pycalphad Model classes. Returns ------- list List of data dictionaries with keys ``weight``, ``phase_regions`` and ``dataset_references``. """ desired_data = datasets.search( (tinydb.where('output') == 'ZPF') & (tinydb.where('components').test(lambda x: set(x).issubset(comps))) & (tinydb.where('phases').test( lambda x: len(set(phases).intersection(x)) > 0))) zpf_data = [] # 1:1 correspondence with each dataset for data in desired_data: data_comps = list(set(data['components']).union({'VA'})) species = sorted(unpack_components(dbf, data_comps), key=str) data_phases = filter_phases(dbf, species, candidate_phases=phases) models = instantiate_models(dbf, species, data_phases, model=model, parameters=parameters) # assumed N, P, T state variables phase_recs = build_phase_records(dbf, species, data_phases, {v.N, v.P, v.T}, models, parameters=parameters, build_gradients=True, build_hessians=True) all_phase_points = { phase_name: _sample_phase_constitution(models[phase_name], point_sample, True, 50) for phase_name in data_phases } all_regions = data['values'] conditions = data['conditions'] phase_regions = [] # Each phase_region is one set of phases in equilibrium (on a tie-line), # e.g. [["ALPHA", ["B"], [0.25]], ["BETA", ["B"], [0.5]]] for idx, phase_region in enumerate(all_regions): # Extract the conditions for entire phase region pot_conds = _extract_pot_conds(conditions, idx) pot_conds.setdefault(v.N, 1.0) # Add v.N condition, if missing # Extract all the phases and compositions from the tie-line points vertices = [] for vertex in phase_region: phase_name, comp_conds, disordered_flag = _extract_phases_comps( vertex) # Construct single-phase points satisfying the conditions for each phase in the region mod = models[phase_name] composition = _compute_vertex_composition( data_comps, comp_conds) if np.any(np.isnan(composition)): # We can't construct points because we don't have a known composition has_missing_comp_cond = True phase_points = None elif _phase_is_stoichiometric(mod): has_missing_comp_cond = False phase_points = None else: has_missing_comp_cond = False # Only sample points that have an average mass residual within tol tol = 0.02 phase_points = _subsample_phase_points( phase_recs[phase_name], all_phase_points[phase_name], composition, tol) assert phase_points.shape[ 0] > 0, f"phase {phase_name} must have at least one set of points within the target tolerance {pot_conds} {comp_conds}" vtx = RegionVertex(phase_name, composition, comp_conds, phase_points, phase_recs, disordered_flag, has_missing_comp_cond) vertices.append(vtx) region = PhaseRegion(vertices, pot_conds, species, data_phases, models) phase_regions.append(region) data_dict = { 'weight': data.get('weight', 1.0), 'phase_regions': phase_regions, 'dataset_reference': data['reference'] } zpf_data.append(data_dict) return zpf_data
def datasets_db(): """Returns a clean instance of a PickleableTinyDB for datasets""" db = PickleableTinyDB(storage=MemoryStorage) yield db db.close()
def generate_parameters(phase_models, datasets, ref_state, excess_model, ridge_alpha=None, aicc_penalty_factor=None, dbf=None): """Generate parameters from given phase models and datasets Parameters ---------- phase_models : dict Dictionary of components and phases to fit. datasets : PickleableTinyDB database of single- and multi-phase to fit. ref_state : str String of the reference data to use, e.g. 'SGTE91' or 'SR2016' excess_model : str String of the type of excess model to fit to, e.g. 'linear' ridge_alpha : float Value of the :math:`\\alpha` hyperparameter used in ridge regression. Defaults to None, which falls back to ordinary least squares regression. For now, the parameter is applied to all features. aicc_penalty_factor : dict Map of phase name to feature to a multiplication factor for the AICc's parameter penalty. dbf : Database Initial pycalphad Database that can have parameters that would not be fit by ESPEI Returns ------- pycalphad.Database """ # Set NumPy print options so logged arrays print on one line. Reset at the end. np.set_printoptions(linewidth=sys.maxsize) _log.info('Generating parameters.') _log.trace('Found the following user reference states: %s', espei.refdata.INSERTED_USER_REFERENCE_STATES) refdata = getattr(espei.refdata, ref_state) aliases = extract_aliases(phase_models) dbf = initialize_database(phase_models, ref_state, dbf) # Fit phases in alphabetic order so the VV#### counter is constistent between runs for phase_name, phase_data in sorted(phase_models['phases'].items(), key=operator.itemgetter(0)): if phase_name in dbf.phases: symmetry = phase_data.get('equivalent_sublattices', None) # Filter datasets by thermochemical data for this phase dataset = tinydb.Query() phase_filtered_datasets = PickleableTinyDB( storage=tinydb.storages.MemoryStorage) single_phase_thermochemical_query = ( (dataset.phases == [phase_name]) # TODO: aliases support & dataset.solver.exists()) phase_filtered_datasets.insert_multiple( datasets.search(single_phase_thermochemical_query)) phase_fit(dbf, phase_name, symmetry, phase_filtered_datasets, refdata, ridge_alpha, aicc_penalty=aicc_penalty_factor, aliases=aliases) _log.info('Finished generating parameters.') np.set_printoptions(linewidth=75) return dbf
def get_zpf_data(dbf: Database, comps: Sequence[str], phases: Sequence[str], datasets: PickleableTinyDB, parameters: Dict[str, float]): """ Return the ZPF data used in the calculation of ZPF error Parameters ---------- comps : list List of active component names phases : list List of phases to consider datasets : espei.utils.PickleableTinyDB Datasets that contain single phase data parameters : dict Dictionary mapping symbols to optimize to their initial values Returns ------- list List of data dictionaries with keys ``weight``, ``data_comps`` and ``phase_regions``. ``data_comps`` are the components for the data in question. ``phase_regions`` are the ZPF phases, state variables and compositions. """ desired_data = datasets.search( (tinydb.where('output') == 'ZPF') & (tinydb.where('components').test(lambda x: set(x).issubset(comps))) & (tinydb.where('phases').test( lambda x: len(set(phases).intersection(x)) > 0))) zpf_data = [] # 1:1 correspondence with each dataset for data in desired_data: data_comps = list(set(data['components']).union({'VA'})) species = sorted(unpack_components(dbf, data_comps), key=str) data_phases = filter_phases(dbf, species, candidate_phases=phases) models = instantiate_models(dbf, species, data_phases, parameters=parameters) all_regions = data['values'] conditions = data['conditions'] phase_regions = [] # Each phase_region is one set of phases in equilibrium (on a tie-line), # e.g. [["ALPHA", ["B"], [0.25]], ["BETA", ["B"], [0.5]]] for idx, phase_region in enumerate(all_regions): # We need to construct a PhaseRegion by matching up phases/compositions to the conditions if len(phase_region) < 2: # Skip single-phase regions for fitting purposes continue # Extract the conditions for entire phase region region_potential_conds = extract_conditions(conditions, idx) region_potential_conds[v.N] = region_potential_conds.get( v.N) or 1.0 # Add v.N condition, if missing # Extract all the phases and compositions from the tie-line points region_phases, region_comp_conds, phase_flags = extract_phases_comps( phase_region) region_phase_records = [ build_phase_records(dbf, species, data_phases, { **region_potential_conds, **comp_conds }, models, parameters=parameters, build_gradients=True, build_hessians=True) for comp_conds in region_comp_conds ] phase_regions.append( PhaseRegion(region_phases, region_potential_conds, region_comp_conds, phase_flags, dbf, species, data_phases, models, region_phase_records)) data_dict = { 'weight': data.get('weight', 1.0), 'data_comps': data_comps, 'phase_regions': phase_regions, 'dataset_reference': data['reference'] } zpf_data.append(data_dict) return zpf_data