def __init__(self, **kwargs): """ Build 3-body MGP """ self.bodies = 3 self.grid_dim = 3 self.kernel_name = "threebody" self.pred_perm = [[0, 1, 2], [1, 0, 2]] super().__init__(**kwargs) # initialize bounds self.set_bounds(None, None) spc = self.species self.species_code = ( Z_to_element(spc[0]) + "_" + Z_to_element(spc[1]) + "_" + Z_to_element(spc[2]) ) self.kv3name = f"kv3_{self.species_code}"
def test_Z_to_element(): for i in range(1, 118): assert isinstance(Z_to_element(i), str) for pair in zip([1, 6, "8", "118"], ["H", "C", "O", "Og"]): assert Z_to_element(pair[0]) == pair[1] with raises(ValueError): Z_to_element("a")
def test_Z_to_element(): for i in range(1, 118): assert isinstance(Z_to_element(i), str) for pair in zip([1, 6, '8', '118'], ['H', 'C', 'O', 'Og']): assert Z_to_element(pair[0]) == pair[1] with raises(ValueError): Z_to_element('a')
def training_statistics(self) -> dict: """ Return dict with statistics about the current training data by expert. Useful for quickly summarizing info about the RBCM. :return: """ data = {} # Count all of the present species in the atomic env. data present_species = [] data["N"] = 0 for i in range(self.n_experts): data["N"] += self.n_envs_prev[i] data[f"N_{i}"] = self.n_envs_prev[i] for env, _ in zip(self.training_data[i], self.training_labels[i]): present_species.append( Z_to_element(env.structure.coded_species[env.atom]) ) # Summarize the relevant information data["species"] = list(set(present_species)) data["envs_by_species"] = dict(Counter(present_species)) return data
def update_gp_and_print( self, frame: Structure, train_atoms: List[int], uncertainties: List[int] = None, train: bool = True, ): """ Update the internal GP model training set with a list of training atoms indexing atoms within the frame. If train is True, re-train the GP by optimizing hyperparameters. :param frame: Structure to train on :param train_atoms: Index atoms to train on :param uncertainties: Uncertainties to print, pass in [] to silence :param train: Train or not :return: None """ if not train_atoms: return # Group added atoms by species for easier output added_species = [ Z_to_element(frame.coded_species[at]) for at in train_atoms ] added_atoms = {spec: [] for spec in set(added_species)} for atom, spec in zip(train_atoms, added_species): added_atoms[spec].append(atom) logger = logging.getLogger(self.logger_name) logger.info("Adding atom(s) " f"{json.dumps(added_atoms,cls=NumpyEncoder)}" " to the training set.") if uncertainties is None: uncertainties = frame.stds[train_atoms] if uncertainties is not None and len(uncertainties) != 0: logger.info(f"Uncertainties: {uncertainties}.") logger.info( f"New GP Statistics: {json.dumps(self.gp.training_statistics)}\n") # update gp model; handling differently if it's an MGP if not self.gp_is_mapped: frame_energy = frame.energy if self.include_energies else None self.gp.update_db(frame, frame.forces, custom_range=train_atoms, energy=frame_energy) if train: self.train_gp() else: logger.warning( "Warning: Adding data to an MGP is not yet supported.")
def __init__(self, **kwargs): """ Build 2-body MGP bond_struc: Mock structure used to sample 2-body forces on 2 atoms """ self.bodies = 2 self.grid_dim = 1 self.kernel_name = "twobody" self.pred_perm = [[0]] super().__init__(**kwargs) # initialize bounds self.set_bounds(None, None) spc = self.species self.species_code = Z_to_element(spc[0]) + "_" + Z_to_element(spc[1])
def test_to_xyz(varied_test_struc): simple_str = varied_test_struc.to_xyz(extended_xyz=False, print_stds=False, print_forces=False, print_max_stds=False) simple_str_by_line = simple_str.split("\n") assert len(simple_str_by_line) - 2 == len(varied_test_struc) for i, atom_line in enumerate(simple_str_by_line[2:-1]): split_line = atom_line.split() assert split_line[0] == Z_to_element( int(varied_test_struc.species_labels[i])) for j in range(3): assert float(split_line[1 + j]) == varied_test_struc.positions[i][j] complex_str = varied_test_struc.to_xyz(True, True, True, True) complex_str_by_line = complex_str.split("\n") assert len(complex_str_by_line) - 2 == len(varied_test_struc) for i, atom_line in enumerate(complex_str_by_line[2:-1]): split_line = atom_line.split() assert split_line[0] == Z_to_element( int(varied_test_struc.species_labels[i])) for j in range(1, 4): assert float(split_line[j]) == varied_test_struc.positions[i][j - 1] for j in range(4, 7): assert float(split_line[j]) == varied_test_struc.stds[i][j - 4] for j in range(7, 10): assert float(split_line[j]) == varied_test_struc.forces[i][j - 7] assert float(split_line[10]) == np.max(varied_test_struc.stds[i])
def update_gp_and_print(self, frame: Structure, train_atoms: List[int], uncertainties: List[int] = None, train: bool = True): """ Update the internal GP model training set with a list of training atoms indexing atoms within the frame. If train is True, re-train the GP by optimizing hyperparameters. :param frame: Structure to train on :param train_atoms: Index atoms to train on :param uncertainties: Uncertainties to print, pass in [] to silence :param train: Train or not :return: None """ # Group added atoms by species for easier output added_species = [ Z_to_element(frame.coded_species[at]) for at in train_atoms ] added_atoms = {spec: [] for spec in set(added_species)} for atom, spec in zip(train_atoms, added_species): added_atoms[spec].append(atom) logger = logging.getLogger(self.logger_name) logger.info('Adding atom(s) ' f'{json.dumps(added_atoms,cls=NumpyEncoder)}' ' to the training set.') if uncertainties is None or len(uncertainties) != 0: uncertainties = frame.stds[train_atoms] if len(uncertainties) != 0: logger.info(f'Uncertainties: ' f'{uncertainties}.') # update gp model; handling differently if it's an MGP if not self.mgp: self.gp.update_db(frame, frame.forces, custom_range=train_atoms) if train: self.train_gp() else: logger.warning("Warning: Adding data to an MGP is not yet " "supported.")
def training_statistics(self) -> dict: """ Return a dictionary with statistics about the current training data. Useful for quickly summarizing info about the GP. :return: """ data = dict() data["N"] = len(self.training_data) # Count all of the present species in the atomic env. data present_species = [] for env, _ in zip(self.training_data, self.training_labels): present_species.append( Z_to_element(env.structure.coded_species[env.atom])) # Summarize the relevant information data["species"] = list(set(present_species)) data["envs_by_species"] = dict(Counter(present_species)) return data
def write_gp_dft_comparison( self, curr_step, frame, start_time, dft_forces, dft_energy, error, local_energies=None, KE=None, mgp=False, cell=None, stress=None, ): """Write the comparison to logfile. :param curr_step: current timestep :param frame: Structure object that contains the current GP calculation results. :param start_time: start time for time profiling :param dft_forces: list of forces computed by DFT :param dft_energy: total energy computed by DFT :param error: list of force differences between DFT and GP prediction :param local_energies: local atomic energy :param KE: total kinetic energy :param cell: print the unit cell of the structure :param stress: print the stress acting on the cell :return: """ string = "" # Mark if a frame had DFT forces with an asterisk string += f"\n*-Frame: {curr_step}" # Construct Header line string += "\nEl Position (A) \t\t\t\t " if mgp: string += "M" string += "GP Force (ev/A) \t\t\t\t" string += "Std. Dev (ev/A) \t\t\t\t" string += "DFT Force (ev/A) \t\t\t\t \n" # Construct atom-by-atom description for i in range(len(frame.positions)): string += f"{frame.species_labels[i]} " for j in range(3): string += f"{frame.positions[i][j]:10.5} " string += "\t" for j in range(3): string += f"{frame.forces[i][j]:10.5} " string += "\t" for j in range(3): string += f"{frame.stds[i][j]:10.5} " string += "\t" for j in range(3): string += f"{dft_forces[i][j]:10.5} " string += "\n" string += "\n" # Print stress & cell related parameters if cell is not None: rounded_cell = np.round(cell, 4) string += f"cell: {[list(vec) for vec in rounded_cell]} \n" if stress: raise NotImplementedError # Compute errors and errors by species mae = np.nanmean(error) * 1000 mac = np.mean(np.abs(dft_forces)) * 1000 string += f"mean absolute error: {mae:.2f} meV/A\n" string += f"mean absolute dft component: {mac:.2f} meV/A\n" stat = f"{curr_step} {mae:.2} {mac:.2}" mae_per_species = {} count_per_species = {} species = [Z_to_element(Z) for Z in set(frame.coded_species)] for ele in species: mae_per_species[ele] = 0 count_per_species[ele] = 0 for atom in range(frame.nat): Z = frame.coded_species[atom] ele = Z_to_element(Z) if np.isnan(np.sum(error[atom, :])): continue mae_per_species[ele] += np.sum(error[atom, :]) count_per_species[ele] += 1 string += "mae per species\n" for ele in species: if count_per_species[ele] > 0: mae_per_species[ele] /= count_per_species[ele] * 3 mae_per_species[ele] *= 1000 # Put in meV/A string += f"type {ele} mae: {mae_per_species[ele]:.2f} meV/A\n" stat += f" {mae_per_species[ele]:.2f}" # calculate potential and total energy if local_energies is not None: pot_en = 0 pot_en = np.sum(local_energies) tot_en = KE + pot_en string += f"potential energy: {pot_en:10.6} eV (DFT: {dft_energy} eV\n" string += f"total energy: {tot_en:10.6} eV \n" stat += f" {pot_en:10.6} {tot_en:10.6}" else: pot_en = float("nan") if self.print_as_xyz: self.write_xyz_config( curr_step, frame, forces=frame.forces, stds=frame.stds, dft_forces=dft_forces, dft_energy=dft_energy, predict_energy=pot_en, ) f = logging.getLogger(self.basename + "log") f.info(string) self.write_wall_time(start_time) # stat += f' {dt}\n' # logging.getLogger('stat').write(stat) if self.always_flush: f.handlers[0].flush()
def __init__( self, grid_params: dict, unique_species: list = [], GP: GaussianProcess = None, var_map: str = None, container_only: bool = True, lmp_file_name: str = "lmp", n_cpus: int = None, n_sample: int = 10, ): # load all arguments as attributes self.var_map = var_map self.lmp_file_name = lmp_file_name self.n_cpus = n_cpus self.n_sample = n_sample self.grid_params = grid_params self.species_labels = [] self.coded_species = [] self.hyps_mask = None self.cutoffs = None self.training_statistics = None species_labels = [] coded_species = [] for i, ele in enumerate(unique_species): if isinstance(ele, str): species_labels.append(ele) coded_species.append(element_to_Z(ele)) elif isinstance(ele, int): coded_species.append(ele) species_labels.append(Z_to_element(ele)) else: print("element type not accepted", ele, type(ele)) sort_id = np.argsort(coded_species) for i in sort_id: self.coded_species.append(coded_species[i]) self.species_labels.append(species_labels[i]) self.load_grid = grid_params.get("load_grid", None) self.update = grid_params.get("update", False) self.lower_bound_relax = grid_params.get("lower_bound_relax", 0.1) self.maps = {} optional_xb_params = ["lower_bound", "upper_bound", "svd_rank"] for key in grid_params: if "body" in key: if "twobody" == key: mapxbody = Map2body elif "threebody" == key: mapxbody = Map3body else: raise KeyError("Only 'twobody' & 'threebody' are allowed") xb_dict = grid_params[key] # set to 'auto' if the param is not given args = {} for oxp in optional_xb_params: args[oxp] = xb_dict.get(oxp, "auto") args["grid_num"] = xb_dict.get("grid_num", None) for k in xb_dict: args[k] = xb_dict[k] xb_maps = mapxbody(**args, **self.__dict__) self.maps[key] = xb_maps
def run_passive_learning( self, frames: List[Structure] = (), environments: List[AtomicEnvironment] = (), max_atoms_per_frame: int = np.inf, post_training_iterations: int = 0, post_build_matrices: bool = False, max_elts_per_frame: Dict[str, int] = None, max_model_size: int = np.inf, max_model_elts: Dict[str, int] = None, ): """ Various tasks to set up the AIMD training before commencing the run through the AIMD trajectory. If you want to skip frames, splice the input as frames[::skip_n]. If you want to randomize the frame order, try the random module's shuffle function. Loads the GP with the seed frames and environments. ALL environments passed in will be added. Randomly chosen atoms from each frame will be added. If no seed frames or environments and the GP has no training set, then seed with at least one atom from each """ if self.gp_is_mapped: raise NotImplementedError( "Passive learning not yet configured for MGP") if max_elts_per_frame is None: max_elts_per_frame = dict() if max_model_elts is None: max_model_elts = dict() logger = logging.getLogger(self.logger_name) logger.debug("Beginning passive learning.") # If seed environments were passed in, add them to the GP. for env in environments: self.gp.add_one_env(env, env.force, train=False) # Ensure compatibility with number / symbol elemental notation for cur_dict in [max_elts_per_frame, max_model_elts]: for key in list(cur_dict.keys()): if isinstance(key, int): cur_dict[Z_to_element(key)] = cur_dict[key] elif isinstance(key, str): cur_dict[element_to_Z(key)] = cur_dict[key] # Main frame loop total_added = 0 for frame in frames: current_stats = self.gp.training_statistics available_to_add = max_model_size - current_stats["N"] train_atoms = [] for species_i in set(frame.coded_species): # Get a randomized set of atoms of species i from the frame # So that it is not always the lowest-indexed atoms chosen elt = Z_to_element(species_i) atoms_of_specie = frame.indices_of_specie(species_i) n_at = len(atoms_of_specie) # Determine how many to add based on user defined cutoffs n_add = min( n_at, max_elts_per_frame.get(species_i, inf), max_atoms_per_frame - len(train_atoms), available_to_add - len(train_atoms), max_model_elts.get(elt, np.inf) - current_stats["envs_by_species"].get(elt, 0), ) n_add = max(0, n_add) train_atoms += sample(atoms_of_specie, n_add) available_to_add -= n_add total_added += n_add self.update_gp_and_print( frame=frame, train_atoms=train_atoms, uncertainties=[], train=False, ) logger = logging.getLogger(self.logger_name) logger.info(f"Added {total_added} atoms to " "GP.\n" "Current GP Statistics: " f"{json.dumps(self.gp.training_statistics)} ") if post_training_iterations: logger.debug("Now commencing pre-run training of GP (which has " "non-empty training set)") time0 = time.time() self.train_gp(max_iter=post_training_iterations) logger.debug(f"Done train_gp {time.time() - time0}") elif post_build_matrices: logger.debug( "Now commencing pre-run set up of GP (which has non-empty training set)" ) time0 = time.time() self.gp.check_L_alpha() logger.debug(f"Done check_L_alpha {time.time() - time0}")
def from_dict(hyps_mask, verbose=False, init_spec=[]): """convert dictionary mask to HM instance This function is not tested yet """ Parameters.check_instantiation( hyps_mask["hyps"], hyps_mask["cutoffs"], hyps_mask["kernels"], hyps_mask ) pm = ParameterHelper(verbose=verbose) nspecie = hyps_mask["nspecie"] if nspecie > 1: max_species = np.max(hyps_mask["specie_mask"]) specie_mask = hyps_mask["specie_mask"] for i in range(max_species + 1): elelist = np.where(specie_mask == i)[0] if len(elelist) > 0: for ele in elelist: if ele != 0: elename = Z_to_element(ele) if len(init_spec) > 0: if elename in init_spec: pm.define_group("specie", i, [elename]) else: pm.define_group("specie", i, [elename]) else: pm.define_group("specie", i, ["*"]) for kernel in hyps_mask["kernels"] + ParameterHelper.cutoff_types_keys: n = hyps_mask.get("n" + kernel, 0) if n >= 0: if kernel not in ParameterHelper.cutoff_types: chyps, copt = Parameters.get_component_hyps( hyps_mask, kernel, constraint=True, noise=False ) sig = chyps[0] ls = chyps[1] csig = copt[0] cls = copt[1] cutoff = hyps_mask["cutoffs"][kernel] pm.set_parameters("cutoff_" + kernel, cutoff) cutoff_list = hyps_mask.get( f"{kernel}_cutoff_list", np.ones(len(sig)) * cutoff ) elif kernel in ParameterHelper.cutoff_types and n > 1: cutoff_list = hyps_mask[ ParameterHelper.cutoff_types[kernel] + "_cutoff_list" ] if n > 1: all_specie = np.arange(nspecie) all_comb = combinations_with_replacement( all_specie, ParameterHelper.ndim[kernel] ) for comb in all_comb: mask_id = 0 for ele in comb: mask_id += ele mask_id *= nspecie mask_id = mask_id // nspecie ttype = hyps_mask[f"{kernel}_mask"][mask_id] pm.define_group(f"{kernel}", f"{kernel}{ttype}", comb) if (kernel not in ParameterHelper.cutoff_types) and ( kernel not in ParameterHelper.cutoff_types_values ): pm.set_parameters( f"{kernel}{ttype}", [sig[ttype], ls[ttype], cutoff_list[ttype]], opt=[csig[ttype], cls[ttype]], ) elif kernel in ParameterHelper.cutoff_types_values: pm.set_parameters( f"{kernel}{ttype}", [sig[ttype], ls[ttype]], opt=[csig[ttype], cls[ttype]], ) else: pm.set_parameters(f"{kernel}{ttype}", cutoff_list[ttype]) else: pm.define_group( kernel, kernel, ["*"] * ParameterHelper.ndim[kernel] ) if kernel not in ParameterHelper.cutoff_types_keys: pm.set_parameters( kernel, parameters=np.hstack([sig, ls, cutoff]), opt=copt ) else: pm.set_parameters(kernel, parameters=cutoff) hyps = Parameters.get_hyps(hyps_mask) pm.set_parameters("noise", hyps[-1]) if "cutoffs" in hyps_mask: cutoffs = hyps_mask["cutoffs"] for k in cutoffs: pm.set_parameters(f"cutoff_{k}", cutoffs[k]) return pm
def to_xyz( self, extended_xyz: bool = True, print_stds: bool = False, print_forces: bool = False, print_max_stds: bool = False, print_energies: bool = False, predict_energy=None, dft_forces=None, dft_energy=None, timestep=-1, write_file: str = "", append: bool = False, ) -> str: """ Convenience function which turns a structure into an extended .xyz file; useful for further input into visualization programs like VESTA or Ovito. Can be saved to an output file via write_file. :param print_stds: Print the stds associated with the structure. :param print_forces: :param extended_xyz: :param print_max_stds: :param write_file: :return: """ species_list = [Z_to_element(x) for x in self.coded_species] xyz_str = "" xyz_str += f"{len(self.coded_species)} \n" # Add header line with info about lattice and properties if extended # xyz option is called. if extended_xyz: cell = self.cell xyz_str += f'Lattice="{cell[0,0]} {cell[0,1]} {cell[0,2]}' xyz_str += f" {cell[1,0]} {cell[1,1]} {cell[1,2]}" xyz_str += f' {cell[2,0]} {cell[2,1]} {cell[2,2]}"' if timestep > 0: xyz_str += f" Timestep={timestep}" if predict_energy: xyz_str += f" PE={predict_energy}" if dft_energy is not None: xyz_str += f" DFT_PE={dft_energy}" xyz_str += f' Proprties="species:S:1:pos:R:3' if print_stds: xyz_str += ":stds:R:3" stds = self.stds if print_forces: xyz_str += ":forces:R:3" forces = self.forces if print_max_stds: xyz_str += ":max_std:R:1" stds = self.stds if print_energies: if self.local_energies is None: print_energies = False else: xyz_str += ":local_energy:R:1" local_energies = self.local_energies if dft_forces is not None: xyz_str += ":dft_forces:R:3" xyz_str += "\n" else: xyz_str += "\n" for i, pos in enumerate(self.positions): # Write positions xyz_str += f"{species_list[i]} {pos[0]} {pos[1]} {pos[2]}" # If extended XYZ: Add in extra information if print_stds and extended_xyz: xyz_str += f" {stds[i,0]} {stds[i,1]} {stds[i,2]}" if print_forces and extended_xyz: xyz_str += f" {forces[i,0]} {forces[i,1]} {forces[i,2]}" if print_energies and extended_xyz: xyz_str += f" {local_energies[i]}" if print_max_stds and extended_xyz: xyz_str += f" {np.max(stds[i,:])} " if dft_forces is not None: xyz_str += f" {dft_forces[i, 0]} {dft_forces[i,1]} {dft_forces[i, 2]}" if i < (len(self.positions) - 1): xyz_str += "\n" # Write to file, optionally if write_file: if append: fmt = "a" else: fmt = "w" with open(write_file, fmt) as f: f.write(xyz_str) f.write("\n") return xyz_str
def test_passive_learning(): the_gp = GaussianProcess( kernel_name="2+3_mc", hyps=np.array([ 3.75996759e-06, 1.53990678e-02, 2.50624782e-05, 5.07884426e-01, 1.70172923e-03, ]), cutoffs=np.array([5, 3]), hyp_labels=["l2", "s2", "l3", "s3", "n0"], maxiter=1, opt_algorithm="L-BFGS-B", ) frames = Structure.from_file( path.join(TEST_FILE_DIR, "methanol_frames.json")) envs = AtomicEnvironment.from_file( path.join(TEST_FILE_DIR, "methanol_envs.json")) cur_gp = deepcopy(the_gp) tt = TrajectoryTrainer(frames=None, gp=cur_gp) # TEST ENVIRONMENT ADDITION envs_species = set(Z_to_element(env.ctype) for env in envs) tt.run_passive_learning(environments=envs, post_build_matrices=False) assert cur_gp.training_statistics["N"] == len(envs) assert set(cur_gp.training_statistics["species"]) == envs_species # TEST FRAME ADDITION: ALL ARE ADDED cur_gp = deepcopy(the_gp) tt.gp = cur_gp tt.run_passive_learning(frames=frames, post_build_matrices=False) assert len(cur_gp.training_data) == sum([len(fr) for fr in frames]) # TEST FRAME ADDITION: MAX OUT MODEL SIZE AT 1 cur_gp = deepcopy(the_gp) tt.gp = cur_gp tt.run_passive_learning(frames=frames, max_model_size=1, post_training_iterations=1) assert len(cur_gp.training_data) == 1 # TEST FRAME ADDITION: EXCLUDE OXYGEN, LIMIT CARBON TO 1, 1 H PER FRAME cur_gp = deepcopy(the_gp) tt.gp = cur_gp tt.run_passive_learning( frames=frames, max_model_elts={ "O": 0, "C": 1, "H": 5 }, max_elts_per_frame={"H": 1}, post_build_matrices=False, ) assert "O" not in cur_gp.training_statistics["species"] assert cur_gp.training_statistics["envs_by_species"]["C"] == 1 assert cur_gp.training_statistics["envs_by_species"]["H"] == 5