def extract_from_ase(self, filename, init_config): """Load an ASE trajectory as a gt Trajectory by extracting positions""" assert init_config is not None traj = ase_traj.Trajectory(filename) # Iterate through each frame (set of atoms) in the trajectory for atoms in traj: config = init_config.copy() config.energy = None config.forces = None # Set the coordinate of every atom in the configuration for i, position in enumerate(atoms.get_positions()): config.atoms[i].coord = position self._list.append(config) # Set the energies from the trajectory if possible if os.path.exists('tmp_energies.txt'): for i, line in enumerate(open('tmp_energies.txt', 'r')): try: self[i].energy = float(line.split()[0]) except (TypeError, ValueError, IndexError): logger.warning('Could not set the energy') os.remove('tmp_energies.txt') return None
def _set_soap(self, atom_symbols): """Set the SOAP parameters""" added_pairs = [] for symbol in set(atom_symbols): if symbol == 'H': logger.warning('Not adding SOAP on H') continue params = GTConfig.gap_default_soap_params.copy() # Add all the atomic symbols that aren't this one, the neighbour # density for which also hasn't been added already params["other"] = [ s for s in set(atom_symbols) if s + symbol not in added_pairs and symbol + s not in added_pairs ] # If there are no other atoms of this type then remove the self # pair if atom_symbols.count(symbol) == 1: params["other"].remove(symbol) for other_symbol in params["other"]: added_pairs.append(symbol + other_symbol) if len(params["other"]) == 0: logger.info(f'Not adding SOAP to {symbol} - should be covered') continue self.soap[symbol] = params return None
def sub_sampled_data(self, data, gap=None, random=True): """ Select a portion of the data to train a GAP on as an nth of the full training data where n is the number of GAPs in this ensemble :param data: (gaptrain.data.Data) :param gap: (gaptrain.gap.GAP | None) :param random: (bool) Whether to take a random sample :return: """ sub_sampled_data = data.copy() # Remove points randomly from the training data to give an n-th n_data = int(len(data) / self.n_gaps()) if n_data == 0: raise RuntimeError('Insufficient configurations to sub-sample') if gap is not None: if any(n_sparse > n_data for n_sparse in gap.params.n_sparses()): raise RuntimeError('Number of sub-sampled data must be greater' ' than or equal to the number of sparse ' 'points') else: logger.warning('Cannot check that the number of data is larger' 'than the number of sparse points') if random: sub_sampled_data.remove_random(remainder=n_data) else: raise NotImplementedError return sub_sampled_data
def loss(self, configs_a, configs_b, attr): """ Compute the root mean squared loss between two sets of configurations ----------------------------------------------------------------------- :param configs_a: (gaptrain.configurations.ConfigurationSet) :param configs_b: (gaptrain.configurations.ConfigurationSet) :param attr: (str) Attribute of a configuration to calculate use in the loss function e.g. energy or froca :return: (gaptrain.loss.RMSE) """ assert len(configs_a) == len(configs_b) deltas = [] for (ca, cb) in zip(configs_a, configs_b): val_a, val_b = getattr(ca, attr), getattr(cb, attr) if val_a is None or val_b is None: logger.warning(f'Cannot calculate loss for {attr} at least ' f'one value was None') return None # Append the difference between the floats deltas.append(val_a - val_b) return self.function(np.array(deltas))
def simulation_steps(dt, kwargs): """Calculate the number of simulation steps :param dt: (float) Timestep in fs :param kwargs: (dict) :return: (float) """ if dt < 0.09 or dt > 5: logger.warning('Unexpectedly small or large timestep - is it in fs?') if 'ps' in kwargs: time_fs = 1E3 * kwargs['ps'] elif 'fs' in kwargs: time_fs = kwargs['fs'] elif 'ns' in kwargs: time_fs = 1E6 * kwargs['ns'] else: raise ValueError('Simulation time not found') logger.info(f'Running {time_fs / dt:.0f} steps with a timestep of {dt} fs') # Run at least one step return max(int(time_fs / dt), 1)
def _calculate_single(self, init_config, gap, method_name): """ Calculate a single τ_acc from one configuration :param init_config: (gt.Configuration) :param gap: (gt.GAP) :param method_name: (str) Ground truth method e.g. dftb, orca, gpaw """ cuml_error, curr_time = 0, 0 block_time = self.interval_time * gt.GTConfig.n_cores step_interval = self.interval_time // self.dt while curr_time < self.max_time: traj = gt.md.run_gapmd(init_config, gap=gap, temp=self.temp, dt=self.dt, interval=step_interval, fs=block_time, n_cores=min(gt.GTConfig.n_cores, 4)) # Only evaluate the energy try: traj.single_point(method_name=method_name) except ValueError: logger.warning('Failed to calculate single point energies with' f' {method_name}. τ_acc will be underestimated ' f'by <{block_time}') return curr_time pred = traj.copy() pred.parallel_gap(gap=gap) logger.info(' ___ |E_true - E_GAP|/eV ___') logger.info(f' t/fs err cumul(err)') for j in range(len(traj)): e_error = np.abs(traj[j].energy - pred[j].energy) # Add any error above the allowed threshold cuml_error += max(e_error - self.e_l, 0) curr_time += self.dt * step_interval logger.info(f'{curr_time:5.0f} ' f'{e_error:6.4f} ' f'{cuml_error:6.4f}') if cuml_error > self.e_t: return curr_time init_config = traj[-1] logger.info(f'Reached max(τ_acc) = {self.max_time} fs') return self.max_time
def get_init_configs(system, init_configs=None, n=10, method_name=None): """Generate a set of initial configurations to use for active learning""" if init_configs is not None: if all(cfg.energy is not None for cfg in init_configs): logger.info(f'Initialised with {len(init_configs)} configurations ' f'all with defined energy') return init_configs # Initial configurations are not defined, so make some - will use random # with the largest maximum distance between molecules possible max_vdw = max(get_vdw_radius(symbol) for symbol in system.atom_symbols()) ideal_dist = 2*max_vdw - 0.5 # Desired minimum distance in Å # Reduce the distance until there is a probability at least 0.1 that a # random configuration can be generated with that distance threshold p_acc, dist = 0, ideal_dist+0.2 while p_acc < 0.1: n_generated_configs = 0 dist -= 0.2 # Reduce the minimum distance requirement for _ in range(10): try: _ = system.random(min_dist_threshold=dist) n_generated_configs += 1 except ex.RandomiseFailed: continue p_acc = n_generated_configs / 10 logger.info(f'Generated configurations with p={p_acc:.2f} with a ' f'minimum distance of {dist:.2f}') init_configs = gt.Data(name='init_configs') # Finally generate the initial configurations while len(init_configs) < n: try: init_configs += system.random(min_dist_threshold=dist, with_intra=True) except ex.RandomiseFailed: continue logger.info(f'Added {len(init_configs)} configurations with min dist = ' f'{dist:.3f} Å') if method_name is None: logger.warning('Have no method - not evaluating energies') return init_configs # And run the desired method in parallel across them method = getattr(init_configs, f'parallel_{method_name.lower()}') method() init_configs.save() return init_configs
def save(self, filename=None, append=False): """ Print this configuration as an extended xyz file where the first 4 columns are the atom symbol, x, y, z and, if this configuration contains forces then add the x, y, z components of the force on as columns 4-7. ----------------------------------------------------------------------- :param filename: (str) :param append: (bool) Append to the end of this exyz file? """ if filename is None: filename = f'{self.name}.xyz' logger.info(f'Saving configuration as {filename}') a, b, c = self.box.size energy_str = '' if self.energy is not None: energy_str += f'dft_energy={self.energy:.8f}' prop_str = 'Properties=species:S:1:pos:R:3' if self.forces is not None: prop_str += ':dft_forces:R:3' if not filename.endswith('.xyz'): logger.warning('Filename had no .xyz extension - adding') filename += '.xyz' with open(filename, 'a' if append else 'w') as exyz_file: print( f'{len(self.atoms)}\n' f'Lattice="{a:.6f} 0.000000 0.000000 ' f'0.000000 {b:.6f} 0.000000 ' f'0.000000 0.000000 {c:.6f}" ' f'{prop_str} ' f'{energy_str}', file=exyz_file) for i, atom in enumerate(self.atoms): x, y, z = atom.coord line = f'{atom.label} {x:.5f} {y:.5f} {z:.5f} ' if self.forces is not None: fx, fy, fz = self.forces[i] line += f'{fx:.5f} {fy:.5f} {fz:.5f}' print(line, file=exyz_file) return None
def radius(self): """ Calculate the radius of this species as half the maximum distance between two atoms plus the van der Walls radius of H if there are >1 atoms otherwise :return: (float) Radius in Å """ if self.n_atoms == 1: return get_vdw_radius(atom_label=self.atoms[0].label) coords = self.coordinates max_distance = np.max(distance_matrix(coords, coords)) logger.warning('Assuming hydrogen on the exterior in calculating the ' f'radius of {self.name}') return max_distance / 2.0 + get_vdw_radius('H')
def __init__(self, filename, init_configuration=None, charge=None, mult=None, box=None): super().__init__() if filename == 'geo_end.xyz': self.extract_from_dftb(init_config=init_configuration) elif filename.endswith('.traj'): self.extract_from_ase(filename, init_config=init_configuration) elif all(prm is not None for prm in (charge, mult, box)): self.load(filename, box=box, charge=charge, mult=mult) elif filename.endswith('.gro'): self.extract_from_gmx(filename, init_configuration) if len(self) == 0: logger.warning('Loaded an empty trajectory')
def __init__(self, name, system=None, default_params=True): """ A Gaussian Approximation Potential :param name: (str) :param system: (gaptrain.systems.System | None) """ self.name = name if system is not None and default_params: self.params = Parameters(atom_symbols=system.atom_symbols()) else: self.params = Parameters(atom_symbols=[]) logger.warning('Initialised a GAP with no parameters. ' 'gap.train not available') self.training_data = None
def load(self, filename=None, system=None, box=None, charge=None, mult=None): """ Load a set of configurations from an extended xyz file - needs to have a system to be able to assign a charge, multiplicity and box size. Will set the *true* values ---------------------------------------------------------------------- :param system: (gaptrain.systems.System | gaptrain.configuration.Configuration) :param filename: (str) Filename to load configurations from if None defaults to "name.xyz" :param box: (gaptrain.box.Box) :param charge: (int) :param mult: (int) """ filename = f'{self.name}.xyz' if filename is None else filename if not os.path.exists(filename): raise ex.LoadingFailed(f'XYZ file for {filename} did not exist') if system is not None: if all(prm for prm in (system.box, system.charge, system.mult)): logger.info('Setting box, charge and multiplicity from a conf') box, charge, mult = system.box, system.charge, system.mult logger.info(f'Loading configuration set from {filename}') lines = open(filename, 'r').readlines() # Stride through the file and add configuration for each i = 0 while i < len(lines): # Configurations may have different numbers of atoms try: n_atoms = int(lines[i].split()[0]) except (TypeError, IndexError): raise ex.LoadingFailed('Could not read the number of atoms in' f'{filename}') stride = n_atoms + 2 configuration = Configuration() configuration.load(file_lines=lines[i:i + stride], box=box, charge=charge, mult=mult) self._list.append(configuration) i += stride if self.name is None or self.name == 'data': self.name = filename.rstrip('.xyz') logger.warning(f'Set self.name to {self.name}') return None
def load(self, filename=None, file_lines=None, box=None, charge=None, mult=None): """ Load a configuration from a file or a list of file lines ---------------------------------------------------------------------- :param filename: (str) Filename to load configurations from if None defaults to "name.xyz" :param file_lines: (list(str)) List of extended xyz file lines to read from :param box: (gaptrain.box.Box) :param charge: (int) :param mult: (int) """ if filename is None and file_lines is None: try: file_lines = open(f'{self.name}.xyz', 'r').readlines() except IOError: raise ex.LoadingFailed('Could not load no file or file lines') if filename is not None and file_lines is None: file_lines = open(filename, 'r').readlines() self.charge = charge if charge is not None else self.charge self.mult = mult if mult is not None else self.mult self.box = box if box is not None else self.box # Atoms, true forces and energy n_atoms, atoms, forces = None, [], [] # Grab the coordinates, energy and forces 0->n_atoms + 2 inclusive for j, line in enumerate(file_lines): if j == 0: # First thing should be the number of atoms try: n_atoms = int(line.split()[0]) except (IndexError, TypeError): raise ex.LoadingFailed('Line 1 of the xyz file ' 'malformatted') elif j == 1: if 'dft_energy' in line: # Grab the energy, which may be in any position in the line for item in line.split(): if 'dft_energy=' in item: self.energy = float(item.split('=')[-1]) break # Try and load the box if 'Lattice="' in line and box is None: try: # Remove anything before or after the quotes vec_string = line.split('"')[1].split('"')[0] components = [float(val) for val in vec_string.split()] # Expecting all the components of the lattice # vectors, so for an orthorhombic box take the # diagonal elements of the a, b, c vectors self.box = gt.Box( size=[components[0], components[4], components[8]]) except (TypeError, ValueError, IndexError): raise ex.LoadingFailed('Failed to load the box') elif len(line.split()) < 4: logger.warning('Unexpected line break, assuming end of atoms') break else: atom_label, x, y, z = line.split()[:4] atoms.append(Atom(atom_label, x=x, y=y, z=z)) if len(line.split()) != 7: continue # System has forces fx, fy, fz = line.split()[4:] forces.append(np.array([float(fx), float(fy), float(fz)])) # Default charge and multiplicity if there is a box but no charge if self.box is not None: if charge is None and self.charge is None: logger.warning('Found a box but no charge, defaulting to 0') self.charge = 0 if mult is None and self.mult is None: logger.warning('Found a box but no multiplicity, ' 'defaulting to 1') self.mult = 1 if len(atoms) == 0 or n_atoms is None: raise ex.LoadingFailed('Found no atoms in the file') if len(atoms) != n_atoms: raise ex.LoadingFailed(f'Number of atoms declared {n_atoms} not ' f'equal to the number found {len(atoms)}') self.set_atoms(atoms=atoms) # Set the forces if there are some if len(forces) > 0: self.forces = np.array(forces) return None
def get_solvent(name): """Gets solvent molecule from solvent list""" for solvent in solvents: if solvent.name == name: return solvent return None # Generate Solvent objects for all molecules in solvent_lib solvents = [] for filename in os.listdir(solvent_dir): if not filename.endswith('.xyz'): continue itp_filename = filename.replace('.xyz', '.itp') itp_filepath = os.path.join(solvent_dir, itp_filename) if not os.path.exists(itp_filepath): logger.warning(f'Found solvent xyz file without associated ' f'itp {filename}') continue solvent = Solvent(xyz_filename=os.path.join(solvent_dir, filename), gmx_itp_filename=itp_filepath) solvent.name = os.path.basename(itp_filepath.rstrip('.itp')) solvents.append(solvent)
def get_active_config_diff(config, gap, temp, e_thresh, max_time_fs, ref_method_name='dftb', curr_time_fs=0, n_calls=0, extra_time_fs=0, **kwargs): """ Given a configuration run MD with a GAP until the absolute error between the predicted and true values is above a threshold -------------------------------------------------------------------------- :param config: (gt.Configuration) :param gap: (gt.GAP) :param e_thresh: (float) Threshold energy error (eV) above which the configuration is returned :param temp: (float) Temperature to propagate GAP-MD :param max_time_fs: (float) :param ref_method_name: (str) :param curr_time_fs: (float) :param n_calls: (int) Number of times this function has been called :param extra_time_fs: (float) Some extra time to run initially e.g. as the GAP is already likely to get to e.g. 100 fs, so run that initially and don't run ground truth evaluations :return: (gt.Configuration) """ if float(temp) < 0: raise ValueError('Cannot run MD with a negative temperature') if float(e_thresh) < 0: raise ValueError(f'Error threshold {e_thresh} must be positive (eV)') if extra_time_fs > 0: logger.info(f'Running an extra {extra_time_fs:.1f} fs of MD before ' f'calculating an error') md_time_fs = 2 + n_calls**3 + float(extra_time_fs) gap_traj = gt.md.run_gapmd(config, gap=gap, temp=float(temp), dt=0.5, interval=4, fs=md_time_fs, n_cores=1, **kwargs) # Actual initial time, given this function can be called multiple times for frame in gap_traj: frame.t0 = curr_time_fs + extra_time_fs # Evaluate the error on the final frame error = calc_error(frame=gap_traj[-1], gap=gap, method_name=ref_method_name) # And the number of ground truth evaluations for this configuration n_evals = n_calls + 1 if error > 100 * e_thresh: logger.error('Huge error: 100x threshold, returning the first frame') gap_traj[0].single_point(method_name=ref_method_name, n_cores=1) gap_traj[0].n_evals = n_evals + 1 return gap_traj[0] if error > 10 * e_thresh: logger.warning('Error 10 x threshold! Taking the last frame less than ' '10x the threshold') # Stride through only 10 frames to prevent very slow backtracking for frame in reversed(gap_traj[::max(1, len(gap_traj)//10)]): error = calc_error(frame, gap=gap, method_name=ref_method_name) n_evals += 1 if e_thresh < error < 10 * e_thresh: frame.n_evals = n_evals return frame if error > e_thresh: gap_traj[-1].n_evals = n_evals return gap_traj[-1] if curr_time_fs + md_time_fs > max_time_fs: logger.info(f'Reached the maximum time {max_time_fs} fs, returning ' f'None') return None # Increment t_0 to the new time curr_time_fs += md_time_fs # If the prediction is within the threshold then call this function again return get_active_config_diff(config, gap, temp, e_thresh, max_time_fs, curr_time_fs=curr_time_fs, ref_method_name=ref_method_name, n_calls=n_calls+1, **kwargs)