def get_init_configs(system, init_configs=None, n=10, method_name=None): """Generate a set of initial configurations to use for active learning""" if init_configs is not None: if all(cfg.energy is not None for cfg in init_configs): logger.info(f'Initialised with {len(init_configs)} configurations ' f'all with defined energy') return init_configs # Initial configurations are not defined, so make some - will use random # with the largest maximum distance between molecules possible max_vdw = max(get_vdw_radius(symbol) for symbol in system.atom_symbols()) ideal_dist = 2*max_vdw - 0.5 # Desired minimum distance in Å # Reduce the distance until there is a probability at least 0.1 that a # random configuration can be generated with that distance threshold p_acc, dist = 0, ideal_dist+0.2 while p_acc < 0.1: n_generated_configs = 0 dist -= 0.2 # Reduce the minimum distance requirement for _ in range(10): try: _ = system.random(min_dist_threshold=dist) n_generated_configs += 1 except ex.RandomiseFailed: continue p_acc = n_generated_configs / 10 logger.info(f'Generated configurations with p={p_acc:.2f} with a ' f'minimum distance of {dist:.2f}') init_configs = gt.Data(name='init_configs') # Finally generate the initial configurations while len(init_configs) < n: try: init_configs += system.random(min_dist_threshold=dist, with_intra=True) except ex.RandomiseFailed: continue logger.info(f'Added {len(init_configs)} configurations with min dist = ' f'{dist:.3f} Å') if method_name is None: logger.warning('Have no method - not evaluating energies') return init_configs # And run the desired method in parallel across them method = getattr(init_configs, f'parallel_{method_name.lower()}') method() init_configs.save() return init_configs
def test_gap_train(): system = gt.System(box_size=[10, 10, 10]) training_data = gt.Data(name='test') training_data.load(system=system, filename=os.path.join(here, 'data', 'rnd_training.xyz')) assert len(training_data) == 10 assert len(training_data[0].atoms) == 31 if 'GT_GAP' not in os.environ or not os.environ['GT_GAP'] == 'True': return # Run GAP train with the training data gap = gt.GAP(name='test', system=system) gap.train(training_data)
def grid_configs(n_to_cube): """ :param n_to_cube: (int) Generate n^3 configurations :return: (gt.Configuration) """ configs = gt.Data(name=f'grid_{n_to_cube}-cubed') # Also add the minimum energy strucutre minimum = get_h2o(r1=1.0, r2=1.0, r3=1.5) minimum.run_gpaw(max_force=0.01, n_cores=4) configs += minimum for r1 in np.linspace(0.8, 1.5, n_to_cube): for r2 in np.linspace(0.8, 1.5, n_to_cube): for r3 in np.linspace(1.0, 2.5, n_to_cube): h2o = get_h2o(r1, r2, r3) configs += h2o return configs
def test_load_no_box(): data = gt.Data() data.load(filename=os.path.join(here, 'data', 'rnd_training.xyz')) assert len(data) > 0 for config in data: assert config.energy is not None assert config.charge == 0 assert config.mult == 1 with open('tmp.xyz', 'w') as test_xyz: print('1\nLattice=""', file=test_xyz) with pytest.raises(LoadingFailed): configs = ConfigurationSet() configs.load(filename='tmp.xyz') os.remove('tmp.xyz')
def grid_configs(n_to_cube): """ :param n_to_cube: (int) Generate n^3 configurations :return: (gt.Configuration) """ configs = gt.Data(name=f'grid_{n_to_cube}-cubed') # Also add the minimum energy strucutre minimum = gt.Configuration('h2o_min_revPBE0.xyz', box=gt.Box([8, 8, 8])) assert minimum.atoms is not None configs += minimum for r1 in np.linspace(0.8, 1.5, n_to_cube): for r2 in np.linspace(0.8, 1.5, n_to_cube): for r3 in np.linspace(1.0, 2.5, n_to_cube): h2o = get_h2o(r1, r2, r3) configs += h2o configs.parallel_cp2k() return configs
import gaptrain as gt gt.GTConfig.n_cores = 4 if __name__ == '__main__': system = gt.System(box_size=[12.42, 12.42, 12.42]) system.add_molecules(gt.Molecule('znh2o6.xyz', charge=2)) system.add_solvent('h2o', n=58) intra_h2o_gap = gt.gap.SolventIntraGAP(name='water_intra_gap', system=system) intra_znh2o6_gap = gt.gap.SoluteIntraGAP( name='intra_znh2o6', system=system, molecule=gt.Molecule('znh2o6.xyz')) inter_gap = gt.InterGAP(name='inter', system=system, default_params=False) # Run 30 ps of dynamics from an equilibrated point traj = gt.md.run_gapmd(configuration=gt.Data('eqm_final_frame.xyz')[0], gap=gt.gap.SSGAP(solute_intra=intra_znh2o6_gap, solvent_intra=intra_h2o_gap, inter=inter_gap), temp=300, dt=0.5, interval=5, ps=30, n_cores=4) traj.save(filename=f'zn_h2o_traj')
# Run 1 ps molecular dynamics using the GAP at 300 K using a 0.5 fs time-step. # The initial configuration is methane located at a random position in the box traj = gt.md.run_gapmd( configuration=methane.random(), gap=gap, temp=500, # Kelvin dt=0.5, # fs interval=1, # frames fs=50, n_cores=4) # save the trajectory with no energies traj.save(filename='traj.xyz') # create sets of data from the trajectory containing predicted & true energies pred = gt.Data('traj.xyz') pred.parallel_gap(gap=gap) true = gt.Data('traj.xyz') true.parallel_orca() # and plot the energies over time --------------------------------------------- import numpy as np import matplotlib.pyplot as plt plt.plot( np.linspace(0, 50, len(pred)), # 0 -> 50 fs pred.energies() - np.min(true.energies()), # rel energies label='GAP', lw=2)
def train(system: gt.System, method_name: str, gap=None, max_time_active_fs=1000, min_time_active_fs=0, n_configs_iter=10, temp=300, active_e_thresh=None, active_method='diff', max_energy_threshold=None, validate=False, tau=None, tau_max=None, val_interval=None, max_active_iters=50, n_init_configs=10, init_configs=None, remove_intra_init_configs=True, fix_init_config=False, bbond_energy=None, fbond_energy=None, init_active_temp=None): """ Train a system using active learning, by propagating dynamics using ML driven molecular dynamics (MD) and adding configurations where the error is above a threshold. Loop looks something like Generate configurations -> train a GAP -> run GAP-MD -> frames with error ^ | |________ calc true ___________ Active learning will loop until either (1) the iteration > max_active_iters or (2) no configurations are found to add or (3) if calculated τ = max(τ) where the loop will break out -------------------------------------------------------------------------- :param system: (gt.system.System) :param method_name: (str) Name of a method to use as the ground truth e.g. dftb, orca, gpaw :param gap: (gt.gap.GAP) GAP to train with the active learnt data, if None then one will be initialised by placing SOAPs on each heavy atom and defining the 'other' atom types included in the neighbour density by their proximity. Distance cutoffs default to 3.5 Å for all atoms :param max_time_active_fs: (float) Maximum propagation time in the active learning loop. Default = 1 ps :param min_time_active_fs: (float) Minimum propagation time for an active learnt configuration. Will be updated so the error is only calculated where the GAP is unlikely to be accurate :param n_configs_iter: (int) Number of configurations to generate per active learning cycle :param temp: (float) Temperature in K to propagate active learning at - higher is better for stability but requires more training :param active_method: (str) Method used to generate active learnt configurations. One of ['diff', 'qbc', 'gp_var'] :param active_e_thresh: (float) Threshold in eV (E_t) above which a configuration is added to the potential. If None then will use 1 kcal mol-1 molecule-1 1. active_method='diff': |E_0 - E_GAP| > E_t 2. active_method='qbc': σ(E_GAP1, E_GAP2...) > E_t 3. active_method='gp_var': σ^2_GAP(predicted) > E_t :param max_energy_threshold: (float) Maximum relative energy threshold for configurations to be added to the training data :param validate: (bool) Whether or not to validate the potential during the training. Will, by default run a τ calculation with an interval max_time_active_fs / 100, so that a maximum of 50 calculations are run and a maximum time of max(τ) = 5 x max_time_active_fs :param tau: (gt.loss.Tau) A instance of the τ error metric, unused if no validation is performed. Otherwise :param tau_max: (float | None) Maximum τ_acc in fs if float, will break out of the active learning loop if this value is reached. If None then won't break out :param val_interval: (int) Interval in the active training loop at which to run the validation. Defaults to max_active_iters // 10 if validation is requested :param max_active_iters: (int) Maximum number of active learning iterations to perform. Will break if we hit the early stopping criteria :param n_init_configs: (int) Number of initial configurations to generate, will be ignored if init_configs is not None :param init_configs: (gt.ConfigurationSet) A set of configurations from which to start the active learning from :param remove_intra_init_configs: (bool) Whether the intramolecular component of the energy/force needs to be removed prior to training with init_configs. only applies for IIGAP and init_configs != None :param fix_init_config: (bool) Always start from the same initial configuration for the active learning loop, if False then the minimum energy structure is used. Useful for TS learning, where dynamics should be propagated from a saddle point not the minimum :param bbond_energy: (dict | None) Additional energy to add to a breaking bond. e.g. bbond_energy={(0, 1), 0.1} Adds 0.1 eV to the 'bond' between atoms 0 and 1 as velocities shared between the atoms in the breaking bond direction :param fbond_energy: (dict | None) As bbond_energy but in the direction to form a bond :param init_active_temp: (float | None) Initial temperature for velocities in the 'active' MD search for configurations :return: (gt.Data, gt.GAP) """ init_configs = get_init_configs(init_configs=init_configs, n=n_init_configs, method_name=method_name, system=system) # Remove the intra-molecular energy if an intra+inter (II) GAP is being # trained do_remove_intra = isinstance(gap, gt.IIGAP) if do_remove_intra and remove_intra_init_configs: remove_intra(init_configs, gap=gap) # Initial configuration must have energies assert all(cfg.energy is not None for cfg in init_configs) if gap is None: gap = gt.GAP(name=unique_name('active_gap'), system=system) # Initialise a τ metric with default parameters if validate and tau is None: # 1 ps default maximum tau tau = gt.loss.Tau(configs=get_init_configs(system, n=5), e_lower=0.043363 * len(system.molecules), max_fs=tau_max if tau_max is not None else 1000) # Default to validating 10 times through the training if validate and val_interval is None: val_interval = max(max_active_iters // 10, 1) # Initialise training data train_data = gt.Data(name=gap.name) train_data += init_configs # and train an initial GAP gap.train(init_configs) if active_e_thresh is None: if active_method.lower() == 'diff': # 1 kcal mol-1 molecule-1 active_e_thresh = 0.043363 * len(system.molecules) if active_method.lower() == 'qbc': # optimised on a small box of water. std dev. for total energy active_e_thresh = 1E-6 * len(system.molecules) if active_method.lower() == 'gp_var': # Threshold for maximum per-atom GP variance (eV atom^-1) active_e_thresh = 5E-5 # Initialise the validation output file if validate: tau_file = open(f'{gap.name}_tau.txt', 'w') print('Iteration n_evals τ_acc / fs', file=tau_file) # Run the active learning loop, running iterative GAP-MD for iteration in range(max_active_iters): # Set the configuration from which GAP-MD will be run min_idx = int(np.argmin(train_data.energies())) init_config = train_data[0] if fix_init_config else train_data[min_idx] configs = get_active_configs(init_config, gap=gap, ref_method_name=method_name, method=str(active_method), n_configs=n_configs_iter, temp=temp, e_thresh=active_e_thresh, max_time_fs=max_time_active_fs, min_time_fs=min_time_active_fs, bbond_energy=bbond_energy, fbond_energy=fbond_energy, init_temp=init_active_temp) # Active learning finds no configurations,, if len(configs) == 0: # Calculate the final tau if we're running with validation if validate: tau.calculate(gap=gap, method_name=method_name) print(iteration, tau.value, sep='\t\t\t', file=tau_file) logger.info('No configs to add. Active learning = DONE') break min_time_active_fs = min(config.t0 for config in configs) logger.info(f'All active configurations reached t = ' f'{min_time_active_fs} fs before an error exceeded the ' f'threshold of {active_e_thresh:.3f} eV') if do_remove_intra: remove_intra(configs, gap=gap) train_data += configs # If required remove high-lying energy configuration from the data if max_energy_threshold is not None: train_data.remove_above_e(max_energy_threshold) # Retrain on these new data gap.train(train_data) # Print the accuracy if validate and iteration % val_interval == 0: tau.calculate(gap=gap, method_name=method_name) print(f'{iteration:<13g}' f'{sum(config.n_evals for config in train_data):<13g}' f'{tau.value}', sep='\t', file=tau_file) if np.abs(tau.value - tau.max_time) < 1: logger.info('Reached the maximum tau. Active learning = DONE') break return train_data, gap
def get_active_configs(config, gap, ref_method_name, method='diff', max_time_fs=1000, n_configs=10, temp=300, e_thresh=0.1, min_time_fs=0, **kwargs): """ Generate n_configs using on-the-fly active learning parallelised over GTConfig.n_cores -------------------------------------------------------------------------- :param config: (gt.Configuration) Initial configuration to propagate from :param gap: (gt.gap.GAP) GAP to run MD with :param ref_method_name: (str) Name of the method to use as the ground truth :param method: (str) Name of the strategy used to generate new configurations :param max_time_fs: (float) Maximum propagation time in the active learning loop. Default = 1 ps :param n_configs: (int) Number of configurations to generate :param temp: (float) Temperature in K to run the intermediate MD with :param e_thresh: (float) Energy threshold in eV above which the MD frame is returned by the active learning function i.e E_t < |E_GAP - E_true| method='diff' :param min_time_fs: (float) Minimum propagation time in the active learning loop. If non-zero then will run this amount of time initially then look for a configuration with a |E_0 - E_GAP| > e_thresh :param kwargs: Additional keyword arguments passed to the GAP MD function :return:(gt.ConfigurationSet) """ if int(n_configs) < int(gt.GTConfig.n_cores): raise NotImplementedError('Active learning is only implemented using ' 'one core for each process. Please use ' 'n_configs >= gt.GTConfig.n_cores') results = [] configs = gt.Data() logger.info('Searching for "active" configurations with a threshold of ' f'{e_thresh:.6f} eV') if method.lower() == 'diff': function = get_active_config_diff args = (config, gap, temp, e_thresh, max_time_fs, ref_method_name, 0, 0, min_time_fs) elif method.lower() == 'qbc': function = get_active_config_qbc # Train a few GAPs on the same data gap = gt.gap.GAPEnsemble(name=f'{gap.name}_ensemble', gap=gap) gap.train() args = (config, gap, temp, e_thresh, max_time_fs) elif method.lower() == 'gp_var': function = get_active_config_gp_var args = (config, gap, temp, e_thresh, max_time_fs) else: raise ValueError('Unsupported active method') logger.info(f'Using {gt.GTConfig.n_cores} processes') with Pool(processes=int(gt.GTConfig.n_cores)) as pool: for _ in range(n_configs): result = pool.apply_async(func=function, args=args, kwds=kwargs) results.append(result) for result in results: try: config = result.get(timeout=None) if config is not None and config.energy is not None: configs.add(config) # Lots of different exceptions can be raised when trying to # generate an active config, continue regardless.. except Exception as err: logger.error(f'Raised an exception in calculating the energy\n' f'{err}') continue if method.lower() != 'diff': logger.info('Running reference calculations on configurations ' f'generated by {method}') configs.single_point(method_name=ref_method_name) # Set the number of ground truth function calls for each iteration for config in configs: config.n_evals = 1 return configs