Python Data Examples, gaptrain.Data Python Examples

Example #1

0

Show file

File: active.py Project: duartegroup/gap-train

def get_init_configs(system, init_configs=None, n=10, method_name=None):
    """Generate a set of initial configurations to use for active learning"""

    if init_configs is not None:

        if all(cfg.energy is not None for cfg in init_configs):
            logger.info(f'Initialised with {len(init_configs)} configurations '
                        f'all with defined energy')
            return init_configs

    # Initial configurations are not defined, so make some - will use random
    # with the largest maximum distance between molecules possible
    max_vdw = max(get_vdw_radius(symbol) for symbol in system.atom_symbols())
    ideal_dist = 2*max_vdw - 0.5    # Desired minimum distance in Å

    # Reduce the distance until there is a probability at least 0.1 that a
    # random configuration can be generated with that distance threshold
    p_acc, dist = 0, ideal_dist+0.2

    while p_acc < 0.1:
        n_generated_configs = 0
        dist -= 0.2                 # Reduce the minimum distance requirement

        for _ in range(10):
            try:
                _ = system.random(min_dist_threshold=dist)
                n_generated_configs += 1

            except ex.RandomiseFailed:
                continue

        p_acc = n_generated_configs / 10
        logger.info(f'Generated configurations with p={p_acc:.2f} with a '
                    f'minimum distance of {dist:.2f}')

    init_configs = gt.Data(name='init_configs')
    # Finally generate the initial configurations
    while len(init_configs) < n:
        try:
            init_configs += system.random(min_dist_threshold=dist,
                                          with_intra=True)
        except ex.RandomiseFailed:
            continue
    logger.info(f'Added {len(init_configs)} configurations with min dist = '
                f'{dist:.3f} Å')

    if method_name is None:
        logger.warning('Have no method - not evaluating energies')
        return init_configs

    # And run the desired method in parallel across them
    method = getattr(init_configs, f'parallel_{method_name.lower()}')
    method()

    init_configs.save()
    return init_configs

Example #2

0

Show file

File: test_gap.py Project: duartegroup/gap-train

def test_gap_train():

    system = gt.System(box_size=[10, 10, 10])

    training_data = gt.Data(name='test')
    training_data.load(system=system,
                       filename=os.path.join(here, 'data', 'rnd_training.xyz'))

    assert len(training_data) == 10
    assert len(training_data[0].atoms) == 31

    if 'GT_GAP' not in os.environ or not os.environ['GT_GAP'] == 'True':
        return

    # Run GAP train with the training data
    gap = gt.GAP(name='test', system=system)
    gap.train(training_data)

Example #3

0

Show file

def grid_configs(n_to_cube):
    """
    :param n_to_cube: (int) Generate n^3 configurations
    :return: (gt.Configuration)
    """

    configs = gt.Data(name=f'grid_{n_to_cube}-cubed')

    # Also add the minimum energy strucutre
    minimum = get_h2o(r1=1.0, r2=1.0, r3=1.5)
    minimum.run_gpaw(max_force=0.01, n_cores=4)
    configs += minimum

    for r1 in np.linspace(0.8, 1.5, n_to_cube):
        for r2 in np.linspace(0.8, 1.5, n_to_cube):
            for r3 in np.linspace(1.0, 2.5, n_to_cube):
                h2o = get_h2o(r1, r2, r3)
                configs += h2o

    return configs

Example #4

0

Show file

def test_load_no_box():

    data = gt.Data()
    data.load(filename=os.path.join(here, 'data', 'rnd_training.xyz'))
    assert len(data) > 0

    for config in data:

        assert config.energy is not None
        assert config.charge == 0
        assert config.mult == 1

    with open('tmp.xyz', 'w') as test_xyz:
        print('1\nLattice=""', file=test_xyz)

    with pytest.raises(LoadingFailed):
        configs = ConfigurationSet()
        configs.load(filename='tmp.xyz')

    os.remove('tmp.xyz')

Example #5

0

Show file

File: train_intra.py Project: duartegroup/gap-train

def grid_configs(n_to_cube):
    """
    :param n_to_cube: (int) Generate n^3 configurations
    :return: (gt.Configuration)
    """

    configs = gt.Data(name=f'grid_{n_to_cube}-cubed')

    # Also add the minimum energy strucutre
    minimum = gt.Configuration('h2o_min_revPBE0.xyz', box=gt.Box([8, 8, 8]))
    assert minimum.atoms is not None
    configs += minimum

    for r1 in np.linspace(0.8, 1.5, n_to_cube):
        for r2 in np.linspace(0.8, 1.5, n_to_cube):
            for r3 in np.linspace(1.0, 2.5, n_to_cube):
                h2o = get_h2o(r1, r2, r3)
                configs += h2o

    configs.parallel_cp2k()
    return configs

Example #6

0

Show file

import gaptrain as gt
gt.GTConfig.n_cores = 4

if __name__ == '__main__':

    system = gt.System(box_size=[12.42, 12.42, 12.42])
    system.add_molecules(gt.Molecule('znh2o6.xyz', charge=2))
    system.add_solvent('h2o', n=58)

    intra_h2o_gap = gt.gap.SolventIntraGAP(name='water_intra_gap',
                                           system=system)

    intra_znh2o6_gap = gt.gap.SoluteIntraGAP(
        name='intra_znh2o6', system=system, molecule=gt.Molecule('znh2o6.xyz'))
    inter_gap = gt.InterGAP(name='inter', system=system, default_params=False)

    # Run 30 ps of dynamics from an equilibrated point
    traj = gt.md.run_gapmd(configuration=gt.Data('eqm_final_frame.xyz')[0],
                           gap=gt.gap.SSGAP(solute_intra=intra_znh2o6_gap,
                                            solvent_intra=intra_h2o_gap,
                                            inter=inter_gap),
                           temp=300,
                           dt=0.5,
                           interval=5,
                           ps=30,
                           n_cores=4)

    traj.save(filename=f'zn_h2o_traj')

Example #7

0

Show file

# Run 1 ps molecular dynamics using the GAP at 300 K using a 0.5 fs time-step.
# The initial configuration is methane located at a random position in the box
traj = gt.md.run_gapmd(
    configuration=methane.random(),
    gap=gap,
    temp=500,  # Kelvin
    dt=0.5,  # fs
    interval=1,  # frames
    fs=50,
    n_cores=4)

# save the trajectory with no energies
traj.save(filename='traj.xyz')

# create sets of data from the trajectory containing predicted & true energies
pred = gt.Data('traj.xyz')
pred.parallel_gap(gap=gap)

true = gt.Data('traj.xyz')
true.parallel_orca()

# and plot the energies over time ---------------------------------------------
import numpy as np
import matplotlib.pyplot as plt

plt.plot(
    np.linspace(0, 50, len(pred)),  # 0 -> 50 fs
    pred.energies() - np.min(true.energies()),  # rel energies
    label='GAP',
    lw=2)

Example #8

0

Show file

File: active.py Project: duartegroup/gap-train

def train(system: gt.System,
          method_name: str,
          gap=None,
          max_time_active_fs=1000,
          min_time_active_fs=0,
          n_configs_iter=10,
          temp=300,
          active_e_thresh=None,
          active_method='diff',
          max_energy_threshold=None,
          validate=False,
          tau=None,
          tau_max=None,
          val_interval=None,
          max_active_iters=50,
          n_init_configs=10,
          init_configs=None,
          remove_intra_init_configs=True,
          fix_init_config=False,
          bbond_energy=None,
          fbond_energy=None,
          init_active_temp=None):
    """
    Train a system using active learning, by propagating dynamics using ML
    driven molecular dynamics (MD) and adding configurations where the error
    is above a threshold. Loop looks something like

    Generate configurations -> train a GAP -> run GAP-MD -> frames with error
                                   ^                               |
                                   |________ calc true  ___________


    Active learning will loop until either (1) the iteration > max_active_iters
    or (2) no configurations are found to add or (3) if calculated τ = max(τ)
    where the loop will break out

    --------------------------------------------------------------------------
    :param system: (gt.system.System)

    :param method_name: (str) Name of a method to use as the ground truth e.g.
                        dftb, orca, gpaw

    :param gap: (gt.gap.GAP) GAP to train with the active learnt data, if
                None then one will be initialised by placing SOAPs on each
                heavy atom and defining the 'other' atom types included in the
                neighbour density by their proximity. Distance cutoffs default
                to 3.5 Å for all atoms

    :param max_time_active_fs: (float) Maximum propagation time in the active
                               learning loop. Default = 1 ps

    :param min_time_active_fs: (float) Minimum propagation time for an
                               active learnt configuration. Will be updated
                               so the error is only calculated where the GAP
                               is unlikely to be accurate

    :param n_configs_iter: (int) Number of configurations to generate per
                           active learning cycle

    :param temp: (float) Temperature in K to propagate active learning at -
                 higher is better for stability but requires more training


    :param active_method: (str) Method used to generate active learnt
                          configurations. One of ['diff', 'qbc', 'gp_var']

    :param active_e_thresh: (float) Threshold in eV (E_t) above which a
                            configuration is added to the potential. If None
                            then will use 1 kcal mol-1 molecule-1

                            1. active_method='diff': |E_0 - E_GAP| > E_t

                            2. active_method='qbc': σ(E_GAP1, E_GAP2...) > E_t

                            3. active_method='gp_var': σ^2_GAP(predicted) > E_t

    :param max_energy_threshold: (float) Maximum relative energy threshold for
                                 configurations to be added to the training
                                 data

    :param validate: (bool) Whether or not to validate the potential during
                     the training. Will, by default run a τ calculation with
                     an interval max_time_active_fs / 100, so that a maximum of
                     50 calculations are run and a maximum time of
                     max(τ) = 5 x max_time_active_fs

    :param tau: (gt.loss.Tau) A instance of the τ error metric, unused if no
                validation is performed. Otherwise

    :param tau_max: (float | None) Maximum τ_acc in fs if float, will break out
                    of the active learning loop if this value is reached. If
                    None then won't break out

    :param val_interval: (int) Interval in the active training loop at which to
                         run the validation. Defaults to max_active_iters // 10
                         if validation is requested

    :param max_active_iters: (int) Maximum number of active learning
                             iterations to perform. Will break if we hit the
                             early stopping criteria

    :param n_init_configs: (int) Number of initial configurations to generate,
                           will be ignored if init_configs is not None

    :param init_configs: (gt.ConfigurationSet) A set of configurations from
                         which to start the active learning from

    :param remove_intra_init_configs: (bool) Whether the intramolecular
                                      component of the energy/force needs to
                                      be removed prior to training with
                                      init_configs. only applies for IIGAP
                                      and init_configs != None

    :param fix_init_config: (bool) Always start from the same initial
                            configuration for the active learning loop, if
                            False then the minimum energy structure is used.
                            Useful for TS learning, where dynamics should be
                            propagated from a saddle point not the minimum

    :param bbond_energy: (dict | None) Additional energy to add to a breaking
                         bond. e.g. bbond_energy={(0, 1), 0.1} Adds 0.1 eV
                         to the 'bond' between atoms 0 and 1 as velocities
                        shared between the atoms in the breaking bond direction

    :param fbond_energy: (dict | None) As bbond_energy but in the direction to
                         form a bond


    :param init_active_temp: (float | None) Initial temperature for velocities
                             in the 'active' MD search for configurations

    :return: (gt.Data, gt.GAP)
    """
    init_configs = get_init_configs(init_configs=init_configs,
                                    n=n_init_configs,
                                    method_name=method_name,
                                    system=system)

    # Remove the intra-molecular energy if an intra+inter (II) GAP is being
    # trained
    do_remove_intra = isinstance(gap, gt.IIGAP)
    if do_remove_intra and remove_intra_init_configs:
        remove_intra(init_configs, gap=gap)

    # Initial configuration must have energies
    assert all(cfg.energy is not None for cfg in init_configs)

    if gap is None:
        gap = gt.GAP(name=unique_name('active_gap'), system=system)

    # Initialise a τ metric with default parameters
    if validate and tau is None:
        # 1 ps default maximum tau
        tau = gt.loss.Tau(configs=get_init_configs(system, n=5),
                          e_lower=0.043363 * len(system.molecules),
                          max_fs=tau_max if tau_max is not None else 1000)

    # Default to validating 10 times through the training
    if validate and val_interval is None:
        val_interval = max(max_active_iters // 10, 1)

    # Initialise training data
    train_data = gt.Data(name=gap.name)
    train_data += init_configs

    # and train an initial GAP
    gap.train(init_configs)

    if active_e_thresh is None:
        if active_method.lower() == 'diff':
            #                 1 kcal mol-1 molecule-1
            active_e_thresh = 0.043363 * len(system.molecules)

        if active_method.lower() == 'qbc':
            # optimised on a small box of water. std dev. for total energy
            active_e_thresh = 1E-6 * len(system.molecules)

        if active_method.lower() == 'gp_var':
            # Threshold for maximum per-atom GP variance (eV atom^-1)
            active_e_thresh = 5E-5

    # Initialise the validation output file
    if validate:
        tau_file = open(f'{gap.name}_tau.txt', 'w')
        print('Iteration    n_evals      τ_acc / fs', file=tau_file)

    # Run the active learning loop, running iterative GAP-MD
    for iteration in range(max_active_iters):

        # Set the configuration from which GAP-MD will be run
        min_idx = int(np.argmin(train_data.energies()))
        init_config = train_data[0] if fix_init_config else train_data[min_idx]

        configs = get_active_configs(init_config,
                                     gap=gap,
                                     ref_method_name=method_name,
                                     method=str(active_method),
                                     n_configs=n_configs_iter,
                                     temp=temp,
                                     e_thresh=active_e_thresh,
                                     max_time_fs=max_time_active_fs,
                                     min_time_fs=min_time_active_fs,
                                     bbond_energy=bbond_energy,
                                     fbond_energy=fbond_energy,
                                     init_temp=init_active_temp)

        # Active learning finds no configurations,,
        if len(configs) == 0:
            # Calculate the final tau if we're running with validation
            if validate:
                tau.calculate(gap=gap, method_name=method_name)
                print(iteration, tau.value, sep='\t\t\t', file=tau_file)

            logger.info('No configs to add. Active learning = DONE')
            break

        min_time_active_fs = min(config.t0 for config in configs)
        logger.info(f'All active configurations reached t = '
                    f'{min_time_active_fs} fs before an error exceeded the '
                    f'threshold of {active_e_thresh:.3f} eV')

        if do_remove_intra:
            remove_intra(configs, gap=gap)

        train_data += configs

        # If required remove high-lying energy configuration from the data
        if max_energy_threshold is not None:
            train_data.remove_above_e(max_energy_threshold)

        # Retrain on these new data
        gap.train(train_data)

        # Print the accuracy
        if validate and iteration % val_interval == 0:

            tau.calculate(gap=gap, method_name=method_name)
            print(f'{iteration:<13g}'
                  f'{sum(config.n_evals for config in train_data):<13g}'
                  f'{tau.value}', sep='\t', file=tau_file)

            if np.abs(tau.value - tau.max_time) < 1:
                logger.info('Reached the maximum tau. Active learning = DONE')
                break

    return train_data, gap

Example #9

0

Show file

File: active.py Project: duartegroup/gap-train

def get_active_configs(config, gap, ref_method_name, method='diff',
                       max_time_fs=1000, n_configs=10, temp=300, e_thresh=0.1,
                       min_time_fs=0, **kwargs):
    """
    Generate n_configs using on-the-fly active learning parallelised over
    GTConfig.n_cores

    --------------------------------------------------------------------------
    :param config: (gt.Configuration) Initial configuration to propagate from

    :param gap: (gt.gap.GAP) GAP to run MD with

    :param ref_method_name: (str) Name of the method to use as the ground truth

    :param method: (str) Name of the strategy used to generate new configurations

    :param max_time_fs: (float) Maximum propagation time in the active learning
                        loop. Default = 1 ps

    :param n_configs: (int) Number of configurations to generate

    :param temp: (float) Temperature in K to run the intermediate MD with

    :param e_thresh: (float) Energy threshold in eV above which the MD frame
                     is returned by the active learning function i.e
                     E_t < |E_GAP - E_true|  method='diff'

    :param min_time_fs: (float) Minimum propagation time in the active learning
                        loop. If non-zero then will run this amount of time
                        initially then look for a configuration with a
                        |E_0 - E_GAP| > e_thresh

    :param kwargs: Additional keyword arguments passed to the GAP MD function

    :return:(gt.ConfigurationSet)
    """
    if int(n_configs) < int(gt.GTConfig.n_cores):
        raise NotImplementedError('Active learning is only implemented using '
                                  'one core for each process. Please use '
                                  'n_configs >= gt.GTConfig.n_cores')
    results = []
    configs = gt.Data()
    logger.info('Searching for "active" configurations with a threshold of '
                f'{e_thresh:.6f} eV')

    if method.lower() == 'diff':
        function = get_active_config_diff
        args = (config, gap, temp, e_thresh, max_time_fs, ref_method_name,
                0, 0, min_time_fs)

    elif method.lower() == 'qbc':
        function = get_active_config_qbc
        # Train a few GAPs on the same data
        gap = gt.gap.GAPEnsemble(name=f'{gap.name}_ensemble', gap=gap)
        gap.train()

        args = (config, gap, temp, e_thresh, max_time_fs)

    elif method.lower() == 'gp_var':
        function = get_active_config_gp_var
        args = (config, gap, temp, e_thresh, max_time_fs)

    else:
        raise ValueError('Unsupported active method')

    logger.info(f'Using {gt.GTConfig.n_cores} processes')
    with Pool(processes=int(gt.GTConfig.n_cores)) as pool:

        for _ in range(n_configs):
            result = pool.apply_async(func=function, args=args, kwds=kwargs)
            results.append(result)

        for result in results:

            try:
                config = result.get(timeout=None)
                if config is not None and config.energy is not None:
                    configs.add(config)

            # Lots of different exceptions can be raised when trying to
            # generate an active config, continue regardless..
            except Exception as err:
                logger.error(f'Raised an exception in calculating the energy\n'
                             f'{err}')
                continue

    if method.lower() != 'diff':
        logger.info('Running reference calculations on configurations '
                    f'generated by {method}')
        configs.single_point(method_name=ref_method_name)

        # Set the number of ground truth function calls for each iteration
        for config in configs:
            config.n_evals = 1

    return configs