Exemple #1
0
def test_subset_of_frame_by_element():
    spec_list = ["H", "H", "O", "O", "O", "C"]
    test_struc_1 = Structure(
        cell=np.eye(3), species=spec_list, positions=np.zeros(shape=(len(spec_list), 3))
    )

    assert np.array_equal(
        subset_of_frame_by_element(test_struc_1, {}), list(range(len(test_struc_1)))
    )

    assert np.array_equal(
        subset_of_frame_by_element(test_struc_1, {"H": 2, "O": 3}),
        list(range(len(test_struc_1))),
    )
    assert np.array_equal(
        subset_of_frame_by_element(test_struc_1, {"H": 2, "O": 15}),
        list(range(len(test_struc_1))),
    )

    assert set(subset_of_frame_by_element(test_struc_1, {"H": 1, "O": 1})).issubset(
        range(len(spec_list))
    )
    assert len(subset_of_frame_by_element(test_struc_1, {"H": 1, "O": 1, "C": 1})) == 3

    assert subset_of_frame_by_element(test_struc_1, {"H": 0, "O": 0, "C": 0}) == []

    assert subset_of_frame_by_element(test_struc_1, {"H": 0, "O": 0, "C": 1}) == [5]
Exemple #2
0
def test_subset_of_frame_by_element():
    spec_list = ['H', 'H', 'O', 'O','O', 'C']
    test_struc_1 = Structure(cell=np.eye(3),
                             species=spec_list,
                             positions=np.zeros(shape=(len(spec_list), 3)))

    assert np.array_equal(subset_of_frame_by_element(test_struc_1, {}),
                          list(range(len(test_struc_1))))

    assert np.array_equal(subset_of_frame_by_element(test_struc_1,
                                                     {'H':2,'O':3}),
                                        list(range(len(test_struc_1))))
    assert np.array_equal(subset_of_frame_by_element(test_struc_1,
                                                     {'H': 2, 'O': 15}),
                          list(range(len(test_struc_1))))

    assert set(subset_of_frame_by_element(test_struc_1,{'H': 1,
              'O': 1})).issubset(range(len(spec_list)))
    assert len(subset_of_frame_by_element(test_struc_1,{'H':1,'O':1,
                                                        'C':1}))==3

    assert subset_of_frame_by_element(test_struc_1, {'H': 0, 'O': 0,
                                                         'C': 0}) == []


    assert subset_of_frame_by_element(test_struc_1, {'H': 0, 'O': 0,
                                                         'C': 1}) == [5]
    def run(self):
        """
        UPDATE: SOON TO BE DEPRECATED, CIRCA SEPTEMBER 2020

        Loop through frames and record the error between
        the GP predictions and the ground-truth forces. Train the GP and update
        the training set upon the triggering of the uncertainty or force error
        threshold.

        :return: None
        """

        # Perform pre-run, in which seed trames are used.
        logger = logging.getLogger(self.logger_name)
        logger.debug("Commencing run with pre-run...")
        if not self.gp_is_mapped:
            self.pre_run()

        # Past this frame, stop adding atoms to the training set
        #  (used for validation of model)
        train_frame = int(
            len(self.frames[::self.skip]) * (1 - self.validate_ratio))

        # Loop through trajectory.
        cur_atoms_added_train = 0  # Track atoms added for training
        cur_atoms_added_write = 0  # Track atoms added for writing
        cur_trains_done_write = 0  # Track training done for writing

        # Keep track of which atoms trigger force / uncertainty condition
        training_plan = {}

        for i, cur_frame in enumerate(self.frames[::self.skip]):

            frame_start_time = time.time()
            logger.info(f"=====NOW ON FRAME {i}=====")

            # If no predict_atoms_per_element was specified, predict_atoms
            # will be equal to every atom in the frame.
            predict_atoms = subset_of_frame_by_element(
                cur_frame, self.predict_atoms_per_element)

            # Atoms which are skipped will have NaN as their force / std values
            local_energies = None

            # Three different predictions: Either MGP, GP with energy,
            # or GP without
            if self.gp_is_mapped:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    mgp=self.gp,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                    energy=True,
                )
            elif self.calculate_energy:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )
            else:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )

            # Get Error
            dft_forces = cur_frame.forces
            dft_energy = cur_frame.energy
            error = np.abs(pred_forces - dft_forces)

            # Create dummy frame with the predicted forces written
            dummy_frame = deepcopy(cur_frame)
            dummy_frame.forces = pred_forces
            dummy_frame.stds = pred_stds
            cur_frame.stds = pred_stds

            self.output.write_gp_dft_comparison(
                curr_step=i,
                frame=dummy_frame,
                start_time=frame_start_time,
                dft_forces=dft_forces,
                dft_energy=dft_energy,
                error=error,
                local_energies=local_energies,
                KE=0,
                cell=cur_frame.cell,
            )

            logger.debug(
                f"Single frame calculation time {time.time()-frame_start_time}"
            )

            if i < train_frame:
                # Noise hyperparameter & relative std tolerance is not for gp_is_mapped.
                if self.gp_is_mapped:
                    noise = 0
                else:
                    noise = Parameters.get_noise(self.gp.hyps_mask,
                                                 self.gp.hyps,
                                                 constraint=False)
                std_in_bound, std_train_atoms = is_std_in_bound_per_species(
                    rel_std_tolerance=self.rel_std_tolerance,
                    abs_std_tolerance=self.abs_std_tolerance,
                    noise=noise,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                )

                # Get max force error atoms
                force_in_bound, force_train_atoms = is_force_in_bound_per_species(
                    abs_force_tolerance=self.abs_force_tolerance,
                    predicted_forces=pred_forces,
                    label_forces=dft_forces,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                    max_force_error=self.max_force_error,
                )

                if not std_in_bound or not force_in_bound:

                    # -1 is returned from the is_in_bound methods,
                    # so filter that out and the use sets to remove repeats
                    train_atoms = list(
                        set(std_train_atoms).union(force_train_atoms) - {-1})

                    # Record frame and training atoms, uncertainty, error
                    force_errors = list(np.abs(pred_forces - dft_forces))
                    uncertainties = list(dummy_frame.stds)
                    training_plan[int(i)] = [(int(a), uncertainties[a],
                                              force_errors[a])
                                             for a in train_atoms]

                    # Compute mae and write to output;
                    # Add max uncertainty atoms to training set
                    self.update_gp_and_print(
                        cur_frame,
                        train_atoms=train_atoms,
                        uncertainties=pred_stds[train_atoms],
                        train=False,
                    )
                    cur_atoms_added_train += len(train_atoms)
                    cur_atoms_added_write += len(train_atoms)
                    # Re-train if number of sampled atoms is high enough

                    if (cur_atoms_added_train >= self.min_atoms_per_train
                            or (i + 1) == train_frame):
                        if self.train_count < self.max_trains:
                            self.train_gp()
                            cur_trains_done_write += 1
                        else:
                            self.gp.update_L_alpha()
                        cur_atoms_added_train = 0
                    else:
                        self.gp.update_L_alpha()

                    # Loop to decide of a model should be written this
                    # iteration
                    will_write = False

                    if (self.train_checkpoint_interval
                            and cur_trains_done_write
                            and self.train_checkpoint_interval <=
                            cur_trains_done_write):
                        will_write = True
                        cur_trains_done_write = 0

                    if (self.atom_checkpoint_interval and cur_atoms_added_write
                            and self.atom_checkpoint_interval <=
                            cur_atoms_added_write):
                        will_write = True
                        cur_atoms_added_write = 0

                    if self.model_format and will_write:
                        self.gp.write_model(f"{self.output_name}_checkpt",
                                            self.model_format)

                if (i + 1) == train_frame and not self.gp_is_mapped:
                    self.gp.check_L_alpha()

        # Print training statistics for GP model used
        conclusion_strings = [
            "Final GP statistics:" + json.dumps(self.gp.training_statistics)
        ]
        self.output.conclude_run(conclusion_strings)

        if self.print_training_plan:
            with open(f"{self.output_name}_training_plan.json", "w") as f:
                f.write(json.dumps(training_plan, cls=NumpyEncoder))

        if self.model_format and not self.gp_is_mapped:
            self.gp.write_model(f"{self.output_name}_model", self.model_format)
    def run_active_learning(
        self,
        frames: Union[List[Structure], Trajectory] = (),
        rel_std_tolerance: float = 4,
        abs_std_tolerance: float = 0,
        abs_force_tolerance: float = 0.15,
        min_atoms_per_train: int = 200,
        max_force_error: float = inf,
        max_atoms_from_frame: int = inf,
        max_trains: int = inf,
        max_model_size: int = inf,
        max_elts_per_frame: Dict[str, int] = None,
        max_model_elts: Dict[str, int] = None,
        predict_atoms_per_elt: Dict[str, int] = None,
        write_model_train_interval: int = 1,
        write_model_atom_interval: int = 100,
        validate_ratio: float = 0,
        post_write: bool = True,
    ):

        # Perform pre-run, in which seed trames are used.
        logger = logging.getLogger(self.logger_name)

        if len(self.gp) == 0:
            logger.warning(
                "You are attempting active learning with an empty model. "
                "One atom of each element will be added from the first frame, "
                "but be warned: Hyperparameter optimzation on a very small "
                "subset of data can lead to suboptimal training set "
                "choices, as the hyperparameters will take time to become "
                "representative of their converged state relative to your data of "
                "interest.")
            self.run_passive_learning(
                frames[0:1],
                max_model_elts={elt: 1
                                for elt in frames[0].species_labels})

        if isinstance(frames, list):
            frames = Trajectory(deepcopy(frames))

        train_frame = int(len(frames) * (1 - validate_ratio))

        # Loop through trajectory.
        train_model_atom_counter = 0  # Track atoms added for training
        write_model_atom_counter = 0  # Track atoms added for writing
        train_counter = 0  # Track # of times training done

        # Keep track of which atoms trigger force / uncertainty condition
        training_plan = {}

        # MAIN LOOP - Frames
        for i, cur_frame in enumerate(frames):
            frame_start_time = time.time()
            logger.info(f"=====NOW ON FRAME {i}=====")

            # If no predict_atoms_per_element was specified, predict_atoms
            # will be equal to every atom in the frame.
            predict_atoms = subset_of_frame_by_element(cur_frame,
                                                       predict_atoms_per_elt)

            # Atoms which are skipped will have NaN as their force / std values
            local_energies = None

            # Three different predictions: Either MGP, GP with energy,
            # or GP without
            if self.gp_is_mapped:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    mgp=self.gp,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                    energy=True,
                )
            elif self.calculate_energy:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )
            else:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )

            # Get Error
            dft_forces = cur_frame.forces
            dft_energy = cur_frame.energy
            force_error = np.abs(pred_forces - dft_forces)

            # Create dummy frame with the predicted forces written
            dummy_frame = deepcopy(cur_frame)
            dummy_frame.forces = pred_forces
            dummy_frame.stds = pred_stds
            cur_frame.stds = pred_stds

            self.output.write_gp_dft_comparison(
                curr_step=i,
                frame=dummy_frame,
                start_time=frame_start_time,
                dft_forces=dft_forces,
                dft_energy=dft_energy,
                error=force_error,
                local_energies=local_energies,
                KE=0,
                cell=cur_frame.cell,
            )

            logger.debug(
                f"Single frame calculation time {time.time()-frame_start_time}"
            )

            if i < train_frame:
                # Noise hyperparameter & relative std tolerance is not for gp_is_mapped.
                if self.gp_is_mapped:
                    noise = 0
                else:
                    noise = Parameters.get_noise(self.gp.hyps_mask,
                                                 self.gp.hyps,
                                                 constraint=False)

                in_bound, train_atoms = evaluate_training_atoms(
                    rel_std_tolerance=rel_std_tolerance,
                    abs_std_tolerance=abs_std_tolerance,
                    noise=noise,
                    abs_force_tolerance=abs_force_tolerance,
                    max_force_error=max_force_error,
                    pred_forces=pred_forces,
                    dft_forces=dft_forces,
                    structure=dummy_frame,
                    max_model_elts=max_model_elts,
                    max_atoms_from_frame=max_atoms_from_frame,
                    max_elts_per_frame=max_elts_per_frame,
                    training_statistics=self.gp.training_statistics,
                )

                # Protocol for adding atoms to training set
                if not in_bound:

                    # Record frame and training atoms, uncertainty, error
                    force_errors = list(np.abs(pred_forces - dft_forces))
                    uncertainties = list(dummy_frame.stds)
                    training_plan[int(i)] = [(int(a), uncertainties[a],
                                              force_errors[a])
                                             for a in train_atoms]

                    if self.gp_is_mapped:
                        continue

                    if len(self.gp) + len(train_atoms) <= max_model_size:
                        self.update_gp_and_print(
                            cur_frame,
                            train_atoms=train_atoms,
                            uncertainties=pred_stds[train_atoms],
                            train=False,
                        )
                    else:
                        logger.info(
                            f"GP is at maximum model size of {max_model_size}. "
                            f"No further atoms will be added for "
                            f"remainder of run, but predictions will still be "
                            f"made. Setting max_atoms_from_frame "
                            f"to 0.")
                        max_atoms_from_frame = 0
                        if self.model_format:
                            self.gp.write_model(
                                f"{self.output_name}_saturated",
                                self.model_format)
                    train_model_atom_counter += len(train_atoms)
                    write_model_atom_counter += len(train_atoms)

                    # Re-train if number of sampled atoms is high enough
                    if (train_model_atom_counter >= min_atoms_per_train
                            or (i + 1) == train_frame
                            and train_counter <= max_trains):
                        self.train_gp()
                        train_counter += 1
                        train_model_atom_counter = 0
                    else:
                        self.gp.update_L_alpha()
                    written = self.write_model_decision(
                        write_model_train_interval,
                        write_model_atom_counter,
                        write_model_atom_interval,
                        train_counter,
                    )
                    if written:
                        write_model_atom_counter = 0

        # Print training statistics for GP model used
        conclusion_strings = [
            "Final GP statistics:" + json.dumps(self.gp.training_statistics)
        ]
        self.output.conclude_run(conclusion_strings)

        if self.print_training_plan:
            with open(f"{self.output_name}_training_plan.json", "w") as f:
                f.write(json.dumps(training_plan, cls=NumpyEncoder))

        if self.model_format and post_write and not self.gp_is_mapped:
            self.gp.write_model(f"{self.output_name}_model", self.model_format)
Exemple #5
0
    def active_run(self):
        """
        Loop through frames and record the error between
        the GP predictions and the ground-truth forces. Train the GP and update
        the training set upon the triggering of the uncertainty or force error
        threshold.
        :return: None
        """

        # Perform pre-run, in which seed trames are used.
        logger = logging.getLogger(self.logger_name)
        logger.debug("Commencing run with pre-run...")
        if not self.mgp:
            if len(self.gp) == 0:
                logger.warning("You are attempting to train a model with no "
                               "data in your Gausian Process; it is "
                               "recommended that you begin with "
                               "a passive training process.")

        self.preparation_for_active_run()

        # Loop through trajectory.
        self.cur_atoms_added_train = 0  # Track atoms added for training
        cur_atoms_added_write = 0  # Track atoms added for writing
        cur_trains_done_write = 0  # Track training done for writing

        self.curr_active_frame_index = -1
        cur_frame = self.get_next_active_frame()
        while cur_frame is not None:

            frame_start_time = time.time()
            logger.info(
                f"=====NOW ON FRAME {self.curr_active_frame_index}=====")

            # If no predict_atoms_per_element was specified, predict_atoms
            # will be equal to every atom in the frame.
            predict_atoms = subset_of_frame_by_element(
                cur_frame, self.predict_atoms_per_element)

            # Atoms which are skipped will have NaN as their force / std values
            local_energies = None

            # Three different predictions: Either MGP, GP with energy,
            # or GP without
            if self.mgp:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    mgp=self.gp,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                    energy=True,
                )
            elif self.calculate_energy:
                pred_forces, pred_stds, local_energies = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )
            else:
                pred_forces, pred_stds = self.pred_func(
                    structure=cur_frame,
                    gp=self.gp,
                    n_cpus=self.n_cpus,
                    write_to_structure=False,
                    selective_atoms=predict_atoms,
                    skipped_atom_value=np.nan,
                )

            # Get Error
            dft_forces = cur_frame.forces
            dft_energy = cur_frame.energy
            error = np.abs(pred_forces - dft_forces)

            # Create dummy frame with the predicted forces written
            dummy_frame = deepcopy(cur_frame)
            dummy_frame.forces = pred_forces
            dummy_frame.stds = pred_stds

            self.output.write_gp_dft_comparison(
                curr_step=self.curr_active_frame_index,
                frame=dummy_frame,
                start_time=time.time(),
                dft_forces=dft_forces,
                dft_energy=dft_energy,
                error=error,
                local_energies=local_energies,
                KE=0,
            )

            logger.debug(
                f"Single frame calculation time {time.time()-frame_start_time}"
            )

            if self.decide_to_update_db():

                # Noise hyperparameter & relative std tolerance is not for mgp.
                if self.mgp:
                    noise = 0
                else:
                    noise = Parameters.get_noise(self.gp.hyps_mask,
                                                 self.gp.hyps,
                                                 constraint=False)

                std_in_bound, std_train_atoms = is_std_in_bound_per_species(
                    rel_std_tolerance=self.rel_std_tolerance,
                    abs_std_tolerance=self.abs_std_tolerance,
                    noise=noise,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                )

                # Get max force error atoms
                force_in_bound, force_train_atoms = is_force_in_bound_per_species(
                    abs_force_tolerance=self.abs_force_tolerance,
                    predicted_forces=pred_forces,
                    label_forces=dft_forces,
                    structure=dummy_frame,
                    max_atoms_added=self.max_atoms_from_frame,
                    max_by_species=self.train_env_per_species,
                    max_force_error=self.max_force_error,
                )

                if not std_in_bound or not force_in_bound:

                    # -1 is returned from the is_in_bound methods,
                    # so filter that out and the use sets to remove repeats
                    train_atoms = list(
                        set(std_train_atoms).union(force_train_atoms) - {-1})

                    # Compute mae and write to output;
                    # Add max uncertainty atoms to training set
                    self.update_gp_and_print(
                        cur_frame,
                        train_atoms=train_atoms,
                        uncertainties=pred_stds[train_atoms],
                        train=False,
                    )
                    self.cur_atoms_added_train += len(train_atoms)
                    cur_atoms_added_write += len(train_atoms)
                    # Re-train if number of sampled atoms is high enough

                    if self.decide_to_train():
                        self.train_gp()
                        cur_trains_done_write += 1
                        self.cur_atoms_added_train = 0
                    else:
                        self.gp.update_L_alpha()
                        # self.cur_atoms_added_train = 0

                    # Loop to decide of a model should be written this
                    # iteration
                    will_write = False

                    if (self.train_checkpoint_interval
                            and cur_trains_done_write
                            and self.train_checkpoint_interval <=
                            cur_trains_done_write):
                        will_write = True
                        cur_trains_done_write = 0

                    if (self.atom_checkpoint_interval and cur_atoms_added_write
                            and self.atom_checkpoint_interval <=
                            cur_atoms_added_write):
                        will_write = True
                        cur_atoms_added_write = 0

                    if self.model_format and will_write:
                        self.gp.write_model(f"{self.output_name}_checkpt",
                                            self.model_format)

                if self.decide_to_checkLalpha():
                    self.gp.check_L_alpha()

            cur_frame = self.get_next_active_frame()

        self.output.conclude_run()

        if self.model_format and not self.mgp:
            self.gp.write_model(f"{self.output_name}_model", self.model_format)