def test_train_normalize_array():
    data_transformer = DataTransformer(transformations="normalize")
    data_transformer.train(data)
    transformed = data_transformer.transform(data)
    assert np.all(np.abs(np.amin(transformed, axis=0)) < 1e-7)
    assert np.all(np.abs(np.amax(transformed, axis=0) - 1.0) < 1e-7)
    assert np.all(data == data_transformer.back_transform(transformed))
def test_train_standardize_array():
    data_transformer = DataTransformer(transformations="standardize")
    data_transformer.train(data)
    transformed = data_transformer.transform(data)
    assert np.all(np.abs(np.mean(transformed, axis=0)) < 1e-7)
    assert np.all(np.abs(np.std(transformed, axis=0) - 1.0) < 1e-7)
    assert np.all(data == data_transformer.back_transform(transformed))
def test_train_identity_array():
    data_transformer = DataTransformer(transformations="identity")
    data_transformer.train(data)

    assert np.all(data_transformer._min == np.amin(data, axis=0))
    assert np.all(data_transformer._max == np.amax(data, axis=0))
    assert np.all(data_transformer._stddev == np.std(data, axis=0))
    assert np.all(data_transformer._mean == np.mean(data, axis=0))
Example #4
0
    def __init__(
        self,
        dataset=None,
        model=None,
        feature_transform="identity",
        target_transform="identity",
    ):
        """Experiment emulator.

        Args:
            dataset (str, Dataset): dataset used to train a model. Either a string, in which case a standard dataset
                is loaded, or a Dataset object. To see the list of available datasets ...
            model (str, Model): the model used to create the emulator. Either a string, in which case a default model
                is loaded, or a Model object. To see the list of available models ...
            feature_transform (str, list): the data transform to be applied to the features. See DataTransformer for the
                available transformations.
            target_transform (str, list): the data transform to be applied to the targets. See DataTransformer for the
                available transformations.
        """

        # ------------------------------------------------------------
        # if dataset and model are strings ==> load emulator from file
        # ------------------------------------------------------------
        if type(dataset) == str and type(model) == str:
            # check dataset string
            _validate_dataset_args(kind=dataset,
                                   data=None,
                                   columns=None,
                                   target_names=None)
            # check model string
            _validate_model_kind(model)
            Logger.log(
                f"Loading emulator using a {model} model for the dataset {dataset}...",
                "INFO",
            )
            self._load(f"{__emulator_path__}/emulator_{dataset}_{model}")

        # -----------------------------------------
        # otherwise, assume it is a custom emulator
        # -----------------------------------------
        else:
            Object.__init__(**locals())

            if dataset is not None:
                self._set_dataset(dataset)
            if model is not None:
                self._set_model(model)

            # other attributes we will use
            self._version = __version__
            self._ghost_model = deepcopy(self.model)
            self.is_trained = False
            self.cross_val_performed = False
            self.cv_scores = None
            self.model_scores = None
            self.emulator_to_save = None
            self.feature_transformer = DataTransformer(
                transformations=self.feature_transform)
            self.target_transformer = DataTransformer(
                transformations=self.target_transform)

        # create tmp dir to store model files
        # also if we are loading a model (the user could call 'train' again)
        self._scratch_dir = TemporaryDirectory(dir=f"{__scratch__}",
                                               prefix="emulator_")
Example #5
0
    def cross_validate(self, rerun=False, plot=False):
        # TODO: allow setting verbosity: verbose=True/False will be enough
        """Performs cross validation on the emulator dataset, using the emulator model. The number of folds used is
        defined in the Dataset object.

        Args:
            rerun (bool): whether to run cross validation again, in case it had already been performed.

        Returns:
            scores (dict): dictionary with the list of train and validation R2 scores.

        """
        """Perform cross validation.

        Returns (dict): dictionary containing the training and test R2 scores for all folds.

        """

        if self.cross_val_performed is True and rerun is False:
            message = (
                "Cross validation has already been performed for this Emulator. You can see its results in "
                "`self.cv_scores`. If you would like to rerun cross validation and overwrite the previous "
                "results, set `rerun` to True")
            Logger.log(message, "FATAL")

        training_r2_scores = np.empty(self.dataset.num_folds)
        valid_r2_scores = np.empty(self.dataset.num_folds)
        training_rmsd_scores = np.empty(self.dataset.num_folds)
        valid_rmsd_scores = np.empty(self.dataset.num_folds)

        # get scaled train/valid sets
        # NOTE: we do not want to use the self.transformers, because for 'run' we want to use the transformers
        # trained in 'train'. If we reset the Transformers here, then if a user calls 'cross_validate' after 'train'
        # we end up using the wrong transformers in 'run'
        feature_transformer = DataTransformer(
            transformations=self.feature_transform)
        target_transformer = DataTransformer(
            transformations=self.target_transform)

        # ---------------------------------------
        # Iterate over the cross validation folds
        # ---------------------------------------
        for fold in range(self.dataset.num_folds):
            # get the train/valid sets
            # NOTE: we keep the features as Dataset objects, as these are needed for possible periodic transformations
            # TODO: expend the above also to targets? Right now param_space does not describe what type of variable
            #  the targets are
            train_features = Dataset(
                data=self.dataset.cross_val_sets_features[fold][0])
            train_features.set_param_space(self.dataset.param_space)
            valid_features = self.dataset.cross_val_sets_features[fold][
                1].to_numpy()
            train_targets = self.dataset.cross_val_sets_targets[fold][
                0].to_numpy()
            valid_targets = self.dataset.cross_val_sets_targets[fold][
                1].to_numpy()

            feature_transformer.train(train_features)
            target_transformer.train(train_targets)

            train_features_scaled = feature_transformer.transform(
                train_features)
            valid_features_scaled = feature_transformer.transform(
                valid_features)
            train_targets_scaled = target_transformer.transform(train_targets)
            valid_targets_scaled = target_transformer.transform(valid_targets)

            # define scope and make a copy of the model for the cross validation
            model_fold = deepcopy(
                self._ghost_model
            )  # the model we will use for training the fold
            model_fold.scope = f"Fold_{fold}"
            model_path = f"{self._scratch_dir.name}/{model_fold.scope}"
            # TODO/QUESTION: in case we are overwriting the output of a previous call, should we first remove the folder
            #  to make sure to have a cleared path?
            if not os.path.exists(model_path):
                os.makedirs(model_path)

            Logger.log(f">>> Training model on fold #{fold}...", "INFO")
            (
                mdl_train_r2,
                mdl_valid_r2,
                mdl_train_rmsd,
                mdl_test_rmsd,
            ) = model_fold.train(
                train_features=train_features_scaled,
                train_targets=train_targets_scaled,
                valid_features=valid_features_scaled,
                valid_targets=valid_targets_scaled,
                model_path=model_path,
                plot=plot,
            )

            # store performance of fold
            training_r2_scores[fold] = mdl_train_r2
            valid_r2_scores[fold] = mdl_valid_r2
            training_rmsd_scores[fold] = mdl_train_rmsd
            valid_rmsd_scores[fold] = mdl_test_rmsd
            # write file to indicate training is complete and add R2 in there
            with open(f"{model_path}/training_completed.info", "w") as content:
                content.write(
                    f"Train R2={mdl_train_r2}\nValidation R2={mdl_valid_r2}\n"
                    f"Train RMSD={mdl_train_rmsd}\nValidation RMSD={mdl_test_rmsd}\n"
                )

        # print some info to screen
        Logger.log(
            f"Performance statistics based on transformed data "
            f"[{self.feature_transform}, {self.target_transform}]:", "INFO")
        cv_r2_score_mean = np.mean(valid_r2_scores)
        cv_r2_score_stderr = np.std(valid_r2_scores) / np.sqrt(
            (len(valid_r2_scores) - 1))
        cv_rmsd_score_mean = np.mean(valid_rmsd_scores)
        cv_rmsd_score_stderr = np.std(valid_rmsd_scores) / np.sqrt(
            (len(valid_rmsd_scores) - 1))
        Logger.log(
            "Validation   R2: {0:.4f} +/- {1:.4f}".format(
                cv_r2_score_mean, cv_r2_score_stderr),
            "INFO",
        )
        Logger.log(
            "Validation RSMD: {0:.4f} +/- {1:.4f}".format(
                cv_rmsd_score_mean, cv_rmsd_score_stderr),
            "INFO",
        )

        self.cross_val_performed = True
        self.cv_scores = {
            "train_r2": training_r2_scores,
            "validate_r2": valid_r2_scores,
            "train_rmsd": training_rmsd_scores,
            "validate_rmsd": valid_rmsd_scores,
        }
        return self.cv_scores
def test_train_sqrt_mean_array():
    data_transformer = DataTransformer(transformations="sqrt_mean")
    data_transformer.train(data)
    transformed = data_transformer.transform(data)
    assert np.sum(
        np.abs(data - data_transformer.back_transform(transformed))) < 1e-7
def test_train_log_mean_array():
    data_transformer = DataTransformer(transformations="log_mean")
    data_transformer.train(data)
    transformed = data_transformer.transform(data)
    assert np.all(data == data_transformer.back_transform(transformed))
def test_train_identity_array():
    data_transformer = DataTransformer(transformations="identity")
    data_transformer.train(data)
    transformed = data_transformer.transform(data)
    assert np.all(data == transformed)
    assert np.all(data == data_transformer.back_transform(transformed))
def test_train_mean_array():
    data_transformer = DataTransformer(transformations="mean")
    data_transformer.train(data)
    transformed = data_transformer.transform(data)
    assert np.all(np.abs(np.mean(transformed, axis=0) - 1.0) < 1e-7)
    assert np.all(data == data_transformer.back_transform(transformed))