Exemple #1
0
    def fit(
        self,
        training_data: MODData,
        n_jobs=1,
        **kwargs,
    ) -> None:
        """Train the model on the passed training `MODData` object.

        Parameters:
            same as MODNetModel fit.
        """

        if self.bootstrap:
            LOG.info("Generating bootstrap data...")
            train_datas = [training_data.split((resample(np.arange(len(training_data.df_targets)),
                    replace=True, random_state=2943),[]))[0] for _ in range(self.n_models)]
        else:
            train_datas = [training_data for _ in range(self.n_models)]

        if n_jobs<=1:
            for i in range(self.n_models):
                LOG.info(f"Bootstrap fitting model #{i + 1}/{self.n_models}")
                self.model[i].fit(train_datas[i], **kwargs)
                model_summary = ""
                for k in self.model[i].history.keys():
                    model_summary += "{}: {:.4f}\t".format(k, self.model[i].history[k][-1])
                LOG.info(model_summary)
        else:
                ctx = multiprocessing.get_context('spawn')
                pool = ctx.Pool(processes=n_jobs)
                tasks =[]
                for i,m in enumerate(self.model):
                    m._make_picklable()
                    tasks.append({'model':m, 'training_data':train_datas[i], 'model_id':i, **kwargs})
                for res in tqdm.tqdm(pool.imap_unordered(_map_fit_MODNet, tasks, chunksize=1),
                                     total=self.n_models):
                    model, model_id = res
                    model._restore_model()
                    self.model[model_id] = model
                    model_summary = f"Model #{model_id}\t"
                    for k in model.history.keys():
                        model_summary += "{}: {:.4f}\t".format(k, model.history[k][-1])
                    LOG.info(model_summary)
                pool.close()
                pool.join()
Exemple #2
0
def matbench_benchmark(
    data: MODData,
    target: List[str],
    target_weights: Dict[str, float],
    fit_settings: Optional[Dict[str, Any]] = None,
    classification: bool = False,
    model_type: Type[MODNetModel] = MODNetModel,
    save_folds: bool = False,
    save_models: bool = False,
    hp_optimization: bool = True,
    inner_feat_selection: bool = True,
    use_precomputed_cross_nmi: bool = True,
    presets: Optional[List[dict]] = None,
    fast: bool = False,
    n_jobs: Optional[int] = None,
    nested: bool = False,
    **model_init_kwargs,
) -> dict:
    """Train and cross-validate a model against Matbench data splits, optionally
    performing hyperparameter optimisation.

    Arguments:
        data: The entire dataset as a `MODData`.
        target: The list of target names to train on.
        target_weights: The target weights to use for the `MODNetModel`.
        fit_settings: Any settings to pass to `model.fit(...)` directly
            (typically when not performing hyperparameter optimisation).
        classification: Whether all tasks are classification rather than regression.
        model_type: The type of the model to create and benchmark.
        save_folds: Whether to save dataframes with pre-processed fold
            data (e.g. feature selection).
        save_models: Whether to pickle all trained models according to
            their fold index and performance.
        hp_optimization: Whether to perform hyperparameter optimisation.
        inner_feat_selection: Whether to perform split-level feature
            selection or try to use pre-computed values.
        use_precomputed_cross_nmi: Whether to use the precmputed cross NMI
            from the Materials Project dataset, or recompute per fold.
        presets: Override the built-in hyperparameter grid with these presets.
        fast: Whether to perform debug training, i.e. reduced presets and epochs.
        n_jobs: Try to parallelize the inner fit_preset over this number of
            processes. Maxes out at number_of_presets*nested_folds
        nested: Whether to perform nested CV for hyperparameter optimisation.
        **model_init_kwargs: Additional arguments to pass to the model on creation.

    Returns:
        A dictionary containing all the results from the training, broken
            down by model and by fold.

    """

    if fit_settings is None:
        fit_settings = {}

    if not fit_settings.get("n_feat"):
        nf = len(data.df_featurized.columns)
        fit_settings["n_feat"] = nf
    if not fit_settings.get("num_neurons"):
        # Pass dummy network
        fit_settings["num_neurons"] = [[4], [4], [4], [4]]

    fold_data = []
    results = defaultdict(list)

    for ind, (train, test) in enumerate(matbench_kfold_splits(data)):
        train_data, test_data = data.split((train, test))
        if inner_feat_selection:
            path = "folds/train_moddata_f{}".format(ind + 1)
            if os.path.isfile(path):
                train_data = MODData.load(path)
            else:
                train_data.feature_selection(
                    n=-1, use_precomputed_cross_nmi=use_precomputed_cross_nmi)
            os.makedirs("folds", exist_ok=True)
            train_data.save(path)

        fold_data.append((train_data, test_data))

    args = (target, target_weights, fit_settings)

    model_kwargs = {
        "model_type": model_type,
        "hp_optimization": hp_optimization,
        "fast": fast,
        "classification": classification,
        "save_folds": save_folds,
        "presets": presets,
        "save_models": save_models,
        "nested": nested,
        "n_jobs": n_jobs,
    }

    model_kwargs.update(model_init_kwargs)

    fold_results = []
    for fold in enumerate(fold_data):
        fold_results.append(train_fold(fold, *args, **model_kwargs))

    for fold in fold_results:
        for key in fold:
            results[key].append(fold[key])

    return results
Exemple #3
0
    def fit_preset(
        self,
        data: MODData,
        presets: List[Dict[str, Any]] = None,
        val_fraction: float = 0.15,
        verbose: int = 0,
        classification: bool = False,
        refit: bool = True,
        fast: bool = False,
        nested: int = 5,
        callbacks: List[Any] = None,
        n_jobs=None,
    ) -> Tuple[List[List[Any]], np.ndarray, Optional[List[float]],
               List[List[float]], Dict[str, Any], ]:
        """Chooses an optimal hyper-parametered MODNet model from different presets.

        This function implements the "inner loop" of a cross-validation workflow. By
        modifying the `nested` argument, it can be run in full nested mode (i.e.
        train n_fold * n_preset models) or just with a simple random hold-out set.

        The data is first fitted on several well working MODNet presets
        with a validation set (10% of the furnished data by default).

        Sets the `self.model` attribute to the model with the lowest mean validation loss across
        all folds.

        Args:
            data: MODData object contain training and validation samples.
            presets: A list of dictionaries containing custom presets.
            verbose: The verbosity level to pass to tf.keras
            val_fraction: The fraction of the data to use for validation.
            classification: Whether or not we are performing classification.
            refit: Whether or not to refit the final model for each fold with
                the best-performing settings.
            fast: Used for debugging. If `True`, only fit the first 2 presets and
                reduce the number of epochs.
            nested: integer specifying whether or not to perform a full nested CV. If 0,
                a simple validation split is performed based on val_fraction argument.
                If an integer, use this number of inner CV folds, ignoring the `val_fraction` argument.
                Note: If set to 1, the value will be overwritten to a default of 5 folds.
            n_jobs: number of jobs for multiprocessing

        Returns:
            - A list of length num_outer_folds containing lists of MODNet models of length num_inner_folds.
            - A list of validation losses achieved by the best model for each fold during validation (excluding refit).
            - The learning curve of the final (refitted) model (or `None` if `refit` is `False`)
            - A nested list of learning curves for each trained model of lengths (num_outer_folds,  num_inner folds).
            - The settings of the best-performing preset.

        """

        from modnet.matbench.benchmark import matbench_kfold_splits

        if callbacks is None:
            es = tf.keras.callbacks.EarlyStopping(
                monitor="loss",
                min_delta=0.001,
                patience=100,
                verbose=verbose,
                mode="auto",
                baseline=None,
                restore_best_weights=False,
            )
            callbacks = [es]

        if presets is None:
            from modnet.model_presets import gen_presets

            presets = gen_presets(
                len(data.optimal_features),
                len(data.df_targets),
                classification=classification,
            )

        if fast and len(presets) >= 2:
            presets = presets[:2]
            for k, _ in enumerate(presets):
                presets[k]["epochs"] = 100

        num_nested_folds = 5
        if nested:
            num_nested_folds = nested
        if num_nested_folds <= 1:
            num_nested_folds = 5

        # create tasks
        splits = matbench_kfold_splits(data,
                                       n_splits=num_nested_folds,
                                       classification=classification)
        if not nested:
            splits = [
                train_test_split(range(len(data.df_featurized)),
                                 test_size=val_fraction)
            ]
            n_splits = 1
        else:
            n_splits = num_nested_folds
        train_val_datas = []
        for train, val in splits:
            train_val_datas.append(data.split((train, val)))

        tasks = []
        for i, params in enumerate(presets):
            n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"])

            for ind in range(n_splits):
                val_params = {}
                train_data, val_data = train_val_datas[ind]
                val_params["val_data"] = val_data

                tasks += [{
                    "train_data": train_data,
                    "targets": self.targets,
                    "weights": self.weights,
                    "num_classes": self.num_classes,
                    "n_feat": n_feat,
                    "num_neurons": params["num_neurons"],
                    "lr": params["lr"],
                    "batch_size": params["batch_size"],
                    "epochs": params["epochs"],
                    "loss": params["loss"],
                    "act": params["act"],
                    "out_act": self.out_act,
                    "callbacks": callbacks,
                    "preset_id": i,
                    "fold_id": ind,
                    "verbose": verbose,
                    **val_params,
                }]

        val_losses = 1e20 * np.ones((len(presets), n_splits))
        learning_curves = [[None for _ in range(n_splits)]
                           for _ in range(len(presets))]
        models = [[None for _ in range(n_splits)] for _ in range(len(presets))]

        ctx = multiprocessing.get_context("spawn")
        pool = ctx.Pool(processes=n_jobs)
        LOG.info(
            f"Multiprocessing on {n_jobs} cores. Total of {multiprocessing.cpu_count()} cores available."
        )

        for res in tqdm.tqdm(
                pool.imap_unordered(map_validate_model, tasks, chunksize=1),
                total=len(tasks),
        ):
            val_loss, learning_curve, model, preset_id, fold_id = res
            LOG.info(f"Preset #{preset_id} fitting finished, loss: {val_loss}")
            # reload the model object after serialization
            model._restore_model()

            val_losses[preset_id, fold_id] = val_loss
            learning_curves[preset_id][fold_id] = learning_curve
            models[preset_id][fold_id] = model

        pool.close()
        pool.join()

        val_loss_per_preset = np.mean(val_losses, axis=1)
        best_preset_idx = int(np.argmin(val_loss_per_preset))
        best_model_idx = int(np.argmin(val_losses[best_preset_idx, :]))
        best_preset = presets[best_preset_idx]
        best_learning_curve = learning_curves[best_preset_idx][best_model_idx]
        best_model = models[best_preset_idx][best_model_idx]

        LOG.info(
            "Preset #{} resulted in lowest validation loss with params {}".
            format(best_preset_idx + 1,
                   tasks[n_splits * best_preset_idx + best_model_idx]))

        if refit:
            LOG.info("Refitting with all data and parameters: {}".format(
                best_preset))
            # Building final model

            n_feat = min(len(data.get_optimal_descriptors()),
                         best_preset["n_feat"])
            self.model = MODNetModel(
                self.targets,
                self.weights,
                num_neurons=best_preset["num_neurons"],
                n_feat=n_feat,
                act=best_preset["act"],
                out_act=self.out_act,
                num_classes=self.num_classes,
            ).model
            self.n_feat = n_feat
            self.fit(
                data,
                val_fraction=0,
                lr=best_preset["lr"],
                epochs=best_preset["epochs"],
                batch_size=best_preset["batch_size"],
                loss=best_preset["loss"],
                callbacks=callbacks,
                verbose=verbose,
            )
        else:
            self.n_feat = best_model.n_feat
            self.model = best_model.model
            self._scaler = best_model._scaler

        return models, val_losses, best_learning_curve, learning_curves, best_preset