def fit( self, training_data: MODData, n_jobs=1, **kwargs, ) -> None: """Train the model on the passed training `MODData` object. Parameters: same as MODNetModel fit. """ if self.bootstrap: LOG.info("Generating bootstrap data...") train_datas = [training_data.split((resample(np.arange(len(training_data.df_targets)), replace=True, random_state=2943),[]))[0] for _ in range(self.n_models)] else: train_datas = [training_data for _ in range(self.n_models)] if n_jobs<=1: for i in range(self.n_models): LOG.info(f"Bootstrap fitting model #{i + 1}/{self.n_models}") self.model[i].fit(train_datas[i], **kwargs) model_summary = "" for k in self.model[i].history.keys(): model_summary += "{}: {:.4f}\t".format(k, self.model[i].history[k][-1]) LOG.info(model_summary) else: ctx = multiprocessing.get_context('spawn') pool = ctx.Pool(processes=n_jobs) tasks =[] for i,m in enumerate(self.model): m._make_picklable() tasks.append({'model':m, 'training_data':train_datas[i], 'model_id':i, **kwargs}) for res in tqdm.tqdm(pool.imap_unordered(_map_fit_MODNet, tasks, chunksize=1), total=self.n_models): model, model_id = res model._restore_model() self.model[model_id] = model model_summary = f"Model #{model_id}\t" for k in model.history.keys(): model_summary += "{}: {:.4f}\t".format(k, model.history[k][-1]) LOG.info(model_summary) pool.close() pool.join()
def matbench_benchmark( data: MODData, target: List[str], target_weights: Dict[str, float], fit_settings: Optional[Dict[str, Any]] = None, classification: bool = False, model_type: Type[MODNetModel] = MODNetModel, save_folds: bool = False, save_models: bool = False, hp_optimization: bool = True, inner_feat_selection: bool = True, use_precomputed_cross_nmi: bool = True, presets: Optional[List[dict]] = None, fast: bool = False, n_jobs: Optional[int] = None, nested: bool = False, **model_init_kwargs, ) -> dict: """Train and cross-validate a model against Matbench data splits, optionally performing hyperparameter optimisation. Arguments: data: The entire dataset as a `MODData`. target: The list of target names to train on. target_weights: The target weights to use for the `MODNetModel`. fit_settings: Any settings to pass to `model.fit(...)` directly (typically when not performing hyperparameter optimisation). classification: Whether all tasks are classification rather than regression. model_type: The type of the model to create and benchmark. save_folds: Whether to save dataframes with pre-processed fold data (e.g. feature selection). save_models: Whether to pickle all trained models according to their fold index and performance. hp_optimization: Whether to perform hyperparameter optimisation. inner_feat_selection: Whether to perform split-level feature selection or try to use pre-computed values. use_precomputed_cross_nmi: Whether to use the precmputed cross NMI from the Materials Project dataset, or recompute per fold. presets: Override the built-in hyperparameter grid with these presets. fast: Whether to perform debug training, i.e. reduced presets and epochs. n_jobs: Try to parallelize the inner fit_preset over this number of processes. Maxes out at number_of_presets*nested_folds nested: Whether to perform nested CV for hyperparameter optimisation. **model_init_kwargs: Additional arguments to pass to the model on creation. Returns: A dictionary containing all the results from the training, broken down by model and by fold. """ if fit_settings is None: fit_settings = {} if not fit_settings.get("n_feat"): nf = len(data.df_featurized.columns) fit_settings["n_feat"] = nf if not fit_settings.get("num_neurons"): # Pass dummy network fit_settings["num_neurons"] = [[4], [4], [4], [4]] fold_data = [] results = defaultdict(list) for ind, (train, test) in enumerate(matbench_kfold_splits(data)): train_data, test_data = data.split((train, test)) if inner_feat_selection: path = "folds/train_moddata_f{}".format(ind + 1) if os.path.isfile(path): train_data = MODData.load(path) else: train_data.feature_selection( n=-1, use_precomputed_cross_nmi=use_precomputed_cross_nmi) os.makedirs("folds", exist_ok=True) train_data.save(path) fold_data.append((train_data, test_data)) args = (target, target_weights, fit_settings) model_kwargs = { "model_type": model_type, "hp_optimization": hp_optimization, "fast": fast, "classification": classification, "save_folds": save_folds, "presets": presets, "save_models": save_models, "nested": nested, "n_jobs": n_jobs, } model_kwargs.update(model_init_kwargs) fold_results = [] for fold in enumerate(fold_data): fold_results.append(train_fold(fold, *args, **model_kwargs)) for fold in fold_results: for key in fold: results[key].append(fold[key]) return results
def fit_preset( self, data: MODData, presets: List[Dict[str, Any]] = None, val_fraction: float = 0.15, verbose: int = 0, classification: bool = False, refit: bool = True, fast: bool = False, nested: int = 5, callbacks: List[Any] = None, n_jobs=None, ) -> Tuple[List[List[Any]], np.ndarray, Optional[List[float]], List[List[float]], Dict[str, Any], ]: """Chooses an optimal hyper-parametered MODNet model from different presets. This function implements the "inner loop" of a cross-validation workflow. By modifying the `nested` argument, it can be run in full nested mode (i.e. train n_fold * n_preset models) or just with a simple random hold-out set. The data is first fitted on several well working MODNet presets with a validation set (10% of the furnished data by default). Sets the `self.model` attribute to the model with the lowest mean validation loss across all folds. Args: data: MODData object contain training and validation samples. presets: A list of dictionaries containing custom presets. verbose: The verbosity level to pass to tf.keras val_fraction: The fraction of the data to use for validation. classification: Whether or not we are performing classification. refit: Whether or not to refit the final model for each fold with the best-performing settings. fast: Used for debugging. If `True`, only fit the first 2 presets and reduce the number of epochs. nested: integer specifying whether or not to perform a full nested CV. If 0, a simple validation split is performed based on val_fraction argument. If an integer, use this number of inner CV folds, ignoring the `val_fraction` argument. Note: If set to 1, the value will be overwritten to a default of 5 folds. n_jobs: number of jobs for multiprocessing Returns: - A list of length num_outer_folds containing lists of MODNet models of length num_inner_folds. - A list of validation losses achieved by the best model for each fold during validation (excluding refit). - The learning curve of the final (refitted) model (or `None` if `refit` is `False`) - A nested list of learning curves for each trained model of lengths (num_outer_folds, num_inner folds). - The settings of the best-performing preset. """ from modnet.matbench.benchmark import matbench_kfold_splits if callbacks is None: es = tf.keras.callbacks.EarlyStopping( monitor="loss", min_delta=0.001, patience=100, verbose=verbose, mode="auto", baseline=None, restore_best_weights=False, ) callbacks = [es] if presets is None: from modnet.model_presets import gen_presets presets = gen_presets( len(data.optimal_features), len(data.df_targets), classification=classification, ) if fast and len(presets) >= 2: presets = presets[:2] for k, _ in enumerate(presets): presets[k]["epochs"] = 100 num_nested_folds = 5 if nested: num_nested_folds = nested if num_nested_folds <= 1: num_nested_folds = 5 # create tasks splits = matbench_kfold_splits(data, n_splits=num_nested_folds, classification=classification) if not nested: splits = [ train_test_split(range(len(data.df_featurized)), test_size=val_fraction) ] n_splits = 1 else: n_splits = num_nested_folds train_val_datas = [] for train, val in splits: train_val_datas.append(data.split((train, val))) tasks = [] for i, params in enumerate(presets): n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"]) for ind in range(n_splits): val_params = {} train_data, val_data = train_val_datas[ind] val_params["val_data"] = val_data tasks += [{ "train_data": train_data, "targets": self.targets, "weights": self.weights, "num_classes": self.num_classes, "n_feat": n_feat, "num_neurons": params["num_neurons"], "lr": params["lr"], "batch_size": params["batch_size"], "epochs": params["epochs"], "loss": params["loss"], "act": params["act"], "out_act": self.out_act, "callbacks": callbacks, "preset_id": i, "fold_id": ind, "verbose": verbose, **val_params, }] val_losses = 1e20 * np.ones((len(presets), n_splits)) learning_curves = [[None for _ in range(n_splits)] for _ in range(len(presets))] models = [[None for _ in range(n_splits)] for _ in range(len(presets))] ctx = multiprocessing.get_context("spawn") pool = ctx.Pool(processes=n_jobs) LOG.info( f"Multiprocessing on {n_jobs} cores. Total of {multiprocessing.cpu_count()} cores available." ) for res in tqdm.tqdm( pool.imap_unordered(map_validate_model, tasks, chunksize=1), total=len(tasks), ): val_loss, learning_curve, model, preset_id, fold_id = res LOG.info(f"Preset #{preset_id} fitting finished, loss: {val_loss}") # reload the model object after serialization model._restore_model() val_losses[preset_id, fold_id] = val_loss learning_curves[preset_id][fold_id] = learning_curve models[preset_id][fold_id] = model pool.close() pool.join() val_loss_per_preset = np.mean(val_losses, axis=1) best_preset_idx = int(np.argmin(val_loss_per_preset)) best_model_idx = int(np.argmin(val_losses[best_preset_idx, :])) best_preset = presets[best_preset_idx] best_learning_curve = learning_curves[best_preset_idx][best_model_idx] best_model = models[best_preset_idx][best_model_idx] LOG.info( "Preset #{} resulted in lowest validation loss with params {}". format(best_preset_idx + 1, tasks[n_splits * best_preset_idx + best_model_idx])) if refit: LOG.info("Refitting with all data and parameters: {}".format( best_preset)) # Building final model n_feat = min(len(data.get_optimal_descriptors()), best_preset["n_feat"]) self.model = MODNetModel( self.targets, self.weights, num_neurons=best_preset["num_neurons"], n_feat=n_feat, act=best_preset["act"], out_act=self.out_act, num_classes=self.num_classes, ).model self.n_feat = n_feat self.fit( data, val_fraction=0, lr=best_preset["lr"], epochs=best_preset["epochs"], batch_size=best_preset["batch_size"], loss=best_preset["loss"], callbacks=callbacks, verbose=verbose, ) else: self.n_feat = best_model.n_feat self.model = best_model.model self._scaler = best_model._scaler return models, val_losses, best_learning_curve, learning_curves, best_preset