def test_small_moddata_featurization(): """ This test creates a new MODData from the MP 2018.6 structures. """ data_file = Path(__file__).parent.joinpath("data/MP_2018.6_small.zip") # Loading pickles can be dangerous, so lets at least check that the MD5 matches # what it was when created assert (get_sha512_of_file(data_file) == "37bd4f8ce6f29c904a13e5670dd53af9a8779094727052ec85ccd6362b1b3765" "ac613426331811b3f626242896d87c3f6bc1884cc5545875b5ae66a712f9e218") old = MODData.load(data_file) structures = old.structures targets = old.targets names = old.names new = MODData(structures, targets, target_names=names) new.featurize(fast=False) new_cols = sorted(new.df_featurized.columns.tolist()) old_cols = sorted(old.df_featurized.columns.tolist()) for i in range(len(old_cols)): print(new_cols[i], old_cols[i]) assert new_cols[i] == old_cols[i] np.testing.assert_array_equal(old_cols, new_cols) for col in new.df_featurized.columns: np.testing.assert_almost_equal( new.df_featurized[col].to_numpy(), old.df_featurized[col].to_numpy(), )
def test_small_moddata_featurization(small_moddata): """ This test creates a new MODData from the MP 2018.6 structures. """ old = small_moddata structures = old.structures targets = old.targets names = old.names new = MODData(structures, targets, target_names=names) new.featurize(fast=False, n_jobs=1) new_cols = sorted(new.df_featurized.columns.tolist()) old_cols = sorted(old.df_featurized.columns.tolist()) for i in range(len(old_cols)): print(new_cols[i], old_cols[i]) assert new_cols[i] == old_cols[i] np.testing.assert_array_equal(old_cols, new_cols) for col in new.df_featurized.columns: np.testing.assert_almost_equal( new.df_featurized[col].to_numpy(), old.df_featurized[col].to_numpy(), )
def test_load_precomputed(): """Tries to load and unpack the dataset on figshare. Requies ~10 GB of memory. """ MODData.load_precomputed("MP_2018.6")
def test_precomputed_cross_nmi(small_moddata): new = MODData( materials=small_moddata.structures, targets=small_moddata.targets, target_names=small_moddata.names, df_featurized=small_moddata.df_featurized, ) new.feature_selection(5, use_precomputed_cross_nmi=True)
def fit(self,data:MODData, val_fraction = 0.0, val_key = None, lr=0.001, epochs = 200, batch_size = 128, xscale='minmax',yscale=None): print('new') self.xscale = xscale self.target_names = data.names self.optimal_descriptors = data.get_optimal_descriptors() x = data.get_featurized_df()[self.optimal_descriptors[:self.n_feat]].values print(x.shape) y = data.get_target_df()[self.targets_flatten].values.transpose() print(y.shape) #Scale the input features: if self.xscale == 'minmax': self.xmin = x.min(axis=0) self.xmax = x.max(axis=0) x=(x-self.xmin)/(self.xmax-self.xmin) - 0.5 elif self.xscale == 'standard': self.scaler = StandardScaler() x = self.scaler.fit_transform(x) x = np.nan_to_num(x) if val_fraction > 0: if self.PP: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_{}_mae'.format(val_key)]))) else: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_mae']))) else: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}".format(epoch,logs['loss']))) fit_params = { 'x': x, 'y': list(y), 'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'validation_split' : val_fraction, 'callbacks':[print_callback] } print('compile',flush=True) self.model.compile(loss = 'mse',optimizer=keras.optimizers.Adam(lr=lr),metrics=['mae'],loss_weights=self.weights) print('fit',flush=True) self.model.fit(**fit_params)
def predict(self, test_data: MODData) -> pd.DataFrame: """Predict the target values for the passed MODData. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. Returns: A `pandas.DataFrame` containing the predicted values of the targets. """ x = test_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat] ].values # Scale the input features: if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x) if self._multi_target: p = np.array(self.model.predict(x))[:, :, 0].transpose() else: p = np.array(self.model.predict(x))[:, 0].transpose() predictions = pd.DataFrame(p) predictions.columns = self.targets_flatten predictions.index = test_data.structure_ids return predictions
def evaluate(self, test_data: MODData) -> pd.DataFrame: """Evaluates the target values for the passed MODData by returning the corresponding loss. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. Returns: Loss score """ # prevents Nan predictions if some features are inf x = (test_data.get_featurized_df().replace( [np.inf, -np.inf, np.nan], 0)[self.optimal_descriptors[:self.n_feat]].values) # Scale the input features: x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x, nan=-1) y = [] for targ in self.targets_flatten: if self.num_classes[targ] >= 2: # Classification y_inner = tf.keras.utils.to_categorical( test_data.df_targets[targ].values, num_classes=self.num_classes[targ], ) else: y_inner = test_data.df_targets[targ].values.astype(np.float, copy=False) y.append(y_inner) return self.model.evaluate(x, y)[0]
def test_small_moddata_feature_selection_classif(small_moddata): """ This test creates classifier MODData and test the feature selection method """ x1 = np.array([0] * 500 + [1] * 500 + [2] * 500, dtype='float') x2 = np.random.choice(2, 1500) x3 = x1 * x2 x4 = x1 + (x2 * 0.5) targets = np.array(x1, dtype='int').reshape(-1, 1) features = np.array([x1, x2, x3, x4]).T names = ['my_classes'] c_nmi = pd.DataFrame([[1, 0, 0.5, 0.5], [0, 1, 0.5, 0.5], [0.5, 0.5, 1, 0.5], [0.5, 0.5, 0.5, 1]], columns=['f1', 'f2', 'f3', 'f4'], index=['f1', 'f2', 'f3', 'f4']) classif_md = MODData(['dummy'] * 1500, targets, target_names=names, num_classes={"my_classes": 3}) classif_md.df_featurized = pd.DataFrame(features, columns=['f1', 'f2', 'f3', 'f4']) classif_md.feature_selection(n=3, cross_nmi=c_nmi) assert len(classif_md.get_optimal_descriptors()) == 3 assert classif_md.get_optimal_descriptors() == ['f1', 'f4', 'f3']
def test_small_moddata_composition_featurization(small_moddata_composition): """ This test creates a new MODData from the MP 2018.6 structures. """ reference = small_moddata_composition compositions = reference.compositions new = MODData(materials=compositions) new.featurize(fast=False, n_jobs=1) new_cols = sorted(new.df_featurized.columns.tolist()) ref_cols = sorted(reference.df_featurized.columns.tolist()) for i in range(len(ref_cols)): # print(new_cols[i], ref_cols[i]) assert new_cols[i] == ref_cols[i] for col in new.df_featurized.columns: np.testing.assert_almost_equal( new.df_featurized[col].to_numpy(), reference.df_featurized[col].to_numpy(), )
def load_or_featurize(task, n_jobs=1): data_files = glob.glob("./precomputed/*.gz") if len(data_files) == 0: data = featurize(task, n_jobs=n_jobs) else: precomputed_moddata = data_files[0] if len(data_files) > 1: print( f"Found multiple data files {data_files}, loading the first {data_files[0]}" ) data = MODData.load(precomputed_moddata) return data
def _load_moddata(filename): """Loads the pickled MODData from the test directory and checks it's hash.""" from modnet.preprocessing import MODData data_file = Path(__file__).parent.joinpath(f"data/{filename}") if filename not in _TEST_DATA_HASHES: raise RuntimeError( f"Cannot verify hash of {filename} as it was not provided, will not load pickle." ) # Loading pickles can be dangerous, so lets at least check that the MD5 matches # what it was when created assert get_hash_of_file(data_file) == _TEST_DATA_HASHES[filename] return MODData.load(data_file)
def fit( self, training_data: MODData, n_jobs=1, **kwargs, ) -> None: """Train the model on the passed training `MODData` object. Parameters: same as MODNetModel fit. """ if self.bootstrap: LOG.info("Generating bootstrap data...") train_datas = [training_data.split((resample(np.arange(len(training_data.df_targets)), replace=True, random_state=2943),[]))[0] for _ in range(self.n_models)] else: train_datas = [training_data for _ in range(self.n_models)] if n_jobs<=1: for i in range(self.n_models): LOG.info(f"Bootstrap fitting model #{i + 1}/{self.n_models}") self.model[i].fit(train_datas[i], **kwargs) model_summary = "" for k in self.model[i].history.keys(): model_summary += "{}: {:.4f}\t".format(k, self.model[i].history[k][-1]) LOG.info(model_summary) else: ctx = multiprocessing.get_context('spawn') pool = ctx.Pool(processes=n_jobs) tasks =[] for i,m in enumerate(self.model): m._make_picklable() tasks.append({'model':m, 'training_data':train_datas[i], 'model_id':i, **kwargs}) for res in tqdm.tqdm(pool.imap_unordered(_map_fit_MODNet, tasks, chunksize=1), total=self.n_models): model, model_id = res model._restore_model() self.model[model_id] = model model_summary = f"Model #{model_id}\t" for k in model.history.keys(): model_summary += "{}: {:.4f}\t".format(k, model.history[k][-1]) LOG.info(model_summary) pool.close() pool.join()
def test_load_moddata_zip(): """ This test checks that older MODData objects can still be loaded. """ data_file = Path(__file__).parent.joinpath("data/MP_2018.6_subset.zip") # Loading pickles can be dangerous, so lets at least check that the MD5 matches # what it was when created assert (get_sha512_of_file(data_file) == "d7d75e646dbde539645c8c0b065fd82cbe93f81d3500809655bd13d0acf2027c" "1786091a73f53985b08868c5be431a3c700f7f1776002df28ebf3a12a79ab1a1") data = MODData.load(data_file) assert len(data.structures) == 100 assert len(data.mpids) == 100 assert len(data.df_structure) == 100 assert len(data.df_featurized) == 100 assert len(data.df_targets) == 100 assert len(data.df_targets) == 100
def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame: """Predict the target values for the passed MODData. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. return_prob: For a classification tasks only: whether to return the probability of each class OR only return the most probable class. Returns: A `pandas.DataFrame` containing the predicted values of the targets. """ # prevents Nan predictions if some features are inf x = (test_data.get_featurized_df().replace( [np.inf, -np.inf, np.nan], 0)[self.optimal_descriptors[:self.n_feat]].values) # Scale the input features: x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x, nan=-1) p = np.array(self.model.predict(x)) if len(p.shape) == 2: p = np.array([p]) p_dic = {} for i, name in enumerate(self.targets_flatten): if self.num_classes[name] >= 2: if return_prob: temp = p[i, :, :] / (p[i, :, :].sum(axis=1)).reshape( (-1, 1)) for j in range(temp.shape[-1]): p_dic["{}_prob_{}".format(name, j)] = temp[:, j] else: p_dic[name] = np.argmax(p[i, :, :], axis=1) else: p_dic[name] = p[i, :, 0] predictions = pd.DataFrame(p_dic) predictions.index = test_data.structure_ids return predictions
def featurize(task, n_jobs=1): import warnings warnings.filterwarnings("ignore", category=RuntimeWarning) from modnet.preprocessing import MODData from modnet.featurizers.presets import DeBreuck2020Featurizer from matminer.datasets import load_dataset if task == "matbench_elastic": df_g = load_dataset("matbench_log_gvrh") df_k = load_dataset("matbench_log_kvrh") df = df_g.join(df_k.drop("structure", axis=1)) else: df = load_dataset(task) mapping = { col: col.replace(" ", "_").replace("(", "").replace(")", "") for ind, col in enumerate(df.columns) } df.rename(columns=mapping, inplace=True) targets = [ col for col in df.columns if col not in ("id", "structure", "composition") ] if "structure" not in df.columns: featurizer = CompositionOnlyFeaturizer() else: featurizer = DeBreuck2020Featurizer(fast_oxid=True) try: materials = df["structure"] if "structure" in df.columns else df[ "composition"].map(Composition) except KeyError: raise RuntimeError( f"Could not find any materials data dataset for task {task!r}!") data = MODData( materials=materials.tolist(), targets=df[targets].values, target_names=targets, featurizer=featurizer, ) data.featurize(n_jobs=n_jobs) os.makedirs("./precomputed", exist_ok=True) data.save(f"./precomputed/{task}_moddata.pkl.gz") return data
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False): """ Runs benchmark based on final_model without training everything again. It also computes the Knn distance and puts it in the results pickle. In fine, this should be integrated inside modnet benchmark. :param data: :param final_model: :param settings: :return: """ task = settings["task"] # rebuild the EnsembleMODNetModels from the final model n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen bootstrap_size = 5 outer_fold_size = bootstrap_size * 5 * 5 inner_fold_size = bootstrap_size * 5 models = [] multi_target = bool(len(data.df_targets.columns) - 1) for i in range(5): # outer fold modnet_models = [] for j in range(5): # inner fold modnet_models+=( final_model.model[(i * outer_fold_size) + (j * inner_fold_size): (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)]) model = EnsembleMODNetModel(modnet_models=modnet_models) models.append(model) if dknn_only: with open(f"results/{task}_results.pkl", "rb") as f: results = pickle.load(f) results["dknns"] = [] else: results = defaultdict(list) for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))): train_data, test_data = data.split((train, test)) path = "folds/train_moddata_f{}".format(ind + 1) train_data = MODData.load(path) assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0 model = models[ind] # compute dkNN # TODO: test this quickly before submitting max_feat_model = np.argmax([m.n_feat for m in model.model]) n_feat = model.model[max_feat_model].n_feat feature_names = model.model[max_feat_model].optimal_descriptors dknn = get_dknn(train_data, test_data, feature_names) results["dknns"].append(dknn) if dknn_only: continue predict_kwargs = {} if settings.get("classification"): predict_kwargs["return_prob"] = True if model.can_return_uncertainty: predict_kwargs["return_unc"] = True pred_results = model.predict(test_data, **predict_kwargs) if isinstance(pred_results, tuple): predictions, stds = pred_results else: predictions = pred_results stds = None targets = test_data.df_targets if settings.get("classification"): from sklearn.metrics import roc_auc_score from sklearn.preprocessing import OneHotEncoder y_true = OneHotEncoder().fit_transform(targets.values).toarray() score = roc_auc_score(y_true, predictions.values) pred_bool = model.predict(test_data, return_prob=False) print(f"ROC-AUC: {score}") errors = targets - pred_bool elif multi_target: errors = targets - predictions score = np.mean(np.abs(errors.values), axis=0) else: errors = targets - predictions score = np.mean(np.abs(errors.values)) if save_folds: opt_feat = train_data.optimal_features[:n_feat] df_train = train_data.df_featurized df_train = df_train[opt_feat] df_train.to_csv("folds/train_f{}.csv".format(ind + 1)) df_test = test_data.df_featurized df_test = df_test[opt_feat] errors.columns = [x + "_error" for x in errors.columns] df_test = df_test.join(errors) df_test.to_csv("folds/test_f{}.csv".format(ind + 1)) results["predictions"].append(predictions) if stds is not None: results["stds"].append(stds) results["targets"].append(targets) results["errors"].append(errors) results["scores"].append(score) results['model'].append(model) return results
def fit_preset( self, data: MODData, presets: List[Dict[str, Any]] = None, val_fraction: float = 0.15, verbose: int = 0, classification: bool = False, refit: bool = True, fast: bool = False, nested: int = 5, callbacks: List[Any] = None, n_jobs=None, ) -> Tuple[List[List[Any]], np.ndarray, Optional[List[float]], List[List[float]], Dict[str, Any], ]: """Chooses an optimal hyper-parametered MODNet model from different presets. This function implements the "inner loop" of a cross-validation workflow. By modifying the `nested` argument, it can be run in full nested mode (i.e. train n_fold * n_preset models) or just with a simple random hold-out set. The data is first fitted on several well working MODNet presets with a validation set (10% of the furnished data by default). Sets the `self.model` attribute to the model with the lowest mean validation loss across all folds. Args: data: MODData object contain training and validation samples. presets: A list of dictionaries containing custom presets. verbose: The verbosity level to pass to tf.keras val_fraction: The fraction of the data to use for validation. classification: Whether or not we are performing classification. refit: Whether or not to refit the final model for each fold with the best-performing settings. fast: Used for debugging. If `True`, only fit the first 2 presets and reduce the number of epochs. nested: integer specifying whether or not to perform a full nested CV. If 0, a simple validation split is performed based on val_fraction argument. If an integer, use this number of inner CV folds, ignoring the `val_fraction` argument. Note: If set to 1, the value will be overwritten to a default of 5 folds. n_jobs: number of jobs for multiprocessing Returns: - A list of length num_outer_folds containing lists of MODNet models of length num_inner_folds. - A list of validation losses achieved by the best model for each fold during validation (excluding refit). - The learning curve of the final (refitted) model (or `None` if `refit` is `False`) - A nested list of learning curves for each trained model of lengths (num_outer_folds, num_inner folds). - The settings of the best-performing preset. """ from modnet.matbench.benchmark import matbench_kfold_splits if callbacks is None: es = tf.keras.callbacks.EarlyStopping( monitor="loss", min_delta=0.001, patience=100, verbose=verbose, mode="auto", baseline=None, restore_best_weights=False, ) callbacks = [es] if presets is None: from modnet.model_presets import gen_presets presets = gen_presets( len(data.optimal_features), len(data.df_targets), classification=classification, ) if fast and len(presets) >= 2: presets = presets[:2] for k, _ in enumerate(presets): presets[k]["epochs"] = 100 num_nested_folds = 5 if nested: num_nested_folds = nested if num_nested_folds <= 1: num_nested_folds = 5 # create tasks splits = matbench_kfold_splits(data, n_splits=num_nested_folds, classification=classification) if not nested: splits = [ train_test_split(range(len(data.df_featurized)), test_size=val_fraction) ] n_splits = 1 else: n_splits = num_nested_folds train_val_datas = [] for train, val in splits: train_val_datas.append(data.split((train, val))) tasks = [] for i, params in enumerate(presets): n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"]) for ind in range(n_splits): val_params = {} train_data, val_data = train_val_datas[ind] val_params["val_data"] = val_data tasks += [{ "train_data": train_data, "targets": self.targets, "weights": self.weights, "num_classes": self.num_classes, "n_feat": n_feat, "num_neurons": params["num_neurons"], "lr": params["lr"], "batch_size": params["batch_size"], "epochs": params["epochs"], "loss": params["loss"], "act": params["act"], "out_act": self.out_act, "callbacks": callbacks, "preset_id": i, "fold_id": ind, "verbose": verbose, **val_params, }] val_losses = 1e20 * np.ones((len(presets), n_splits)) learning_curves = [[None for _ in range(n_splits)] for _ in range(len(presets))] models = [[None for _ in range(n_splits)] for _ in range(len(presets))] ctx = multiprocessing.get_context("spawn") pool = ctx.Pool(processes=n_jobs) LOG.info( f"Multiprocessing on {n_jobs} cores. Total of {multiprocessing.cpu_count()} cores available." ) for res in tqdm.tqdm( pool.imap_unordered(map_validate_model, tasks, chunksize=1), total=len(tasks), ): val_loss, learning_curve, model, preset_id, fold_id = res LOG.info(f"Preset #{preset_id} fitting finished, loss: {val_loss}") # reload the model object after serialization model._restore_model() val_losses[preset_id, fold_id] = val_loss learning_curves[preset_id][fold_id] = learning_curve models[preset_id][fold_id] = model pool.close() pool.join() val_loss_per_preset = np.mean(val_losses, axis=1) best_preset_idx = int(np.argmin(val_loss_per_preset)) best_model_idx = int(np.argmin(val_losses[best_preset_idx, :])) best_preset = presets[best_preset_idx] best_learning_curve = learning_curves[best_preset_idx][best_model_idx] best_model = models[best_preset_idx][best_model_idx] LOG.info( "Preset #{} resulted in lowest validation loss with params {}". format(best_preset_idx + 1, tasks[n_splits * best_preset_idx + best_model_idx])) if refit: LOG.info("Refitting with all data and parameters: {}".format( best_preset)) # Building final model n_feat = min(len(data.get_optimal_descriptors()), best_preset["n_feat"]) self.model = MODNetModel( self.targets, self.weights, num_neurons=best_preset["num_neurons"], n_feat=n_feat, act=best_preset["act"], out_act=self.out_act, num_classes=self.num_classes, ).model self.n_feat = n_feat self.fit( data, val_fraction=0, lr=best_preset["lr"], epochs=best_preset["epochs"], batch_size=best_preset["batch_size"], loss=best_preset["loss"], callbacks=callbacks, verbose=verbose, ) else: self.n_feat = best_model.n_feat self.model = best_model.model self._scaler = best_model._scaler return models, val_losses, best_learning_curve, learning_curves, best_preset
def fit( self, training_data: MODData, val_fraction: float = 0.0, val_key: Optional[str] = None, val_data: Optional[MODData] = None, lr: float = 0.001, epochs: int = 200, batch_size: int = 128, xscale: Optional[str] = "minmax", metrics: List[str] = ["mae"], callbacks: List[Callable] = None, verbose: int = 0, loss: str = "mse", **fit_params, ) -> None: """Train the model on the passed training `MODData` object. Parameters: training_data: A `MODData` that has been featurized and feature selected. The first `self.n_feat` entries in `training_data.get_optimal_descriptors()` will be used for training. val_fraction: The fraction of the training data to use as a validation set for tracking model performance during training. val_key: The target name to track on the validation set during training, if performing multi-target learning. lr: The learning rate. epochs: The maximum number of epochs to train for. batch_size: The batch size to use for training. xscale: The feature scaler to use, either `None`, `'minmax'` or `'standard'`. metrics: A list of tf.keras metrics to pass to `compile(...)`. loss: The built-in tf.keras loss to pass to `compile(...)`. fit_params: Any additional parameters to pass to `fit(...)`, these will be overwritten by the explicit keyword arguments above. """ if self.n_feat > len(training_data.get_optimal_descriptors()): raise RuntimeError( "The model requires more features than computed in data. " f"Please reduce n_feat below or equal to {len(training_data.get_optimal_descriptors())}" ) self.xscale = xscale self.target_names = list(self.weights.keys()) self.optimal_descriptors = training_data.get_optimal_descriptors() x = training_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat]].values # For compatibility with MODNet 0.1.7; if there is only one target in the training data, # use that for the name of the target too. if (len(self.targets_flatten) == 1 and len(training_data.df_targets.columns) == 1): self.targets_flatten = list(training_data.df_targets.columns) y = [] for targ in self.targets_flatten: if self.num_classes[targ] >= 2: # Classification y_inner = tf.keras.utils.to_categorical( training_data.df_targets[targ].values, num_classes=self.num_classes[targ], ) loss = "categorical_crossentropy" else: y_inner = training_data.df_targets[targ].values.astype( np.float, copy=False) y.append(y_inner) # Scale the input features: if self.xscale == "minmax": self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5)) elif self.xscale == "standard": self._scaler = StandardScaler() x = self._scaler.fit_transform(x) x = np.nan_to_num(x, nan=-1) if val_data is not None: val_x = val_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat]].values val_x = self._scaler.transform(val_x) val_x = np.nan_to_num(val_x, nan=-1) try: val_y = list(val_data.get_target_df()[ self.targets_flatten].values.astype( np.float, copy=False).transpose()) except Exception: val_y = list(val_data.get_target_df().values.astype( np.float, copy=False).transpose()) validation_data = (val_x, val_y) else: validation_data = None # Optionally set up print callback if verbose: if val_fraction > 0 or validation_data: if self._multi_target and val_key is not None: val_metric_key = f"val_{val_key}_mae" else: val_metric_key = "val_mae" print_callback = tf.keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: print( f"epoch {epoch}: loss: {logs['loss']:.3f}, " f"val_loss:{logs['val_loss']:.3f} {val_metric_key}:{logs[val_metric_key]:.3f}" )) else: print_callback = tf.keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: print( f"epoch {epoch}: loss: {logs['loss']:.3f}")) if callbacks is None: callbacks = [print_callback] else: callbacks.append(print_callback) fit_params = { "x": x, "y": y, "epochs": epochs, "batch_size": batch_size, "verbose": verbose, "validation_split": val_fraction, "validation_data": validation_data, "callbacks": callbacks, } self.model.compile( loss=loss, optimizer=tf.keras.optimizers.Adam(lr=lr), metrics=metrics, loss_weights=self.weights, ) history = self.model.fit(**fit_params) self.history = history.history
if col not in ("id", "structure", "composition") ] try: materials = train_df[ "structure"] if "structure" in train_df.columns else train_df[ "composition"].map(Composition) except KeyError: raise RuntimeError( f"Could not find any materials data dataset for task {task!r}!" ) fast_oxid_featurizer = DeBreuck2020Featurizer(fast_oxid=True) train_data = MODData( materials=materials.tolist(), targets=train_df[targets].values, target_names=targets, featurizer=fast_oxid_featurizer, ) train_data.featurize(n_jobs=32) train_data.feature_selection(n=-1, use_precomputed_cross_nmi=True) # create model targets_hierarchy = [[[field for field in targets]]] weights = {field: 1 for field in targets} model = EnsembleMODNetModel(targets_hierarchy, weights) # fit model if USE_GA: # you can either use a GA for hyper-parameter optimization or... from modnet.hyper_opt import FitGenetic
def predict(self, test_data: MODData, return_prob=False, return_unc=False) -> pd.DataFrame: """Predict the target values for the passed MODData. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. return_prob: For a classification tasks only: whether to return the probability of each class OR only return the most probable class. return_unc: whether to return the standard deviation as a second dataframe Returns: A `pandas.DataFrame` containing the predicted values of the targets. If return_unc=True, two `pandas.DataFrame` : (predictions,std) containing the predicted values of the targets and the standard deviations of the epistemic uncertainty. """ # prevents Nan predictions if some features are inf x = (test_data.get_featurized_df().replace( [np.inf, -np.inf, np.nan], 0)[self.optimal_descriptors[:self.n_feat]].values) # Scale the input features: x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x) all_predictions = [] for i in range(1000): p = self.model.predict(x) if len(self.targets_flatten) == 1: p = np.array([p]) all_predictions.append(p) p_dic = {} unc_dic = {} for i, name in enumerate(self.targets_flatten): if self.num_classes[name] >= 2: if return_prob: preds = np.array([pred[i] for pred in all_predictions]) probs = preds / (preds.sum(axis=-1)).reshape((-1, 1)) mean_prob = probs.mean() std_prob = probs.std() for j in range(mean_prob.shape[-1]): p_dic["{}_prob_{}".format(name, j)] = mean_prob[:, j] unc_dic["{}_prob_{}".format(name, j)] = std_prob[:, j] else: p_dic[name] = np.argmax( np.array([pred[i] for pred in all_predictions]).mean(axis=0), axis=1, ) unc_dic[name] = np.max( np.array([pred[i] for pred in all_predictions]).mean(axis=0), axis=1, ) else: mean_p = np.array([pred[i] for pred in all_predictions]).mean(axis=0) std_p = np.array([pred[i] for pred in all_predictions]).std(axis=0) p_dic[name] = mean_p[:, 0] unc_dic[name] = std_p[:, 0] predictions = pd.DataFrame(p_dic) unc = pd.DataFrame(unc_dic) predictions.index = test_data.structure_ids unc.index = test_data.structure_ids if return_unc: return predictions, unc else: return predictions
def matbench_benchmark( data: MODData, target: List[str], target_weights: Dict[str, float], fit_settings: Optional[Dict[str, Any]] = None, classification: bool = False, model_type: Type[MODNetModel] = MODNetModel, save_folds: bool = False, save_models: bool = False, hp_optimization: bool = True, inner_feat_selection: bool = True, use_precomputed_cross_nmi: bool = True, presets: Optional[List[dict]] = None, fast: bool = False, n_jobs: Optional[int] = None, nested: bool = False, **model_init_kwargs, ) -> dict: """Train and cross-validate a model against Matbench data splits, optionally performing hyperparameter optimisation. Arguments: data: The entire dataset as a `MODData`. target: The list of target names to train on. target_weights: The target weights to use for the `MODNetModel`. fit_settings: Any settings to pass to `model.fit(...)` directly (typically when not performing hyperparameter optimisation). classification: Whether all tasks are classification rather than regression. model_type: The type of the model to create and benchmark. save_folds: Whether to save dataframes with pre-processed fold data (e.g. feature selection). save_models: Whether to pickle all trained models according to their fold index and performance. hp_optimization: Whether to perform hyperparameter optimisation. inner_feat_selection: Whether to perform split-level feature selection or try to use pre-computed values. use_precomputed_cross_nmi: Whether to use the precmputed cross NMI from the Materials Project dataset, or recompute per fold. presets: Override the built-in hyperparameter grid with these presets. fast: Whether to perform debug training, i.e. reduced presets and epochs. n_jobs: Try to parallelize the inner fit_preset over this number of processes. Maxes out at number_of_presets*nested_folds nested: Whether to perform nested CV for hyperparameter optimisation. **model_init_kwargs: Additional arguments to pass to the model on creation. Returns: A dictionary containing all the results from the training, broken down by model and by fold. """ if fit_settings is None: fit_settings = {} if not fit_settings.get("n_feat"): nf = len(data.df_featurized.columns) fit_settings["n_feat"] = nf if not fit_settings.get("num_neurons"): # Pass dummy network fit_settings["num_neurons"] = [[4], [4], [4], [4]] fold_data = [] results = defaultdict(list) for ind, (train, test) in enumerate(matbench_kfold_splits(data)): train_data, test_data = data.split((train, test)) if inner_feat_selection: path = "folds/train_moddata_f{}".format(ind + 1) if os.path.isfile(path): train_data = MODData.load(path) else: train_data.feature_selection( n=-1, use_precomputed_cross_nmi=use_precomputed_cross_nmi) os.makedirs("folds", exist_ok=True) train_data.save(path) fold_data.append((train_data, test_data)) args = (target, target_weights, fit_settings) model_kwargs = { "model_type": model_type, "hp_optimization": hp_optimization, "fast": fast, "classification": classification, "save_folds": save_folds, "presets": presets, "save_models": save_models, "nested": nested, "n_jobs": n_jobs, } model_kwargs.update(model_init_kwargs) fold_results = [] for fold in enumerate(fold_data): fold_results.append(train_fold(fold, *args, **model_kwargs)) for fold in fold_results: for key in fold: results[key].append(fold[key]) return results
def fit_preset( self, data: MODData, presets: List[Dict[str, Any]] = None, val_fraction: float = 0.1, verbose: int = 0 ) -> None: """Chooses an optimal hyper-parametered MODNet model from different presets. The data is first fitted on several well working MODNet presets with a validation set (10% of the furnished data by default). Sets the `self.model` attribute to the model with the lowest loss. Args: data: MODData object contain training and validation samples. presets: A list of dictionaries containing custom presets. verbose: The verbosity level to pass to Keras val_fraction: The fraction of the data to use for validation. """ rlr = keras.callbacks.ReduceLROnPlateau( monitor="loss", factor=0.5, patience=20, verbose=verbose, mode="auto", min_delta=0, ) es = keras.callbacks.EarlyStopping( monitor="loss", min_delta=0.001, patience=300, verbose=verbose, mode="auto", baseline=None, restore_best_weights=True, ) callbacks = [rlr, es] if presets is None: from modnet.model_presets import MODNET_PRESETS presets = MODNET_PRESETS val_losses = 1e20 * np.ones((len(presets),)) best_model = None best_n_feat = None for i, params in enumerate(presets): logging.info("Training preset #{}/{}".format(i + 1, len(presets))) n_feat = min(len(data.get_optimal_descriptors()), params["n_feat"]) self.model = MODNetModel( self.targets, self.weights, num_neurons=params["num_neurons"], n_feat=n_feat, act=params["act"], ).model self.n_feat = n_feat self.fit( data, val_fraction=val_fraction, lr=params["lr"], epochs=params["epochs"], batch_size=params["batch_size"], loss=params["loss"], callbacks=callbacks, verbose=verbose, ) val_loss = np.array(self.model.history.history["val_loss"])[-20:].mean() if val_loss < min(val_losses): best_model = self.model best_n_feat = n_feat val_losses[i] = val_loss logging.info("Validation loss: {:.3f}".format(val_loss)) best_preset = val_losses.argmin() logging.info( "Preset #{} resulted in lowest validation loss.\nFitting all data...".format( best_preset + 1 ) ) self.n_feat = best_n_feat self.model = best_model