def test_train_small_bootstrap_single_target_classif(small_moddata, tf_session): """Tests the single target training.""" from modnet.models import EnsembleMODNetModel data = small_moddata # set 'optimal' features manually data.optimal_features = [ col for col in data.df_featurized.columns if col.startswith("ElementProperty") ] def is_metal(egap): if egap == 0: return 1 else: return 0 data.df_targets["is_metal"] = data.df_targets["egap"].apply(is_metal) model = EnsembleMODNetModel( [[["is_metal"]]], weights={"is_metal": 1}, num_neurons=[[16], [8], [8], [4]], num_classes={"is_metal": 2}, n_feat=10, n_models=3, bootstrap=True, ) model.fit(data, epochs=5) model.predict(data) model.predict(data, return_unc=True)
def test_train_small_bootstrap_presets(small_moddata, tf_session): """Tests the `fit_preset()` method.""" from modnet.model_presets import gen_presets from modnet.models import EnsembleMODNetModel modified_presets = gen_presets(100, 100)[:2] for ind, preset in enumerate(modified_presets): modified_presets[ind]["epochs"] = 2 data = small_moddata # set 'optimal' features manually data.optimal_features = [ col for col in data.df_featurized.columns if col.startswith("ElementProperty") ] model = EnsembleMODNetModel( [[["eform", "egap"]]], weights={ "eform": 1, "egap": 1 }, num_neurons=[[4], [2], [2], [2]], n_feat=3, n_models=2, bootstrap=True, ) # nested=0/False -> no inner loop, so only 1 model # nested=1/True -> inner loop, but default n_folds so 5 for num_nested, nested_option in zip([2, 1], [2, 0]): results = model.fit_preset( data, presets=modified_presets, nested=nested_option, val_fraction=0.2, n_jobs=2, ) models = results[0] assert len(models) == len(modified_presets) assert len(models[0]) == num_nested
def test_train_small_bootstrap_multi_target(small_moddata, tf_session): """Tests the multi-target training.""" from modnet.models import EnsembleMODNetModel data = small_moddata # set 'optimal' features manually data.optimal_features = [ col for col in data.df_featurized.columns if col.startswith("ElementProperty") ] model = EnsembleMODNetModel( [[["eform", "egap"]]], weights={ "eform": 1, "egap": 1 }, num_neurons=[[16], [8], [8], [4]], n_feat=10, n_models=3, bootstrap=True, ) model.fit(data, epochs=5) model.predict(data, return_unc=True)
def run_predict(data, final_model, settings, save_folds=False, dknn_only=False): """ Runs benchmark based on final_model without training everything again. It also computes the Knn distance and puts it in the results pickle. In fine, this should be integrated inside modnet benchmark. :param data: :param final_model: :param settings: :return: """ task = settings["task"] # rebuild the EnsembleMODNetModels from the final model n_best_archs = 5 # change this (from 1 to 5 max) to adapt number of inner best archs chosen bootstrap_size = 5 outer_fold_size = bootstrap_size * 5 * 5 inner_fold_size = bootstrap_size * 5 models = [] multi_target = bool(len(data.df_targets.columns) - 1) for i in range(5): # outer fold modnet_models = [] for j in range(5): # inner fold modnet_models+=( final_model.model[(i * outer_fold_size) + (j * inner_fold_size): (i * outer_fold_size) + (j * inner_fold_size) + (n_best_archs * bootstrap_size)]) model = EnsembleMODNetModel(modnet_models=modnet_models) models.append(model) if dknn_only: with open(f"results/{task}_results.pkl", "rb") as f: results = pickle.load(f) results["dknns"] = [] else: results = defaultdict(list) for ind, (train, test) in enumerate(matbench_kfold_splits(data, classification=settings.get("classification", False))): train_data, test_data = data.split((train, test)) path = "folds/train_moddata_f{}".format(ind + 1) train_data = MODData.load(path) assert len(set(train_data.df_targets.index).intersection(set(test_data.df_targets.index))) == 0 model = models[ind] # compute dkNN # TODO: test this quickly before submitting max_feat_model = np.argmax([m.n_feat for m in model.model]) n_feat = model.model[max_feat_model].n_feat feature_names = model.model[max_feat_model].optimal_descriptors dknn = get_dknn(train_data, test_data, feature_names) results["dknns"].append(dknn) if dknn_only: continue predict_kwargs = {} if settings.get("classification"): predict_kwargs["return_prob"] = True if model.can_return_uncertainty: predict_kwargs["return_unc"] = True pred_results = model.predict(test_data, **predict_kwargs) if isinstance(pred_results, tuple): predictions, stds = pred_results else: predictions = pred_results stds = None targets = test_data.df_targets if settings.get("classification"): from sklearn.metrics import roc_auc_score from sklearn.preprocessing import OneHotEncoder y_true = OneHotEncoder().fit_transform(targets.values).toarray() score = roc_auc_score(y_true, predictions.values) pred_bool = model.predict(test_data, return_prob=False) print(f"ROC-AUC: {score}") errors = targets - pred_bool elif multi_target: errors = targets - predictions score = np.mean(np.abs(errors.values), axis=0) else: errors = targets - predictions score = np.mean(np.abs(errors.values)) if save_folds: opt_feat = train_data.optimal_features[:n_feat] df_train = train_data.df_featurized df_train = df_train[opt_feat] df_train.to_csv("folds/train_f{}.csv".format(ind + 1)) df_test = test_data.df_featurized df_test = df_test[opt_feat] errors.columns = [x + "_error" for x in errors.columns] df_test = df_test.join(errors) df_test.to_csv("folds/test_f{}.csv".format(ind + 1)) results["predictions"].append(predictions) if stds is not None: results["stds"].append(stds) results["targets"].append(targets) results["errors"].append(errors) results["scores"].append(score) results['model'].append(model) return results
if not os.path.isdir(task): raise RuntimeError(f"No folder found for {task!r}.") os.chdir(task) print(f"Running on {n_jobs} jobs") settings = load_settings(task) settings["task"] = task if args.get("predict"): if not os.path.isfile(f"final_model/{task}_model"): raise RuntimeError("No model found for prediction, please run the benchmark first.") else: print("Loading data and model...") data = load_or_featurize(task) final_model = EnsembleMODNetModel.load(f"final_model/{task}_model") print("Running predictions...") results = run_predict(data, final_model, settings) print("Saving results...") try: save_results(results, task) except Exception: print_exc() if args.get("plot"): #make graphs only if not os.path.isfile(f"results/{task}_results.pkl"): raise RuntimeError("No results file, please run the benchmark before plotting.") else: print("Loading previous results.") with open(f"results/{task}_results.pkl", "rb") as f:
) fast_oxid_featurizer = DeBreuck2020Featurizer(fast_oxid=True) train_data = MODData( materials=materials.tolist(), targets=train_df[targets].values, target_names=targets, featurizer=fast_oxid_featurizer, ) train_data.featurize(n_jobs=32) train_data.feature_selection(n=-1, use_precomputed_cross_nmi=True) # create model targets_hierarchy = [[[field for field in targets]]] weights = {field: 1 for field in targets} model = EnsembleMODNetModel(targets_hierarchy, weights) # fit model if USE_GA: # you can either use a GA for hyper-parameter optimization or... from modnet.hyper_opt import FitGenetic ga = FitGenetic(train_data) model = ga.run( size_pop=20, num_generations=10, n_jobs=16, early_stopping=True, refit=True, ) else: