def _check_datasets(self, dataset=None, csv_dataset=None): if csv_dataset: if dataset: print( "Dataset and csv.dataset are given, hence dataset will be overwritten by csv.data." ) dataset = DataSet.read_csv(csv_dataset, index_col=None) return dataset
def test_train_experimental_emulator(): model_name = f"reizman_suzuki_case_1" domain = ReizmanSuzukiEmulator.setup_domain() ds = DataSet.read_csv(DATA_PATH / f"{model_name}.csv") exp = ExperimentalEmulator(model_name, domain, dataset=ds, regressor=ANNRegressor) # Test grid search cross validation and training # params = { # "regressor__net__max_epochs": [1, 1000], # } params = None exp.train(cv_folds=5, max_epochs=1000, random_state=100, search_params=params, verbose=0) # Testing res = exp.test() r2 = res["test_r2"].mean() assert r2 > 0.8 # Test plotting fig, ax = exp.parity_plot(output_variables="yld", include_test=True) # Test saving/loading exp.save("test_ee") exp_2 = ExperimentalEmulator.load(model_name, "test_ee") assert all(exp.descriptors_features) == all(exp_2.descriptors_features) assert exp.n_examples == exp_2.n_examples assert all(exp.output_variable_names) == all(exp_2.output_variable_names) assert exp.clip == exp_2.clip exp_2.X_train, exp_2.y_train, exp_2.X_test, exp_2.y_test = ( exp.X_train, exp.y_train, exp.X_test, exp.y_test, ) res = exp_2.test(X_test=exp.X_test, y_test=exp.y_test) exp.parity_plot(output_variables="yld", include_test=True) r2 = res["test_r2"].mean() assert r2 > 0.8 shutil.rmtree("test_ee")
def _train_baumgartner(use_descriptors=False, show_plots=False, save_plots=True): # Setup model_name = f"baumgartner_aniline_cn_crosscoupling" domain = BaumgartnerCrossCouplingEmulator.setup_domain() ds = DataSet.read_csv(DATA_PATH / f"{model_name}.csv") # Create emulator and train model_name += "_descriptors" if use_descriptors else "" exp = ExperimentalEmulator( model_name, domain, dataset=ds, regressor=ANNRegressor, output_variable_names=["yield"], descriptors_features=["catalyst", "base"] if use_descriptors else [], ) res = exp.train(max_epochs=MAX_EPOCHS, cv_folds=CV_FOLDS, random_state=100, test_size=0.2) # Run test res_test = exp.test() res.update(res_test) # Save emulator model_path = pathlib.Path(MODELS_PATH / model_name) model_path.mkdir(exist_ok=True) exp.save(model_path) # Make plot for posteriority sake fig, ax = exp.parity_plot(include_test=True) if save_plots: fig.savefig(f"results/{model_name}.png", dpi=100) if show_plots: plt.show() return res
results_average = [{ f"avg_{score_name}": scores.mean() for score_name, scores in result.items() } for result in results] index = [f"case_{i}" for i in range(1, 5)] results_df = pd.DataFrame.from_records(results_average, index=index) results_df.index.rename("case", inplace=True) results_df.to_csv(f"results/reizman_suzuki_scores.csv") def train_one_reizman(case, show_plots=False, save_plots=True): # Setup model_name = f"reizman_suzuki_case_{case}" domain = ReizmanSuzukiEmulator.setup_domain() ds = DataSet.read_csv(DATA_PATH / f"{model_name}.csv") # Create emulator and train exp = ExperimentalEmulator( model_name, domain, dataset=ds, regressor=ANNRegressor, ) res = exp.train(max_epochs=MAX_EPOCHS, cv_folds=CV_FOLDS, random_state=100, test_size=0.2) # Run test res_test = exp.test()