### Fit GP Model to liquid density param_names = list(R125.param_names) + ["temperature"] property_name = "sim_liq_density" x_train, y_train, x_test, y_test = shuffle_and_split( df_all, param_names, property_name, shuffle_seed=gp_shuffle_seed, fraction_train=0.8) # Fit model models = {} models["RBF"] = run_gpflow_scipy( x_train, y_train, gpflow.kernels.RBF(lengthscales=np.ones(R125.n_params + 1)), ) models["Matern32"] = run_gpflow_scipy( x_train, y_train, gpflow.kernels.Matern32(lengthscales=np.ones(R125.n_params + 1)), ) models["Matern52"] = run_gpflow_scipy( x_train, y_train, gpflow.kernels.Matern52(lengthscales=np.ones(R125.n_params + 1)), ) # Plot model performance on train and test points
def main(): # Make sure we have a folder for figures try: os.mkdir("figs") except FileExistsError: pass ########################################################### #################### Fit GP models ################### ########################################################### ## UCMD Model param_names = list(AP.param_names) property_name = "uc_mean_distance" # Only train on 10 K data that meets ucmd clf threshold df_ucmd = df_all.loc[(df_all["temperature"] == 10) & (df_all["uc_mean_distance"] < ucmd_clf_threshold)] # Get train/test x_train, y_train, x_test, y_test = shuffle_and_split( df_ucmd, param_names, property_name, shuffle_seed=gp_shuffle_seed) # Fit model ucmd_gp = run_gpflow_scipy( x_train, y_train, gpflow.kernels.Matern32(lengthscales=np.ones(AP.n_params)), ) ## Lattice APE Model param_names = list(AP.param_names) + ["scaled_temperature"] property_name = "lattice_ape" # Get train/test x_train, y_train, x_test, y_test = shuffle_and_split( df_all, param_names, property_name, shuffle_seed=gp_shuffle_seed) # Fit model lattice_gp = run_gpflow_scipy( x_train, y_train, gpflow.kernels.Matern32(lengthscales=np.ones(AP.n_params + 1)), ) ########################################################### ################## Train classifers ################## ########################################################### x_train, y_train, x_test, y_test = shuffle_and_split( df_all.loc[df_all["temperature"] == 10], list(AP.param_names), "uc_mean_distance", shuffle_seed=clf_shuffle_seed, ) y_train = np.array(y_train < ucmd_clf_threshold, dtype=np.int32) y_test = np.array(y_test < ucmd_clf_threshold, dtype=np.int32) ucmd_clf = svm.SVC(kernel="rbf") ucmd_clf.fit(x_train, y_train) y_pred = ucmd_clf.predict(x_train) print("Training accuracy:", metrics.accuracy_score(y_train, y_pred)) y_pred = ucmd_clf.predict(x_test) print("Testing accuracy:", metrics.accuracy_score(y_test, y_pred)) print(metrics.confusion_matrix(y_test, y_pred)) ########################################################### ################### Find new points ################### ########################################################### # Load large hypercube latin_hypercube = np.loadtxt("LHS_1e6x8.csv", delimiter=",") # Apply UCMD classifier ucmd_pred = ucmd_clf.predict(latin_hypercube) # Predict UCMD with GP model gp_means_ucmd, gp_vars_ucmd = ucmd_gp.predict_f(latin_hypercube) # Predict Lattice APE with GP model at each temperature all_errs = np.empty(shape=(latin_hypercube.shape[0], len(temperatures))) col_idx = 0 for temperature in temperatures: scaled_temperature = values_real_to_scaled(temperature, AP.temperature_bounds) xx = np.hstack((latin_hypercube, np.tile(scaled_temperature, (latin_hypercube.shape[0], 1)))) gp_means_ape, gp_vars_ape = lattice_gp.predict_f(xx) all_errs[:, col_idx] = gp_means_ape[:, 0] col_idx += 1 # Compute MAPE across all three temperatures mean_errs = np.mean(all_errs, axis=1) ## Save all the results to a dataframe LH_results = pd.DataFrame(latin_hypercube, columns=AP.param_names) LH_results["ucmd_clf"] = ucmd_pred.astype(np.bool_) LH_results["ucmd"] = gp_means_ucmd.numpy() LH_results["lattice_mape"] = mean_errs # Only take points where structure classifier is satisifed LH_results_pass_ucmd_clf = LH_results.loc[LH_results.ucmd_clf == True] costs = LH_results_pass_ucmd_clf[["ucmd", "lattice_mape"]].to_numpy() # Find pareto efficient points result, pareto_points, dominated_points = find_pareto_set( costs, is_pareto_efficient) LH_results_pass_ucmd_clf["is_pareto"] = result # Plot pareto points vs. costs g = seaborn.pairplot( LH_results_pass_ucmd_clf, vars=["ucmd", "lattice_mape"], hue="is_pareto", ) g.savefig("figs/pareto-mses.png", dpi=300) # Plot pareto points vs. params g = seaborn.pairplot(LH_results_pass_ucmd_clf, vars=list(AP.param_names), hue="is_pareto") g.set(xlim=(-0.1, 1.1), ylim=(-0.1, 1.1)) g.savefig("figs/pareto-params.png", dpi=300) # For next iteration: 1. All non-dominated points that meet the thresholds # 2. "Separated" dominated points that meet the thresholds next_iteration_points = LH_results_pass_ucmd_clf.loc[ (LH_results_pass_ucmd_clf.is_pareto == True) & (LH_results_pass_ucmd_clf.ucmd < ucmd_next_itr_threshold) & (LH_results_pass_ucmd_clf.lattice_mape < lattice_mape_next_itr_threshold)] dominated_points = LH_results_pass_ucmd_clf.loc[ (LH_results_pass_ucmd_clf.is_pareto == False) & (LH_results_pass_ucmd_clf.ucmd < ucmd_next_itr_threshold) & (LH_results_pass_ucmd_clf.lattice_mape < lattice_mape_next_itr_threshold)] print(f"{len(LH_results_pass_ucmd_clf)} points meet the ucmd classifier.") print( f"{len(LH_results_pass_ucmd_clf[LH_results_pass_ucmd_clf.is_pareto == True])} are non-dominated." ) print( f"{len(dominated_points)} are dominated with ucmd < {ucmd_next_itr_threshold} and lattice_mape < {lattice_mape_next_itr_threshold}" ) removal_distance = 1.034495 np.random.seed(distance_seed) discarded_points = pd.DataFrame(columns=dominated_points.columns) while len(dominated_points > 0): # Shuffle the top parameter sets dominated_points = dominated_points.sample(frac=1) # Select one off the top # Note: here we use a random one rather than the "best" one; we don't have # confidence that the GP models are more accurate than our thresholds next_iteration_points = next_iteration_points.append( dominated_points.iloc[[0]]) # Remove anything within given distance l1_norm = np.sum( np.abs( dominated_points[list(AP.param_names)].values - next_iteration_points[list(AP.param_names)].iloc[[-1]].values), axis=1, ) points_to_remove = np.where(l1_norm < removal_distance)[0] discarded_points = discarded_points.append( dominated_points.iloc[points_to_remove]) dominated_points.drop(index=dominated_points.index[points_to_remove], inplace=True) print( f"After removing similar points, we are left with {len(next_iteration_points)} final top points." ) next_iteration_points = next_iteration_points[:250] next_iteration_points.drop(columns=["ucmd", "lattice_mape", "is_pareto"], inplace=True) # Plot new points g = seaborn.pairplot(next_iteration_points, vars=list(AP.param_names)) g.set(xlim=(-0.1, 1.1), ylim=(-0.1, 1.1)) g.savefig("figs/new-points-params.png", dpi=300) # Save the final new parameters next_iteration_points.to_csv(csv_path + out_csv_name)
test_score = classifier.score(x_test, y_test) print(f"Classifer is {test_score*100.0}% accurate on the test set.") ### Fit GP Model # Create training/test set param_names = list(R32.param_names) + ["temperature"] property_name = "md_density" x_train, y_train, x_test, y_test = shuffle_and_split( df_liquid, param_names, property_name, shuffle_seed=gp_shuffle_seed ) # Fit model model = run_gpflow_scipy( x_train, y_train, gpflow.kernels.RBF(lengthscales=np.ones(R32.n_params + 1)), ) ### Step 3: Find new parameters for MD simulations # SVM to classify hypercube regions as liquid or vapor latin_hypercube = np.loadtxt("LHS_1e6x6.csv", delimiter=",") liquid_samples, vapor_samples = classify_samples(latin_hypercube, classifier) # Find the lowest MSE points from the GP in both sets ranked_liquid_samples = rank_samples(liquid_samples, model, R32, "sim_liq_density") ranked_vapor_samples = rank_samples(vapor_samples, model, R32, "sim_liq_density") # Make a set of the lowest MSE parameter sets top_liquid_samples = ranked_liquid_samples[
### Fit GP models to VLE data # Create training/test set param_names = list(R125.param_names) + ["temperature"] property_names = ["sim_liq_density", "sim_vap_density", "sim_Pvap", "sim_Hvap"] vle_models = {} for property_name in property_names: # Get train/test x_train, y_train, x_test, y_test = shuffle_and_split( df_vle, param_names, property_name, shuffle_seed=gp_shuffle_seed) # Fit model vle_models[property_name] = run_gpflow_scipy( x_train, y_train, gpflow.kernels.RBF(lengthscales=np.ones(R125.n_params + 1)), ) # For vapor density replace with Matern52 kernel property_name = "sim_vap_density" # Get train/test x_train, y_train, x_test, y_test = shuffle_and_split( df_vle, param_names, property_name, shuffle_seed=gp_shuffle_seed) # Fit model vle_models[property_name] = run_gpflow_scipy( x_train, y_train, gpflow.kernels.Matern52(lengthscales=np.ones(R125.n_params + 1)), )
def main(): seaborn.set_palette("Paired") # Liquid density first param_names = list(R32.param_names) + ["temperature"] property_name = "sim_liq_density" property_bounds = R32.liq_density_bounds # Extract train/test data x_train, y_train, x_test, y_test = shuffle_and_split( df_all, param_names, property_name, shuffle_seed=gp_shuffle_seed, fraction_train=0.8, ) # Fit model model = run_gpflow_scipy( x_train, y_train, gpflow.kernels.RBF(lengthscales=np.ones(R32.n_params + 1)), ) # Use model to predict results gp_mu_train, gp_var_train = model.predict_f(x_train) gp_mu_test, gp_var_test = model.predict_f(x_test) # Convert results to physical values y_train_physical = values_scaled_to_real(y_train, property_bounds) y_test_physical = values_scaled_to_real(y_test, property_bounds) gp_mu_train_physical = values_scaled_to_real(gp_mu_train, property_bounds) gp_mu_test_physical = values_scaled_to_real(gp_mu_test, property_bounds) # Plot fig, ax = plt.subplots() ax.scatter( y_train_physical, gp_mu_train_physical, label="Train", alpha=0.4, s=130, c="C1", ) ax.scatter( y_test_physical, gp_mu_test_physical, marker="+", label="Test", alpha=0.7, s=170, c="C5", ) xylim = [750, 1250] ax.plot( np.arange(xylim[0], xylim[1] + 100, 100), np.arange(xylim[0], xylim[1] + 100, 100), color="black", linewidth=3, alpha=0.6, ) ax.set_xlim(xylim[0], xylim[1]) ax.set_ylim(xylim[0], xylim[1]) ax.set_xticks([800, 1000, 1200]) ax.set_yticks([800, 1000, 1200]) ax.set_xticks([850, 900, 950, 1050, 1100, 1150], minor=True) ax.set_yticks([850, 900, 950, 1050, 1100, 1150], minor=True) ax.tick_params("both", direction="in", which="both", length=4, labelsize=26, pad=10) ax.tick_params("both", which="major", length=8) ax.xaxis.set_ticks_position("both") ax.yaxis.set_ticks_position("both") ax.set_xlabel(r"$\mathregular{\rho_{liq}\ sim. (kg/m^3)}$", fontsize=28, labelpad=20) ax.set_ylabel(r"$\mathregular{\rho_{liq}\ sur. (kg/m^3)}$", fontsize=28, labelpad=10) ax.legend(fontsize=24, handletextpad=0.00, loc="lower right", bbox_to_anchor=(1.01, -0.01)) ax.set_aspect("equal", "box") fig.tight_layout() fig.savefig("pdfs/fig1-surrogate-liquiddensity.pdf") # Vapor density next param_names = list(R32.param_names) + ["temperature"] property_name = "sim_vap_density" property_bounds = R32.vap_density_bounds # Extract train/test data x_train, y_train, x_test, y_test = shuffle_and_split( df_all, param_names, property_name, shuffle_seed=gp_shuffle_seed, fraction_train=0.8, ) # Fit model model = run_gpflow_scipy( x_train, y_train, gpflow.kernels.RBF(lengthscales=np.ones(R32.n_params + 1)), ) # Use model to predict results gp_mu_train, gp_var_train = model.predict_f(x_train) gp_mu_test, gp_var_test = model.predict_f(x_test) # Convert results to physical values y_train_physical = values_scaled_to_real(y_train, property_bounds) y_test_physical = values_scaled_to_real(y_test, property_bounds) gp_mu_train_physical = values_scaled_to_real(gp_mu_train, property_bounds) gp_mu_test_physical = values_scaled_to_real(gp_mu_test, property_bounds) # Plot fig, ax = plt.subplots() ax.scatter( y_train_physical, gp_mu_train_physical, label="Train", alpha=0.6, s=130, c="C1", ) ax.scatter( y_test_physical, gp_mu_test_physical, marker="+", label="Test", alpha=0.8, s=170, c="C5", ) xylim = [0, 125] ax.plot(np.arange(xylim[0], xylim[1] + 100, 100), np.arange(xylim[0], xylim[1] + 100, 100), color="black", linewidth=3, alpha=0.6) ax.set_xlim(xylim[0], xylim[1]) ax.set_ylim(xylim[0], xylim[1]) ax.set_xticks([0, 50, 100]) ax.set_yticks([0, 50, 100]) ax.set_xticks([25, 75, 125], minor=True) ax.set_yticks([25, 75, 125], minor=True) ax.tick_params("both", direction="in", which="both", length=4, labelsize=26, pad=10) ax.tick_params("both", which="major", length=8) ax.xaxis.set_ticks_position("both") ax.yaxis.set_ticks_position("both") ax.set_xlabel(r"$\mathregular{\rho_{vap}\ sim. (kg/m^3)}$", fontsize=28, labelpad=20) ax.set_ylabel(r"$\mathregular{\rho_{vap}\ sur. (kg/m^3)}$", fontsize=28, labelpad=10) ax.legend(fontsize=24, handletextpad=0.00) ax.set_aspect("equal", "box") fig.tight_layout() fig.savefig("pdfs/fig1-surrogate-vapordensity.pdf")