def test_upper_bound_few_inducing_points(): """ Test for upper bound for regression marginal likelihood """ model_vfe = gpflow.models.SGPR( (DatumUpper.X, DatumUpper.Y), gpflow.kernels.SquaredExponential(), inducing_variable=DatumUpper.X[:10, :].copy(), mean_function=Constant()) opt = gpflow.optimizers.Scipy() @tf.function(autograph=False) def model_vfe_closure(): return -model_vfe.log_marginal_likelihood() opt.minimize(model_vfe_closure, variables=model_vfe.trainable_variables, options=dict(maxiter=500)) full_gp = gpflow.models.GPR((DatumUpper.X, DatumUpper.Y), kernel=gpflow.kernels.SquaredExponential(), mean_function=Constant()) full_gp.kernel.lengthscale.assign(model_vfe.kernel.lengthscale) full_gp.kernel.variance.assign(model_vfe.kernel.variance) full_gp.likelihood.variance.assign(model_vfe.likelihood.variance) full_gp.mean_function.c.assign(model_vfe.mean_function.c) lml_upper = model_vfe.upper_bound() lml_vfe = model_vfe.log_marginal_likelihood() lml_full_gp = full_gp.log_marginal_likelihood() assert lml_vfe < lml_full_gp assert lml_full_gp < lml_upper
def make_single_layer_models(X, Y, Z): D = X.shape[1] Y_mean, Y_std = np.average(Y), np.std(Y) m_sgpr = SGPR(X, Y, RBF(D, variance=Y_std**2), Z.copy(), mean_function=Constant(Y_mean)) m_svgp = SVGP(X, Y, RBF(D, variance=Y_std**2), Gaussian(), Z.copy(), mean_function=Constant(Y_mean)) m_fitc = GPRFITC(X, Y, RBF(D, variance=Y_std**2), Z.copy(), mean_function=Constant(Y_mean)) for m in [m_sgpr, m_svgp, m_fitc]: m.mean_function.fixed = True m.likelihood.variance = 0.1 * Y_std return m_sgpr, m_svgp, m_fitc
def _prepare_models(): """ Prepare models to make sure the coregionalized model with diagonal coregion kernel and with fixed lengthscale is equivalent with normal GP regression. """ # 1. Two independent VGPs for two sets of data k0 = gpflow.kernels.SquaredExponential() k0.lengthscale.trainable = False k1 = gpflow.kernels.SquaredExponential() k1.lengthscale.trainable = False vgp0 = gpflow.models.VGP((Datum.X[0], Datum.Y[0]), kernel=k0, mean_function=Constant(), likelihood=gpflow.likelihoods.Gaussian(), num_latent=1) vgp1 = gpflow.models.VGP((Datum.X[1], Datum.Y[1]), kernel=k1, mean_function=Constant(), likelihood=gpflow.likelihoods.Gaussian(), num_latent=1) # 2. Coregionalized GPR kc = gpflow.kernels.SquaredExponential(active_dims=[0, 1]) kc.lengthscale.trainable = False kc.variance.trainable = False # variance is handles by the coregion kernel coreg = gpflow.kernels.Coregion(output_dim=2, rank=1, active_dims=[2]) coreg.W.trainable = False lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()] ) mean_c = gpflow.mean_functions.SwitchedMeanFunction( [gpflow.mean_functions.Constant(), gpflow.mean_functions.Constant()]) cvgp = gpflow.models.VGP((Datum.X_augumented, Datum.Y_augumented), kernel=kc * coreg, mean_function=mean_c, likelihood=lik, num_latent=1 ) # Train them for a small number of iterations opt = gpflow.optimizers.Scipy() @tf.function(autograph=False) def vgp0_closure(): return - vgp0.log_marginal_likelihood() @tf.function(autograph=False) def vgp1_closure(): return - vgp1.log_marginal_likelihood() @tf.function(autograph=False) def cvgp_closure(): return - cvgp.log_marginal_likelihood() opt.minimize(vgp0_closure, variables=vgp0.trainable_variables, options=dict(maxiter=1000), method='BFGS') opt.minimize(vgp1_closure, variables=vgp1.trainable_variables, options=dict(maxiter=1000), method='BFGS') opt.minimize(cvgp_closure, variables=cvgp.trainable_variables, options=dict(maxiter=1000), method='BFGS') return vgp0, vgp1, cvgp
def _create_approximate_models(): """ 1) Variational GP (with the likelihood set to Gaussian) 2) Sparse variational GP (likelihood is Gaussian, inducing points at the data) 3) Sparse variational GP (as above, but with the whitening rotation of the inducing variables) 4) Sparse variational GP Regression (as above, but there the inducing variables are 'collapsed' out, as in Titsias 2009) 5) FITC Sparse GP Regression """ model_1 = gpflow.models.VGP( (Datum.X, Datum.Y), kernel=gpflow.kernels.SquaredExponential(), likelihood=gpflow.likelihoods.Gaussian(), mean_function=gpflow.mean_functions.Constant(), ) model_2 = gpflow.models.SVGP( kernel=gpflow.kernels.SquaredExponential(), likelihood=gpflow.likelihoods.Gaussian(), inducing_variable=Datum.X.copy(), q_diag=False, whiten=False, mean_function=gpflow.mean_functions.Constant(), num_latent_gps=Datum.Y.shape[1], ) gpflow.set_trainable(model_2.inducing_variable, False) model_3 = gpflow.models.SVGP( kernel=gpflow.kernels.SquaredExponential(), likelihood=gpflow.likelihoods.Gaussian(), inducing_variable=Datum.X.copy(), q_diag=False, whiten=True, mean_function=gpflow.mean_functions.Constant(), num_latent_gps=Datum.Y.shape[1], ) gpflow.set_trainable(model_3.inducing_variable, False) model_4 = gpflow.models.SGPR( (Datum.X, Datum.Y), kernel=gpflow.kernels.SquaredExponential(), inducing_variable=Datum.X.copy(), mean_function=Constant(), ) gpflow.set_trainable(model_4.inducing_variable, False) model_5 = gpflow.models.GPRFITC( (Datum.X, Datum.Y), kernel=gpflow.kernels.SquaredExponential(), inducing_variable=Datum.X.copy(), mean_function=Constant(), ) gpflow.set_trainable(model_5.inducing_variable, False) return model_1, model_2, model_3, model_4, model_5
def test_switched_mean_function(N, D): """ Test for the SwitchedMeanFunction. """ X = np.hstack([rng.randn(N, D), 1.0 * rng.randint(0, 2, N).reshape(-1, 1)]) zeros, ones = Constant(np.zeros(1)), Constant(np.ones(1)) switched_mean = SwitchedMeanFunction([zeros, ones]) np_list = np.array([0., 1.]) result_ref = (np_list[X[:, D].astype(default_int())]).reshape(-1, 1) result = switched_mean(X) assert_allclose(result, result_ref)
def make_dgp(X, Y, Z, L): D = X.shape[1] Y_mean, Y_std = np.average(Y), np.std(Y) # the layer shapes are defined by the kernel dims, so here all hidden layers are D dimensional kernels = [] for l in range(L): kernels.append(RBF(D, lengthscales=1., variance=1.)) # between layer noise (doesn't actually make much difference but we include it anyway) for kernel in kernels[:-1]: kernel += White(D, variance=1e-5) mb = 10000 if X.shape[0] > 10000 else None model = DGP(X, Y, Z, kernels, Gaussian(), num_samples=1, minibatch_size=mb) # same final layer inits we used for the single layer model model.layers[-1].kern.variance = Y_std**2 model.likelihood.variance = Y_std * 0.1 model.layers[-1].mean_function = Constant(Y_mean) model.layers[-1].mean_function.fixed = True # start the inner layers almost deterministically for layer in model.layers[:-1]: layer.q_sqrt = layer.q_sqrt.value * 1e-5 return model
def mean_function_factory(mean_function_name, D_in, D_out): if mean_function_name == "Zero": return Zero(output_dim=D_out) elif mean_function_name == "Constant": return Constant(c=rng.rand(D_out)) elif mean_function_name == "Linear": return Linear(A=rng.rand(D_in, D_out), b=rng.rand(D_out)) else: return None
def fit(self, X_train, y_train): y_train_scaled = self.y_scaler.fit_transform(y_train.reshape(-1, 1)) k = Tanimoto() self.m = gpflow.models.GPR( data=(X_train.astype(np.float64), y_train_scaled), mean_function=Constant(np.mean(y_train_scaled)), kernel=k, noise_variance=1) opt = gpflow.optimizers.Scipy() opt.minimize(self.objective_closure, self.m.trainable_variables, options=dict(maxiter=self.maxiter))
def _fit(self, X, F, data): if self.regr == 'constant': mf = Constant() elif self.regr == 'linear': mf = Linear(numpy.ones((X.shape[1], 1)), numpy.ones((1, 1))) if self.kernel == 'linear': kernel = gpflow.kernels.Linear(X.shape[1], ARD=self.ARD) if self.kernel == 'rbf': kernel = gpflow.kernels.RBF(X.shape[1], ARD=self.ARD) elif self.kernel == 'polynomial': kernel = gpflow.kernels.Polynomial(X.shape[1], ARD=self.ARD) m = gpflow.gpr.GPR(X, numpy.array([F]).T, kern=kernel, mean_function=mf) m.optimize() self.model = m
def _prepare_models(): """ Prepare models to make sure the coregionalized model with diagonal coregion kernel and with fixed lengthscales is equivalent with normal GP regression. """ # 1. Two independent VGPs for two sets of data k0 = gpflow.kernels.SquaredExponential() set_trainable(k0.lengthscales, False) k1 = gpflow.kernels.SquaredExponential() set_trainable(k1.lengthscales, False) vgp0 = gpflow.models.VGP( (Datum.X[0], Datum.Y[0]), kernel=k0, mean_function=Constant(), likelihood=gpflow.likelihoods.Gaussian(), num_latent_gps=1, ) vgp1 = gpflow.models.VGP( (Datum.X[1], Datum.Y[1]), kernel=k1, mean_function=Constant(), likelihood=gpflow.likelihoods.Gaussian(), num_latent_gps=1, ) # 2. Coregionalized VGP kc = gpflow.kernels.SquaredExponential(active_dims=[0, 1]) set_trainable(kc.lengthscales, False) set_trainable(kc.variance, False) # variance is handled by the Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=2, rank=1, active_dims=[2]) coreg.W.assign(np.zeros((2, 1))) # zero correlation between outputs set_trainable(coreg.W, False) lik = gpflow.likelihoods.SwitchedLikelihood( [gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) mean_c = gpflow.mean_functions.SwitchedMeanFunction( [gpflow.mean_functions.Constant(), gpflow.mean_functions.Constant()]) cvgp = gpflow.models.VGP( (Datum.X_augmented, Datum.Y_augmented), kernel=kc * coreg, mean_function=mean_c, likelihood=lik, num_latent_gps=1, ) # Train them for a small number of iterations opt = gpflow.optimizers.Scipy() opt.minimize( vgp0.training_loss, variables=vgp0.trainable_variables, options=dict(maxiter=1000), method="BFGS", ) opt.minimize( vgp1.training_loss, variables=vgp1.trainable_variables, options=dict(maxiter=1000), method="BFGS", ) opt.minimize( cvgp.training_loss, variables=cvgp.trainable_variables, options=dict(maxiter=1000), method="BFGS", ) return vgp0, vgp1, cvgp
def main(path, path_to_dft_dataset, task, representation, theory_level): """ :param path: str specifying path to photoswitches.csv file. :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. :param task: str specifying the task. e_iso_pi only supported task for the TD-DFT comparison. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] """ data_loader = TaskDataLoader(task, path) smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) X = featurise_mols(smiles_list, representation) # Keep only non-duplicate entries because we're not considering effects of solvent non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) X = X[non_duplicate_indices, :] experimental_vals = experimental_vals[non_duplicate_indices] non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) pbe0_vals = pbe0_vals[non_dup_pbe0] cam_vals = cam_vals[non_dup_cam] # molecules with dft values to be split into train/test if theory_level == 'CAM-B3LYP': X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals))) # DFT values for the CAM-B3LYP level of theory dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals))) else: X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals))) # DFT values for the PBE0 level of theory dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals))) mae_list = [] dft_mae_list = [] # We define the Gaussian Process optimisation objective m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') for i in range(len(y_with_dft)): X_train = np.delete(X_with_dft, i, axis=0) y_train = np.delete(y_with_dft, i) X_test = X_with_dft[i].reshape(1, -1) y_test = y_with_dft[i] dft_test = dft_vals[i] X_train = np.concatenate((X_train, X_no_dft)) y_train = np.concatenate((y_train, y_no_dft)) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Output MAE for this trial mae = abs(y_test - y_pred) print("MAE: {}".format(mae)) # Store values in order to compute the mean and standard error of the statistics across trials mae_list.append(mae) # DFT prediction scores on the same trial dft_mae = abs(y_test - dft_test) dft_mae_list.append(dft_mae) mae_list = np.array(mae_list) dft_mae_list = np.array(dft_mae_list) print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
def _create_approximate_models(): """ 1) Variational GP (with the likelihood set to Gaussian) 2) Sparse variational GP (likelihood is Gaussian, inducing points at the data) 3) Sparse variational GP (as above, but with the whitening rotation of the inducing variables) 4) Sparse variational GP Regression (as above, but there the inducing variables are 'collapsed' out, as in Titsias 2009) 5) FITC Sparse GP Regression """ model_1 = gpflow.models.VGP((Datum.X, Datum.Y), gpflow.kernels.SquaredExponential(), likelihood=gpflow.likelihoods.Gaussian(), mean_function=gpflow.mean_functions.Constant()) model_2 = gpflow.models.SVGP( gpflow.kernels.SquaredExponential(), gpflow.likelihoods.Gaussian(), inducing_variable=Datum.X.copy(), q_diag=False, mean_function=gpflow.mean_functions.Constant(), num_latent=Datum.Y.shape[1]) gpflow.utilities.set_trainable(model_2.inducing_variable, False) model_3 = gpflow.models.SVGP( kernel=gpflow.kernels.SquaredExponential(), likelihood=gpflow.likelihoods.Gaussian(), inducing_variable=Datum.X.copy(), q_diag=False, whiten=True, mean_function=gpflow.mean_functions.Constant(), num_latent=Datum.Y.shape[1]) gpflow.utilities.set_trainable(model_3.inducing_variable, False) model_4 = gpflow.models.GPRFITC((Datum.X, Datum.Y), kernel=gpflow.kernels.SquaredExponential(), inducing_variable=Datum.X.copy(), mean_function=Constant()) gpflow.utilities.set_trainable(model_4.inducing_variable, False) model_5 = gpflow.models.SGPR((Datum.X, Datum.Y), gpflow.kernels.SquaredExponential(), inducing_variable=Datum.X.copy(), mean_function=Constant()) gpflow.utilities.set_trainable(model_5.inducing_variable, False) # Train models opt = gpflow.optimizers.Scipy() @tf.function(autograph=False) def model_1_closure(): return -model_1.log_marginal_likelihood() @tf.function(autograph=False) def model_2_closure(): return -model_2.elbo(Datum.data) @tf.function(autograph=False) def model_3_closure(): return -model_3.elbo(Datum.data) @tf.function(autograph=False) def model_4_closure(): return -model_4.log_marginal_likelihood() @tf.function(autograph=False) def model_5_closure(): return -model_5.log_marginal_likelihood() opt.minimize(model_1_closure, variables=model_1.trainable_variables, options=dict(maxiter=300)) opt.minimize(model_2_closure, variables=model_2.trainable_variables, options=dict(maxiter=300)) opt.minimize(model_3_closure, variables=model_3.trainable_variables, options=dict(maxiter=300)) opt.minimize(model_4_closure, variables=model_4.trainable_variables, options=dict(maxiter=300)) opt.minimize(model_5_closure, variables=model_5.trainable_variables, options=dict(maxiter=300)) return model_1, model_2, model_3, model_4, model_5
def main(path, task, representation, use_pca, n_trials, test_set_size): """ Train a multioutput GP simultaneously on all tasks of the photoswitch dataset. :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set """ # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data( ) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data( ) smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_e_iso_pi = y_e_iso_pi.reshape(-1, 1) y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_e_iso_pi[0, :]) tanimoto_active_dims = [i for i in range(feature_dim) ] # active dims for Tanimoto base kernel. r2_list = [] rmse_list = [] mae_list = [] print('\nBeginning training loop...') for i in range(0, n_trials): if task == 'e_iso_pi': X_task = X_e_iso_pi y_task = y_e_iso_pi elif task == 'z_iso_pi': X_task = X_z_iso_pi y_task = y_z_iso_pi elif task == 'e_iso_n': X_task = X_e_iso_n y_task = y_e_iso_n else: X_task = X_z_iso_n y_task = y_z_iso_n X_train, X_test, y_train, y_test = train_test_split( X_task, y_task, test_size=test_set_size, random_state=i) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) if task == 'e_iso_pi': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) elif task == 'z_iso_pi': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_train, np.ones((len(X_train), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_train, np.ones_like(y_train))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test))) elif task == 'e_iso_n': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)) * 2, axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_train, np.ones_like(y_train) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test) * 2)) else: # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_train, np.ones_like(y_train) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test) * 3)) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) #set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian() ]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize( m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test[:, 0], y_pred) rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) mae = mean_absolute_error(y_test[:, 0], y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) B = coreg.output_covariance().numpy() print("B =", B) _ = plt.imshow(B) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list))))
best_nlpd_kernel = '' gapped_rates = np.reshape(gapped_flux[i, :], (-1, 1)) ground_truth_rates = ground_truth_flux_matrix[i, :] # Standardize the count rates flux_scaler = StandardScaler() gapped_rates = flux_scaler.fit_transform(gapped_rates) for k in kernel_list: name = kernel_dict[k] m = gpflow.models.GPR(data=(train_times, gapped_rates), mean_function=Constant( np.mean(gapped_rates)), kernel=k, noise_variance=np.float64(0.001)) if fix_noise: fixed_noise = np.float64(0.001) # was 0.05 previously set_trainable( m.likelihood.variance, False ) # We don't want to optimise the noise level in this case. m.likelihood.variance = fixed_noise opt = gpflow.optimizers.Scipy() # If Cholesky decomposition error, then skip try:
kernel_list[0]: 'RBF_Kernel', kernel_list[1]: 'Matern_12_Kernel', kernel_list[2]: 'Matern_32_Kernel', kernel_list[3]: 'Matern_52_Kernel', kernel_list[4]: 'Rational_Quadratic_Kernel' } for k in kernel_list: name = kernel_dict[k] # GP uses a constant mean function, where the constant is set to be the empirical average of the standardised # counts m = gpflow.models.GPR(data=(time, uv_band_flux), mean_function=Constant(np.mean(uv_band_flux)), kernel=k, noise_variance=1) if fix_noise: # Fix a noise level to be the average experimental error observed in the dataset (0.037) for magnitudes # Noise level is 2.0364e-15 for the flux values. # Standardisation destroys this information so setting noise to be mean of standardised values divided by # the SNR in the orignal space. fixed_noise = np.mean(np.abs(uv_band_flux / snr)) set_trainable( m.likelihood.variance, False ) # We don't want to optimise the noise level in this case. m.likelihood.variance = fixed_noise
from gpflow.config import default_int from gpflow.inducing_variables import InducingPoints from gpflow.mean_functions import Additive, Constant, Linear, Product, SwitchedMeanFunction, Zero rng = np.random.RandomState(99021) class Datum: input_dim, output_dim = 3, 2 N, Ntest, M = 20, 30, 10 _mean_functions = [ Zero(), Linear(A=rng.randn(Datum.input_dim, Datum.output_dim), b=rng.randn(Datum.output_dim, 1).reshape(-1)), Constant(c=rng.randn(Datum.output_dim, 1).reshape(-1)) ] @pytest.mark.parametrize('mean_function_1', _mean_functions) @pytest.mark.parametrize('mean_function_2', _mean_functions) @pytest.mark.parametrize('operation', ['+', 'x']) def test_mean_functions_output_shape(mean_function_1, mean_function_2, operation): """ Test the output shape for basic and compositional mean functions, also check that the combination of mean functions returns the correct class """ X = np.random.randn(Datum.N, Datum.input_dim) Y = mean_function_1(X) # basic output shape check assert Y.shape in [(Datum.N, Datum.output_dim), (Datum.N, 1)]
return -m.log_marginal_likelihood() # We standardise the outputs but leave the inputs unchanged. Equivalent to transform data used in other scripts. y_train = y_train.reshape(-1, 1) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) # Fit GP k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred)
kernel_list[0]: 'RBF_Kernel', kernel_list[1]: 'Matern_12_Kernel', kernel_list[2]: 'Matern_32_Kernel', kernel_list[3]: 'Matern_52_Kernel', kernel_list[4]: 'Rational_Quadratic_Kernel' } for k in kernel_list: name = kernel_dict[k] # GP uses a constant mean function, where the constant is set to be the empirical average of the standardised # counts m = gpflow.models.GPR(data=(time, counts), mean_function=Constant(np.mean(counts)), kernel=k, noise_variance=1) if fix_noise: # Fix a noise level to be a jitter of 1e-4 because the log transform means we lose access to the # empirical values. # The SNR is ca. 16 in the original data so it's possible to impose this in the standardised data as well. fixed_noise = np.mean( np.abs(counts / snr) ) # 0.05, current val, fixed_noise = np.float64(0.0001) previously set_trainable( m.likelihood.variance, False ) # We don't want to optimise the noise level in this case. m.likelihood.variance = fixed_noise
def main(path, representation): """ :param path: str specifying path to dataset. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ task = 'e_iso_pi' # Always e_iso_pi for human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # 5 test molecules test_smiles = [ 'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC' ] # and their indices in the loaded data test_smiles_indices = [116, 131, 168, 221, 229] X_train = np.delete(X, np.array(test_smiles_indices), axis=0) y_train = np.delete(y, np.array(test_smiles_indices)) X_test = X[[116, 131, 168, 221, 229]] # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was # under a different solvent y_test = y[[116, 131, 168, 221, 229]] y_test[2] = 407. y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) num_features = np.shape(X)[1] # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # for plotting confidence-error curves rmse_confidence_list = [] mae_confidence_list = [] k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) r2 = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) per_molecule = abs(y_pred - y_test) print("\n Averaged test statistics are") print("\nR^2: {:.3f}".format(r2)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) print("\nAbsolute error per molecule is {} ".format(per_molecule))
def main(path, task, n_trials, test_set_size, use_rmse_conf, kernel, N): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity'] :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. :param kernel: str specifying the kernel to be used. One of ['ShortestPath', ] """ start_time = time.time() data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() # List truncation for faster computation smiles_list = smiles_list[0:N] y = y[0:N] m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split( smiles_list, y, test_size=test_set_size) # To get test set size # Photoswitch dataset requires 80/20 splitting. Other datasets are 80/10/10. if task != 'Photoswitch': split_in_two = int(len(y_test) / 2) n_test = split_in_two else: n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( smiles_list, y, test_size=test_set_size, random_state=i) if task != 'Photoswitch': # Artificially create a 80/10/10 train/validation/test split discarding the validation set. split_in_two = int(len(y_test) / 2) X_test = X_test[0:split_in_two] y_test = y_test[0:split_in_two] y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_train = np.asarray(X_train) X_test = np.asarray(X_test) print('kernel is ', kernel) if kernel == 'PUTH': k = GP.kernels.PUTH() elif kernel == 'CW': k = GP.kernels.CWgeo() elif kernel == 'MK': k = GP.kernels.MK() elif kernel == 'SP': k = GP.kernels.SP() elif kernel == 'SSP': k = GP.kernels.SSP() elif kernel == 'T': k = GP.kernels.T() elif kernel == 'WL': k = GP.kernels.WL() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood optimizer = tf.optimizers.Adam(learning_rate=0.1) print_summary(m) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(m.trainable_variables) ll = m.maximum_log_likelihood_objective() objective = -ll gradients = tape.gradient(objective, m.trainable_variables) optimizer.apply_gradients(zip(gradients, m.trainable_variables)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_pred.numpy() # Compute scores for confidence curve plotting. ranked_confidence_list = np.argsort(y_var, axis=0).flatten() for k in range(len(y_test)): # Construct the RMSE error for each level of confidence conf = ranked_confidence_list[0:k + 1] rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf])) rmse_confidence_list[i, k] = rmse # Construct the MAE error for each level of confidence mae = mean_absolute_error(y_test[conf], y_pred[conf]) mae_confidence_list[i, k] = mae # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error( y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nDataset: {}".format(task)) print("\nKernel: {}".format(kernel)) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) #### modify to include kernel and dataset info in outputs, use {} command outF = open("results.txt", "a") outF.write("\n") outF.write("\nDataset: {}".format(task)) outF.write("\nKernel: {}".format(kernel)) outF.write("\nTime taken: {}".format(time.time() - start_time)) outF.write("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) outF.write("\nmean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) outF.write("\nmean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) outF.close() # Plot confidence-error curves confidence_percentiles = np.arange( 1e-14, 100, 100 / len(y_test) ) # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29
# Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) # set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train)
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split(X, y, test_size=test_set_size) # To get test set size n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # e_iso_pi best params: # {'learner': RandomForestRegressor(max_features=0.9348473830061558, n_estimators=381, # n_jobs=1, random_state=2, verbose=False)} # e_iso_n best params: # {'learner': RandomForestRegressor(bootstrap=False, max_features=0.09944870853556087, # min_samples_leaf=3, n_estimators=1295, n_jobs=1, # random_state=0, verbose=False)} # z_iso_pi best params: # {'learner': RandomForestRegressor(max_depth=4, max_features=0.33072121415416944, # n_estimators=2755, n_jobs=1, random_state=2, # verbose=False)} # z_iso_n best params: # {'learner': RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, # random_state=3, verbose=False)} regr_rf = RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, random_state=3, verbose=False) regr_rf.fit(X_train, y_train) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction and RF prediction y_pred, y_var = m.predict_f(X_test) y_pred_rf = regr_rf.predict(X_test) y_pred_av = (y_pred + y_pred_rf.reshape(-1, 1)) / 2.0 y_pred = y_scaler.inverse_transform(y_pred_av) y_test = y_scaler.inverse_transform(y_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) y_pred_train_rf = regr_rf.predict(X_train) y_pred_train = (y_pred_train + y_pred_train_rf.reshape(-1, 1)) / 2.0 train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
def test_models_with_mean_functions_changes(model_class): """ Simply check that all models have a higher prediction with a constant mean function than with a zero mean function. For compositions of mean functions check that multiplication/ addition of a constant results in a higher prediction, whereas addition of zero/ mutliplication with one does not. """ data = rng.randn(Datum.N, Datum.input_dim), rng.randn(Datum.N, 1) predict_at = rng.randn(Datum.Ntest, Datum.input_dim) inducing_variable = InducingPoints(Z=rng.randn(Datum.M, Datum.input_dim)) kernel = gpflow.kernels.Matern32() likelihood = gpflow.likelihoods.Gaussian() zero_mean = Zero() non_zero_mean = Constant(c=np.ones(1) * 10) if model_class in [gpflow.models.GPR]: model_zero_mean = model_class(data, kernel=kernel, mean_function=zero_mean) model_non_zero_mean = model_class(data, kernel=kernel, mean_function=non_zero_mean) elif model_class in [gpflow.models.VGP]: model_zero_mean = model_class(data, likelihood=likelihood, kernel=kernel, mean_function=zero_mean) model_non_zero_mean = model_class(data, likelihood=likelihood, kernel=kernel, mean_function=non_zero_mean) elif model_class in [gpflow.models.SVGP]: model_zero_mean = model_class(kernel=kernel, likelihood=likelihood, inducing_variable=inducing_variable, mean_function=zero_mean) model_non_zero_mean = model_class(kernel=kernel, likelihood=likelihood, inducing_variable=inducing_variable, mean_function=non_zero_mean) elif model_class in [gpflow.models.SGPR, gpflow.models.GPRFITC]: model_zero_mean = model_class(data, kernel=kernel, inducing_variable=inducing_variable, mean_function=zero_mean) model_non_zero_mean = model_class(data, kernel=kernel, inducing_variable=inducing_variable, mean_function=non_zero_mean) elif model_class in [gpflow.models.SGPMC]: model_zero_mean = model_class(data, kernel=kernel, likelihood=likelihood, inducing_variable=inducing_variable, mean_function=zero_mean) model_non_zero_mean = model_class(data, kernel=kernel, likelihood=likelihood, inducing_variable=inducing_variable, mean_function=non_zero_mean) elif model_class in [gpflow.models.GPMC]: model_zero_mean = model_class(data, kernel=kernel, likelihood=likelihood, mean_function=zero_mean) model_non_zero_mean = model_class(data, kernel=kernel, likelihood=likelihood, mean_function=non_zero_mean) else: raise (NotImplementedError) mu_zero, var_zero = model_zero_mean.predict_f(predict_at) mu_non_zero, var_non_zero = model_non_zero_mean.predict_f(predict_at) # predictive variance remains unchanged after modifying mean function assert np.all(var_zero.numpy() == var_non_zero.numpy()) # predictive mean changes after modifying mean function assert not np.all(mu_zero.numpy() == mu_non_zero.numpy())
def main(path, representation): """ :param path: str specifying path to dataset. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ task = 'e_iso_pi' # task always e_iso_pi with human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # 5 test molecules test_smiles = [ 'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC' ] # and their indices in the loaded data test_smiles_indices = [116, 131, 168, 221, 229] X_train = np.delete(X, np.array(test_smiles_indices), axis=0) y_train = np.delete(y, np.array(test_smiles_indices)) X_test = X[[116, 131, 168, 221, 229]] # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was # under a different solvent y_test = y[[116, 131, 168, 221, 229]] y_test[2] = 407. y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # # We standardise the outputs but leave the inputs unchanged # # _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data( ) smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_train[0, :]) tanimoto_active_dims = [i for i in range(feature_dim) ] # active dims for Tanimoto base kernel. # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) # set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian() ]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize( m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) r2 = r2_score(y_test[:, 0], y_pred) rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) mae = mean_absolute_error(y_test[:, 0], y_pred) per_molecule = np.diag(abs(y_pred - y_test[:, 0])) print("\n Averaged test statistics are") print("\nR^2: {:.3f}".format(r2)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) print("\nAbsolute error per molecule is {} ".format(per_molecule))
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split( X, y, test_size=test_set_size) # To get test set size n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Compute scores for confidence curve plotting. ranked_confidence_list = np.argsort(y_var, axis=0).flatten() for k in range(len(y_test)): # Construct the RMSE error for each level of confidence conf = ranked_confidence_list[0:k + 1] rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf])) rmse_confidence_list[i, k] = rmse # Construct the MAE error for each level of confidence mae = mean_absolute_error(y_test[conf], y_pred[conf]) mae_confidence_list[i, k] = mae # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) # Plot confidence-error curves confidence_percentiles = np.arange( 1e-14, 100, 100 / len(y_test) ) # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29 if use_rmse_conf: rmse_mean = np.mean(rmse_confidence_list, axis=0) rmse_std = np.std(rmse_confidence_list, axis=0) # We flip because we want the most confident predictions on the right-hand side of the plot rmse_mean = np.flip(rmse_mean) rmse_std = np.flip(rmse_std) # One-sigma error bars lower = rmse_mean - rmse_std upper = rmse_mean + rmse_std plt.plot(confidence_percentiles, rmse_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('RMSE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig( task + '/results/gpr/{}_confidence_curve_rmse.png'.format(representation)) plt.show() else: # We plot the Mean-absolute error confidence-error curves mae_mean = np.mean(mae_confidence_list, axis=0) mae_std = np.std(mae_confidence_list, axis=0) mae_mean = np.flip(mae_mean) mae_std = np.flip(mae_std) lower = mae_mean - mae_std upper = mae_mean + mae_std plt.plot(confidence_percentiles, mae_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('MAE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig( task + '/results/gpr/{}_confidence_curve_mae.png'.format(representation)) plt.show()
def main(path, path_to_dft_dataset, representation, theory_level): """ :param path: str specifying path to photoswitches.csv file. :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] """ task = 'e_iso_pi' # e_iso_pi only task supported for TD-DFT comparison data_loader = TaskDataLoader(task, path) smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) X = featurise_mols(smiles_list, representation) # Keep only non-duplicate entries because we're not considering effects of solvent non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) X = X[non_duplicate_indices, :] experimental_vals = experimental_vals[non_duplicate_indices] non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) pbe0_vals = pbe0_vals[non_dup_pbe0] cam_vals = cam_vals[non_dup_cam] # molecules with dft values to be split into train/test if theory_level == 'CAM-B3LYP': X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals))) # DFT values for the CAM-B3LYP level of theory dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals))) else: X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals))) # DFT values for the PBE0 level of theory dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals))) # Load in the other property values for multitask learning. e_iso_pi is a always the task in this instance. data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_no_dft[0, :]) tanimoto_active_dims = [i for i in range(feature_dim)] # active dims for Tanimoto base kernel. mae_list = [] dft_mae_list = [] # We define the Gaussian Process optimisation objective m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') for i in range(len(y_with_dft)): X_train = np.delete(X_with_dft, i, axis=0) y_train = np.delete(y_with_dft, i) X_test = X_with_dft[i].reshape(1, -1) y_test = y_with_dft[i] dft_test = dft_vals[i] X_train = np.concatenate((X_train, X_no_dft)) y_train = np.concatenate((y_train, y_no_dft)) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) #set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",) print_summary(m) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output MAE for this trial mae = abs(y_test[:, 0] - y_pred) print("MAE: {}".format(mae)) # Store values in order to compute the mean and standard error of the statistics across trials mae_list.append(mae) # DFT prediction scores on the same trial dft_mae = abs(y_test[:, 0] - dft_test) dft_mae_list.append(dft_mae) mae_list = np.array(mae_list) dft_mae_list = np.array(dft_mae_list) print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf, precompute_repr): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity'] :param representation: str specifying the molecular representation. One of ['SMILES, fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. :param precompute_repr: bool indicating whether to precompute representations or not. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() print('\nBeginning augmentation...') start_time = time.time() x, smiles_card, y = augmentation(np.array(smiles_list), y, 15, canon=False, rotate=True) print('\nFinished augmentation after', time.time() - start_time) print('\nBeginning representation...') start_time = time.time() X = featurise_mols(x, representation) print('\nFinished representation after', time.time() - start_time) if precompute_repr: if representation == 'SMILES': with open( f'precomputed_representations/{task}_{representation}.txt', 'w') as f: for smiles in X: f.write(smiles + '\n') else: np.savetxt( f'precomputed_representations/{task}_{representation}.txt', X) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split( X, y, test_size=test_set_size) # To get test set size # Photoswitch dataset requires 80/20 splitting. Other datasets are 80/10/10. if task != 'Photoswitch': split_in_two = int(len(y_test) / 2) n_test = split_in_two else: n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) if representation == 'SMILES': np.savetxt(f'fixed_train_test_splits/{task}/X_train_split_{i}.txt', X_train, fmt="%s") np.savetxt(f'fixed_train_test_splits/{task}/X_test_split_{i}.txt', X_test, fmt="%s") np.savetxt(f'fixed_train_test_splits/{task}/y_train_split_{i}.txt', y_train) np.savetxt(f'fixed_train_test_splits/{task}/y_test_split_{i}.txt', y_test) else: if task != 'Photoswitch': # Artificially create a 80/10/10 train/validation/test split discarding the validation set. split_in_two = int(len(y_test) / 2) X_test = X_test[0:split_in_two] y_test = y_test[0:split_in_two] y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Compute scores for confidence curve plotting. ranked_confidence_list = np.argsort(y_var, axis=0).flatten() for k in range(len(y_test)): # Construct the RMSE error for each level of confidence conf = ranked_confidence_list[0:k + 1] rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf])) rmse_confidence_list[i, k] = rmse # Construct the MAE error for each level of confidence mae = mean_absolute_error(y_test[conf], y_pred[conf]) mae_confidence_list[i, k] = mae # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error( y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) if representation != 'SMILES': r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) # Plot confidence-error curves confidence_percentiles = np.arange( 1e-14, 100, 100 / len(y_test) ) # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29 if use_rmse_conf: rmse_mean = np.mean(rmse_confidence_list, axis=0) rmse_std = np.std(rmse_confidence_list, axis=0) # We flip because we want the most confident predictions on the right-hand side of the plot rmse_mean = np.flip(rmse_mean) rmse_std = np.flip(rmse_std) # One-sigma error bars lower = rmse_mean - rmse_std upper = rmse_mean + rmse_std plt.plot(confidence_percentiles, rmse_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('RMSE') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig(task + '/results/tanimoto/{}_confidence_curve_rmse.png'. format(representation)) plt.show() else: # We plot the Mean-absolute error confidence-error curves mae_mean = np.mean(mae_confidence_list, axis=0) mae_std = np.std(mae_confidence_list, axis=0) mae_mean = np.flip(mae_mean) mae_std = np.flip(mae_std) lower = mae_mean - mae_std upper = mae_mean + mae_std plt.plot(confidence_percentiles, mae_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('MAE') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig(task + '/results/tanimoto/{}_confidence_curve_mae.png'.format( representation)) plt.show()