def main(path, task, aug_factor, n_trials, test_set_size): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity'] :param aug_factor: Factor by which to augment the SMILES dataset. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split(smiles_list, y, test_size=test_set_size, random_state=i) # Augment the train set SMILES by a factor equal to aug_factor X_train, smiles_card, y_train = augmentation(np.array(X_train), y_train, aug_factor, canon=False, rotate=True) # Augment the test set SMILES by a factor equal to aug_factor X_test_aug, smiles_test_card, y_test_aug = augmentation(np.array(X_test), y_test, aug_factor, canon=False, rotate=True) # Save the augmented train SMILES with fixed test set. np.savetxt(f'enumerated_datasets/{task}/X_train_split_aug_x{aug_factor}_split_{i}.txt', X_train, fmt="%s") np.savetxt(f'enumerated_datasets/{task}/X_test_split_aug_x{aug_factor}_split_{i}.txt', X_test, fmt="%s") np.savetxt(f'enumerated_datasets/{task}/y_train_split_aug_x{aug_factor}_split_{i}.txt', y_train) np.savetxt(f'enumerated_datasets/{task}/y_test_split_aug_x{aug_factor}_split_{i}.txt', y_test) # Save the augmented test SMILES. aug in front of filename denotes test set augmentation as well. np.savetxt(f'enumerated_datasets/{task}/aug_X_test_split_aug_x{aug_factor}_split_{i}.txt', X_test_aug, fmt="%s") np.savetxt(f'enumerated_datasets/{task}/aug_y_test_split_aug_x{aug_factor}_split_{i}.txt', y_test_aug)
def main(path, task, representation, use_pca): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) if use_pca: n_components = 50 else: n_components = None # Set random state to be different to the splits used for evaluation X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_train, y_train, _, _, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components, use_pca) estim = HyperoptEstimator(regressor=random_forest_regression('my_RF')) estim.fit(X_train, y_train, valid_size=0.1, n_folds=5, cv_shuffle=True) print(estim.best_model()) with open(f'saved_hypers/RF/tuning_for_{task}', 'w') as f: print(estim.best_model(), file=f)
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split(X, y, test_size=test_set_size) # To get test set size n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # e_iso_pi best params: # {'learner': RandomForestRegressor(max_features=0.9348473830061558, n_estimators=381, # n_jobs=1, random_state=2, verbose=False)} # e_iso_n best params: # {'learner': RandomForestRegressor(bootstrap=False, max_features=0.09944870853556087, # min_samples_leaf=3, n_estimators=1295, n_jobs=1, # random_state=0, verbose=False)} # z_iso_pi best params: # {'learner': RandomForestRegressor(max_depth=4, max_features=0.33072121415416944, # n_estimators=2755, n_jobs=1, random_state=2, # verbose=False)} # z_iso_n best params: # {'learner': RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, # random_state=3, verbose=False)} regr_rf = RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, random_state=3, verbose=False) regr_rf.fit(X_train, y_train) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction and RF prediction y_pred, y_var = m.predict_f(X_test) y_pred_rf = regr_rf.predict(X_test) y_pred_av = (y_pred + y_pred_rf.reshape(-1, 1)) / 2.0 y_pred = y_scaler.inverse_transform(y_pred_av) y_test = y_scaler.inverse_transform(y_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) y_pred_train_rf = regr_rf.predict(X_train) y_pred_train = (y_pred_train + y_pred_train_rf.reshape(-1, 1)) / 2.0 train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
def main(path, path_to_dft_dataset, task, representation, theory_level): """ :param path: str specifying path to photoswitches.csv file. :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. :param task: str specifying the task. e_iso_pi only supported task for the TD-DFT comparison. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] """ data_loader = TaskDataLoader(task, path) smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) X = featurise_mols(smiles_list, representation) # Keep only non-duplicate entries because we're not considering effects of solvent non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) X = X[non_duplicate_indices, :] experimental_vals = experimental_vals[non_duplicate_indices] non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) pbe0_vals = pbe0_vals[non_dup_pbe0] cam_vals = cam_vals[non_dup_cam] # molecules with dft values to be split into train/test if theory_level == 'CAM-B3LYP': X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals))) # DFT values for the CAM-B3LYP level of theory dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals))) else: X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals))) # DFT values for the PBE0 level of theory dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals))) mae_list = [] dft_mae_list = [] # We define the Gaussian Process optimisation objective m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') for i in range(len(y_with_dft)): X_train = np.delete(X_with_dft, i, axis=0) y_train = np.delete(y_with_dft, i) X_test = X_with_dft[i].reshape(1, -1) y_test = y_with_dft[i] dft_test = dft_vals[i] X_train = np.concatenate((X_train, X_no_dft)) y_train = np.concatenate((y_train, y_no_dft)) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Output MAE for this trial mae = abs(y_test - y_pred) print("MAE: {}".format(mae)) # Store values in order to compute the mean and standard error of the statistics across trials mae_list.append(mae) # DFT prediction scores on the same trial dft_mae = abs(y_test - dft_test) dft_mae_list.append(dft_mae) mae_list = np.array(mae_list) dft_mae_list = np.array(dft_mae_list) print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split( X, y, test_size=test_set_size) # To get test set size n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Compute scores for confidence curve plotting. ranked_confidence_list = np.argsort(y_var, axis=0).flatten() for k in range(len(y_test)): # Construct the RMSE error for each level of confidence conf = ranked_confidence_list[0:k + 1] rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf])) rmse_confidence_list[i, k] = rmse # Construct the MAE error for each level of confidence mae = mean_absolute_error(y_test[conf], y_pred[conf]) mae_confidence_list[i, k] = mae # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) # Plot confidence-error curves confidence_percentiles = np.arange( 1e-14, 100, 100 / len(y_test) ) # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29 if use_rmse_conf: rmse_mean = np.mean(rmse_confidence_list, axis=0) rmse_std = np.std(rmse_confidence_list, axis=0) # We flip because we want the most confident predictions on the right-hand side of the plot rmse_mean = np.flip(rmse_mean) rmse_std = np.flip(rmse_std) # One-sigma error bars lower = rmse_mean - rmse_std upper = rmse_mean + rmse_std plt.plot(confidence_percentiles, rmse_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('RMSE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig( task + '/results/gpr/{}_confidence_curve_rmse.png'.format(representation)) plt.show() else: # We plot the Mean-absolute error confidence-error curves mae_mean = np.mean(mae_confidence_list, axis=0) mae_std = np.std(mae_confidence_list, axis=0) mae_mean = np.flip(mae_mean) mae_std = np.flip(mae_std) lower = mae_mean - mae_std upper = mae_mean + mae_std plt.plot(confidence_percentiles, mae_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('MAE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig( task + '/results/gpr/{}_confidence_curve_mae.png'.format(representation)) plt.show()
def main(task, path, representation, use_pca, test_set_size, r_size, det_encoder_n_hidden, lat_encoder_n_hidden, decoder_n_hidden): """ :param task: str specifying the task name. One of [Photoswitch, ESOL, FreeSolv, Lipophilicity]. :param path: str specifying the path to the photoswitches.csv file :param representation: str specifying the representation. One of [fingerprints, fragments, fragprints] :param use_pca: bool specifying whether or not to use PCA to perform Principal Components Regression :param test_set_size: float specifying the train/test split ratio. e.g. 0.2 is 80/20 train/test split :param r_size: Dimensionality of context encoding r. :param det_encoder_n_hidden: Number of deterministic encoder hidden layers. :param lat_encoder_n_hidden: Number of latent encoder hidden layers. :param decoder_n_hidden: Number of decoder hidden layers. :return: """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) y_size = 1 # If True we perform Principal Components Regression if use_pca: n_components = 50 else: n_components = None print('\nBeginning training loop...') j = 0 # index for saving results X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=42) if task != 'Photoswitch': # Artificially create a 80/10/10 train/validation/test split discarding the validation set. split_in_two = int(len(y_test) / 2) X_test = X_test[0:split_in_two] y_test = y_test[0:split_in_two] else: # We subdivide the train set in order to run cross-validation. X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged X_train, y_train, X_test, _, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = torch.from_numpy(X_train).float().unsqueeze(dim=0) X_test = torch.from_numpy(X_test).float().unsqueeze(dim=0) y_train = torch.from_numpy(y_train).float().unsqueeze(dim=0) det_encoder_hidden_sizes = [8, 16] lat_encoder_hidden_sizes = [8, 16] decoder_hidden_sizes = [8, 16] learning_rates = [0.01, 0.001] batch_sizes = [16, 32] iteration_numbers = [250, 500] best_rmse = 10000000 # a big number best_params = { 'det_encs': 0, 'lat_encs': 0, 'dec_hid': 0, 'lr': 0, 'batch_size': 0, 'iterations': 0 } for det_encs in det_encoder_hidden_sizes: for lat_encs in lat_encoder_hidden_sizes: for dec_hid in decoder_hidden_sizes: for l_rate in learning_rates: for batch_s in batch_sizes: for iter_num in iteration_numbers: m = AttentiveNP( x_size=X_train.shape[2], y_size=y_size, r_size=r_size, det_encoder_hidden_size=det_encs, det_encoder_n_hidden=det_encoder_n_hidden, lat_encoder_hidden_size=lat_encs, lat_encoder_n_hidden=lat_encoder_n_hidden, decoder_hidden_size=dec_hid, decoder_n_hidden=decoder_n_hidden, lr=l_rate, attention_type="multihead") print('...training.') m.train(X_train, y_train, batch_size=batch_s, iterations=iter_num, print_freq=None) # Now, the context set comprises the training x / y values, the target set comprises the test x values. y_pred, y_var = m.predict(X_train, y_train, X_test, n_samples=100) # Output Standardised RMSE and RMSE on Train Set score = r2_score( y_test, y_scaler.inverse_transform(y_pred)) rmse = np.sqrt( mean_squared_error( y_test, y_scaler.inverse_transform(y_pred))) mae = mean_absolute_error( y_test, y_scaler.inverse_transform(y_pred)) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) if rmse < best_rmse: best_rmse = rmse best_params['det_encs'] = det_encs best_params['lat_encs'] = lat_encs best_params['dec_hid'] = dec_hid best_params['lr'] = l_rate best_params['batch_size'] = batch_s best_params['iterations'] = iter_num print('Best parameters are \n') print(best_params) print('Final best parameters are \n') print(best_params) with open(f'cross_val_hypers/{task}/ANP/hypers_{representation}.txt', 'w') as f: f.write(str(best_params))
def main(path, path_to_dft_dataset, representation, theory_level): """ :param path: str specifying path to photoswitches.csv file. :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] """ task = 'e_iso_pi' # e_iso_pi only task supported for TD-DFT comparison data_loader = TaskDataLoader(task, path) smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) X = featurise_mols(smiles_list, representation) # Keep only non-duplicate entries because we're not considering effects of solvent non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) X = X[non_duplicate_indices, :] experimental_vals = experimental_vals[non_duplicate_indices] non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) pbe0_vals = pbe0_vals[non_dup_pbe0] cam_vals = cam_vals[non_dup_cam] # molecules with dft values to be split into train/test if theory_level == 'CAM-B3LYP': X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals))) # DFT values for the CAM-B3LYP level of theory dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals))) else: X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0) y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals))) # DFT values for the PBE0 level of theory dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals))) # molecules with no dft vals must go into the training set. X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0) y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals))) # Load in the other property values for multitask learning. e_iso_pi is a always the task in this instance. data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_no_dft[0, :]) tanimoto_active_dims = [i for i in range(feature_dim)] # active dims for Tanimoto base kernel. mae_list = [] dft_mae_list = [] # We define the Gaussian Process optimisation objective m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') for i in range(len(y_with_dft)): X_train = np.delete(X_with_dft, i, axis=0) y_train = np.delete(y_with_dft, i) X_test = X_with_dft[i].reshape(1, -1) y_test = y_with_dft[i] dft_test = dft_vals[i] X_train = np.concatenate((X_train, X_no_dft)) y_train = np.concatenate((y_train, y_no_dft)) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) #set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",) print_summary(m) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output MAE for this trial mae = abs(y_test[:, 0] - y_pred) print("MAE: {}".format(mae)) # Store values in order to compute the mean and standard error of the statistics across trials mae_list.append(mae) # DFT prediction scores on the same trial dft_mae = abs(y_test[:, 0] - dft_test) dft_mae_list.append(dft_mae) mae_list = np.array(mae_list) dft_mae_list = np.array(dft_mae_list) print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
def main(path, task, graph_type): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param graph_type: str. either 'bigraph' or 'complete' """ data_loader = TaskDataLoader(task, path) X, y = data_loader.load_property_data() X = [Chem.MolFromSmiles(m) for m in X] # Collate Function for Dataloader def collate(sample): graphs, labels = map(list, zip(*sample)) batched_graph = dgl.batch(graphs) batched_graph.set_n_initializer(dgl.init.zero_initializer) batched_graph.set_e_initializer(dgl.init.zero_initializer) return batched_graph, torch.tensor(labels) # Initialise featurisers atom_featurizer = CanonicalAtomFeaturizer() n_feats = atom_featurizer.feat_size('h') print('Number of features: ', n_feats) X_full, _, y_full, _ = train_test_split(X, y, test_size=0.2, random_state=30) y_full = y_full.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged y_scaler = StandardScaler() y_full = torch.Tensor(y_scaler.fit_transform(y_full)) # Set up cross-validation splits n_splits = 5 kf = KFold(n_splits=n_splits) X_train_splits = [] y_train_splits = [] X_val_splits = [] y_val_splits = [] for train_index, test_index in kf.split(X_full): X_train, X_val = np.array(X_full)[train_index], np.array( X_full)[test_index] y_train, y_val = y_full[train_index], y_full[test_index] # Create graphs and labels if graph_type == 'complete': X_train = [ mol_to_complete_graph(m, node_featurizer=atom_featurizer) for m in X_train ] X_val = [ mol_to_complete_graph(m, node_featurizer=atom_featurizer) for m in X_val ] elif graph_type == 'bigraph': X_train = [ mol_to_bigraph(m, node_featurizer=atom_featurizer) for m in X ] X_val = [ mol_to_bigraph(m, node_featurizer=atom_featurizer) for m in X_val ] X_train_splits.append(X_train) X_val_splits.append(X_val) y_train_splits.append(y_train) y_val_splits.append(y_val) def lognuniform(low=1, high=5, size=None, base=10): return np.power(base, -np.random.uniform(low, high, size)) best_rmse = 100000000 for i in range(1000): num_layers = np.random.randint(1, 4) classifier_hidden_feats = np.random.randint(1, 128) hidden_feats = [np.random.choice([16, 32, 64])] * num_layers dropout = [np.random.uniform(0, 0.5)] * num_layers batchnorm = [np.random.choice([True, False])] * num_layers learning_rate = lognuniform() param_set = { 'num_layers': num_layers, 'classifier_hidden_feats': classifier_hidden_feats, 'hidden_feats': hidden_feats, 'dropout': dropout, 'batchnorm': batchnorm, 'lr': learning_rate } print(f'\nParameter set in trial {i} is \n') print(param_set) print('\n') cv_rmse_list = [] for j in range(n_splits): X_train = X_train_splits[j] y_train = y_train_splits[j] X_val = X_val_splits[j] y_val = y_val_splits[j] train_data = list(zip(X_train, y_train)) test_data = list(zip(X_val, y_val)) train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate, drop_last=False) test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate, drop_last=False) gcn_net = GCNPredictor( in_feats=n_feats, hidden_feats=hidden_feats, batchnorm=batchnorm, dropout=dropout, classifier_hidden_feats=classifier_hidden_feats, ) gcn_net.to(device) loss_fn = MSELoss() optimizer = torch.optim.Adam(gcn_net.parameters(), lr=learning_rate) gcn_net.train() epoch_losses = [] epoch_rmses = [] for epoch in range(1, 501): epoch_loss = 0 preds = [] labs = [] for i, (bg, labels) in enumerate(train_loader): labels = labels.to(device) atom_feats = bg.ndata.pop('h').to(device) atom_feats, labels = atom_feats.to(device), labels.to( device) y_pred = gcn_net(bg, atom_feats) labels = labels.unsqueeze(dim=1) loss = loss_fn(y_pred, labels) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.detach().item() # Inverse transform to get RMSE labels = y_scaler.inverse_transform(labels.reshape(-1, 1)) y_pred = y_scaler.inverse_transform( y_pred.detach().numpy().reshape(-1, 1)) # store labels and preds preds.append(y_pred) labs.append(labels) labs = np.concatenate(labs, axis=None) preds = np.concatenate(preds, axis=None) pearson, p = pearsonr(preds, labs) mae = mean_absolute_error(preds, labs) rmse = np.sqrt(mean_squared_error(preds, labs)) r2 = r2_score(preds, labs) epoch_loss /= (i + 1) if epoch % 20 == 0: print(f"epoch: {epoch}, " f"LOSS: {epoch_loss:.3f}, " f"RMSE: {rmse:.3f}, " f"MAE: {mae:.3f}, " f"R: {pearson:.3f}, " f"R2: {r2:.3f}") epoch_losses.append(epoch_loss) epoch_rmses.append(rmse) # Evaluate gcn_net.eval() preds = [] labs = [] for i, (bg, labels) in enumerate(test_loader): labels = labels.to(device) atom_feats = bg.ndata.pop('h').to(device) atom_feats, labels = atom_feats.to(device), labels.to(device) y_pred = gcn_net(bg, atom_feats) labels = labels.unsqueeze(dim=1) # Inverse transform to get RMSE labels = y_scaler.inverse_transform(labels.reshape(-1, 1)) y_pred = y_scaler.inverse_transform( y_pred.detach().numpy().reshape(-1, 1)) preds.append(y_pred) labs.append(labels) preds = np.concatenate(preds, axis=None) labs = np.concatenate(labs, axis=None) pearson, p = pearsonr(preds, labs) mae = mean_absolute_error(preds, labs) rmse = np.sqrt(mean_squared_error(preds, labs)) cv_rmse_list.append(rmse) r2 = r2_score(preds, labs) print( f'Test RMSE: {rmse:.3f}, MAE: {mae:.3f}, R: {pearson:.3f}, R2: {r2:.3f}' ) param_rmse = np.mean(cv_rmse_list) if param_rmse < best_rmse: best_rmse = param_rmse best_params = param_set print('Best RMSE and best params \n') print(best_rmse) print(best_params) np.savetxt('saved_hypers/GCN', best_params)
from sklearn.metrics import mean_squared_error from data_utils import TaskDataLoader, featurise_mols from kernels import Tanimoto representation = 'fragprints' task = 'e_iso_pi' path = '../dataset/photoswitches.csv' df = pd.read_csv('../dataset/purchasable_switch.csv') candidate_list = df['SMILES'].to_list() if __name__ == '__main__': X_test = featurise_mols(candidate_list, representation) data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data() smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_e_iso_pi = y_e_iso_pi.reshape(-1, 1) y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation)
def main(path, task, representation, use_pca, n_trials, test_set_size): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None r2_list = [] rmse_list = [] mae_list = [] print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) gp_kernel = TanimotoKernel() gpr = GaussianProcessRegressor(kernel=gp_kernel) gpr.fit(X_train, y_train) # mean GP prediction X_test = np.tile(X_test, (10000, 1)) import time start = time.time() y_pred = gpr.predict(X_test, return_std=False) end = time.time() print(f'time elapsed is {end - start}') y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train = gpr.predict(X_train, return_std=False) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list))))
def main(path, task, representation, use_pca, n_trials, test_set_size): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) if use_pca: n_components = 50 else: n_components = None r2_list = [] rmse_list = [] mae_list = [] print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_train, y_train, X_test, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components, use_pca) regr_rf = RandomForestRegressor(n_estimators=1519, random_state=4, max_features=0.086, bootstrap=False, min_samples_leaf=2) regr_rf.fit(X_train, y_train) # Output Standardised RMSE and RMSE on Train Set y_pred_train = regr_rf.predict(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # Predict on new data y_rf = regr_rf.predict(X_test) y_rf = y_scaler.inverse_transform(y_rf) y_test = y_scaler.inverse_transform(y_test) score = r2_score(y_test, y_rf) rmse = np.sqrt(mean_squared_error(y_test, y_rf)) mae = mean_absolute_error(y_test, y_rf) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list))))
def main(path, task, representation, use_pca, n_trials, test_set_size): """ Train a multioutput GP simultaneously on all tasks of the photoswitch dataset. :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set """ # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data( ) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data( ) smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_e_iso_pi = y_e_iso_pi.reshape(-1, 1) y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_e_iso_pi[0, :]) tanimoto_active_dims = [i for i in range(feature_dim) ] # active dims for Tanimoto base kernel. r2_list = [] rmse_list = [] mae_list = [] print('\nBeginning training loop...') for i in range(0, n_trials): if task == 'e_iso_pi': X_task = X_e_iso_pi y_task = y_e_iso_pi elif task == 'z_iso_pi': X_task = X_z_iso_pi y_task = y_z_iso_pi elif task == 'e_iso_n': X_task = X_e_iso_n y_task = y_e_iso_n else: X_task = X_z_iso_n y_task = y_z_iso_n X_train, X_test, y_train, y_test = train_test_split( X_task, y_task, test_size=test_set_size, random_state=i) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) if task == 'e_iso_pi': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) elif task == 'z_iso_pi': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_train, np.ones((len(X_train), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_train, np.ones_like(y_train))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test))) elif task == 'e_iso_n': # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones( (len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)) * 2, axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_train, np.ones_like(y_train) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test) * 2)) else: # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones( (len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1))) X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1) X_train = np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_train, np.ones_like(y_train) * 3)))) y_test = np.hstack((y_test, np.ones_like(y_test) * 3)) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) #set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian() ]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize( m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test[:, 0], y_pred) rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) mae = mean_absolute_error(y_test[:, 0], y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) B = coreg.output_covariance().numpy() print("B =", B) _ = plt.imshow(B) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list))))
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf, precompute_repr): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity'] :param representation: str specifying the molecular representation. One of ['SMILES, fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- error curves. True is the option for rmse. :param precompute_repr: bool indicating whether to precompute representations or not. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) if precompute_repr: if representation == 'SMILES': with open( f'precomputed_representations/{task}_{representation}.txt', 'w') as f: for smiles in X: f.write(smiles + '\n') else: np.savetxt( f'precomputed_representations/{task}_{representation}.txt', X) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split(X, y, test_size=test_set_size, random_state=42) # To get test set size # Photoswitch dataset requires 80/20 splitting. Other datasets are 80/10/10. if task != 'Photoswitch': split_in_two = int(len(y_test) / 2) n_test = split_in_two else: n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) # For Calibration curve prediction_prop = [[] for _ in range(n_trials)] print('\nBeginning training loop...') for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) if representation == 'SMILES': np.savetxt(f'fixed_train_test_splits/{task}/X_train_split_{i}.txt', X_train, fmt="%s") np.savetxt(f'fixed_train_test_splits/{task}/X_test_split_{i}.txt', X_test, fmt="%s") np.savetxt(f'fixed_train_test_splits/{task}/y_train_split_{i}.txt', y_train) np.savetxt(f'fixed_train_test_splits/{task}/y_test_split_{i}.txt', y_test) else: if task != 'Photoswitch': # Artificially create a 80/10/10 train/validation/test split discarding the validation set. split_in_two = int(len(y_test) / 2) X_test = X_test[0:split_in_two] y_test = y_test[0:split_in_two] y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) np.random.seed(42) datasets, n, d, mean_y_train, std_y_train = load_reg_data( X_train, y_train, X_test, y_test) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[1] N_train = train_set_x.get_value(borrow=True).shape[0] N_test = test_set_x.get_value(borrow=True).shape[0] layer_sizes = [d, 20, 20, len(mean_y_train)] n_samples = 100 alpha = 0.5 learning_rate = 0.01 v_prior = 1.0 batch_size = 32 print('... building model') sys.stdout.flush() bb_alpha = BB_alpha(layer_sizes, n_samples, alpha, learning_rate, v_prior, batch_size, train_set_x, train_set_y, N_train, test_set_x, test_set_y, N_test, mean_y_train, std_y_train) print('... training') sys.stdout.flush() test_error, test_ll = bb_alpha.train_ADAM(100) print('Test RMSE: ', test_error) print('Test ll: ', test_ll) samples = bb_alpha.sample_predictive_distribution(X_test) y_pred = np.mean(samples, axis=0) var = np.var(samples, axis=0) # For producing the calibration curve for k in [ 0.13, 0.26, 0.39, 0.53, 0.68, 0.85, 1.04, 1.15, 1.28, 1.44, 1.645, 1.96 ]: a = (y_scaler.inverse_transform(y_test) < y_scaler.inverse_transform(y_pred + k * np.sqrt(var))) b = (y_scaler.inverse_transform(y_test) > y_scaler.inverse_transform(y_pred - k * np.sqrt(var))) prediction_prop[i].append( np.argwhere((a == True) & (b == True)).shape[0] / len(y_test)) # We transform the standardised predictions back to the original data space y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Compute scores for confidence curve plotting. ranked_confidence_list = np.argsort(var, axis=0).flatten() for k in range(len(y_test)): # Construct the RMSE error for each level of confidence conf = ranked_confidence_list[0:k + 1] rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf])) rmse_confidence_list[i, k] = rmse # Construct the MAE error for each level of confidence mae = mean_absolute_error(y_test[conf], y_pred[conf]) mae_confidence_list[i, k] = mae # Output Standardised RMSE and RMSE on Train Set train_samples = bb_alpha.sample_predictive_distribution(X_train) y_pred_train = np.mean(train_samples, axis=0) train_rmse_stan = np.sqrt(mean_squared_error( y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) if representation != 'SMILES': r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list))) print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list))) # Plot confidence-error curves confidence_percentiles = np.arange( 1e-14, 100, 100 / len(y_test) ) # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29 if use_rmse_conf: rmse_mean = np.mean(rmse_confidence_list, axis=0) rmse_std = np.std(rmse_confidence_list, axis=0) # We flip because we want the most confident predictions on the right-hand side of the plot rmse_mean = np.flip(rmse_mean) rmse_std = np.flip(rmse_std) # One-sigma error bars lower = rmse_mean - rmse_std upper = rmse_mean + rmse_std plt.plot(confidence_percentiles, rmse_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('RMSE') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig(task + '/results/BNN/{}_{}_confidence_curve_rmse.png'.format( representation, task)) plt.show() else: # We plot the Mean-absolute error confidence-error curves mae_mean = np.mean(mae_confidence_list, axis=0) mae_std = np.std(mae_confidence_list, axis=0) mae_mean = np.flip(mae_mean) mae_std = np.flip(mae_std) lower = mae_mean - mae_std upper = mae_mean + mae_std plt.plot(confidence_percentiles, mae_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('MAE') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig(task + '/results/BNN/{}_{}_confidence_curve_mae.png'.format( representation, task)) plt.show() # Plot the calibration curve mean_props = np.mean(prediction_prop, axis=0) sd_props = np.std(prediction_prop, axis=0) lower = mean_props - sd_props upper = mean_props + sd_props qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95] plt.plot(qs, mean_props, label='mean') plt.fill_between(qs, lower, upper, alpha=0.2) plt.plot(qs, qs, color="red") plt.xlabel('q') plt.ylabel('C(q)') plt.savefig(task + '/results/BNN/{}_{}_calibration_curve.png'.format( representation, task)) plt.show() np.savetxt( task + '/results/BNN/{}_{}_mean_props'.format(representation, task), mean_props) np.savetxt( task + '/results/BNN/{}_{}_sd_props'.format(representation, task), sd_props)
def main(path, path_to_large_dataset, task, representation, test_set_size, augment_photo_dataset, n_trials): """ :param path: str giving path to the photoswitches.csv file. :param path_to_large_dataset: str giving path to paper_allDB.csv file :param task: str specifying the task. Always e_iso_pi for the generalization experiment :param representation: str specifying the molecular representation. One of [fingerprints, fragments, fragprints].' :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set :param augment_photo_dataset: If True augment the photoswitch dataset with the Beard et al. 2019 dataset :param n_trials: int specifying the number of random train/test splits. """ data_loader = TaskDataLoader(task, path) photo_smiles_list, y_vals_photo = data_loader.load_property_data() beard_smiles_list, y_vals_beard = data_loader.load_large_comparison_data( path_to_large_dataset) r2_list = [] rmse_list = [] mae_list = [] if not augment_photo_dataset: # test set is now fixed n_trials = 1 # We train on the Beard dataset and test on the photoswitch dataset X_train = featurise_mols(beard_smiles_list, representation) X_test = featurise_mols(photo_smiles_list, representation) y_train = y_vals_beard y_test = y_vals_photo for i in range(0, n_trials): if augment_photo_dataset: # We add the Beard dataset as additional training data X_train, X_test, y_train, y_test = train_test_split( photo_smiles_list, y_vals_photo, test_size=test_set_size, random_state=i) X_train = X_train + beard_smiles_list y_train = np.concatenate((y_train, y_vals_beard)) X_train = featurise_mols(X_train, representation) X_test = featurise_mols(X_test, representation) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_train, y_train, X_test, y_test, y_scaler = transform_data( X_train, y_train, X_test, y_test) regr_rf = RandomForestRegressor(n_estimators=1000, max_depth=300, random_state=2) regr_rf.fit(X_train, y_train) # Output Standardised RMSE and RMSE on Train Set y_pred_train = regr_rf.predict(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) # Predict on new data y_rf = regr_rf.predict(X_test) y_rf = y_scaler.inverse_transform(y_rf) y_test = y_scaler.inverse_transform(y_test) score = r2_score(y_test, y_rf) rmse = np.sqrt(mean_squared_error(y_test, y_rf)) mae = mean_absolute_error(y_test, y_rf) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list))))
def main(path, representation): """ :param path: str specifying path to dataset. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ task = 'e_iso_pi' # task always e_iso_pi with human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # 5 test molecules test_smiles = [ 'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC' ] # and their indices in the loaded data test_smiles_indices = [116, 131, 168, 221, 229] X_train = np.delete(X, np.array(test_smiles_indices), axis=0) y_train = np.delete(y, np.array(test_smiles_indices)) X_test = X[[116, 131, 168, 221, 229]] # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was # under a different solvent y_test = y[[116, 131, 168, 221, 229]] y_test[2] = 407. y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # # We standardise the outputs but leave the inputs unchanged # # _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data( ) smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) y_e_iso_n = y_e_iso_n.reshape(-1, 1) y_z_iso_n = y_z_iso_n.reshape(-1, 1) X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) output_dim = 4 # Number of outputs rank = 1 # Rank of W feature_dim = len(X_train[0, :]) tanimoto_active_dims = [i for i in range(feature_dim) ] # active dims for Tanimoto base kernel. # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods Y_augmented = np.vstack( (np.hstack((y_train, np.zeros_like(y_train))), np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) y_test = np.hstack((y_test, np.zeros_like(y_test))) # Base kernel k = Tanimoto(active_dims=tanimoto_active_dims) # set_trainable(k.variance, False) # Coregion kernel coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) # Create product kernel kern = k * coreg # This likelihood switches between Gaussian noise with different variances for each f_i: lik = gpflow.likelihoods.SwitchedLikelihood([ gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian() ]) # now build the GP model as normal m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) # fit the covariance function parameters maxiter = ci_niter(1000) gpflow.optimizers.Scipy().minimize( m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B", ) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) r2 = r2_score(y_test[:, 0], y_pred) rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) mae = mean_absolute_error(y_test[:, 0], y_pred) per_molecule = np.diag(abs(y_pred - y_test[:, 0])) print("\n Averaged test statistics are") print("\nR^2: {:.3f}".format(r2)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) print("\nAbsolute error per molecule is {} ".format(per_molecule))
def main(path, representation): """ :param path: str specifying path to dataset. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ task = 'e_iso_pi' # Always e_iso_pi for human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # 5 test molecules test_smiles = [ 'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC' ] # and their indices in the loaded data test_smiles_indices = [116, 131, 168, 221, 229] X_train = np.delete(X, np.array(test_smiles_indices), axis=0) y_train = np.delete(y, np.array(test_smiles_indices)) X_test = X[[116, 131, 168, 221, 229]] # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was # under a different solvent y_test = y[[116, 131, 168, 221, 229]] y_test[2] = 407. y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) num_features = np.shape(X)[1] # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # for plotting confidence-error curves rmse_confidence_list = [] mae_confidence_list = [] k = Tanimoto() m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) print_summary(m) # mean and variance GP prediction y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) # Output Standardised RMSE and RMSE on Train Set y_pred_train, _ = m.predict_f(X_train) train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) r2 = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) per_molecule = abs(y_pred - y_test) print("\n Averaged test statistics are") print("\nR^2: {:.3f}".format(r2)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) print("\nAbsolute error per molecule is {} ".format(per_molecule))
def main(path, task, n_trials, test_set_size): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set. """ data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = [Chem.MolFromSmiles(m) for m in smiles_list] # Collate Function for Dataloader def collate(sample): graphs, labels = map(list, zip(*sample)) batched_graph = dgl.batch(graphs) batched_graph.set_n_initializer(dgl.init.zero_initializer) batched_graph.set_e_initializer(dgl.init.zero_initializer) return batched_graph, torch.tensor(labels) # Initialise featurisers atom_featurizer = CanonicalAtomFeaturizer() bond_featurizer = CanonicalBondFeaturizer() e_feats = bond_featurizer.feat_size('e') n_feats = atom_featurizer.feat_size('h') print('Number of features: ', n_feats) X = [ mol_to_bigraph(m, node_featurizer=atom_featurizer, edge_featurizer=bond_featurizer) for m in X ] r2_list = [] rmse_list = [] mae_list = [] skipped_trials = 0 for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i + 5) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged y_scaler = StandardScaler() y_train_scaled = torch.Tensor(y_scaler.fit_transform(y_train)) y_test_scaled = torch.Tensor(y_scaler.transform(y_test)) train_data = list(zip(X_train, y_train_scaled)) test_data = list(zip(X_test, y_test_scaled)) train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate, drop_last=False) test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate, drop_last=False) gat_net = GATPredictor(in_feats=n_feats) gat_net.to(device) loss_fn = MSELoss() optimizer = torch.optim.Adam(gat_net.parameters(), lr=0.001) gat_net.train() epoch_losses = [] epoch_rmses = [] for epoch in range(1, 201): epoch_loss = 0 preds = [] labs = [] for i, (bg, labels) in enumerate(train_loader): labels = labels.to(device) atom_feats = bg.ndata.pop('h').to(device) bond_feats = bg.edata.pop('e').to(device) atom_feats, bond_feats, labels = atom_feats.to( device), bond_feats.to(device), labels.to(device) y_pred = gat_net(bg, atom_feats) labels = labels.unsqueeze(dim=1) loss = loss_fn(y_pred, labels) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.detach().item() # Inverse transform to get RMSE labels = y_scaler.inverse_transform(labels.reshape(-1, 1)) y_pred = y_scaler.inverse_transform( y_pred.detach().numpy().reshape(-1, 1)) # store labels and preds preds.append(y_pred) labs.append(labels) labs = np.concatenate(labs, axis=None) preds = np.concatenate(preds, axis=None) pearson, p = pearsonr(preds, labs) mae = mean_absolute_error(preds, labs) rmse = np.sqrt(mean_squared_error(preds, labs)) r2 = r2_score(preds, labs) epoch_loss /= (i + 1) if epoch % 20 == 0: print(f"epoch: {epoch}, " f"LOSS: {epoch_loss:.3f}, " f"RMSE: {rmse:.3f}, " f"MAE: {mae:.3f}, " f"R: {pearson:.3f}, " f"R2: {r2:.3f}") epoch_losses.append(epoch_loss) epoch_rmses.append(rmse) # Discount trial if train RMSE finishes as a negative value (optimiser error). if r2 < 0: skipped_trials += 1 print('Skipped trials is {}'.format(skipped_trials)) continue # Evaluate gat_net.eval() preds = [] labs = [] for i, (bg, labels) in enumerate(test_loader): labels = labels.to(device) atom_feats = bg.ndata.pop('h').to(device) bond_feats = bg.edata.pop('e').to(device) atom_feats, labels = atom_feats.to(device), labels.to(device) y_pred = gat_net(bg, atom_feats) labels = labels.unsqueeze(dim=1) # Inverse transform to get RMSE labels = y_scaler.inverse_transform(labels.reshape(-1, 1)) y_pred = y_scaler.inverse_transform( y_pred.detach().numpy().reshape(-1, 1)) preds.append(y_pred) labs.append(labels) labs = np.concatenate(labs, axis=None) preds = np.concatenate(preds, axis=None) pearson, p = pearsonr(preds, labs) mae = mean_absolute_error(preds, labs) rmse = np.sqrt(mean_squared_error(preds, labs)) r2 = r2_score(preds, labs) r2_list.append(r2) rmse_list.append(rmse) mae_list.append(mae) print( f'Test RMSE: {rmse:.3f}, MAE: {mae:.3f}, R: {pearson:.3f}, R2: {r2:.3f}' ) r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) print("\nSkipped trials is {}".format(skipped_trials))
representation = 'fragprints' task = 'e_iso_pi' path = '../dataset/photoswitches.csv' # New candidates to predict wavelength values for # candidate_list = ['O=C(OC)C(C=C1)=CC=C1C2=C[N-][N+]3=C(C=CN32)/N=N/C4=CC=CC=C4', # 'O=C(OC)C(C=C1)=CC=C1C2=CN3[N+]([N-]2)=CC=C3/N=N/C4=CC=CC=C4'] df = pd.read_csv('../dataset/purchasable_switch.csv') candidate_list = df['SMILES'].to_list() if __name__ == '__main__': data_loader = TaskDataLoader(task, path) smiles_list, y_train = data_loader.load_property_data() X_train = featurise_mols(smiles_list, representation) X_test = featurise_mols(candidate_list, representation) num_features = np.shape(X_train)[1] # We define the Gaussian Process Regression Model using the Tanimoto kernel m = None def objective_closure(): return -m.log_marginal_likelihood() # We standardise the outputs but leave the inputs unchanged. Equivalent to transform data used in other scripts.
def main(path, task, representation, use_pca, test_set_size): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity'] :param representation: str specifying the molecular representation. One of [fingerprints, 'fragments', 'fragprints'] :param use_pca: bool. If True apply PCA to perform Principal Components Regression. :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set """ if representation == 'SMILES': raise Exception('SMILES is not a valid representation for the BNN') data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 100 else: n_components = None X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=42) if task != 'Photoswitch': # Artificially create a 80/10/10 train/validation/test split discarding the validation set. split_in_two = int(len(y_test) / 2) X_test = X_test[0:split_in_two] y_test = y_test[0:split_in_two] else: # We subdivide the train set in order to run cross-validation. X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) datasets, n, d, mean_y_train, std_y_train = load_reg_data( X_train, y_train, X_test, y_test) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[1] layer_sizes = [10, 20] learning_rates = [0.01, 0.001] batch_sizes = [16, 32, 64] iters = [20, 50, 100] best_rmse = 10000000 # a large number best_params = {'layer_size': 0, 'lr': 0, 'batch_size': 0, 'iterations': 0} for layer_size in layer_sizes: for lr in learning_rates: for batch_size in batch_sizes: for iteration in iters: N_train = train_set_x.get_value(borrow=True).shape[0] N_test = test_set_x.get_value(borrow=True).shape[0] layer_sizes = [ d, layer_size, layer_size, len(mean_y_train) ] n_samples = 100 alpha = 0.5 learning_rate = lr v_prior = 1.0 batch_size = batch_size print('... building model') sys.stdout.flush() bb_alpha = BB_alpha(layer_sizes, n_samples, alpha, learning_rate, v_prior, batch_size, train_set_x, train_set_y, N_train, test_set_x, test_set_y, N_test, mean_y_train, std_y_train) print('... training') sys.stdout.flush() test_error, test_ll = bb_alpha.train_ADAM(iteration) print('Test RMSE: ', test_error) print('Test ll: ', test_ll) samples = bb_alpha.sample_predictive_distribution(X_test) y_pred = np.mean(samples, axis=0) # Output Standardised RMSE and RMSE on Train Set train_samples = bb_alpha.sample_predictive_distribution( X_train) y_pred_train = np.mean(train_samples, axis=0) train_rmse_stan = np.sqrt( mean_squared_error(y_train, y_pred_train)) train_rmse = np.sqrt( mean_squared_error( y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) print("\nStandardised Train RMSE: {:.3f}".format( train_rmse_stan)) print("Train RMSE: {:.3f}".format(train_rmse)) score = r2_score(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(y_pred)) rmse = np.sqrt( mean_squared_error(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(y_pred))) mae = mean_absolute_error( y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(y_pred)) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) if rmse < best_rmse: best_rmse = rmse best_params['lr'] = lr best_params['batch_size'] = batch_size best_params['iterations'] = iteration best_params['layer_size'] = layer_size print('Best parameters are \n') print(best_params) print('Final best parameters are \n') print(best_params) with open(f'cross_val_hypers/{task}/BNN/hypers_{representation}.txt', 'w') as f: f.write(str(best_params))
def main(task, path, representation, use_pca, n_trials, test_set_size, batch_size, lr, iterations, r_size, det_encoder_hidden_size, det_encoder_n_hidden, lat_encoder_hidden_size, lat_encoder_n_hidden, decoder_hidden_size, decoder_n_hidden): """ :param task: str specifying the task name. One of [e_iso_pi, e_iso_n, z_iso_pi, z_iso_n] :param path: str specifying the path to the photoswitches.csv file :param representation: str specifying the representation. One of [fingerprints, fragments, fragprints] :param use_pca: bool specifying whether or not to use PCA to perform Principal Components Regression :param n_trials: int specifying the number of random train/test splits. :param test_set_size: float specifying the train/test split ratio. e.g. 0.2 is 80/20 train/test split :param batch_size: int specifying the number of samples to take of the context set, given the number of context points that should be selected. :param lr: float specifying the learning rate. :param iterations: int specifying the number of training iterations :param r_size: Dimensionality of context encoding r. :param det_encoder_hidden_size: Dimensionality of deterministic encoder hidden layers. :param det_encoder_n_hidden: Number of deterministic encoder hidden layers. :param lat_encoder_hidden_size: Dimensionality of latent encoder hidden layers. :param lat_encoder_n_hidden: Number of latent encoder hidden layers. :param decoder_hidden_size: Dimensionality of decoder hidden layers. :param decoder_n_hidden: Number of decoder hidden layers. :return: """ path_to_save = task + '/results/anp/' data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() y_size = 1 if args.representation == 'fingerprints': X = featurise_mols(smiles_list, representation) elif args.representation == 'fragments': X = featurise_mols(smiles_list, representation) else: X = featurise_mols(smiles_list, representation) # If True we perform Principal Components Regression if use_pca: n_components = 50 else: n_components = None r2_list = [] rmse_list = [] mae_list = [] # We pre-allocate arrays for plotting confidence-error curves _, _, _, y_test = train_test_split( X, y, test_size=test_set_size) # To get test set size n_test = len(y_test) rmse_confidence_list = np.zeros((n_trials, n_test)) mae_confidence_list = np.zeros((n_trials, n_test)) print('\nBeginning training loop...') j = 0 # index for saving results for i in range(0, n_trials): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_set_size, random_state=i) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged X_train, y_train, X_test, _, y_scaler = transform_data( X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) X_train = torch.from_numpy(X_train).float().unsqueeze(dim=0) X_test = torch.from_numpy(X_test).float().unsqueeze(dim=0) y_train = torch.from_numpy(y_train).float().unsqueeze(dim=0) m = AttentiveNP(x_size=X_train.shape[2], y_size=y_size, r_size=r_size, det_encoder_hidden_size=det_encoder_hidden_size, det_encoder_n_hidden=det_encoder_n_hidden, lat_encoder_hidden_size=lat_encoder_hidden_size, lat_encoder_n_hidden=lat_encoder_n_hidden, decoder_hidden_size=decoder_hidden_size, decoder_n_hidden=decoder_n_hidden, lr=lr, attention_type="multihead") print('...training.') m.train(X_train, y_train, batch_size=batch_size, iterations=iterations, print_freq=None) # Now, the context set comprises the training x / y values, the target set comprises the test x values. y_pred, y_var = m.predict(X_train, y_train, X_test, n_samples=100) y_pred = y_scaler.inverse_transform(y_pred) # Compute scores for confidence curve plotting. ranked_confidence_list = np.argsort(y_var.numpy(), axis=0).flatten() for k in range(len(y_test)): # Construct the RMSE error for each level of confidence conf = ranked_confidence_list[0:k + 1] rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf])) rmse_confidence_list[i, k] = rmse # Construct the MAE error for each level of confidence mae = mean_absolute_error(y_test[conf], y_pred[conf]) mae_confidence_list[i, k] = mae # Output Standardised RMSE and RMSE on Train Set score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) print("\nR^2: {:.3f}".format(score)) print("RMSE: {:.3f}".format(rmse)) print("MAE: {:.3f}".format(mae)) r2_list.append(score) rmse_list.append(rmse) mae_list.append(mae) np.savetxt( path_to_save + '_seed_' + str(j) + '_ypred_' + representation + '.txt', y_pred) np.savetxt(path_to_save + '_seed_' + str(j) + '_ytest.txt', y_test) np.savetxt( path_to_save + '_seed_' + str(j) + '_ystd_' + representation + '.txt', np.sqrt(y_var)) j += 1 r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) with open(path_to_save + representation + '.txt', 'w+') as f: f.write('\n Representation = ' + str(representation)) f.write('\n Task = ' + str(task)) f.write('\n Use PCA? = ' + str(use_pca)) f.write('\n Number of trials = {} \n'.format(n_trials)) f.write('\n Deterministic encoder hidden size = ' + str(det_encoder_hidden_size)) f.write('\n Deterministic encoder number of layers = ' + str(det_encoder_n_hidden)) f.write('\n Latent encoder hidden size = ' + str(lat_encoder_hidden_size)) f.write('\n Latent encoder number of layers = ' + str(lat_encoder_n_hidden)) f.write('\n Decoder hidden size = ' + str(decoder_hidden_size)) f.write('\n Decoder number of layers = ' + str(decoder_n_hidden)) f.write('\n Latent variable size = ' + str(r_size)) f.write('\n Batch size = {}'.format(batch_size)) f.write('\n Learning rate = {}'.format(lr)) f.write('\n Number of iterations = {} \n'.format(iterations)) f.write("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) f.write("\nmean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) f.write("\nmean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) f.flush() # Plot confidence-error curves # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29 confidence_percentiles = np.arange(1e-14, 100, 100 / len(y_test)) rmse_mean = np.mean(rmse_confidence_list, axis=0) rmse_std = np.std(rmse_confidence_list, axis=0) # We flip because we want the most confident predictions on the right-hand side of the plot rmse_mean = np.flip(rmse_mean) rmse_std = np.flip(rmse_std) # One-sigma error bars lower = rmse_mean - rmse_std upper = rmse_mean + rmse_std plt.plot(confidence_percentiles, rmse_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('RMSE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig(path_to_save + 'confidence_curve_rmse.png') # We plot the Mean-absolute error confidence-error curves mae_mean = np.mean(mae_confidence_list, axis=0) mae_std = np.std(mae_confidence_list, axis=0) mae_mean = np.flip(mae_mean) mae_std = np.flip(mae_std) lower = mae_mean - mae_std upper = mae_mean + mae_std plt.plot(confidence_percentiles, mae_mean, label='mean') plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2) plt.xlabel('Confidence Percentile') plt.ylabel('MAE (nm)') plt.ylim([0, np.max(upper) + 1]) plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))]) plt.yticks(np.arange(0, np.max(upper) + 1, 5.0)) plt.savefig(path_to_save + 'confidence_curve_mae.png')