def main(task, samp, split, seed, n_iter): global rem_mat, rem_diag warnings.filterwarnings('ignore') print(task + ' active learning with E3FP-GP, ' + samp + ' sampling, ' + split + ' splitting, seed = ' + str(seed)) print('\nGenerating features...') if task in PATHS: smiles_list, y = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA X = np.arange(len(smiles_list)).reshape(-1, 1) else: raise Exception('Must provide dataset') n_samp = round(len(X) * 0.8 * 0.025) X_init, y_init, X_holdout, y_holdout, X_test, y_test = initial_data_split( X, smiles_list, y, seed, split) rmse_list = [] rem_mat = np.load('/rds-d2/user/wjm41/hpc-work/kernels/e3fp/' + task + '_e3fp.npy') max_rem = rem_mat.max() rem_diag = tf.constant(np.diag(rem_mat), dtype=tf.float64) rem_mat = tf.constant(rem_mat, dtype=tf.float64) from gpflow.utilities import positive class Matern32_rem(gpflow.kernels.Kernel): def __init__(self): super().__init__(active_dims=[0]) self.var = gpflow.Parameter(1.0, transform=positive()) self.mag = gpflow.Parameter(1.0, transform=positive()) def K(self, X, X2=None, presliced=None): global rem_mat if X2 is None: X2 = X A = tf.cast(X, tf.int32) A = tf.reshape(A, [-1]) A2 = tf.reshape(X2, [-1]) A2 = tf.cast(A2, tf.int32) K_mat = tf.gather(rem_mat, A, axis=0) K_mat = tf.gather(K_mat, A2, axis=1) z = tf.math.sqrt(3 * K_mat) * self.var K_final = self.mag * (1 + z) * tf.math.exp(-z) return K_final def K_diag(self, X, presliced=None): global rem_diag A = tf.cast(X, tf.int32) K_diag = tf.gather_nd(rem_diag, A) z = tf.math.sqrt(3 * K_diag) * self.var return self.mag * (1 + z) * tf.math.exp(-z) m = None def objective_closure(): return -m.log_marginal_likelihood() opt = gpflow.optimizers.Scipy() # Active learning loop for i in range(n_iter + 1): y_init_scaled, y_test_scaled, y_scaler = transform_data(y_init, y_test) X_init_tf = tf.convert_to_tensor(X_init, dtype=tf.float64) X_test_tf = tf.convert_to_tensor(X_test, dtype=tf.float64) k = Matern32_rem() + gpflow.kernels.White(0.1) m = gpflow.models.GPR(data=(X_init_tf, y_init_scaled), kernel=k, mean_function=None, noise_variance=1) opt_logs = opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000)) y_pred, _ = m.predict_f(X_test_tf) y_pred = y_scaler.inverse_transform(y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) rmse_list.append(rmse) r2 = r2_score(y_test, y_pred) print('\nIteration ' + str(i) + ' RMSE = ' + str(rmse)) print('Iteration ' + str(i) + ' R2 = ' + str(r2)) print('model training size = ' + str(len(X_init))) X_holdout_tf = tf.convert_to_tensor(X_holdout, dtype=tf.float64) y_init_scaled, y_holdout_scaled, y_scaler = transform_data( y_init, y_holdout) #Find sample indices and update init and holdouting training sets sample_indices = suggest_sample(X_holdout_tf, m, samp, n_samp) X_init = np.vstack((X_init, X_holdout[sample_indices])) y_init = np.vstack((y_init, y_holdout[sample_indices])) X_holdout = np.delete(X_holdout, sample_indices, axis=0) y_holdout = np.delete(y_holdout, sample_indices, axis=0) # Saves rmse vs num acquisitions into a 'results' folder np.save( 'results/e3fp_' + task + '_samp_' + samp + '_split_' + split + '_seed_' + str(seed) + '.npy', rmse_list)
def main(task, split, n_runs, n_fold, n_bits): global rem_mat, rem_diag, max_rem warnings.filterwarnings('ignore') print('\nTraining E3FP-GP on '+task+' dataset') print('\nGenerating features...') if task in PATHS: smiles_list, y = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA X = np.arange(len(smiles_list)).reshape(-1,1) else: raise Exception('Must provide dataset') m = None def objective_closure(): return -m.log_marginal_likelihood() print('\nBeginning training loop...') if n_bits==-1: bit_list = [512, 1024, 2048, 4096, 8192] else: bit_list = [n_bits] for bits in bit_list: r2_list = [] rmse_list = [] logP_list = [] j=0 for i in range(n_runs): if split=='random': kf = KFold(n_splits=n_fold, random_state=i, shuffle=True) split_list = kf.split(X) elif split=='scaffold': train_ind, test_ind = scaffold_split(smiles_list, seed=i) split_list = [(train_ind, test_ind)] for train_ind, test_ind in split_list: X_train, X_test = X[train_ind], X[test_ind] y_train, y_test = y[train_ind], y[test_ind] y_train, y_test, y_scaler = transform_data(y_train, y_test) X_train = tf.convert_to_tensor(X_train, dtype = tf.float64) X_test = tf.convert_to_tensor(X_test, dtype = tf.float64) #rem_mat = np.load('kernels/'+task+'_ecfp_'+str(bits)+'.npy') rem_mat = np.load('/rds-d2/user/wjm41/hpc-work/kernels/e3fp/'+task+'_e3fp.npy') rem_diag = tf.constant(np.diag(rem_mat),dtype=tf.float64) rem_mat = tf.constant(rem_mat,dtype=tf.float64) from gpflow.utilities import positive class Matern32_rem(gpflow.kernels.Kernel): def __init__(self): super().__init__(active_dims=[0]) self.var = gpflow.Parameter(1.0, transform=positive()) self.mag = gpflow.Parameter(1.0, transform=positive()) def K(self, X, X2=None, presliced=None): global rem_mat if X2 is None: X2=X A = tf.cast(X,tf.int32) A = tf.reshape(A,[-1]) A2 = tf.reshape(X2,[-1]) A2 = tf.cast(A2,tf.int32) K_mat = tf.gather(rem_mat, A, axis=0) K_mat = tf.gather(K_mat, A2, axis=1) z = tf.math.sqrt(3*K_mat)*self.var K_final = self.mag*(1+z)*tf.math.exp(-z) return K_final def K_diag(self, X, presliced=None): global rem_diag A=tf.cast(X,tf.int32) K_diag = tf.gather_nd(rem_diag, A) z = tf.math.sqrt(3*K_diag)*self.var return self.mag*(1+z)*tf.math.exp(-z) k = Matern32_rem()+gpflow.kernels.White(0.1) m = gpflow.models.GPR( data=(X_train, y_train), kernel=k) opt = gpflow.optimizers.Scipy() opt_logs = opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000)) #print_summary(m) y_pred, y_var = m.predict_f(X_test) y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test) y_var = y_scaler.var_ * y_var score = r2_score(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) logP = -m.log_likelihood() #print("\nR^2: {:.3f}".format(score)) #print("RMSE: {:.3f}".format(rmse)) #print("-ve logP: {:.3f}".format(logP)) r2_list.append(score) rmse_list.append(rmse) logP_list.append(logP) np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ypred.txt', y_pred) np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ytest.txt', y_test) np.savetxt('results/e3fp_'+task+'_split_'+split+'_run_'+str(j)+'_ystd.txt', np.sqrt(y_var)) j+=1 r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) logP_list = np.array(logP_list) print("\nbits: {}".format(bits)) print("mean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list)))) print("mean -ve logP: {:.4f} +- {:.4f}\n".format(np.mean(logP_list), np.std(logP_list)/np.sqrt(len(logP_list))))
def main(args): """ :param path: str specifying path to dataset. :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] :param n_trials: int specifying number of random train/test splits to use :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set """ # data_loader = TaskDataLoader(args.task, args.path) # smiles_list, y = data_loader.load_property_data() smiles_list, y = parse_dataset(args.task, PATHS[args.task], args.reg) X = [Chem.MolFromSmiles(m) for m in smiles_list] # Initialise featurisers atom_featurizer = CanonicalAtomFeaturizer() bond_featurizer = CanonicalBondFeaturizer() e_feats = bond_featurizer.feat_size('e') n_feats = atom_featurizer.feat_size('h') print('Number of features: ', n_feats) X = [ mol_to_bigraph(m, node_featurizer=atom_featurizer, edge_featurizer=bond_featurizer) for m in X ] r2_list = [] rmse_list = [] mae_list = [] skipped_trials = 0 for i in range(args.n_trials): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_set_size, random_state=i + 5) kf = StratifiedKFold(n_splits=args.n_folds, random_state=i, shuffle=True) split_list = kf.split(X, y) j = 0 for train_ind, test_ind in split_list: if args.reg: writer = SummaryWriter('runs/' + args.task + '/mpnn/reg/run_' + str(i) + '_fold_' + str(j)) else: writer = SummaryWriter('runs/' + args.task + '/mpnn/class/run_' + str(i) + '_fold_' + str(j)) X_train, X_test = np.array(X)[train_ind], np.array(X)[test_ind] y_train, y_test = np.array(y)[train_ind], np.array(y)[test_ind] y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # We standardise the outputs but leave the inputs unchanged if args.reg: y_scaler = StandardScaler() y_train_scaled = torch.Tensor(y_scaler.fit_transform(y_train)) y_test_scaled = torch.Tensor(y_scaler.transform(y_test)) else: y_train_scaled = torch.Tensor(y_train) y_test_scaled = torch.Tensor(y_test) train_data = list(zip(X_train, y_train_scaled)) test_data = list(zip(X_test, y_test_scaled)) train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate, drop_last=False) test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate, drop_last=False) mpnn_net = MPNNPredictor(node_in_feats=n_feats, edge_in_feats=e_feats) mpnn_net.to(device) if args.reg: loss_fn = MSELoss() else: loss_fn = BCELoss() optimizer = torch.optim.Adam(mpnn_net.parameters(), lr=1e-4) mpnn_net.train() epoch_losses = [] epoch_rmses = [] for epoch in tqdm(range(1, args.n_epochs)): epoch_loss = 0 preds = [] labs = [] for i, (bg, labels) in tqdm(enumerate(train_loader)): labels = labels.to(device) atom_feats = bg.ndata.pop('h').to(device) bond_feats = bg.edata.pop('e').to(device) atom_feats, bond_feats, labels = atom_feats.to( device), bond_feats.to(device), labels.to(device) y_pred = mpnn_net(bg, atom_feats, bond_feats) labels = labels.unsqueeze(dim=1) loss = loss_fn(y_pred, labels) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.detach().item() if args.reg: # Inverse transform to get RMSE labels = y_scaler.inverse_transform( labels.cpu().reshape(-1, 1)) y_pred = y_scaler.inverse_transform( y_pred.detach().cpu().numpy().reshape(-1, 1)) else: labels = labels.cpu().numpy() y_pred = y_pred.detach().cpu().numpy() # store labels and preds preds.append(y_pred) labs.append(labels) labs = np.concatenate(labs, axis=None) preds = np.concatenate(preds, axis=None) pearson, p = pearsonr(preds, labs) if args.reg: mae = mean_absolute_error(preds, labs) rmse = np.sqrt(mean_squared_error(preds, labs)) r2 = r2_score(preds, labs) else: r2 = roc_auc_score(labs, preds) precision, recall, thresholds = precision_recall_curve( labs, preds) rmse = auc(recall, precision) mae = 0 if args.reg: writer.add_scalar('Loss/train', epoch_loss, epoch) writer.add_scalar('RMSE/train', rmse, epoch) writer.add_scalar('R2/train', r2, epoch) else: writer.add_scalar('Loss/train', epoch_loss, epoch) writer.add_scalar('ROC-AUC/train', r2, epoch) writer.add_scalar('PRC-AUC/train', rmse, epoch) if epoch % 20 == 0: if args.reg: print(f"epoch: {epoch}, " f"LOSS: {epoch_loss:.3f}, " f"RMSE: {rmse:.3f}, " f"MAE: {mae:.3f}, " f"rho: {pearson:.3f}, " f"R2: {r2:.3f}") else: print(f"epoch: {epoch}, " f"LOSS: {epoch_loss:.3f}, " f"ROC-AUC: {r2:.3f}, " f"PRC-AUC: {rmse:.3f}, " f"rho: {pearson:.3f}") epoch_losses.append(epoch_loss) epoch_rmses.append(rmse) # Discount trial if train RMSE finishes as a negative value (optimiser error). if r2 < -1: skipped_trials += 1 print('Skipped trials is {}'.format(skipped_trials)) continue # Evaluate mpnn_net.eval() preds = [] labs = [] for i, (bg, labels) in enumerate(test_loader): labels = labels.to(device) atom_feats = bg.ndata.pop('h').to(device) bond_feats = bg.edata.pop('e').to(device) atom_feats, bond_feats, labels = atom_feats.to( device), bond_feats.to(device), labels.to(device) y_pred = mpnn_net(bg, atom_feats, bond_feats) labels = labels.unsqueeze(dim=1) if args.reg: # Inverse transform to get RMSE labels = y_scaler.inverse_transform(labels.cpu().reshape( -1, 1)) y_pred = y_scaler.inverse_transform( y_pred.detach().cpu().numpy().reshape(-1, 1)) else: labels = labels.cpu().numpy() y_pred = y_pred.detach().cpu().numpy() preds.append(y_pred) labs.append(labels) labs = np.concatenate(labs, axis=None) preds = np.concatenate(preds, axis=None) pearson, p = pearsonr(preds, labs) if args.reg: mae = mean_absolute_error(preds, labs) rmse = np.sqrt(mean_squared_error(preds, labs)) r2 = r2_score(preds, labs) writer.add_scalar('RMSE/test', rmse) writer.add_scalar('R2/test', r2) print( f'Test RMSE: {rmse:.3f}, MAE: {mae:.3f}, R: {pearson:.3f}, R2: {r2:.3f}' ) else: r2 = roc_auc_score(labs, preds) precision, recall, thresholds = precision_recall_curve( labs, preds) rmse = auc(recall, precision) mae = 0 writer.add_scalar('ROC-AUC/test', r2) writer.add_scalar('PRC-AUC/test', rmse) print( f'Test ROC-AUC: {r2:.3f}, PRC-AUC: {rmse:.3f}, rho: {pearson:.3f}' ) r2_list.append(r2) rmse_list.append(rmse) mae_list.append(mae) j += 1 r2_list = np.array(r2_list) rmse_list = np.array(rmse_list) mae_list = np.array(mae_list) if args.reg: print("\nmean R^2: {:.4f} +- {:.4f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean RMSE: {:.4f} +- {:.4f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("mean MAE: {:.4f} +- {:.4f}\n".format( np.mean(mae_list), np.std(mae_list) / np.sqrt(len(mae_list)))) else: print("mean ROC-AUC^2: {:.3f} +- {:.3f}".format( np.mean(r2_list), np.std(r2_list) / np.sqrt(len(r2_list)))) print("mean PRC-AUC: {:.3f} +- {:.3f}".format( np.mean(rmse_list), np.std(rmse_list) / np.sqrt(len(rmse_list)))) print("\nSkipped trials is {}".format(skipped_trials))
MALARIA_PATH = 'data/Malaria/Malaria.csv' PATHS = { 'FreeSolv': FREESOLV_PATH, 'esol': ESOL_PATH, 'lipo': LIPO_PATH, 'dls': DLS_PATH, 'CatS': CATS_PATH, 'bradley': BRADLEY_PATH, 'Malaria': MALARIA_PATH } task = sys.argv[1] #TASK_NAME = 'FreeSolv' # Change dataset. Options: ['ESOL', 'FreeSolv', 'dls', 'CEP', 'CatS', 'bradley', 'Malaria'] smiles_list, y = parse_dataset(task, PATHS[task]) #NEED TO FIX MALARIA dat_size = len(smiles_list) mpi_comm = MPI.COMM_WORLD mpi_rank = mpi_comm.Get_rank() mpi_size = mpi_comm.Get_size() my_border_low, my_border_high = return_borders(mpi_rank, dat_size, mpi_size) my_list = smiles_list[my_border_low:my_border_high] bit_list = [512, 1024, 2048, 4096, 8192] for bits in bit_list: my_db = gen_e3fp_features(my_list, mpi_rank, mpi_size, bits)