def fingerprint(fingerprint_method, keyword_dict, smi): mol = AllChem.MolFromSmiles(smi) if fingerprint_method == "Morgan": return AllChem.GetMorganFingerprintAsBitVect(mol, **keyword_dict) else: return RDKFingerprint(mol)
def rdkit_fingerprint(mol, **kwargs): return list(RDKFingerprint(mol, **kwargs).GetOnBits())
def main_bo(vocab_path, model_path, save_dir, descriptor_path, sampling=60, iterations=2, epochs=2, hidden_size=450, latent_size=56, depthT=20, depthG=3, random_seed=1, pIC50_weight=0, QED_weight=0, logP_weight=1, SA_weight=1, cycle_weight=1, sim_weight=0): if os.path.isdir(save_dir) is False: os.makedirs(save_dir) vocab = [x.strip("\r\n ") for x in open(vocab_path)] vocab = Vocab(vocab) model = JTNNVAE(vocab, hidden_size, latent_size, depthT, depthG) model.load_state_dict(torch.load(model_path)) model = model.cuda() if sim_weight != 0: df_100 = pd.read_csv( '../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv') ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()] fps_db = [RDKFingerprint(x) for x in ms_db] # We load the random seed np.random.seed(random_seed) # Path of the files latent_feature = os.path.join(descriptor_path, './latent_features.txt') target = os.path.join(descriptor_path, './targets.txt') logp_value = os.path.join(descriptor_path, './logP_values.txt') QED_value = os.path.join(descriptor_path, './QED_values.txt') pIC50_value = os.path.join(descriptor_path, './pIC50_values.txt') sa_score = os.path.join(descriptor_path, './SA_scores.txt') cycle_score = os.path.join(descriptor_path, './cycle_scores.txt') sim_score = os.path.join(descriptor_path, './sim_values.txt') # We load the data (y is minued!) X = np.loadtxt(latent_feature) y = -np.loadtxt(target) y = y.reshape((-1, 1)) n = X.shape[0] permutation = np.random.choice(n, n, replace=False) X_train = X[permutation, :][0:np.int(np.round(0.9 * n)), :] X_test = X[permutation, :][np.int(np.round(0.9 * n)):, :] y_train = y[permutation][0:np.int(np.round(0.9 * n))] y_test = y[permutation][np.int(np.round(0.9 * n)):] np.random.seed(random_seed) pIC50_values = np.loadtxt(pIC50_value) QED_values = np.loadtxt(QED_value) logP_values = np.loadtxt(logp_value) SA_scores = np.loadtxt(sa_score) cycle_scores = np.loadtxt(cycle_score) sim_values = np.loadtxt(sim_score) iteration = 0 while iteration < iterations: # We fit the GP np.random.seed(iteration * random_seed) M = 500 sgp = SparseGP(X_train, 0 * X_train, y_train, M) # TODO: test hyperparameters sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0, y_test, minibatch_size=10 * M, max_iterations=5, learning_rate=0.001) pred, uncert = sgp.predict(X_test, 0 * X_test) error = np.sqrt(np.mean((pred - y_test)**2)) testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert))) print('Test RMSE: ', error) print('Test ll: ', testll) pred, uncert = sgp.predict(X_train, 0 * X_train) error = np.sqrt(np.mean((pred - y_train)**2)) trainll = np.mean( sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert))) print('Train RMSE: ', error) print('Train ll: ', trainll) # We pick the next 60 inputs next_inputs = sgp.batched_greedy_ei(sampling, np.min(X_train, 0), np.max(X_train, 0)) # joblib.dump(next_inputs, './next_inputs.pkl') # next_inputs = joblib.load('./next_inputs.pkl') valid_smiles = [] new_features = [] for i in tqdm(range(sampling)): all_vec = next_inputs[i].reshape((1, -1)) tree_vec, mol_vec = np.hsplit(all_vec, 2) tree_vec = create_var(torch.from_numpy(tree_vec).float()) mol_vec = create_var(torch.from_numpy(mol_vec).float()) tree_vecs, _ = model.rsample(tree_vec, model.T_mean, model.T_var) mol_vecs, _ = model.rsample(mol_vec, model.G_mean, model.G_var) s = model.decode(tree_vecs, mol_vecs, prob_decode=False) if s is not None: valid_smiles.append(s) new_features.append(all_vec) print(len(valid_smiles), "molecules are found") valid_smiles = valid_smiles new_features = next_inputs new_features = np.vstack(new_features) save_object( valid_smiles, os.path.join(save_dir, "valid_smiles{}.pkl".format(iteration))) scores = [] if pIC50_weight != 0: current_pIC50 = calculate_pIC50(valid_smiles) for i in range(len(valid_smiles)): current_pIC50_normalized = ( current_pIC50[i] - np.mean(pIC50_values)) / np.std(pIC50_values) else: current_pIC50_normalized = [] for i in range(len(valid_smiles)): current_pIC50_normalized.append(0) for i in range(len(valid_smiles)): if sim_weight != 0: current_sim_value = similarity_search(fps_db, valid_smiles[i]) current_sim_value_normalized = ( current_sim_value - np.mean(sim_values)) / np.std(sim_values) else: current_sim_value_normalized = 0 current_QED_value = QED.qed(MolFromSmiles(valid_smiles[i])) current_log_P_value = Descriptors.MolLogP( MolFromSmiles(valid_smiles[i])) current_SA_score = -sascorer.calculateScore( MolFromSmiles(valid_smiles[i])) cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles( valid_smiles[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 current_cycle_score = -cycle_length current_SA_score_normalized = ( current_SA_score - np.mean(SA_scores)) / np.std(SA_scores) current_QED_value_normalized = ( current_QED_value - np.mean(QED_values)) / np.std(QED_values) current_log_P_value_normalized = ( current_log_P_value - np.mean(logP_values)) / np.std(logP_values) current_cycle_score_normalized = ( current_cycle_score - np.mean(cycle_scores)) / np.std(cycle_scores) score = (SA_weight * current_SA_score_normalized + QED_weight * current_QED_value_normalized + logP_weight * current_log_P_value_normalized + cycle_weight * current_cycle_score_normalized + pIC50_weight * current_pIC50_normalized[i] + sim_weight * current_sim_value_normalized) scores.append(-score) # target is always minused print(valid_smiles) print(scores) save_object(scores, os.path.join(save_dir, "scores{}.pkl".format(iteration))) if len(new_features) > 0: X_train = np.concatenate([X_train, new_features], 0) y_train = np.concatenate([y_train, np.array(scores)[:, None]], 0) iteration += 1
def scorer(smiles, pIC50_weight, QED_weight, logP_weight, SA_weight, cycle_weight, sim_weight): smiles_rdkit = [] for i in range(len(smiles)): smiles_rdkit.append( MolToSmiles(MolFromSmiles(smiles[i]), isomericSmiles=True)) # calculate IC50 of training set using MPNN #IC50_scores=calculateScore(smiles_rdkit) # read in IC50 of training set from database IC50_scores = np.loadtxt('../data/covid/ic50-fulltrain.txt') IC50_scores = [x for x in IC50_scores] IC50_scores_normalized = (np.array(IC50_scores) - np.mean(IC50_scores)) / np.std(IC50_scores) if sim_weight != 0: # df_100 = list of molecules to match similarity df_100 = pd.read_csv('../data/covid/MPro_6wqf_A_ProteaseData_smiles_top100.csv') ms_db = [MolFromSmiles(x) for x in df_100['SMILES'].tolist()] fps_db = [RDKFingerprint(x) for x in ms_db] sim_values = [] for i in range(len(smiles)): sim_values.append( similarity_search(fps_db, smiles_rdkit[i])) sim_values_normalized = ( np.array(sim_values) - np.mean(sim_values)) / np.std(sim_values) else: sim_values, sim_values_normalized = [], [] for i in range(len(smiles)): sim_values.append(0) sim_values_normalized.append(0) sim_values_normalized=np.array(sim_values_normalized) logP_values = [] for i in range(len(smiles)): logP_values.append( Descriptors.MolLogP(MolFromSmiles(smiles_rdkit[i]))) qed_values = [] for i in range(len(smiles)): qed_values.append( QED.qed(MolFromSmiles(smiles_rdkit[i]))) SA_scores = [] for i in range(len(smiles)): SA_scores.append( -sascorer.calculateScore(MolFromSmiles(smiles_rdkit[i]))) cycle_scores = [] for i in range(len(smiles)): cycle_list = nx.cycle_basis( nx.Graph( rdmolops.GetAdjacencyMatrix(MolFromSmiles(smiles_rdkit[i])))) if len(cycle_list) == 0: cycle_length = 0 else: cycle_length = max([len(j) for j in cycle_list]) if cycle_length <= 6: cycle_length = 0 else: cycle_length = cycle_length - 6 cycle_scores.append(-cycle_length) SA_scores_normalized = ( np.array(SA_scores) - np.mean(SA_scores)) / np.std(SA_scores) qed_values_normalized = ( np.array(qed_values) - np.mean(qed_values)) / np.std(qed_values) cycle_scores_normalized = ( np.array(cycle_scores) - np.mean(cycle_scores)) / np.std(cycle_scores) logP_values_normalized = ( np.array(logP_values) - np.mean(logP_values)) / np.std(logP_values) targets = (pIC50_weight * IC50_scores_normalized + logP_weight * logP_values_normalized + SA_weight * SA_scores_normalized + QED_weight * qed_values_normalized + cycle_weight * cycle_scores_normalized + sim_weight * sim_values_normalized) return (IC50_scores, qed_values, logP_values, SA_scores, cycle_scores, sim_values, targets)