def decode_test(): wrong = 0 for tot, s in enumerate(sys.stdin): s = s.split()[0] tree = MolTree(s) tree.recover() cur_mol = copy_edit_mol(tree.nodes[0].mol) global_amap = [{}] + [{} for node in tree.nodes] global_amap[1] = { atom.GetIdx(): atom.GetIdx() for atom in cur_mol.GetAtoms() } dfs_assemble(cur_mol, global_amap, [], tree.nodes[0], None) cur_mol = cur_mol.GetMol() cur_mol = Chem.MolFromSmiles(Chem.MolToSmiles(cur_mol)) set_atommap(cur_mol) dec_smiles = Chem.MolToSmiles(cur_mol) gold_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(s)) if gold_smiles != dec_smiles: print gold_smiles, dec_smiles wrong += 1 print wrong, tot + 1
def optimize(self, smiles, sim_cutoff, lr=2.0, num_iter=20): mol_tree = MolTree(smiles) mol_tree.recover() _, tree_vec, mol_vec = self.encode([mol_tree]) mol = Chem.MolFromSmiles(smiles) fp1 = AllChem.GetMorganFingerprint(mol, 2) tree_mean = self.T_mean(tree_vec) # Following Mueller et al. tree_log_var = -torch.abs(self.T_var(tree_vec)) mol_mean = self.G_mean(mol_vec) # Following Mueller et al. mol_log_var = -torch.abs(self.G_var(mol_vec)) mean = torch.cat([tree_mean, mol_mean], dim=1) log_var = torch.cat([tree_log_var, mol_log_var], dim=1) cur_vec = create_var(mean.data, True) visited = [] for _ in xrange(num_iter): prop_val = self.propNN(cur_vec).squeeze() grad = torch.autograd.grad(prop_val, cur_vec)[0] cur_vec = cur_vec.data + lr * grad.data cur_vec = create_var(cur_vec, True) visited.append(cur_vec) l, r = 0, num_iter - 1 while l < r - 1: mid = (l + r) / 2 new_vec = visited[mid] tree_vec, mol_vec = torch.chunk(new_vec, 2, dim=1) new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False) if new_smiles is None: r = mid - 1 continue new_mol = Chem.MolFromSmiles(new_smiles) fp2 = AllChem.GetMorganFingerprint(new_mol, 2) sim = DataStructs.TanimotoSimilarity(fp1, fp2) if sim < sim_cutoff: r = mid - 1 else: l = mid tree_vec, mol_vec = torch.chunk(visited[l], 2, dim=1) new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False) if new_smiles is None: return smiles, 1.0 new_mol = Chem.MolFromSmiles(new_smiles) fp2 = AllChem.GetMorganFingerprint(new_mol, 2) sim = DataStructs.TanimotoSimilarity(fp1, fp2) if sim >= sim_cutoff: return new_smiles, sim else: return smiles, 1.0
def __getitem__(self, idx): batch_data = self.data[idx] tree1_batch = [dpair[0] for dpair in batch_data] tree2_batch = [dpair[1] for dpair in batch_data] x_batch = MolTree.tensorize(tree1_batch, self.vocab, self.avocab, target=False, add_target=self.add_target) y_batch = MolTree.tensorize(tree2_batch, self.vocab, self.avocab, target=True, add_target=self.add_target) return x_batch, y_batch, tree1_batch, tree2_batch
def reconstruct(self, smiles, prob_decode=False): mol_tree = MolTree(smiles) mol_tree.recover() _,tree_vec,mol_vec = self.encode([mol_tree]) tree_mean = self.T_mean(tree_vec) tree_log_var = -torch.abs(self.T_var(tree_vec)) #Following Mueller et al. mol_mean = self.G_mean(mol_vec) mol_log_var = -torch.abs(self.G_var(mol_vec)) #Following Mueller et al. epsilon = create_var(torch.randn(1, self.latent_size / 2), False) tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon epsilon = create_var(torch.randn(1, self.latent_size / 2), False) mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon return self.decode(tree_vec, mol_vec, prob_decode)
def encode_from_smiles(self, smiles_list): tree_batch = [MolTree(s) for s in smiles_list] _, jtenc_holder, mpn_holder = tensorize(tree_batch, self.vocab, assm=False) tree_vecs, _, mol_vecs = self.encode(jtenc_holder, mpn_holder) return torch.cat([tree_vecs, mol_vecs], dim=-1)
def predict(smiles, lr, vocab, avocab, reselect, ori_smiles, iternum, output): mol = Chem.MolFromSmiles(smiles) atomnum = mol.GetNumAtoms() tree = get_tree(smiles) score1 = penalized_logp(smiles) try: xbatch = MolTree.tensorize([tree], vocab, avocab, target=False) new_smiles, sim, reselect, score11, score2 = model.test(xbatch, tree, lr=lr, reselect_num=reselect) except: ori_sim = similarity(smiles, ori_smiles) return score1, score1, atomnum, smiles, 1.0, ori_sim, 0 if smiles == new_smiles: s = "iter: %d sim: 0.00 ori_sim: 0.00 imp: 0.00 cannot decode\n" % (iternum) ori_sim = similarity(smiles, ori_smiles) else: ori_sim = similarity(new_smiles, ori_smiles) if reselect == 0: s = "iter: %d sim: %.2f ori_sim: %.4f imp: %.2f decode molecule %s\n" % (iternum, sim, ori_sim, score2-score1, new_smiles) elif reselect == 1: s = "iter: %d sim: %.2f ori_sim: %.4f imp: %.2f decode molecule %s reselect\n" % (iternum, sim, ori_sim, score2-score1, new_smiles) output.write(s) print(s) return score1, score2, atomnum, new_smiles, sim, ori_sim, reselect
def reconstruct1(self, smiles, prob_decode=False): mol_tree = MolTree(smiles) mol_tree.recover() # print("tree olusturuldu") _, tree_vec, mol_vec = self.encode([mol_tree]) # print("encode edildi") tree_mean = self.T_mean(tree_vec) tree_log_var = -torch.abs(self.T_var(tree_vec)) # Following Mueller et al. mol_mean = self.G_mean(mol_vec) mol_log_var = -torch.abs(self.G_var(mol_vec)) # Following Mueller et al. epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False) tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False) mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon return tree_vec,mol_vec,prob_decode
def tree_test(): for s in sys.stdin: s = s.split()[0] tree = MolTree(s) print '-------------------------------------------' print s for node in tree.nodes: print node.smiles, [x.smiles for x in node.neighbors]
def encode_latent_mean(self, smiles_list): mol_batch = [MolTree(s) for s in smiles_list] for mol_tree in mol_batch: mol_tree.recover() _, tree_vec, mol_vec = self.encode(mol_batch) tree_mean = self.T_mean(tree_vec) mol_mean = self.G_mean(mol_vec) return torch.cat([tree_mean, mol_mean], dim=1)
def recon_eval(self, smiles): mol_tree = MolTree(smiles) mol_tree.recover() _,tree_vec,mol_vec = self.encode([mol_tree]) tree_mean = self.T_mean(tree_vec) tree_log_var = -torch.abs(self.T_var(tree_vec)) #Following Mueller et al. mol_mean = self.G_mean(mol_vec) mol_log_var = -torch.abs(self.G_var(mol_vec)) #Following Mueller et al. all_smiles = [] for i in range(10): epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False) tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False) mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon for j in range(10): new_smiles = self.decode(tree_vec, mol_vec, prob_decode=True) all_smiles.append(new_smiles) return all_smiles
def encode_latent_from_smiles(self, smiles_list): tree_batch = [MolTree(s) for s in smiles_list] _, jtenc_holder, mpn_holder = tensorize(tree_batch, self.vocab, assm=False) tree_vecs, _, mol_vecs = self.encode(jtenc_holder, mpn_holder) z_tree_vecs, _ = self.rsample(tree_vecs, self.T_mean, self.T_var) z_mol_vecs, _ = self.rsample(mol_vecs, self.G_mean, self.G_var) return torch.cat([z_tree_vecs, z_mol_vecs], dim=-1)
def reconstruct(self, smiles, prob_decode=False,DataFrame=None): mol_tree = MolTree(smiles) mol_tree.recover() #print("tree olusturuldu") _,tree_vec,mol_vec = self.encode([mol_tree]) #print("encode edildi") tree_mean = self.T_mean(tree_vec) tree_log_var = -torch.abs(self.T_var(tree_vec)) #Following Mueller et al. mol_mean = self.G_mean(mol_vec) mol_log_var = -torch.abs(self.G_var(mol_vec)) #Following Mueller et al. epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False) tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False) mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon thethird=torch.cat((tree_vec, mol_vec), 1) #print(thethird.to('cpu').data.numpy()) DataFrame.loc[smiles]=thethird.to('cpu').data.numpy()[0] return self.decode(tree_vec, mol_vec, prob_decode)
def __getitem__(self, item): smiles = self.data[item] mol_tree = MolTree(smiles) if mol_tree.mol is None: return None mol_tree.recover() mol_tree.assemble() return mol_tree
def __getitem__(self, idx): smiles = self.data[idx] mol_tree = MolTree(smiles) #print(len(smiles)) mol_tree.recover() mol_tree.assemble() return mol_tree
def count(): cnt, n = 0, 0 for s in sys.stdin: s = s.split()[0] tree = MolTree(s) tree.recover() tree.assemble() for node in tree.nodes: cnt += len(node.cands) n += len(tree.nodes)
def enum_test(): for s in sys.stdin: s = s.split()[0] tree = MolTree(s) tree.recover() tree.assemble() for node in tree.nodes: if node.label not in node.cands: print tree.smiles print node.smiles, [x.smiles for x in node.neighbors] print node.label, len(node.cands)
def predict(smiles, lr, vocab, avocab, reselect, ori_smiles, iternum, output, prop="logp", sim_type="binary"): mol = Chem.MolFromSmiles(smiles) atomnum = mol.GetNumAtoms() tree = get_tree(smiles) score1 = get_prop(smiles, prop=prop) try: xbatch = MolTree.tensorize([tree], vocab, avocab, target=False) new_smiles, sim, reselect, score11, score2 = model.test( xbatch, tree, reselect_num=reselect, prop=prop, sim_type=sim_type) except Exception as e: print(e) print("cannot process molecule %s at iteration %d" % (smiles, iternum)) return score1, score1, atomnum, smiles, 1.0, 0.0, 0 #except: # ori_sim = similarity(smiles, ori_smiles) # return score1, score1, atomnum, smiles, 1.0, ori_sim, 0 if smiles == new_smiles: s = "iter: %d sim: 0.00 ori_sim: 0.00 imp: 0.00 cannot decode\n" % ( iternum) ori_sim = similarity(smiles, ori_smiles, sim_type=sim_type) else: ori_sim = similarity(new_smiles, ori_smiles, sim_type=sim_type) if reselect == 0: s = "iter: %d sim: %.2f ori_sim: %.4f prop1: %.2f prop2: %.2f imp: %.2f decode molecule %s\n" % ( iternum, sim, ori_sim, score1, score2, score2 - score1, new_smiles) elif reselect == 1: s = "iter: %d sim: %.2f ori_sim: %.4f prop1: %.2f prop2: %.2f imp: %.2f decode molecule %s reselect\n" % ( iternum, sim, ori_sim, score1, score2, score2 - score1, new_smiles) output.write(s) print(s) return score1, score2, atomnum, new_smiles, sim, ori_sim, reselect
def tensorize(smiles, assm=True): mol_tree = MolTree(smiles) mol_tree.recover() if assm: mol_tree.assemble() for node in mol_tree.nodes: if node.label not in node.cands: node.cands.append(node.label) del mol_tree.mol for node in mol_tree.nodes: del node.mol return mol_tree
def predict(smiles, model, vocab, avocab, reselect, ori_smiles, iternum, prop="logp"): mol = Chem.MolFromSmiles(smiles) atomnum1 = mol.GetNumAtoms() tree = get_tree(smiles) score1 = get_prop(smiles, prop=prop) try: xbatch = MolTree.tensorize([tree], vocab, avocab, target=False) new_smiles, sim, reselect, score11, score2 = model.test(xbatch, tree, reselect_num=reselect, prop=prop) except: ori_sim = similarity(smiles, ori_smiles, "binary") result = [smiles, smiles, 1.0, ori_sim, score1, score1, atomnum1, atomnum1] return result if smiles == new_smiles: ori_sim = similarity(smiles, ori_smiles, "binary") else: ori_sim = similarity(new_smiles, ori_smiles, "binary") atomnum2 = Chem.MolFromSmiles(new_smiles).GetNumAtoms() result = [smiles, new_smiles, sim, ori_sim, score1, score2, atomnum1, atomnum2] return result
def tensorize_pair(smiles_pair): mol_tree0 = MolTree(smiles_pair[0]) mol_tree1 = MolTree(smiles_pair[1]) path = compute_path(mol_tree0, mol_tree1) return (mol_tree0, mol_tree1, path)
def __getitem__(self, idx): smiles = self.data[idx] mol_tree = MolTree(smiles) mol_tree.recover() mol_tree.assemble() return mol_tree, self.prop_data[idx]
def get_tree(smiles, assm=True): smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) mol_tree = MolTree(smiles) return mol_tree
continue global_amap[nei_id][nei_atom] = global_amap[cur_node.nid][ctr_atom] cur_mol = attach_mols(cur_mol, children, [], global_amap) #father is already attached for nei_node in children: if not nei_node.is_leaf: dfs_assemble(cur_mol, global_amap, label_amap, nei_node, cur_node) if __name__ == "__main__": import sys from mol_tree import MolTree lg = rdkit.RDLogger.logger() lg.setLevel(rdkit.RDLogger.CRITICAL) smiles = ["O=C1[C@@H]2C=C[C@@H](C=CC2)C1(c1ccccc1)c1ccccc1","O=C([O-])CC[C@@]12CCCC[C@]1(O)OC(=O)CC2", "ON=C1C[C@H]2CC3(C[C@@H](C1)c1ccccc12)OCCO3", "C[C@H]1CC(=O)[C@H]2[C@@]3(O)C(=O)c4cccc(O)c4[C@@H]4O[C@@]43[C@@H](O)C[C@]2(O)C1", 'Cc1cc(NC(=O)CSc2nnc3c4ccccc4n(C)c3n2)ccc1Br', 'CC(C)(C)c1ccc(C(=O)N[C@H]2CCN3CCCc4cccc2c43)cc1', "O=c1c2ccc3c(=O)n(-c4nccs4)c(=O)c4ccc(c(=O)n1-c1nccs1)c2c34", "O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1"] mol_tree = MolTree("C") assert len(mol_tree.nodes) > 0 def tree_test(): for s in sys.stdin: s = s.split()[0] tree = MolTree(s) print ('-------------------------------------------') print (s) for node in tree.nodes: print(node.smiles, [x.smiles for x in node.neighbors]) def decode_test(): wrong = 0 for tot,s in enumerate(sys.stdin): s = s.split()[0]
key=lambda x: x.mol.GetNumAtoms(), reverse=True) singletons = [nei for nei in neis if nei.mol.GetNumAtoms() == 1] neighbors = singletons + neighbors cands, aroma_scores = enum_assemble(node_x, neighbors) return len(cands) > 0 # and sum(aroma_scores) >= 0 if __name__ == "__main__": smiles = [ "O=C1[C@@H]2C=C[C@@H](C=CC2)C1(c1ccccc1)c1ccccc1", "O=C([O-])CC[C@@]12CCCC[C@]1(O)OC(=O)CC2", "ON=C1C[C@H]2CC3(C[C@@H](C1)c1ccccc12)OCCO3", "C[C@H]1CC(=O)[C@H]2[C@@]3(O)C(=O)c4cccc(O)c4[C@@H]4O[C@@]43[C@@H](O)C[C@]2(O)C1", 'Cc1cc(NC(=O)CSc2nnc3c4ccccc4n(C)c3n2)ccc1Br', 'CC(C)(C)c1ccc(C(=O)N[C@H]2CCN3CCCc4cccc2c43)cc1', "O=c1c2ccc3c(=O)n(-c4nccs4)c(=O)c4ccc(c(=O)n1-c1nccs1)c2c34", "O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1" ] for s in smiles: print s tree = MolTree(s) for i, node in enumerate(tree.nodes): node.idx = i stack = [] dfs(stack, tree.nodes[0], -1) for x, y, d in stack: print x.smiles, y.smiles, d print '------------------------------'