Ejemplo n.º 1
0
    def decode_test():
        wrong = 0
        for tot, s in enumerate(sys.stdin):
            s = s.split()[0]
            tree = MolTree(s)
            tree.recover()

            cur_mol = copy_edit_mol(tree.nodes[0].mol)
            global_amap = [{}] + [{} for node in tree.nodes]
            global_amap[1] = {
                atom.GetIdx(): atom.GetIdx()
                for atom in cur_mol.GetAtoms()
            }

            dfs_assemble(cur_mol, global_amap, [], tree.nodes[0], None)

            cur_mol = cur_mol.GetMol()
            cur_mol = Chem.MolFromSmiles(Chem.MolToSmiles(cur_mol))
            set_atommap(cur_mol)
            dec_smiles = Chem.MolToSmiles(cur_mol)

            gold_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(s))
            if gold_smiles != dec_smiles:
                print gold_smiles, dec_smiles
                wrong += 1
            print wrong, tot + 1
Ejemplo n.º 2
0
    def optimize(self, smiles, sim_cutoff, lr=2.0, num_iter=20):
        mol_tree = MolTree(smiles)
        mol_tree.recover()
        _, tree_vec, mol_vec = self.encode([mol_tree])

        mol = Chem.MolFromSmiles(smiles)
        fp1 = AllChem.GetMorganFingerprint(mol, 2)

        tree_mean = self.T_mean(tree_vec)
        # Following Mueller et al.
        tree_log_var = -torch.abs(self.T_var(tree_vec))
        mol_mean = self.G_mean(mol_vec)
        # Following Mueller et al.
        mol_log_var = -torch.abs(self.G_var(mol_vec))
        mean = torch.cat([tree_mean, mol_mean], dim=1)
        log_var = torch.cat([tree_log_var, mol_log_var], dim=1)
        cur_vec = create_var(mean.data, True)

        visited = []
        for _ in xrange(num_iter):
            prop_val = self.propNN(cur_vec).squeeze()
            grad = torch.autograd.grad(prop_val, cur_vec)[0]
            cur_vec = cur_vec.data + lr * grad.data
            cur_vec = create_var(cur_vec, True)
            visited.append(cur_vec)

        l, r = 0, num_iter - 1
        while l < r - 1:
            mid = (l + r) / 2
            new_vec = visited[mid]
            tree_vec, mol_vec = torch.chunk(new_vec, 2, dim=1)
            new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
            if new_smiles is None:
                r = mid - 1
                continue

            new_mol = Chem.MolFromSmiles(new_smiles)
            fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
            sim = DataStructs.TanimotoSimilarity(fp1, fp2)
            if sim < sim_cutoff:
                r = mid - 1
            else:
                l = mid

        tree_vec, mol_vec = torch.chunk(visited[l], 2, dim=1)
        new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
        if new_smiles is None:
            return smiles, 1.0
        new_mol = Chem.MolFromSmiles(new_smiles)
        fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
        sim = DataStructs.TanimotoSimilarity(fp1, fp2)
        if sim >= sim_cutoff:
            return new_smiles, sim
        else:
            return smiles, 1.0
Ejemplo n.º 3
0
 def __getitem__(self, idx):
     batch_data = self.data[idx]
     
     tree1_batch = [dpair[0] for dpair in batch_data]
     tree2_batch = [dpair[1] for dpair in batch_data]
     
     
     x_batch = MolTree.tensorize(tree1_batch, self.vocab, self.avocab, target=False, add_target=self.add_target)
     y_batch = MolTree.tensorize(tree2_batch, self.vocab, self.avocab, target=True, add_target=self.add_target)
     
     return x_batch, y_batch, tree1_batch, tree2_batch
Ejemplo n.º 4
0
    def reconstruct(self, smiles, prob_decode=False):
        mol_tree = MolTree(smiles)
        mol_tree.recover()
        _,tree_vec,mol_vec = self.encode([mol_tree])
        
        tree_mean = self.T_mean(tree_vec)
        tree_log_var = -torch.abs(self.T_var(tree_vec)) #Following Mueller et al.
        mol_mean = self.G_mean(mol_vec)
        mol_log_var = -torch.abs(self.G_var(mol_vec)) #Following Mueller et al.

        epsilon = create_var(torch.randn(1, self.latent_size / 2), False)
        tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon
        epsilon = create_var(torch.randn(1, self.latent_size / 2), False)
        mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon
        return self.decode(tree_vec, mol_vec, prob_decode)
Ejemplo n.º 5
0
 def encode_from_smiles(self, smiles_list):
     tree_batch = [MolTree(s) for s in smiles_list]
     _, jtenc_holder, mpn_holder = tensorize(tree_batch,
                                             self.vocab,
                                             assm=False)
     tree_vecs, _, mol_vecs = self.encode(jtenc_holder, mpn_holder)
     return torch.cat([tree_vecs, mol_vecs], dim=-1)
Ejemplo n.º 6
0
def predict(smiles, lr, vocab, avocab, reselect, ori_smiles, iternum, output):
    mol = Chem.MolFromSmiles(smiles)
    atomnum = mol.GetNumAtoms()
    tree = get_tree(smiles)
    score1 = penalized_logp(smiles)

    try:
        xbatch = MolTree.tensorize([tree], vocab, avocab, target=False)
        new_smiles, sim, reselect, score11, score2 = model.test(xbatch, tree, lr=lr, reselect_num=reselect)
    except:
        ori_sim = similarity(smiles, ori_smiles)
        return score1, score1, atomnum, smiles, 1.0, ori_sim, 0
    
    if smiles == new_smiles:
        s = "iter: %d sim: 0.00 ori_sim: 0.00 imp: 0.00 cannot decode\n" % (iternum)
        ori_sim = similarity(smiles, ori_smiles)
    else:
        ori_sim = similarity(new_smiles, ori_smiles)
        if reselect == 0:
            s = "iter: %d sim: %.2f ori_sim: %.4f imp: %.2f decode molecule %s\n" % (iternum, sim, ori_sim, score2-score1, new_smiles)
        elif reselect == 1:
            s = "iter: %d sim: %.2f ori_sim: %.4f imp: %.2f decode molecule %s reselect\n" % (iternum, sim, ori_sim, score2-score1, new_smiles)

    output.write(s)
    print(s)
    return score1, score2, atomnum, new_smiles, sim, ori_sim, reselect
Ejemplo n.º 7
0
    def reconstruct1(self, smiles, prob_decode=False):
        mol_tree = MolTree(smiles)
        mol_tree.recover()
        # print("tree olusturuldu")
        _, tree_vec, mol_vec = self.encode([mol_tree])
        # print("encode edildi")
        tree_mean = self.T_mean(tree_vec)
        tree_log_var = -torch.abs(self.T_var(tree_vec))  # Following Mueller et al.
        mol_mean = self.G_mean(mol_vec)
        mol_log_var = -torch.abs(self.G_var(mol_vec))  # Following Mueller et al.

        epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False)
        tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon
        epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False)
        mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon

        return tree_vec,mol_vec,prob_decode
Ejemplo n.º 8
0
 def tree_test():
     for s in sys.stdin:
         s = s.split()[0]
         tree = MolTree(s)
         print '-------------------------------------------'
         print s
         for node in tree.nodes:
             print node.smiles, [x.smiles for x in node.neighbors]
Ejemplo n.º 9
0
    def encode_latent_mean(self, smiles_list):
        mol_batch = [MolTree(s) for s in smiles_list]
        for mol_tree in mol_batch:
            mol_tree.recover()

        _, tree_vec, mol_vec = self.encode(mol_batch)
        tree_mean = self.T_mean(tree_vec)
        mol_mean = self.G_mean(mol_vec)
        return torch.cat([tree_mean, mol_mean], dim=1)
Ejemplo n.º 10
0
 def recon_eval(self, smiles):
     mol_tree = MolTree(smiles)
     mol_tree.recover()
     _,tree_vec,mol_vec = self.encode([mol_tree])
     
     tree_mean = self.T_mean(tree_vec)
     tree_log_var = -torch.abs(self.T_var(tree_vec)) #Following Mueller et al.
     mol_mean = self.G_mean(mol_vec)
     mol_log_var = -torch.abs(self.G_var(mol_vec)) #Following Mueller et al.
     
     all_smiles = []
     for i in range(10):
         epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False)
         tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon
         epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False)
         mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon
         for j in range(10):
             new_smiles = self.decode(tree_vec, mol_vec, prob_decode=True)
             all_smiles.append(new_smiles)
     return all_smiles
Ejemplo n.º 11
0
    def encode_latent_from_smiles(self, smiles_list):
        tree_batch = [MolTree(s) for s in smiles_list]
        _, jtenc_holder, mpn_holder = tensorize(tree_batch,
                                                self.vocab,
                                                assm=False)
        tree_vecs, _, mol_vecs = self.encode(jtenc_holder, mpn_holder)

        z_tree_vecs, _ = self.rsample(tree_vecs, self.T_mean, self.T_var)
        z_mol_vecs, _ = self.rsample(mol_vecs, self.G_mean, self.G_var)

        return torch.cat([z_tree_vecs, z_mol_vecs], dim=-1)
Ejemplo n.º 12
0
    def reconstruct(self, smiles, prob_decode=False,DataFrame=None):
        mol_tree = MolTree(smiles)
        mol_tree.recover()
        #print("tree olusturuldu")
        _,tree_vec,mol_vec = self.encode([mol_tree])
        #print("encode edildi")
        tree_mean = self.T_mean(tree_vec)
        tree_log_var = -torch.abs(self.T_var(tree_vec)) #Following Mueller et al.
        mol_mean = self.G_mean(mol_vec)
        mol_log_var = -torch.abs(self.G_var(mol_vec)) #Following Mueller et al.

        epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False)
        tree_vec = tree_mean + torch.exp(tree_log_var / 2) * epsilon
        epsilon = create_var(torch.randn(1, (int)(self.latent_size / 2)), False)
        mol_vec = mol_mean + torch.exp(mol_log_var / 2) * epsilon
        thethird=torch.cat((tree_vec, mol_vec), 1)
        #print(thethird.to('cpu').data.numpy())
        DataFrame.loc[smiles]=thethird.to('cpu').data.numpy()[0]


        return self.decode(tree_vec, mol_vec, prob_decode)
Ejemplo n.º 13
0
 def __getitem__(self, item):
     smiles = self.data[item]
     mol_tree = MolTree(smiles)
     if mol_tree.mol is None:
         return None
     mol_tree.recover()
     mol_tree.assemble()
     return mol_tree
Ejemplo n.º 14
0
    def __getitem__(self, idx):
        smiles = self.data[idx]
        mol_tree = MolTree(smiles)
        #print(len(smiles))

        mol_tree.recover()
        mol_tree.assemble()
        return mol_tree
Ejemplo n.º 15
0
 def count():
     cnt, n = 0, 0
     for s in sys.stdin:
         s = s.split()[0]
         tree = MolTree(s)
         tree.recover()
         tree.assemble()
         for node in tree.nodes:
             cnt += len(node.cands)
         n += len(tree.nodes)
Ejemplo n.º 16
0
 def enum_test():
     for s in sys.stdin:
         s = s.split()[0]
         tree = MolTree(s)
         tree.recover()
         tree.assemble()
         for node in tree.nodes:
             if node.label not in node.cands:
                 print tree.smiles
                 print node.smiles, [x.smiles for x in node.neighbors]
                 print node.label, len(node.cands)
Ejemplo n.º 17
0
def predict(smiles,
            lr,
            vocab,
            avocab,
            reselect,
            ori_smiles,
            iternum,
            output,
            prop="logp",
            sim_type="binary"):
    mol = Chem.MolFromSmiles(smiles)
    atomnum = mol.GetNumAtoms()
    tree = get_tree(smiles)
    score1 = get_prop(smiles, prop=prop)
    try:
        xbatch = MolTree.tensorize([tree], vocab, avocab, target=False)
        new_smiles, sim, reselect, score11, score2 = model.test(
            xbatch, tree, reselect_num=reselect, prop=prop, sim_type=sim_type)
    except Exception as e:
        print(e)
        print("cannot process molecule %s at iteration %d" % (smiles, iternum))
        return score1, score1, atomnum, smiles, 1.0, 0.0, 0
    #except:
    #    ori_sim = similarity(smiles, ori_smiles)
    #    return score1, score1, atomnum, smiles, 1.0, ori_sim, 0

    if smiles == new_smiles:
        s = "iter: %d sim: 0.00 ori_sim: 0.00 imp: 0.00 cannot decode\n" % (
            iternum)
        ori_sim = similarity(smiles, ori_smiles, sim_type=sim_type)
    else:
        ori_sim = similarity(new_smiles, ori_smiles, sim_type=sim_type)
        if reselect == 0:
            s = "iter: %d sim: %.2f ori_sim: %.4f prop1: %.2f prop2: %.2f imp: %.2f decode molecule %s\n" % (
                iternum, sim, ori_sim, score1, score2, score2 - score1,
                new_smiles)
        elif reselect == 1:
            s = "iter: %d sim: %.2f ori_sim: %.4f prop1: %.2f prop2: %.2f imp: %.2f decode molecule %s reselect\n" % (
                iternum, sim, ori_sim, score1, score2, score2 - score1,
                new_smiles)

    output.write(s)
    print(s)
    return score1, score2, atomnum, new_smiles, sim, ori_sim, reselect
Ejemplo n.º 18
0
def tensorize(smiles, assm=True):
    mol_tree = MolTree(smiles)
    mol_tree.recover()
    if assm:
        mol_tree.assemble()
        for node in mol_tree.nodes:
            if node.label not in node.cands:
                node.cands.append(node.label)

    del mol_tree.mol
    for node in mol_tree.nodes:
        del node.mol

    return mol_tree
Ejemplo n.º 19
0
def predict(smiles, model, vocab, avocab, reselect, ori_smiles, iternum, prop="logp"):
    mol = Chem.MolFromSmiles(smiles)
    atomnum1 = mol.GetNumAtoms()
    tree = get_tree(smiles)
    score1 = get_prop(smiles, prop=prop)

    try:    
        xbatch = MolTree.tensorize([tree], vocab, avocab, target=False)
        new_smiles, sim, reselect, score11, score2 = model.test(xbatch, tree, reselect_num=reselect, prop=prop)
    except:
        ori_sim = similarity(smiles, ori_smiles, "binary")
        result = [smiles, smiles, 1.0, ori_sim, score1, score1, atomnum1, atomnum1]
        return result
    
    if smiles == new_smiles:
        ori_sim = similarity(smiles, ori_smiles, "binary")
    else:
        ori_sim = similarity(new_smiles, ori_smiles, "binary")
    
    atomnum2 = Chem.MolFromSmiles(new_smiles).GetNumAtoms()
    result = [smiles, new_smiles, sim, ori_sim, score1, score2, atomnum1, atomnum2]
    return result
Ejemplo n.º 20
0
def tensorize_pair(smiles_pair):
    mol_tree0 = MolTree(smiles_pair[0])
    mol_tree1 = MolTree(smiles_pair[1])
    path = compute_path(mol_tree0, mol_tree1)
    return (mol_tree0, mol_tree1, path)
Ejemplo n.º 21
0
 def __getitem__(self, idx):
     smiles = self.data[idx]
     mol_tree = MolTree(smiles)
     mol_tree.recover()
     mol_tree.assemble()
     return mol_tree, self.prop_data[idx]
Ejemplo n.º 22
0
def get_tree(smiles, assm=True):
    smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
    mol_tree = MolTree(smiles)
    return mol_tree
Ejemplo n.º 23
0
            continue
        global_amap[nei_id][nei_atom] = global_amap[cur_node.nid][ctr_atom]
    
    cur_mol = attach_mols(cur_mol, children, [], global_amap) #father is already attached
    for nei_node in children:
        if not nei_node.is_leaf:
            dfs_assemble(cur_mol, global_amap, label_amap, nei_node, cur_node)

if __name__ == "__main__":
    import sys
    from mol_tree import MolTree
    lg = rdkit.RDLogger.logger() 
    lg.setLevel(rdkit.RDLogger.CRITICAL)
    
    smiles = ["O=C1[C@@H]2C=C[C@@H](C=CC2)C1(c1ccccc1)c1ccccc1","O=C([O-])CC[C@@]12CCCC[C@]1(O)OC(=O)CC2", "ON=C1C[C@H]2CC3(C[C@@H](C1)c1ccccc12)OCCO3", "C[C@H]1CC(=O)[C@H]2[C@@]3(O)C(=O)c4cccc(O)c4[C@@H]4O[C@@]43[C@@H](O)C[C@]2(O)C1", 'Cc1cc(NC(=O)CSc2nnc3c4ccccc4n(C)c3n2)ccc1Br', 'CC(C)(C)c1ccc(C(=O)N[C@H]2CCN3CCCc4cccc2c43)cc1', "O=c1c2ccc3c(=O)n(-c4nccs4)c(=O)c4ccc(c(=O)n1-c1nccs1)c2c34", "O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1"]
    mol_tree = MolTree("C")
    assert len(mol_tree.nodes) > 0

    def tree_test():
        for s in sys.stdin:
            s = s.split()[0]
            tree = MolTree(s)
            print ('-------------------------------------------')
            print (s)
            for node in tree.nodes:
                print(node.smiles, [x.smiles for x in node.neighbors])

    def decode_test():
        wrong = 0
        for tot,s in enumerate(sys.stdin):
            s = s.split()[0]
Ejemplo n.º 24
0
                       key=lambda x: x.mol.GetNumAtoms(),
                       reverse=True)
    singletons = [nei for nei in neis if nei.mol.GetNumAtoms() == 1]
    neighbors = singletons + neighbors
    cands, aroma_scores = enum_assemble(node_x, neighbors)
    return len(cands) > 0  # and sum(aroma_scores) >= 0


if __name__ == "__main__":
    smiles = [
        "O=C1[C@@H]2C=C[C@@H](C=CC2)C1(c1ccccc1)c1ccccc1",
        "O=C([O-])CC[C@@]12CCCC[C@]1(O)OC(=O)CC2",
        "ON=C1C[C@H]2CC3(C[C@@H](C1)c1ccccc12)OCCO3",
        "C[C@H]1CC(=O)[C@H]2[C@@]3(O)C(=O)c4cccc(O)c4[C@@H]4O[C@@]43[C@@H](O)C[C@]2(O)C1",
        'Cc1cc(NC(=O)CSc2nnc3c4ccccc4n(C)c3n2)ccc1Br',
        'CC(C)(C)c1ccc(C(=O)N[C@H]2CCN3CCCc4cccc2c43)cc1',
        "O=c1c2ccc3c(=O)n(-c4nccs4)c(=O)c4ccc(c(=O)n1-c1nccs1)c2c34",
        "O=C(N1CCc2c(F)ccc(F)c2C1)C1(O)Cc2ccccc2C1"
    ]
    for s in smiles:
        print s
        tree = MolTree(s)
        for i, node in enumerate(tree.nodes):
            node.idx = i

        stack = []
        dfs(stack, tree.nodes[0], -1)
        for x, y, d in stack:
            print x.smiles, y.smiles, d
        print '------------------------------'