Beispiel #1
0
def get_changed_bonds(rxn_smi):
    reactants = Chem.MolFromSmiles(rxn_smi.split('>')[0])
    products  = Chem.MolFromSmiles(rxn_smi.split('>')[2])

    conserved_maps = [a.GetProp('molAtomMapNumber') for a in products.GetAtoms() if a.HasProp('molAtomMapNumber')]
    bond_changes = set() # keep track of bond changes

    # Look at changed bonds
    bonds_prev = {}
    for bond in reactants.GetBonds():
        nums = sorted(
            [bond.GetBeginAtom().GetProp('molAtomMapNumber'),
             bond.GetEndAtom().GetProp('molAtomMapNumber')])
        if (nums[0] not in conserved_maps) and (nums[1] not in conserved_maps): continue
        bonds_prev['{}~{}'.format(nums[0], nums[1])] = bond.GetBondTypeAsDouble()
    bonds_new = {}
    for bond in products.GetBonds():
        nums = sorted(
            [bond.GetBeginAtom().GetProp('molAtomMapNumber'),
             bond.GetEndAtom().GetProp('molAtomMapNumber')])
        bonds_new['{}~{}'.format(nums[0], nums[1])] = bond.GetBondTypeAsDouble()


    for bond in bonds_prev:
        if bond not in bonds_new:
            bond_changes.add((bond.split('~')[0], bond.split('~')[1], 0.0)) # lost bond
        else:
            if bonds_prev[bond] != bonds_new[bond]:
                bond_changes.add((bond.split('~')[0], bond.split('~')[1], bonds_new[bond])) # changed bond
    for bond in bonds_new:
        if bond not in bonds_prev:
            bond_changes.add((bond.split('~')[0], bond.split('~')[1], bonds_new[bond]))  # new bond

    return bond_changes
Beispiel #2
0
def edit_mol(rmol, edits, tatoms):
    new_mol = Chem.RWMol(rmol)
    [a.SetNumExplicitHs(0) for a in new_mol.GetAtoms()]

    amap = {}
    for atom in rmol.GetAtoms():
        amap[atom.GetAtomMapNum() - 1] = atom.GetIdx()

    for x, y, t, v in edits:
        bond = new_mol.GetBondBetweenAtoms(amap[x], amap[y])
        # a1 = new_mol.GetAtomWithIdx(amap[x])
        # a2 = new_mol.GetAtomWithIdx(amap[y])
        if bond is not None:
            new_mol.RemoveBond(amap[x], amap[y])
        if t > 0:
            new_mol.AddBond(amap[x], amap[y], BOND_FLOAT_TO_TYPE[t])

    pred_mol = new_mol.GetMol()
    pred_smiles = Chem.MolToSmiles(pred_mol)
    pred_list = pred_smiles.split('.')
    pred_mols = []
    for pred_smiles in pred_list:
        mol = Chem.MolFromSmiles(pred_smiles)
        if mol is None: continue
        atom_set = set([atom.GetAtomMapNum() - 1 for atom in mol.GetAtoms()])
        if len(atom_set & tatoms) == 0:
            continue
        for atom in mol.GetAtoms():
            atom.SetAtomMapNum(0)
        pred_mols.append(mol)

    return '.'.join(
        sorted([Chem.MolToSmiles(pred_mol) for pred_mol in pred_mols]))
Beispiel #3
0
    def __init__(self, smiles=None, rdk=None, conv_enabled=False):
        """Constructor
        Keyword Arguments:
            smiles {str} -- SMILES representation of a molecule (default: {None})
            rdk {rdkit Mol} -- molecule as an RDKit object (default: {None})
            conv_enabled {bool} -- whether to set both smiles and graph
               arguments here or lazily defer until called
               (default: {False})
        Raises:
            ValueError -- if neither a correct smiles string
                or a rdkit mol are provided
        """
        if conv_enabled:
            if isinstance(smiles, str):
                # also checks if smiles can be parsed
                rdk = Chem.MolFromSmiles(smiles)
                assert rdk is not None
            elif rdk is not None:
                smiles = Chem.MolToSmiles(rdk)
            else:
                raise ValueError("Invalid arguments")

        self.smiles = smiles
        self.rdk = rdk
        self.graph = None  # should be obtained from rdk when needed
        self.synthesis_path = []  # list of Reactions
        self.begin_flag = True
Beispiel #4
0
def smiles2graph(smiles, idxfunc=lambda x: x.GetIdx()):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        raise ValueError("Could not parse smiles string:", smiles)

    n_atoms = mol.GetNumAtoms()
    n_bonds = max(mol.GetNumBonds(), 1)
    fatoms = np.zeros((n_atoms, atom_fdim))
    fbonds = np.zeros((n_bonds, bond_fdim))
    atom_nb = np.zeros((n_atoms, max_nb), dtype=np.int32)
    bond_nb = np.zeros((n_atoms, max_nb), dtype=np.int32)
    num_nbs = np.zeros((n_atoms, ), dtype=np.int32)

    for atom in mol.GetAtoms():
        idx = idxfunc(atom)
        if idx >= n_atoms:
            raise Exception(smiles)
        fatoms[idx] = atom_features(atom)

    for bond in mol.GetBonds():
        a1 = idxfunc(bond.GetBeginAtom())
        a2 = idxfunc(bond.GetEndAtom())
        idx = bond.GetIdx()
        if num_nbs[a1] == max_nb or num_nbs[a2] == max_nb:
            raise Exception(smiles)
        atom_nb[a1, num_nbs[a1]] = a2
        atom_nb[a2, num_nbs[a2]] = a1
        bond_nb[a1, num_nbs[a1]] = idx
        bond_nb[a2, num_nbs[a2]] = idx
        num_nbs[a1] += 1
        num_nbs[a2] += 1
        fbonds[idx] = bond_features(bond)
    return fatoms, fbonds, atom_nb, bond_nb, num_nbs
Beispiel #5
0
def get_bond_label(r, edits, max_natoms):
    rmol = Chem.MolFromSmiles(r)
    n_atoms = rmol.GetNumAtoms()
    rmap = np.zeros((max_natoms, max_natoms, nbos))

    for s in edits.split(';'):
        a1, a2, bo = s.split('-')
        x = min(int(a1) - 1, int(a2) - 1)
        y = max(int(a1) - 1, int(a2) - 1)
        z = bo_to_index[float(bo)]
        rmap[x, y, z] = rmap[y, x, z] = 1

    labels = []
    sp_labels = []
    for i in range(max_natoms):
        for j in range(max_natoms):
            for k in range(len(bo_to_index)):
                if i == j or i >= n_atoms or j >= n_atoms:
                    labels.append(INVALID_BOND)  # mask
                else:
                    labels.append(rmap[i, j, k])
                    if rmap[i, j, k] == 1:
                        sp_labels.append(i * max_natoms * nbos + j * nbos + k)
                        # TODO: check if this is consistent with how TF does flattening
    return np.array(labels), sp_labels
Beispiel #6
0
def get_feature_batch(r_list):
    max_natoms = 0
    for r in r_list:
        rmol = Chem.MolFromSmiles(r)
        if rmol.GetNumAtoms() > max_natoms:
            max_natoms = rmol.GetNumAtoms()

    features = []
    for r in r_list:
        features.append(get_bin_feature(r, max_natoms))
    return np.array(features)
Beispiel #7
0
def get_bin_feature(r, max_natoms):
    '''
    This function is used to generate descriptions of atom-atom relationships, including
    the bond type between the atoms (if any) and whether they belong to the same molecule.
    It is used in the global attention mechanism.
    '''
    comp = {}
    for i, s in enumerate(r.split('.')):
        mol = Chem.MolFromSmiles(s)
        for atom in mol.GetAtoms():
            comp[atom.GetIntProp('molAtomMapNumber') - 1] = i
    n_comp = len(r.split('.'))
    rmol = Chem.MolFromSmiles(r)
    n_atoms = rmol.GetNumAtoms()
    bond_map = {}
    for bond in rmol.GetBonds():
        a1 = bond.GetBeginAtom().GetIntProp('molAtomMapNumber') - 1
        a2 = bond.GetEndAtom().GetIntProp('molAtomMapNumber') - 1
        bond_map[(a1, a2)] = bond_map[(a2, a1)] = bond

    features = []
    for i in range(max_natoms):
        for j in range(max_natoms):
            f = np.zeros((binary_fdim, ))
            if i >= n_atoms or j >= n_atoms or i == j:
                features.append(f)
                continue
            if (i, j) in bond_map:
                bond = bond_map[(i, j)]
                f[1:1 + bond_fdim] = bond_features(bond)
            else:
                f[0] = 1.0
            f[-4] = 1.0 if comp[i] != comp[j] else 0.0
            f[-3] = 1.0 if comp[i] == comp[j] else 0.0
            f[-2] = 1.0 if n_comp == 1 else 0.0
            f[-1] = 1.0 if n_comp > 1 else 0.0
            features.append(f)
    return np.vstack(features).reshape((max_natoms, max_natoms, binary_fdim))
Beispiel #8
0
 def sanitize_smiles(smi, largest_fragment=False):
     mol = Chem.MolFromSmiles(smi)
     if mol is None:
         return smi
     try:
         mol = standardizer.standardize(
             mol)  # standardize functional group reps
         if largest_fragment:
             mol = standardizer.largest_fragment(
                 mol)  # remove product counterions/salts/etc.
         mol = standardizer.uncharge(
             mol)  # neutralize, e.g., carboxylic acids
     except Exception:
         pass
     return Chem.MolToSmiles(mol)
Beispiel #9
0
    def _test_sas(self):
        sas_func = lambda mol: calculateSAScore(Chem.MolFromSmiles(mol.smiles))
        print(sas_func(Molecule("CC")))
        test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"]
        test_pool = [Molecule(smiles) for smiles in test_pool]
        exp = RandomExplorer(sas_func, initial_pool=test_pool)
        print("Starting SA score optimization")
        t0 = time()
        exp.run(10)

        #check
        print("Completed SA score optimization, time elapsed: %.3fs" % (time()-t0))
        print(exp.pool)
        top = exp.get_best(1)[0]
        print(top.get_synthesis_path())
Beispiel #10
0
def get_all_batch(re_list):
    mol_list = []
    max_natoms = 0
    for r, e in re_list:
        rmol = Chem.MolFromSmiles(r)
        mol_list.append((r, e))
        if rmol.GetNumAtoms() > max_natoms:
            max_natoms = rmol.GetNumAtoms()
    labels = []
    features = []
    sp_labels = []
    for r, e in mol_list:
        l, sl = get_bond_label(r, e, max_natoms)
        features.append(get_bin_feature(r, max_natoms))
        labels.append(l)
        sp_labels.append(sl)
    return np.array(features), np.array(labels), sp_labels
Beispiel #11
0
    def to_rdkit(self):
        """
        Converter to rdkit library format, which is
        used for computation of molecular properties
        and for synthesis. Performs a validity check.

        Returns:
            rdkit.Mol -- molecule in RDKit format
        
        Raises:
            ValueError -- if SMILES cannot be decoded
                        into a chemically valid molecule.
        """
        if self.rdk is None:
            self.rdk = Chem.MolFromSmiles(self.smiles)
        if self.rdk is None:
            raise ValueError(f"Molecule {self.smiles} is not valid.")
        return self.rdk
Beispiel #12
0
    def predict(self,
                react,
                top_cand_bonds,
                top_cand_scores=[],
                scores=True,
                top_n=100):
        '''react: atom mapped reactant smiles
        top_cand_bonds: list of strings "ai-aj-bo"'''

        cand_bonds = []
        if not top_cand_scores:
            top_cand_scores = [0.0 for b in top_cand_bonds]
        for i, b in enumerate(top_cand_bonds):
            x, y, t = b.split('-')
            x, y, t = int(float(x)) - 1, int(float(y)) - 1, float(t)

            cand_bonds.append((x, y, t, float(top_cand_scores[i])))

        while True:
            src_tuple, conf = smiles2graph(react,
                                           None,
                                           cand_bonds,
                                           None,
                                           core_size=core_size,
                                           cutoff=MAX_NCAND,
                                           testing=True)
            if len(conf) <= MAX_NCAND:
                break
            ncore -= 1

        feed_map = {x: y for x, y in zip(self.src_holder, src_tuple)}
        cur_scores, cur_probs, candidates = self.session.run(
            self.predict_vars, feed_dict=feed_map)

        idxfunc = lambda a: a.GetAtomMapNum()
        bond_types = [
            Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
            Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC
        ]
        bond_types_as_double = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 1.5: 4}

        # Don't waste predictions on bond changes that aren't actually changes
        rmol = Chem.MolFromSmiles(react)
        rbonds = {}
        for bond in rmol.GetBonds():
            a1 = idxfunc(bond.GetBeginAtom())
            a2 = idxfunc(bond.GetEndAtom())
            t = bond_types.index(bond.GetBondType()) + 1
            a1, a2 = min(a1, a2), max(a1, a2)
            rbonds[(a1, a2)] = t

        cand_smiles = []
        cand_scores = []
        cand_probs = []
        for idx in candidates:
            cbonds = []
            # Define edits from prediction
            for x, y, t, v in conf[idx]:
                x, y = x + 1, y + 1
                if ((x, y) not in rbonds and t > 0) or (
                    (x, y) in rbonds and rbonds[(x, y)] != t):
                    cbonds.append((x, y, bond_types_as_double[t]))
            pred_smiles = edit_mol(rmol, cbonds)
            cand_smiles.append(pred_smiles)
            cand_scores.append(cur_scores[idx])
            cand_probs.append(cur_probs[idx])

        outcomes = []
        if scores:
            for i in range(min(len(cand_smiles), top_n)):
                outcomes.append({
                    'rank': i + 1,
                    'smiles': cand_smiles[i],
                    'score': cand_scores[i],
                    'prob': cand_probs[i],
                })
        else:
            for i in range(min(len(cand_smiles), top_n)):
                outcomes.append({
                    'rank': i + 1,
                    'smiles': cand_smiles[i],
                })

        return outcomes
Beispiel #13
0
    restore_path = tf.train.latest_checkpoint(opts.model_path)
saver.restore(session, restore_path)
sys.stderr.write('restored')
sys.stderr.flush()

total = 0.0
idxfunc = lambda x: x.GetIntProp('molAtomMapNumber')
try:
    while not coord.should_stop():
        total += 1
        r, conf = queue.get(timeout=30)
        if r is None:  # reached end of data set
            break
        cur_pred = session.run(pred_topk)

        rmol = Chem.MolFromSmiles(r)
        rbonds = {}
        for bond in rmol.GetBonds():
            a1 = idxfunc(bond.GetBeginAtom())
            a2 = idxfunc(bond.GetEndAtom())
            t = bond_types.index(bond.GetBondType()) + 1
            a1, a2 = min(a1, a2), max(a1, a2)
            rbonds[(a1, a2)] = t

        if opts.verbose:
            for idx in cur_pred:
                # record the bond changes for this candidate
                for x, y, t, v in conf[idx]:
                    # convert ids to atom map numbers
                    x, y = x + 1, y + 1
                    # make sure this bond change is really a _change_
Beispiel #14
0
def edit_mol(rmol, edits):
    new_mol = Chem.RWMol(rmol)

    # Keep track of aromatic nitrogens, might cause explicit hydrogen issues
    aromatic_nitrogen_idx = set()
    aromatic_carbonyl_adj_to_aromatic_nH = {}
    aromatic_carbondeg3_adj_to_aromatic_nH0 = {}
    for a in new_mol.GetAtoms():
        if a.GetIsAromatic() and a.GetSymbol() == 'N':
            aromatic_nitrogen_idx.add(a.GetIdx())
            for nbr in a.GetNeighbors():
                if a.GetNumExplicitHs() == 1 and nbr.GetSymbol(
                ) == 'C' and nbr.GetIsAromatic() and any(
                        b.GetBondTypeAsDouble() == 2 for b in nbr.GetBonds()):
                    aromatic_carbonyl_adj_to_aromatic_nH[
                        nbr.GetIdx()] = a.GetIdx()
                elif a.GetNumExplicitHs() == 0 and nbr.GetSymbol(
                ) == 'C' and nbr.GetIsAromatic() and len(nbr.GetBonds()) == 3:
                    aromatic_carbondeg3_adj_to_aromatic_nH0[
                        nbr.GetIdx()] = a.GetIdx()
        else:
            a.SetNumExplicitHs(0)
    new_mol.UpdatePropertyCache()

    amap = {}
    for atom in rmol.GetAtoms():
        amap[atom.GetIntProp('molAtomMapNumber')] = atom.GetIdx()

    # Apply the edits as predicted
    for x, y, t in edits:
        bond = new_mol.GetBondBetweenAtoms(amap[x], amap[y])
        a1 = new_mol.GetAtomWithIdx(amap[x])
        a2 = new_mol.GetAtomWithIdx(amap[y])
        if bond is not None:
            new_mol.RemoveBond(amap[x], amap[y])

            # Are we losing a bond on an aromatic nitrogen?
            if bond.GetBondTypeAsDouble() == 1.0:
                if amap[x] in aromatic_nitrogen_idx:
                    if a1.GetTotalNumHs() == 0:
                        a1.SetNumExplicitHs(1)
                    elif a1.GetFormalCharge() == 1:
                        a1.SetFormalCharge(0)
                elif amap[y] in aromatic_nitrogen_idx:
                    if a2.GetTotalNumHs() == 0:
                        a2.SetNumExplicitHs(1)
                    elif a2.GetFormalCharge() == 1:
                        a2.SetFormalCharge(0)

            # Are we losing a c=O bond on an aromatic ring? If so, remove H from adjacent nH if appropriate
            if bond.GetBondTypeAsDouble() == 2.0:
                if amap[x] in aromatic_carbonyl_adj_to_aromatic_nH:
                    new_mol.GetAtomWithIdx(
                        aromatic_carbonyl_adj_to_aromatic_nH[
                            amap[x]]).SetNumExplicitHs(0)
                elif amap[y] in aromatic_carbonyl_adj_to_aromatic_nH:
                    new_mol.GetAtomWithIdx(
                        aromatic_carbonyl_adj_to_aromatic_nH[
                            amap[y]]).SetNumExplicitHs(0)

        if t > 0:
            new_mol.AddBond(amap[x], amap[y], BOND_TYPE[t])

            # Special alkylation case?
            if t == 1:
                if amap[x] in aromatic_nitrogen_idx:
                    if a1.GetTotalNumHs() == 1:
                        a1.SetNumExplicitHs(0)
                    else:
                        a1.SetFormalCharge(1)
                elif amap[y] in aromatic_nitrogen_idx:
                    if a2.GetTotalNumHs() == 1:
                        a2.SetNumExplicitHs(0)
                    else:
                        a2.SetFormalCharge(1)

            # Are we getting a c=O bond on an aromatic ring? If so, add H to adjacent nH0 if appropriate
            if t == 2:
                if amap[x] in aromatic_carbondeg3_adj_to_aromatic_nH0:
                    new_mol.GetAtomWithIdx(
                        aromatic_carbondeg3_adj_to_aromatic_nH0[
                            amap[x]]).SetNumExplicitHs(1)
                elif amap[y] in aromatic_carbondeg3_adj_to_aromatic_nH0:
                    new_mol.GetAtomWithIdx(
                        aromatic_carbondeg3_adj_to_aromatic_nH0[
                            amap[y]]).SetNumExplicitHs(1)

    # Tried:
    # bonds_to_remove.sort(key=lambda x: x[0], reverse=True)
    # for (idx, bond) in bonds_to_remove:
    #     start = bond.GetBeginAtomIdx()
    #     end = bond.GetEndAtomIdx()
    #     new_mol.RemoveBond(start, end)
    # pred_mol = new_mol.GetMol()

    pred_mol = new_mol.GetMol()

    # Clear formal charges to make molecules valid
    # Note: because S and P (among others) can change valence, be more flexible
    for atom in pred_mol.GetAtoms():
        atom.ClearProp('molAtomMapNumber')
        if atom.GetSymbol() == 'N' and atom.GetFormalCharge(
        ) == 1:  # exclude negatively-charged azide
            bond_vals = sum(
                [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
            if bond_vals <= 3:
                atom.SetFormalCharge(0)
        elif atom.GetSymbol() == 'N' and atom.GetFormalCharge(
        ) == -1:  # handle negatively-charged azide addition
            bond_vals = sum(
                [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
            if bond_vals == 3 and any(
                [nbr.GetSymbol() == 'N' for nbr in atom.GetNeighbors()]):
                atom.SetFormalCharge(0)
        elif atom.GetSymbol() == 'N':
            bond_vals = sum(
                [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
            if bond_vals == 4 and not atom.GetIsAromatic(
            ):  # and atom.IsInRingSize(5)):
                atom.SetFormalCharge(1)
        elif atom.GetSymbol() == 'C' and atom.GetFormalCharge() != 0:
            atom.SetFormalCharge(0)
        elif atom.GetSymbol() == 'O' and atom.GetFormalCharge() != 0:
            bond_vals = sum(
                [bond.GetBondTypeAsDouble()
                 for bond in atom.GetBonds()]) + atom.GetNumExplicitHs()
            if bond_vals == 2:
                atom.SetFormalCharge(0)
        elif atom.GetSymbol() in ['Cl', 'Br', 'I', 'F'
                                  ] and atom.GetFormalCharge() != 0:
            bond_vals = sum(
                [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
            if bond_vals == 1:
                atom.SetFormalCharge(0)
        elif atom.GetSymbol() == 'S' and atom.GetFormalCharge() != 0:
            bond_vals = sum(
                [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
            if bond_vals in [2, 4, 6]:
                atom.SetFormalCharge(0)
        elif atom.GetSymbol(
        ) == 'P':  # quartenary phosphorous should be pos. charge with 0 H
            bond_vals = [
                bond.GetBondTypeAsDouble() for bond in atom.GetBonds()
            ]
            if sum(bond_vals) == 4 and len(bond_vals) == 4:
                atom.SetFormalCharge(1)
                atom.SetNumExplicitHs(0)
            elif sum(bond_vals) == 3 and len(
                    bond_vals) == 3:  # make sure neutral
                atom.SetFormalCharge(0)
        elif atom.GetSymbol(
        ) == 'B':  # quartenary boron should be neg. charge with 0 H
            bond_vals = [
                bond.GetBondTypeAsDouble() for bond in atom.GetBonds()
            ]
            if sum(bond_vals) == 4 and len(bond_vals) == 4:
                atom.SetFormalCharge(-1)
                atom.SetNumExplicitHs(0)
        elif atom.GetSymbol() in ['Mg', 'Zn']:
            bond_vals = [
                bond.GetBondTypeAsDouble() for bond in atom.GetBonds()
            ]
            if sum(bond_vals) == 1 and len(bond_vals) == 1:
                atom.SetFormalCharge(1)
        elif atom.GetSymbol() == 'Si':
            bond_vals = [
                bond.GetBondTypeAsDouble() for bond in atom.GetBonds()
            ]
            if sum(bond_vals) == len(bond_vals):
                atom.SetNumExplicitHs(max(0, 4 - len(bond_vals)))

    # Bounce to/from SMILES to try to sanitize
    pred_smiles = Chem.MolToSmiles(pred_mol)  # <--- TODO: error occurs here
    pred_list = pred_smiles.split('.')
    pred_mols = [Chem.MolFromSmiles(pred_smiles) for pred_smiles in pred_list]

    for i, mol in enumerate(pred_mols):
        # Check if we failed/succeeded in previous step
        if mol is None:
            logging.debug('##### Unparseable mol: {}'.format(pred_list[i]))
            continue

        # Else, try post-sanitiztion fixes in structure
        mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))
        if mol is None:
            continue
        for rxn in clean_rxns_postsani:
            out = rxn.RunReactants((mol, ))
            if out:
                try:
                    Chem.SanitizeMol(out[0][0])
                    pred_mols[i] = Chem.MolFromSmiles(
                        Chem.MolToSmiles(out[0][0]))
                except Exception as e:
                    print(e)
                    print('Could not sanitize postsani reaction product: {}'.
                          format(Chem.MolToSmiles(out[0][0])))
                    print('Original molecule was: {}'.format(
                        Chem.MolToSmiles(mol)))
    pred_smiles = [
        Chem.MolToSmiles(pred_mol) for pred_mol in pred_mols
        if pred_mol is not None
    ]

    return pred_smiles
def smiles2graph(rsmiles,
                 psmiles,
                 core_bonds,
                 gold_bonds,
                 cutoff=500,
                 idxfunc=lambda x: x.GetIntProp('molAtomMapNumber') - 1,
                 core_size=20,
                 kmax=5,
                 return_found=False,
                 testing=False):
    '''This is the function that takes reactants, a true product (when defined), and the candidate bonds
    to generate all of the candidate products according to some bounds on the enumeration'''
    mol = Chem.MolFromSmiles(rsmiles)
    if not mol:
        raise ValueError("Could not parse smiles string:", rsmiles)

    if not testing:
        pmol = Chem.MolFromSmiles(psmiles)
        if not pmol:
            raise ValueError("Could not parse smiles string:", psmiles)

    n_atoms = mol.GetNumAtoms()
    n_bonds = max(mol.GetNumBonds(), 1)
    fatoms = np.zeros((n_atoms, atom_fdim))
    fbonds = np.zeros((n_bonds, bond_fdim))
    atom_nb = np.zeros((n_atoms, max_nb), dtype=np.int32)
    bond_nb = np.zeros((n_atoms, max_nb), dtype=np.int32)
    num_nbs = np.zeros((n_atoms, ), dtype=np.int32)
    raw_atom_nb = np.zeros((n_atoms, max_nb), dtype=np.int32)
    raw_bond_nb = np.zeros((n_atoms, max_nb), dtype=np.int32)
    raw_num_nbs = np.zeros((n_atoms, ), dtype=np.int32)
    free_vals = np.zeros((n_atoms, ))
    pfree_vals = np.zeros((n_atoms, ))

    is_c2_of_pyridine = np.zeros((n_atoms, ), dtype=bool)
    is_c = np.zeros((n_atoms, ), dtype=bool)
    is_p = np.zeros((n_atoms, ), dtype=bool)
    is_s = np.zeros((n_atoms, ), dtype=bool)
    is_o = np.zeros((n_atoms, ), dtype=bool)
    is_n = np.zeros((n_atoms, ), dtype=bool)

    # gbonds = {(x,y):0 for x,y in core_bonds}

    #Feature Extraction
    for atom in mol.GetAtoms():
        idx = idxfunc(atom)
        fatoms[idx] = atom_features(atom)
        free_vals[idx] += atom.GetTotalNumHs() + abs(atom.GetFormalCharge())

        # TODO: review these rules
        # Aromatic carbon next to an aromatic nitrogen can get a carbonyl b/c stupid bookkeeping of hydroxypyridines
        if atom.GetAtomicNum() == 6:
            is_c[idx] = True
            if atom.GetIsAromatic():
                for nbr in atom.GetNeighbors():
                    if nbr.GetAtomicNum() == 7 and nbr.GetDegree() == 2:
                        is_c2_of_pyridine[idx] = True
                        break
        #  Nitrogen should be allowed to become positively charged
        elif atom.GetAtomicNum() == 7:
            free_vals[idx] += 1 - atom.GetFormalCharge()
            is_n[idx] = True
        # Phosphorous can form a phosphonium
        elif atom.GetAtomicNum() == 15:
            free_vals[idx] += 1 - atom.GetFormalCharge()
            is_p[idx] = True
        elif atom.GetAtomicNum() == 8:
            is_o[idx] = True
        elif atom.GetAtomicNum() == 16:
            is_s[idx] = True

        # special information needed for valence filtering

    if not testing:
        tatoms = set()
        #Calculate free slots for each atom in product
        for bond in pmol.GetBonds():
            a1 = idxfunc(bond.GetBeginAtom())
            a2 = idxfunc(bond.GetEndAtom())
            t = bond_types.index(bond.GetBondType()) + 1
            a1, a2 = min(a1, a2), max(a1, a2)
            tatoms.add(a1)
            tatoms.add(a2)
            if (a1, a2) in core_bonds:
                # gbonds[(a1,a2)] = t
                tval = t if t < 4 else 1.5
                pfree_vals[a1] += tval
                pfree_vals[a2] += tval

    rbonds = {}
    rbond_vals = {}  # bond orders
    ring_bonds = set()
    #Calculate free slots for each atom in reactant
    for bond in mol.GetBonds():
        idx = bond.GetIdx()
        a1 = idxfunc(bond.GetBeginAtom())
        a2 = idxfunc(bond.GetEndAtom())
        t = bond_types.index(bond.GetBondType())
        a1, a2 = min(a1, a2), max(a1, a2)
        tval = t + 1 if t < 3 else 1.5
        rbonds[(a1, a2)] = t + 1
        rbond_vals[(a1, a2)] = tval
        if (a1, a2) in core_bonds:
            free_vals[a1] += tval
            free_vals[a2] += tval
        if bond.IsInRing():
            ring_bonds.add((a1, a2))

    # Get all possible core configurations - NEW IN DIRECT VERSION
    from itertools import combinations
    core_configs = [
    ]  # will be list of lists of (x, y, t, v) tuples, where t is the bond order and v is CoreFinder score

    # print('rbond_vals:')
    # print(rbond_vals)

    # Filter out core bonds that exactly match reactants
    prev_len = len(core_bonds)
    core_bonds = [(x, y, t, v) for (x, y, t, v) in core_bonds
                  if ((x, y) not in rbond_vals) or (rbond_vals[(x, y)] != t)]
    # print('{}/{} core bonds kept after filtering existing bonds'.format(prev_len, len(core_bonds)))

    # Pare down to top-core_size only
    core_bonds = core_bonds[:core_size]

    # Helper function to check if a combination is connected - this helps the number of valid combinations
    core_bonds_adj = np.eye(len(core_bonds), dtype=bool)
    for i in range(len(core_bonds)):
        a1, b1, t1, v1 = core_bonds[i]
        for j in range(i, len(core_bonds)):
            a2, b2, t2, v2 = core_bonds[j]
            if a1 == a2 or a1 == b2 or b1 == a2 or b1 == b2:
                core_bonds_adj[i, j] = core_bonds_adj[j, i] = True
    # print(core_bonds)
    # print('Calculated core bonds adj matrix: {}'.format(core_bonds_adj * 1.0))

    def check_if_connected(combo_i):
        '''Checks if a set of candidate edits (by indeces) are all connected'''
        if len(combo_i) == 1:
            return True  # only one change, always connected
        temp_adj_pow = np.linalg.matrix_power(
            core_bonds_adj[combo_i, :][:, combo_i],
            len(combo_i) - 1)
        return np.all(temp_adj_pow)

    # Helper function to check if a combiation is valid
    def check_if_valid(bond_change_combo):
        force_even_parity = np.zeros((n_atoms, ), dtype=bool)
        force_odd_parity = np.zeros((n_atoms, ), dtype=bool)
        seen = defaultdict(lambda: False)
        free_vals_temp = free_vals.copy()
        for x, y, t, v in bond_change_combo:
            x, y = tuple(sorted([x, y]))
            if seen[(x, y)]:
                # print('already seen this bond in the list of cand changes')
                return False  # can't have two distinct bond change types in same combo
            seen[(x, y)] = True

            # TODO: review these valence rules
            # Special rules:
            #  - if phosphorous or sulfur, don't count formation of =O toward valence but require odd/even
            #  - if c2 carbon in a pyridine ring, let it get a =O
            tx = ty = t
            if t == 2:
                if is_o[x]:
                    if is_c2_of_pyridine[y]:
                        ty = 1.  # pretend it's just a hydroxylation for the sake of valence
                    elif is_p[y]:
                        ty = 0.  # don't count toward valence
                        force_odd_parity[
                            y] = True  # but require odd valence parity
                    elif is_s[y]:
                        ty = 0.
                        force_even_parity[y] = True

                elif is_o[y]:
                    if is_c2_of_pyridine[x]:
                        tx = 1.
                    elif is_p[x]:
                        tx = 0.
                        force_odd_parity[x] = True
                    elif is_s[x]:
                        tx = 0.
                        force_even_parity[x] = True

                elif is_n[x] and is_p[y]:
                    ty = 0.
                    force_odd_parity[y] = True
                elif is_n[y] and is_p[x]:
                    tx = 0.
                    force_odd_parity[x] = True

                elif is_p[x] and is_c[y]:
                    tx = 0.
                    force_odd_parity[x] = True
                elif is_p[y] and is_c[x]:
                    ty = 0.
                    force_odd_parity[y] = True

            if (x, y) in rbond_vals:
                free_vals_temp[x] += rbond_vals[(x, y)] - tx
                free_vals_temp[y] += rbond_vals[(x, y)] - ty
            else:
                free_vals_temp[x] += -tx
                free_vals_temp[y] += -ty

        # too many connections? sulfur valence not even? phosphorous valence not odd?
        if any(free_vals_temp < 0) \
                or any(aval % 2 != 0 for aval in free_vals_temp[force_even_parity]) \
                or any(aval % 2 != 1 for aval in free_vals_temp[force_odd_parity]):
            # print('invalid valence?')
            # print(free_vals_temp)
            return False
        return True

    # N choose k combinatorics
    # up to 4 bond changes at once - only 0.19% of train examples have 5 bonds changed, we can take the hit...
    core_bonds_i = range(len(core_bonds))
    for k in range(1, kmax + 1):
        for bond_change_combo_i in combinations(core_bonds_i, k):
            # Check if connected
            if not check_if_connected(bond_change_combo_i):
                # print('This combination is not connected!')
                continue

            bond_change_combo = [core_bonds[i] for i in bond_change_combo_i]

            if check_if_valid(bond_change_combo):
                core_configs.append(bond_change_combo)
    # print('Found a total of {} core configs that seem valid'.format(len(core_configs)))

    if not testing:
        random.shuffle(core_configs)
        idx = -1
        for i, cand_bonds in enumerate(core_configs):
            if set([(x, y, t) for (x, y, t, v) in cand_bonds]) == gold_bonds:
                idx = i
                break

        # If we are training and did not find the true outcome, make sure it is the first entry
        if idx == -1:
            # print('Did not find true outcome')
            found_true = False
            core_configs = [[(x, y, t, 0.0)
                             for (x, y, t) in gold_bonds]] + core_configs
        else:
            # print('Found true outcome')
            found_true = True
            core_configs[0], core_configs[idx] = core_configs[
                idx], core_configs[0]  # swap order so true is first
    else:
        found_true = False

    if not testing:
        # If it is possible to recover the true smiles from the set of bonds using the edit_mol method,
        # remove duplicates from the list by converting each candidate into a smiles string
        # note: get_product_smiles is HIGHLY imperfect, but that's not a huge deal. training tries to pick the
        #       right bonds. The evaluation script has a more robust function to get product_smiles
        smiles0 = get_product_smiles(mol, core_configs[0], tatoms)
        if len(smiles0) > 0:  #
            cand_smiles = set([smiles0])
            new_core_configs = [core_configs[0]]

            for core_conf in core_configs[1:]:
                smiles = get_product_smiles(mol, core_conf, tatoms)
                # print('candidate smiles: {}'.format(smiles))
                if smiles in cand_smiles or len(smiles) == 0:
                    continue
                cand_smiles.add(smiles)
                new_core_configs.append(core_conf)
            core_configs = new_core_configs

        else:
            print('\nwarning! could not recover true smiles from gbonds: {}'.
                  format(psmiles))
            print('{}    {}'.format(rsmiles, gold_bonds))

    # print('After removing duplicates, {} core configs'.format(len(core_configs)))

    core_configs = core_configs[:cutoff]

    n_batch = len(core_configs) + 1
    if not testing:
        labels = np.zeros((n_batch - 1, ))
        labels[0] = 1

    # Calculate information that is the same for all candidates; do small updates based on specific changes later
    pending_reactant_neighbors = [
    ]  # reactant neighbors that *might* be over-ridden
    core_bonds_noScore = [(x, y, t) for (x, y, t, z) in core_bonds]
    for bond in mol.GetBonds():
        idx = bond.GetIdx()
        a1 = idxfunc(bond.GetBeginAtom())
        a2 = idxfunc(bond.GetEndAtom())
        a1, a2 = min(a1, a2), max(a1, a2)

        if (
                a1, a2, 0.0
        ) not in core_bonds_noScore:  # are a1 and a2 guaranteed to be neighbors?
            raw_atom_nb[a1, raw_num_nbs[a1]] = a2
            raw_atom_nb[a2, raw_num_nbs[a2]] = a1
            raw_bond_nb[a1, raw_num_nbs[a1]] = idx
            raw_bond_nb[a2, raw_num_nbs[a2]] = idx
            raw_num_nbs[a1] += 1
            raw_num_nbs[a2] += 1
        else:
            pending_reactant_neighbors.append(
                (a1, a2, bond.GetBondTypeAsDouble()))

        # Reactants have this bond...
        atom_nb[a1, num_nbs[a1]] = a2
        atom_nb[a2, num_nbs[a2]] = a1
        bond_nb[a1, num_nbs[a1]] = idx
        bond_nb[a2, num_nbs[a2]] = idx
        num_nbs[a1] += 1
        num_nbs[a2] += 1
        fbonds[idx] = bond_features(bond)

    # print('What is core_bonds here?: {}'.format(core_bonds))
    if not testing:
        num_newbonds = max(
            len(gold_bonds), len(core_bonds)
        ) * 2 + 1  # CC fixed in case where core_bonds isn't large enough
    else:
        num_newbonds = len(core_bonds) * 2 + 1
    new_fbonds = np.zeros(
        (n_bonds + num_newbonds + len(pending_reactant_neighbors),
         bond_fdim))  # CC added + len(pending_reactant_neighbors)
    new_fbonds[:n_bonds, :] = fbonds
    fbonds = new_fbonds
    batch_fbonds, batch_anb, batch_bnb, batch_nbs = [fbonds], [atom_nb], [
        bond_nb
    ], [num_nbs]  # first entry is reactants
    batch_corebias = []
    for core_bonds in core_configs:
        atom_nb2 = np.copy(raw_atom_nb)
        bond_nb2 = np.copy(raw_bond_nb)
        num_nbs2 = np.copy(raw_num_nbs)
        fbonds2 = np.copy(fbonds)
        n_bonds2 = n_bonds + 1

        # Add back reactant bonds?
        core_bonds_nobo = [(x, y) for (x, y, t, v) in core_bonds]
        for (x, y, t) in pending_reactant_neighbors:
            if (x, y) not in core_bonds_nobo:
                core_bonds.append((x, y, t, 0.0))

        for x, y, t, v in core_bonds:  # add new bond features to the "default" reactant ones
            if t == 0: continue

            atom_nb2[x, num_nbs2[x]] = y
            atom_nb2[y, num_nbs2[y]] = x
            bond_nb2[x, num_nbs2[x]] = n_bonds2
            bond_nb2[y, num_nbs2[y]] = n_bonds2
            num_nbs2[x] += 1
            num_nbs2[y] += 1
            fbonds2[n_bonds2] = onek_encoding_unk(t, [1.0, 2.0, 3.0, 1.5, -1])
            if (x, y) in ring_bonds:
                fbonds2[n_bonds2][4] = 1
            n_bonds2 += 1

        batch_fbonds.append(fbonds2)
        batch_anb.append(atom_nb2)
        batch_bnb.append(bond_nb2)
        batch_nbs.append(num_nbs2)
        batch_corebias.append(sum([v for (x, y, t, v) in core_bonds]))

    # TODO: change atom features for each candidate? Maybe update degree at least

    if return_found:
        return (np.array([fatoms] * n_batch), np.array(batch_fbonds),
                packnb(batch_anb), packnb(batch_bnb), np.array(batch_nbs),
                np.array(batch_corebias), labels), core_configs, found_true
    if not testing:
        return (np.array([fatoms] * n_batch), np.array(batch_fbonds),
                packnb(batch_anb), packnb(batch_bnb), np.array(batch_nbs),
                np.array(batch_corebias), labels), core_configs
    return (np.array([fatoms] * n_batch), np.array(batch_fbonds),
            packnb(batch_anb), packnb(batch_bnb), np.array(batch_nbs),
            np.array(batch_corebias)), core_configs
Beispiel #16
0
    a = np.zeros((len(arr_list), N))
    for i, arr in enumerate(arr_list):
        for j in range(arr.shape[0]):
            a[i][j] = 1
    return a


def smiles2graph_list(smiles_list, idxfunc=lambda x: x.GetIdx()):
    res = list(map(lambda x: smiles2graph(x, idxfunc), smiles_list))
    fatom_list, fbond_list, gatom_list, gbond_list, nb_list = zip(*res)
    return pack2D(fatom_list), pack2D(fbond_list), pack2D_withidx(
        gatom_list), pack2D_withidx(gbond_list), pack1D(nb_list), get_mask(
            fatom_list)


m = Chem.MolFromSmiles('CC')
assignProperties(m)
atom = m.GetAtoms()[0]
bond = m.GetBonds()[0]
atom_fdim = len(atom_features(atom))
bond_fdim = len(bond_features(bond))

if __name__ == "__main__":
    np.set_printoptions(threshold='nan')
    a, b, c, d, e, f = smiles2graph_list(["c1cccnc1", 'c1nccc2n1ccc2'])
    print(a)
    print(b)
    print(c)
    print(d)
    print(e)
    print(f)