Beispiel #1
0
    def test_atom_mapping_1(self):
        """Test that toRDKitMol returns correct indices and atom mappings."""
        bondOrderDict = {'SINGLE': 1, 'DOUBLE': 2, 'TRIPLE': 3, 'AROMATIC': 1.5}
        mol = Molecule().fromSMILES('C1CCC=C1C=O')
        rdkitmol, rdAtomIndices = toRDKitMol(mol, removeHs=False, returnMapping=True)
        for atom in mol.atoms:
            # Check that all atoms are found in mapping
            self.assertTrue(atom in rdAtomIndices)
            # Check that all bonds are in rdkitmol with correct mapping and order
            for connectedAtom, bond in atom.bonds.iteritems():
                bondType = str(
                    rdkitmol.GetBondBetweenAtoms(rdAtomIndices[atom], rdAtomIndices[connectedAtom]).GetBondType())
                rdkitBondOrder = bondOrderDict[bondType]
                self.assertEqual(bond.order, rdkitBondOrder)

        # Test for removeHs = True
        rdkitmol2, rdAtomIndices2 = toRDKitMol(mol, removeHs=True, returnMapping=True)
        for atom in mol.atoms:
            # Check that all non-hydrogen atoms are found in mapping
            if atom.symbol != 'H':
                self.assertTrue(atom in rdAtomIndices2)
                # Check that all bonds connected to non-hydrogen have the correct mapping and order
                for connectedAtom, bond in atom.bonds.iteritems():
                    if connectedAtom.symbol != 'H':
                        bondType = str(rdkitmol2.GetBondBetweenAtoms(rdAtomIndices2[atom],
                                                                     rdAtomIndices2[connectedAtom]).GetBondType())
                        rdkitBondOrder = bondOrderDict[bondType]
                        self.assertEqual(bond.order, rdkitBondOrder)
Beispiel #2
0
def _rdkit_translator(input_object, identifier_type, mol=None):
    """
    Converts between formats using RDKit. If input is a :class:`Molecule`,
    the identifier_type is used to determine the output type. If the input is
    a `str`, then the identifier_type is used to identify the input, and the
    desired output is assumed to be a :class:`Molecule` object.

    Args:
        input_object: either molecule or string identifier
        identifier_type: format of string identifier
            'inchi'    -> InChI
            'inchikey' -> InChI Key
            'sma'      -> SMARTS
            'smi'      -> SMILES
        mol: molecule object for output (optional)
    """
    if identifier_type == 'inchi' and not Chem.inchi.INCHI_AVAILABLE:
        raise DependencyError("RDKit installed without InChI. Please reinstall to read and write InChI strings.")

    if isinstance(input_object, str):
        # We are converting from a string identifier to a molecule
        if identifier_type == 'inchi':
            rdkitmol = Chem.inchi.MolFromInchi(input_object, removeHs=False)
        elif identifier_type == 'sma':
            rdkitmol = Chem.MolFromSmarts(input_object)
        elif identifier_type == 'smi':
            rdkitmol = Chem.MolFromSmiles(input_object)
        else:
            raise ValueError('Identifier type {0} is not supported for reading using RDKit.'.format(identifier_type))
        if rdkitmol is None:
            raise ValueError("Could not interpret the identifier {0!r}".format(input_object))
        if mol is None:
            mol = mm.Molecule()
        output = fromRDKitMol(mol, rdkitmol)
    elif isinstance(input_object, mm.Molecule):
        # We are converting from a molecule to a string identifier
        if identifier_type == 'smi':
            rdkitmol = toRDKitMol(input_object, sanitize=False)
        else:
            rdkitmol = toRDKitMol(input_object, sanitize=True)
        if identifier_type == 'inchi':
            output = Chem.inchi.MolToInchi(rdkitmol, options='-SNon')
        elif identifier_type == 'inchikey':
            inchi = toInChI(input_object)
            output = Chem.inchi.InchiToInchiKey(inchi)
        elif identifier_type == 'sma':
            output = Chem.MolToSmarts(rdkitmol)
        elif identifier_type == 'smi':
            if input_object.isAromatic():
                output = Chem.MolToSmiles(rdkitmol)
            else:
                output = Chem.MolToSmiles(rdkitmol, kekuleSmiles=True)
        else:
            raise ValueError('Identifier type {0} is not supported for writing using RDKit.'.format(identifier_type))
    else:
        raise ValueError('Unexpected input format. Should be a Molecule or a string.')

    return output
def _rdkit_translator(input_object, identifier_type, mol=None):
    """
    Converts between formats using RDKit. If input is a :class:`Molecule`,
    the identifier_type is used to determine the output type. If the input is
    a `str`, then the identifier_type is used to identify the input, and the
    desired output is assumed to be a :class:`Molecule` object.

    Args:
        input_object: either molecule or string identifier
        identifier_type: format of string identifier
            'inchi'    -> InChI
            'inchikey' -> InChI Key
            'sma'      -> SMARTS
            'smi'      -> SMILES
        mol: molecule object for output (optional)
    """
    if identifier_type == 'inchi' and not Chem.inchi.INCHI_AVAILABLE:
        raise DependencyError("RDKit installed without InChI. Please reinstall to read and write InChI strings.")

    if isinstance(input_object, str):
        # We are converting from a string identifier to a molecule
        if identifier_type == 'inchi':
            rdkitmol = Chem.inchi.MolFromInchi(input_object, removeHs=False)
        elif identifier_type == 'sma':
            rdkitmol = Chem.MolFromSmarts(input_object)
        elif identifier_type == 'smi':
            rdkitmol = Chem.MolFromSmiles(input_object)
        else:
            raise ValueError('Identifier type {0} is not supported for reading using RDKit.'.format(identifier_type))
        if rdkitmol is None:
            raise ValueError("Could not interpret the identifier {0!r}".format(input_object))
        if mol is None:
            mol = mm.Molecule()
        output = fromRDKitMol(mol, rdkitmol)
    elif isinstance(input_object, mm.Molecule):
        # We are converting from a molecule to a string identifier
        if identifier_type == 'smi':
            rdkitmol = toRDKitMol(input_object, sanitize=False)
        else:
            rdkitmol = toRDKitMol(input_object, sanitize=True)
        if identifier_type == 'inchi':
            output = Chem.inchi.MolToInchi(rdkitmol, options='-SNon')
        elif identifier_type == 'inchikey':
            inchi = toInChI(input_object)
            output = Chem.inchi.InchiToInchiKey(inchi)
        elif identifier_type == 'sma':
            output = Chem.MolToSmarts(rdkitmol)
        elif identifier_type == 'smi':
            if input_object.isAromatic():
                output = Chem.MolToSmiles(rdkitmol)
            else:
                output = Chem.MolToSmiles(rdkitmol, kekuleSmiles=True)
        else:
            raise ValueError('Identifier type {0} is not supported for writing using RDKit.'.format(identifier_type))
    else:
        raise ValueError('Unexpected input format. Should be a Molecule or a string.')

    return output
Beispiel #4
0
    def test_atom_mapping_2(self):
        """Test that toRDKitMol returns correct indices and atom mappings when hydrogens are removed."""
        adjlist = """
1 H u0 p0 c0 {2,S}
2 C u0 p0 c0 {1,S} {3,S} {4,S} {5,S}
3 H u0 p0 c0 {2,S}
4 H u0 p0 c0 {2,S}
5 O u0 p2 c0 {2,S} {6,S}
6 H u0 p0 c0 {5,S}
        """

        mol = Molecule().fromAdjacencyList(adjlist)
        rdkitmol, rdAtomIndices = toRDKitMol(mol,
                                             removeHs=True,
                                             returnMapping=True)

        heavy_atoms = [at for at in mol.atoms if at.number != 1]
        for at1 in heavy_atoms:
            for at2 in heavy_atoms:
                if mol.hasBond(at1, at2):
                    try:
                        rdkitmol.GetBondBetweenAtoms(rdAtomIndices[at1],
                                                     rdAtomIndices[at2])
                    except RuntimeError:
                        self.fail(
                            "RDKit failed in finding the bond in the original atom!"
                        )
Beispiel #5
0
    def test_rdkit_round_trip(self):
        """Test conversion to and from RDKitMol"""
        for mol in self.test_mols:
            rdkit_mol = toRDKitMol(mol)
            new_mol = fromRDKitMol(Molecule(), rdkit_mol)

            self.assertTrue(mol.isIsomorphic(new_mol))
            self.assertEqual(mol.get_element_count(), new_mol.get_element_count())
Beispiel #6
0
    def test_lone_pair_retention(self):
        """Test that we don't lose any lone pairs on round trip RDKit conversion."""
        mol = Molecule().fromAdjacencyList("""
1 C u0 p0 c0 {2,D} {3,S} {4,S}
2 O u0 p2 c0 {1,D}
3 H u0 p0 c0 {1,S}
4 H u0 p0 c0 {1,S}
""")
        rdmol = toRDKitMol(mol)

        try:
            mol2 = fromRDKitMol(Molecule(), rdmol)
        except AtomTypeError as e:
            self.fail('Could not convert from RDKitMol: ' + e.message)
        else:
            self.assertTrue(mol.isIsomorphic(mol2))
Beispiel #7
0
def create_augmented_layers(mol):
    """
    The indices in the string refer to the atom indices in the molecule, according to the atom order
    obtained by sorting the atoms using the InChI canonicalization algorithm.

    First a deep copy is created of the original molecule and hydrogen atoms are removed from the molecule.
    Next, the molecule is converted into an InChI string, and the auxiliary information of the inchification
    procedure is retrieved.

    The N-layer is parsed and used to sort the atoms of the original order according
    to the order in the InChI. In case, the molecule contains atoms that cannot be distinguished
    with the InChI algorithm ('equivalent atoms'), the position of the unpaired electrons is changed
    as to ensure the atoms with the lowest indices are used to compose the string.
    """

    if mol.getRadicalCount() == 0 and not _has_unexpected_lone_pairs(mol):
        return None, None
    elif mol.getFormula() == 'H':
        return U_LAYER_PREFIX + '1', None
    else:
        molcopy = mol.copy(deep=True)

        hydrogens = filter(lambda at: at.number == 1, molcopy.atoms)
        for h in hydrogens:
            molcopy.removeAtom(h)

        rdkitmol = toRDKitMol(molcopy)
        _, auxinfo = Chem.MolToInchiAndAuxInfo(
            rdkitmol, options='-SNon')  # suppress stereo warnings

        # extract the atom numbers from N-layer of auxiliary info:
        atom_indices = _parse_N_layer(auxinfo)
        atom_indices = [
            atom_indices.index(i + 1) for i, atom in enumerate(molcopy.atoms)
        ]

        # sort the atoms based on the order of the atom indices
        molcopy.atoms = [
            x for (y, x) in sorted(zip(atom_indices, molcopy.atoms),
                                   key=lambda pair: pair[0])
        ]

        ulayer = _create_U_layer(molcopy, auxinfo)

        player = _create_P_layer(molcopy, auxinfo)

        return ulayer, player
    def toRDKitMol(self, removeHs=False, returnMapping=True):
        """
        Convert a molecular structure to a RDKit rdmol object.
        """
        if removeHs:
            # because we're replacing
            # cutting labels with hydrogens
            # so do not allow removeHs to be True
            raise "Currently fragment toRDKitMol only allows keeping all the hydrogens."

        mol0, mapping = self.get_representative_molecule('minimal',
                                                         update=False)

        rdmol, rdAtomIdx_mol0 = converter.toRDKitMol(
            mol0,
            removeHs=removeHs,
            returnMapping=returnMapping,
            sanitize=True)

        rdAtomIdx_frag = {}
        for frag_atom, mol0_atom in mapping.iteritems():
            rd_idx = rdAtomIdx_mol0[mol0_atom]
            rdAtomIdx_frag[frag_atom] = rd_idx

        # sync the order of fragment vertices with the order
        # of mol0.atoms since mol0.atoms is changed/sorted in
        # converter.toRDKitMol().
        # Since the rdmol's atoms order is same as the order of mol0's atoms,
        # the synchronization between fragment.atoms order and mol0.atoms order
        # is necessary to make sure the order of fragment vertices
        # reflects the order of rdmol's atoms
        vertices_order = []
        for v in self.vertices:
            a = mapping[v]
            idx = mol0.atoms.index(a)
            vertices_order.append((v, idx))

        adapted_vertices = [
            tup[0] for tup in sorted(vertices_order, key=lambda tup: tup[1])
        ]

        self.vertices = adapted_vertices

        return rdmol, rdAtomIdx_frag
def create_augmented_layers(mol):
    """
    The indices in the string refer to the atom indices in the molecule, according to the atom order
    obtained by sorting the atoms using the InChI canonicalization algorithm.

    First a deep copy is created of the original molecule and hydrogen atoms are removed from the molecule.
    Next, the molecule is converted into an InChI string, and the auxiliary information of the inchification
    procedure is retrieved.

    The N-layer is parsed and used to sort the atoms of the original order according
    to the order in the InChI. In case, the molecule contains atoms that cannot be distinguished
    with the InChI algorithm ('equivalent atoms'), the position of the unpaired electrons is changed
    as to ensure the atoms with the lowest indices are used to compose the string.
    """

    if mol.getRadicalCount() == 0 and not _has_unexpected_lone_pairs(mol):
        return None, None
    elif mol.getFormula() == 'H':
        return U_LAYER_PREFIX + '1', None
    else:
        molcopy = mol.copy(deep=True)

        hydrogens = filter(lambda at: at.number == 1, molcopy.atoms)
        for h in hydrogens:
            molcopy.removeAtom(h)

        rdkitmol = toRDKitMol(molcopy)
        _, auxinfo = Chem.MolToInchiAndAuxInfo(rdkitmol, options='-SNon')  # suppress stereo warnings

        # extract the atom numbers from N-layer of auxiliary info:
        atom_indices = _parse_N_layer(auxinfo)
        atom_indices = [atom_indices.index(i + 1) for i, atom in enumerate(molcopy.atoms)]

        # sort the atoms based on the order of the atom indices
        molcopy.atoms = [x for (y, x) in sorted(zip(atom_indices, molcopy.atoms), key=lambda pair: pair[0])]

        ulayer = _create_U_layer(molcopy, auxinfo)

        player = _create_P_layer(molcopy, auxinfo)

        return ulayer, player