Exemple #1
0
class MAP4Calculator:

    def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False):
        """
        MAP4 calculator class
        """
        self.radius = radius
        self.is_counted = is_counted
        self.is_folded = is_folded

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)

    def calculate(self, mol):
        """Calculates the atom pair minhashed fingerprint

        Arguments:
            mol -- rdkit mol object

        Returns:
            tmap VectorUint -- minhashed fingerprint
        """
        
        atom_env_pairs = self._calculate(mol)
        if self.is_folded:
            return self._fold(atom_env_pairs)
        return self.encoder.from_string_array(atom_env_pairs)

    def calculate_many(self, mols):
        """ Calculates the atom pair minhashed fingerprint

        Arguments:
            mols -- list of mols

        Returns:
            list of tmap VectorUint -- minhashed fingerprints list
        """

        atom_env_pairs_list = [self._calculate(mol) for mol in mols]
        if self.is_folded:
            return [self._fold(pairs) for pairs in atom_env_pairs_list]
        return self.encoder.batch_from_string_array(atom_env_pairs_list)

    def _calculate(self, mol):
        return self._all_pairs(mol, self._get_atom_envs(mol))

    def _fold(self, pairs):
        fp_hash = self.encoder.hash(set(pairs))
        return self.encoder.fold(fp_hash)

    def _get_atom_envs(self, mol):
        atoms_env = {}
        for atom in mol.GetAtoms():
            idx = atom.GetIdx()
            for radius in range(1, self.radius + 1):
                if idx not in atoms_env:
                    atoms_env[idx] = []
                atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
        return atoms_env

    @classmethod
    def _find_env(cls, mol, idx, radius):
        env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
        atom_map = {}

        submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
        if idx in atom_map:
            smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
            return smiles
        return ''

    def _all_pairs(self, mol, atoms_env):
        atom_pairs = []
        distance_matrix = GetDistanceMatrix(mol)
        num_atoms = mol.GetNumAtoms()
        shingle_dict = defaultdict(int)
        for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
            dist = str(int(distance_matrix[idx1][idx2]))

            for i in range(self.radius):
                env_a = atoms_env[idx1][i]
                env_b = atoms_env[idx2][i]

                ordered = sorted([env_a, env_b])

                shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])

                if self.is_counted:
                    shingle_dict[shingle] += 1
                    shingle += '|' + str(shingle_dict[shingle])

                atom_pairs.append(shingle.encode('utf-8'))
        return list(set(atom_pairs))
Exemple #2
0
class Map4Fingerprint:
    """Calculates the atom pair minmashed fingerprint for a given molecular object.
    Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the
    [corresponding repository](https://github.com/reymond-group/map4).
    """
    def __init__(self,
                 dimensions=1024,
                 radius=2,
                 is_counted=False,
                 is_folded=False,
                 return_strings=False):
        """
        Parameters
        ----------
        dimensions : int
            (default = 1024)
            Number of entries in the output map4 fingerprint.

        radius : int
            (default = 2)
            Number of bonds away from atom centre to consider.

        is_counted : bool
            (default = False)

        is_folded : bool
            (default = False)

        return_strings : bool
            (default = False)
            If True then returns substructure strings rather than hashed fingerprint.
        """
        self.dimensions = int(dimensions)
        self.radius = int(radius)
        self.is_counted = bool(is_counted)
        self.is_folded = bool(is_folded)
        self.return_strings = bool(return_strings)

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)

    def __call__(self, mol):
        """Calculates the atom pair minmashed fingerprint for a given molecular object.
        Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the
        [corresponding repository](https://github.com/reymond-group/map4).

        Parameters
        ----------
        mol : rdkit.Chem.rdchem.Mol
            `rdkit` mol object.

        Returns
        -------
        fp_arr : np.ndarray
            shape(self.dimensions, )
            Map4 fingerprint.
        """
        atom_envs = self._get_atom_envs(mol)
        atom_env_pairs = self._all_pairs(mol, atom_envs)
        if self.is_folded:
            fp_arr = self._fold(atom_env_pairs)
        elif self.return_strings:
            fp_arr = atom_env_pairs
        else:
            fp_arr = self.encoder.from_string_array(atom_env_pairs)
        return np.asarray(fp_arr)

    def _fold(self, pairs):
        fp_hash = self.encoder.hash(set(pairs))
        return self.encoder.fold(fp_hash, self.dimensions)

    def _get_atom_envs(self, mol):
        atoms_env = {}
        for atom in mol.GetAtoms():
            idx = atom.GetIdx()
            for radius in range(1, self.radius + 1):
                if idx not in atoms_env:
                    atoms_env[idx] = []
                atoms_env[idx].append(
                    Map4Fingerprint._find_env(mol, idx, radius))
        return atoms_env

    @classmethod
    def _find_env(cls, mol, idx, radius):
        env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
        atom_map = {}

        submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
        if idx in atom_map:
            smiles = Chem.MolToSmiles(submol,
                                      rootedAtAtom=atom_map[idx],
                                      canonical=True,
                                      isomericSmiles=False)
            return smiles
        return ''

    def _all_pairs(self, mol, atoms_env):
        atom_pairs = []
        distance_matrix = GetDistanceMatrix(mol)
        num_atoms = mol.GetNumAtoms()
        shingle_dict = defaultdict(int)
        for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
            dist = str(int(distance_matrix[idx1][idx2]))

            for i in range(self.radius):
                env_a = atoms_env[idx1][i]
                env_b = atoms_env[idx2][i]

                ordered = sorted([env_a, env_b])

                shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])

                if self.is_counted:
                    shingle_dict[shingle] += 1
                    shingle += '|' + str(shingle_dict[shingle])

                atom_pairs.append(shingle.encode('utf-8'))
        return list(set(atom_pairs))