Esempio n. 1
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Featurizes a single SMILE into an image.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A 3D array of image, the shape is `(img_size, img_size, 1)`.
      If the length of SMILES is longer than `max_len`, this value is an empty array.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        smile = Chem.MolToSmiles(datapoint)
        if len(smile) > self.max_len:
            return np.array([])

        cmol = Chem.Mol(datapoint.ToBinary())
        cmol.ComputeGasteigerCharges()
        AllChem.Compute2DCoords(cmol)
        atom_coords = cmol.GetConformer(0).GetPositions()

        if self.img_spec == "std":
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 1))
            # Compute bond properties
            bond_props = np.array(
                [[2.0, bond.GetBeginAtomIdx(),
                  bond.GetEndAtomIdx()] for bond in datapoint.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[atom.GetAtomicNum()]
                                   for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

        else:
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 4))
            # Compute bond properties
            bond_props = np.array([[
                bond.GetBondTypeAsDouble(),
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx()
            ] for bond in datapoint.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[
                atom.GetAtomicNum(),
                atom.GetProp("_GasteigerCharge"),
                atom.GetExplicitValence(),
                atom.GetHybridization().real,
            ] for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

            partial_charges = atom_props[:, 1]
            if np.any(np.isnan(partial_charges)):
                return np.array([])

        frac = np.linspace(0, 1, int(1 / self.res * 2))
        # Reshape done for proper broadcast
        frac = frac.reshape(-1, 1, 1)

        bond_begin_idxs = bond_props[:, 1].astype(int)
        bond_end_idxs = bond_props[:, 2].astype(int)

        # Reshapes, and axes manipulations to facilitate vector processing.
        begin_coords = atom_coords[bond_begin_idxs]
        begin_coords = np.expand_dims(begin_coords.T, axis=0)
        end_coords = atom_coords[bond_end_idxs]
        end_coords = np.expand_dims(end_coords.T, axis=0)

        # Draw a line between the two atoms.
        # The coordinates of this line, are indicated in line_coords
        line_coords = frac * begin_coords + (1 - frac) * end_coords
        # Turn the line coordinates into image positions
        bond_line_idxs = np.ceil(
            (line_coords[:, 0] + self.embed) / self.res).astype(int)
        bond_line_idys = np.ceil(
            (line_coords[:, 1] + self.embed) / self.res).astype(int)
        # Turn atomic coordinates into image positions
        atom_idxs = np.round(
            (atom_coords[:, 0] + self.embed) / self.res).astype(int)
        atom_idys = np.round(
            (atom_coords[:, 1] + self.embed) / self.res).astype(int)

        try:
            # Set the bond line coordinates to the bond property used.
            img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0]

            # Set the atom positions in image to different atomic properties in channels
            img[atom_idxs, atom_idys, :] = atom_props

        except IndexError:
            # With fixed res and img_size some molecules (e.g. long chains) may not fit.
            raise IndexError(
                "The molecule does not fit into the image. Consider increasing img_size or res of the SmilesToImage featurizer."
            )
        return img
Esempio n. 2
0
    def _featurize(self, mol: RDKitMol) -> np.ndarray:
        """Featurizes a single SMILE into an image.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A 3D array of image, the shape is `(img_size, img_size, 1)`.
      If the length of SMILES is longer than `max_len`, this value is an empty array.
    """
        from rdkit import Chem
        from rdkit.Chem import AllChem

        smile = Chem.MolToSmiles(mol)
        if len(smile) > self.max_len:
            return np.array([])

        cmol = Chem.Mol(mol.ToBinary())
        cmol.ComputeGasteigerCharges()
        AllChem.Compute2DCoords(cmol)
        atom_coords = cmol.GetConformer(0).GetPositions()

        if self.img_spec == "std":
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 1))
            # Compute bond properties
            bond_props = np.array(
                [[2.0, bond.GetBeginAtomIdx(),
                  bond.GetEndAtomIdx()] for bond in mol.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[atom.GetAtomicNum()]
                                   for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

        else:
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 4))
            # Compute bond properties
            bond_props = np.array([[
                bond.GetBondTypeAsDouble(),
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx()
            ] for bond in mol.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[
                atom.GetAtomicNum(),
                atom.GetProp("_GasteigerCharge"),
                atom.GetExplicitValence(),
                atom.GetHybridization().real,
            ] for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

            partial_charges = atom_props[:, 1]
            if np.any(np.isnan(partial_charges)):
                return np.array([])

        frac = np.linspace(0, 1, int(1 / self.res * 2))
        # Reshape done for proper broadcast
        frac = frac.reshape(-1, 1, 1)

        bond_begin_idxs = bond_props[:, 1].astype(int)
        bond_end_idxs = bond_props[:, 2].astype(int)

        # Reshapes, and axes manipulations to facilitate vector processing.
        begin_coords = atom_coords[bond_begin_idxs]
        begin_coords = np.expand_dims(begin_coords.T, axis=0)
        end_coords = atom_coords[bond_end_idxs]
        end_coords = np.expand_dims(end_coords.T, axis=0)

        # Draw a line between the two atoms.
        # The coordinates of this line, are indicated in line_coords
        line_coords = frac * begin_coords + (1 - frac) * end_coords
        # Turn the line coordinates into image positions
        bond_line_idxs = np.ceil(
            (line_coords[:, 0] + self.embed) / self.res).astype(int)
        bond_line_idys = np.ceil(
            (line_coords[:, 1] + self.embed) / self.res).astype(int)
        # Set the bond line coordinates to the bond property used.
        img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0]

        # Turn atomic coordinates into image positions
        atom_idxs = np.round(
            (atom_coords[:, 0] + self.embed) / self.res).astype(int)
        atom_idys = np.round(
            (atom_coords[:, 1] + self.embed) / self.res).astype(int)
        # Set the atom positions in image to different atomic properties in channels
        img[atom_idxs, atom_idys, :] = atom_props
        return img