def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Featurizes a single SMILE into an image. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A 3D array of image, the shape is `(img_size, img_size, 1)`. If the length of SMILES is longer than `max_len`, this value is an empty array. """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) smile = Chem.MolToSmiles(datapoint) if len(smile) > self.max_len: return np.array([]) cmol = Chem.Mol(datapoint.ToBinary()) cmol.ComputeGasteigerCharges() AllChem.Compute2DCoords(cmol) atom_coords = cmol.GetConformer(0).GetPositions() if self.img_spec == "std": # Setup image img = np.zeros((self.img_size, self.img_size, 1)) # Compute bond properties bond_props = np.array( [[2.0, bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in datapoint.GetBonds()]) # Compute atom properties atom_props = np.array([[atom.GetAtomicNum()] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) else: # Setup image img = np.zeros((self.img_size, self.img_size, 4)) # Compute bond properties bond_props = np.array([[ bond.GetBondTypeAsDouble(), bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() ] for bond in datapoint.GetBonds()]) # Compute atom properties atom_props = np.array([[ atom.GetAtomicNum(), atom.GetProp("_GasteigerCharge"), atom.GetExplicitValence(), atom.GetHybridization().real, ] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) partial_charges = atom_props[:, 1] if np.any(np.isnan(partial_charges)): return np.array([]) frac = np.linspace(0, 1, int(1 / self.res * 2)) # Reshape done for proper broadcast frac = frac.reshape(-1, 1, 1) bond_begin_idxs = bond_props[:, 1].astype(int) bond_end_idxs = bond_props[:, 2].astype(int) # Reshapes, and axes manipulations to facilitate vector processing. begin_coords = atom_coords[bond_begin_idxs] begin_coords = np.expand_dims(begin_coords.T, axis=0) end_coords = atom_coords[bond_end_idxs] end_coords = np.expand_dims(end_coords.T, axis=0) # Draw a line between the two atoms. # The coordinates of this line, are indicated in line_coords line_coords = frac * begin_coords + (1 - frac) * end_coords # Turn the line coordinates into image positions bond_line_idxs = np.ceil( (line_coords[:, 0] + self.embed) / self.res).astype(int) bond_line_idys = np.ceil( (line_coords[:, 1] + self.embed) / self.res).astype(int) # Turn atomic coordinates into image positions atom_idxs = np.round( (atom_coords[:, 0] + self.embed) / self.res).astype(int) atom_idys = np.round( (atom_coords[:, 1] + self.embed) / self.res).astype(int) try: # Set the bond line coordinates to the bond property used. img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0] # Set the atom positions in image to different atomic properties in channels img[atom_idxs, atom_idys, :] = atom_props except IndexError: # With fixed res and img_size some molecules (e.g. long chains) may not fit. raise IndexError( "The molecule does not fit into the image. Consider increasing img_size or res of the SmilesToImage featurizer." ) return img
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Featurizes a single SMILE into an image. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A 3D array of image, the shape is `(img_size, img_size, 1)`. If the length of SMILES is longer than `max_len`, this value is an empty array. """ from rdkit import Chem from rdkit.Chem import AllChem smile = Chem.MolToSmiles(mol) if len(smile) > self.max_len: return np.array([]) cmol = Chem.Mol(mol.ToBinary()) cmol.ComputeGasteigerCharges() AllChem.Compute2DCoords(cmol) atom_coords = cmol.GetConformer(0).GetPositions() if self.img_spec == "std": # Setup image img = np.zeros((self.img_size, self.img_size, 1)) # Compute bond properties bond_props = np.array( [[2.0, bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]) # Compute atom properties atom_props = np.array([[atom.GetAtomicNum()] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) else: # Setup image img = np.zeros((self.img_size, self.img_size, 4)) # Compute bond properties bond_props = np.array([[ bond.GetBondTypeAsDouble(), bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() ] for bond in mol.GetBonds()]) # Compute atom properties atom_props = np.array([[ atom.GetAtomicNum(), atom.GetProp("_GasteigerCharge"), atom.GetExplicitValence(), atom.GetHybridization().real, ] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) partial_charges = atom_props[:, 1] if np.any(np.isnan(partial_charges)): return np.array([]) frac = np.linspace(0, 1, int(1 / self.res * 2)) # Reshape done for proper broadcast frac = frac.reshape(-1, 1, 1) bond_begin_idxs = bond_props[:, 1].astype(int) bond_end_idxs = bond_props[:, 2].astype(int) # Reshapes, and axes manipulations to facilitate vector processing. begin_coords = atom_coords[bond_begin_idxs] begin_coords = np.expand_dims(begin_coords.T, axis=0) end_coords = atom_coords[bond_end_idxs] end_coords = np.expand_dims(end_coords.T, axis=0) # Draw a line between the two atoms. # The coordinates of this line, are indicated in line_coords line_coords = frac * begin_coords + (1 - frac) * end_coords # Turn the line coordinates into image positions bond_line_idxs = np.ceil( (line_coords[:, 0] + self.embed) / self.res).astype(int) bond_line_idys = np.ceil( (line_coords[:, 1] + self.embed) / self.res).astype(int) # Set the bond line coordinates to the bond property used. img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0] # Turn atomic coordinates into image positions atom_idxs = np.round( (atom_coords[:, 0] + self.embed) / self.res).astype(int) atom_idys = np.round( (atom_coords[:, 1] + self.embed) / self.res).astype(int) # Set the atom positions in image to different atomic properties in channels img[atom_idxs, atom_idys, :] = atom_props return img