def mol_to_graph(mol: RDKitMol): """Convert RDKit Mol to NetworkX graph Convert mol into a graph representation atoms are nodes, and bonds are vertices stored as graph Parameters ---------- mol: RDKit Mol The molecule to convert into a graph. Returns ------- graph: networkx.Graph Contains atoms indices as nodes, edges as bonds. Note ---- This function requires NetworkX to be installed. """ try: import networkx as nx except ModuleNotFoundError: raise ValueError("This function requires NetworkX to be installed.") G = nx.Graph() num_atoms = mol.GetNumAtoms() G.add_nodes_from(range(num_atoms)) for i in range(mol.GetNumBonds()): from_idx = mol.GetBonds()[i].GetBeginAtomIdx() to_idx = mol.GetBonds()[i].GetEndAtomIdx() G.add_edge(from_idx, to_idx) return G
def _featurize(self, mol: RDKitMol) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ if self.use_partial_charge: try: mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed try: from rdkit.Chem import AllChem AllChem.ComputeGasteigerCharges(mol) except ModuleNotFoundError: raise ImportError( "This class requires RDKit to be installed.") # construct atom (node) feature h_bond_infos = construct_hydrogen_bonding_info(mol) atom_features = np.asarray( [ _construct_atom_feature(atom, h_bond_infos, self.use_chirality, self.use_partial_charge) for atom in mol.GetAtoms() ], dtype=float, ) # construct edge (bond) index src, dest = [], [] for bond in mol.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] # construct edge (bond) feature bond_features = None # deafult None if self.use_edges: features = [] for bond in mol.GetBonds(): features += 2 * [_construct_bond_feature(bond)] bond_features = np.asarray(features, dtype=float) return GraphData(node_features=atom_features, edge_index=np.asarray([src, dest], dtype=int), edge_features=bond_features)
def _featurize(self, mol: RDKitMol) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ from rdkit import Chem from rdkit.Chem import AllChem # construct atom and bond features try: mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed AllChem.ComputeGasteigerCharges(mol) h_bond_infos = construct_hydrogen_bonding_info(mol) sssr = Chem.GetSymmSSSR(mol) # construct atom (node) feature atom_features = np.array( [ _construct_atom_feature(atom, h_bond_infos, sssr) for atom in mol.GetAtoms() ], dtype=np.float, ) # construct edge (bond) information src, dest, bond_features = [], [], [] for bond in mol.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] bond_features += 2 * [_construct_bond_feature(bond)] if self.add_self_edges: num_atoms = mol.GetNumAtoms() src += [i for i in range(num_atoms)] dest += [i for i in range(num_atoms)] # add dummy edge features bond_fea_length = len(bond_features[0]) bond_features += num_atoms * [[0 for _ in range(bond_fea_length)]] return GraphData(node_features=atom_features, edge_index=np.array([src, dest], dtype=np.int), edge_features=np.array(bond_features, dtype=np.float))
def _featurize(self, datapoint: RDKitMol, **kwargs) -> Optional[GraphMatrix]: """ Calculate adjacency matrix and nodes features for RDKitMol. It strips any chirality and charges Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ try: from rdkit import Chem except ModuleNotFoundError: raise ImportError("This method requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.kekulize: Chem.Kekulize(datapoint) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = datapoint.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:datapoint.GetNumAtoms(), :datapoint.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in datapoint.GetAtoms() ] + [0] * (self.max_atom_count - datapoint.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def _featurize(self, mol: RDKitMol) -> Optional[GraphMatrix]: """ Calculate adjacency matrix and nodes features for RDKitMol. It strips any chirality and charges Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ try: from rdkit import Chem except ModuleNotFoundError: raise ImportError("This method requires RDKit to be installed.") if self.kekulize: Chem.Kekulize(mol) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = mol.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in mol.GetAtoms() ] + [0] * (self.max_atom_count - mol.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def _featurize(self, mol: RDKitMol) -> GraphMatrix: """Calculate adjacency matrix and nodes features for RDKitMol. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphMatrix A molecule graph with some features. """ if self.kekulize: Chem.Kekulize(mol) A = np.zeros(shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32) bonds = mol.GetBonds() begin, end = [b.GetBeginAtomIdx() for b in bonds], [b.GetEndAtomIdx() for b in bonds] bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds] A[begin, end] = bond_type A[end, begin] = bond_type degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1) X = np.array( [ self.atom_encoder[atom.GetAtomicNum()] for atom in mol.GetAtoms() ] + [0] * (self.max_atom_count - mol.GetNumAtoms()), dtype=np.int32, ) graph = GraphMatrix(A, X) return graph if (degree > 0).all() else None
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Featurizes a single SMILE into an image. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A 3D array of image, the shape is `(img_size, img_size, 1)`. If the length of SMILES is longer than `max_len`, this value is an empty array. """ try: from rdkit import Chem from rdkit.Chem import AllChem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) smile = Chem.MolToSmiles(datapoint) if len(smile) > self.max_len: return np.array([]) cmol = Chem.Mol(datapoint.ToBinary()) cmol.ComputeGasteigerCharges() AllChem.Compute2DCoords(cmol) atom_coords = cmol.GetConformer(0).GetPositions() if self.img_spec == "std": # Setup image img = np.zeros((self.img_size, self.img_size, 1)) # Compute bond properties bond_props = np.array( [[2.0, bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in datapoint.GetBonds()]) # Compute atom properties atom_props = np.array([[atom.GetAtomicNum()] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) else: # Setup image img = np.zeros((self.img_size, self.img_size, 4)) # Compute bond properties bond_props = np.array([[ bond.GetBondTypeAsDouble(), bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() ] for bond in datapoint.GetBonds()]) # Compute atom properties atom_props = np.array([[ atom.GetAtomicNum(), atom.GetProp("_GasteigerCharge"), atom.GetExplicitValence(), atom.GetHybridization().real, ] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) partial_charges = atom_props[:, 1] if np.any(np.isnan(partial_charges)): return np.array([]) frac = np.linspace(0, 1, int(1 / self.res * 2)) # Reshape done for proper broadcast frac = frac.reshape(-1, 1, 1) bond_begin_idxs = bond_props[:, 1].astype(int) bond_end_idxs = bond_props[:, 2].astype(int) # Reshapes, and axes manipulations to facilitate vector processing. begin_coords = atom_coords[bond_begin_idxs] begin_coords = np.expand_dims(begin_coords.T, axis=0) end_coords = atom_coords[bond_end_idxs] end_coords = np.expand_dims(end_coords.T, axis=0) # Draw a line between the two atoms. # The coordinates of this line, are indicated in line_coords line_coords = frac * begin_coords + (1 - frac) * end_coords # Turn the line coordinates into image positions bond_line_idxs = np.ceil( (line_coords[:, 0] + self.embed) / self.res).astype(int) bond_line_idys = np.ceil( (line_coords[:, 1] + self.embed) / self.res).astype(int) # Turn atomic coordinates into image positions atom_idxs = np.round( (atom_coords[:, 0] + self.embed) / self.res).astype(int) atom_idys = np.round( (atom_coords[:, 1] + self.embed) / self.res).astype(int) try: # Set the bond line coordinates to the bond property used. img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0] # Set the atom positions in image to different atomic properties in channels img[atom_idxs, atom_idys, :] = atom_props except IndexError: # With fixed res and img_size some molecules (e.g. long chains) may not fit. raise IndexError( "The molecule does not fit into the image. Consider increasing img_size or res of the SmilesToImage featurizer." ) return img
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Featurizes a single SMILE into an image. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A 3D array of image, the shape is `(img_size, img_size, 1)`. If the length of SMILES is longer than `max_len`, this value is an empty array. """ from rdkit import Chem from rdkit.Chem import AllChem smile = Chem.MolToSmiles(mol) if len(smile) > self.max_len: return np.array([]) cmol = Chem.Mol(mol.ToBinary()) cmol.ComputeGasteigerCharges() AllChem.Compute2DCoords(cmol) atom_coords = cmol.GetConformer(0).GetPositions() if self.img_spec == "std": # Setup image img = np.zeros((self.img_size, self.img_size, 1)) # Compute bond properties bond_props = np.array( [[2.0, bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]) # Compute atom properties atom_props = np.array([[atom.GetAtomicNum()] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) else: # Setup image img = np.zeros((self.img_size, self.img_size, 4)) # Compute bond properties bond_props = np.array([[ bond.GetBondTypeAsDouble(), bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() ] for bond in mol.GetBonds()]) # Compute atom properties atom_props = np.array([[ atom.GetAtomicNum(), atom.GetProp("_GasteigerCharge"), atom.GetExplicitValence(), atom.GetHybridization().real, ] for atom in cmol.GetAtoms()]) bond_props = bond_props.astype(np.float32) atom_props = atom_props.astype(np.float32) partial_charges = atom_props[:, 1] if np.any(np.isnan(partial_charges)): return np.array([]) frac = np.linspace(0, 1, int(1 / self.res * 2)) # Reshape done for proper broadcast frac = frac.reshape(-1, 1, 1) bond_begin_idxs = bond_props[:, 1].astype(int) bond_end_idxs = bond_props[:, 2].astype(int) # Reshapes, and axes manipulations to facilitate vector processing. begin_coords = atom_coords[bond_begin_idxs] begin_coords = np.expand_dims(begin_coords.T, axis=0) end_coords = atom_coords[bond_end_idxs] end_coords = np.expand_dims(end_coords.T, axis=0) # Draw a line between the two atoms. # The coordinates of this line, are indicated in line_coords line_coords = frac * begin_coords + (1 - frac) * end_coords # Turn the line coordinates into image positions bond_line_idxs = np.ceil( (line_coords[:, 0] + self.embed) / self.res).astype(int) bond_line_idys = np.ceil( (line_coords[:, 1] + self.embed) / self.res).astype(int) # Set the bond line coordinates to the bond property used. img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0] # Turn atomic coordinates into image positions atom_idxs = np.round( (atom_coords[:, 0] + self.embed) / self.res).astype(int) atom_idys = np.round( (atom_coords[:, 1] + self.embed) / self.res).astype(int) # Set the atom positions in image to different atomic properties in channels img[atom_idxs, atom_idys, :] = atom_props return img
def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData: """Calculate molecule graph features from RDKit mol object. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit mol object. Returns ------- graph: GraphData A molecule graph with some features. """ assert datapoint.GetNumAtoms( ) > 1, "More than one atom should be present in the molecule for this featurizer to work." if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.use_partial_charge: try: datapoint.GetAtomWithIdx(0).GetProp('_GasteigerCharge') except: # If partial charges were not computed try: from rdkit.Chem import AllChem AllChem.ComputeGasteigerCharges(datapoint) except ModuleNotFoundError: raise ImportError( "This class requires RDKit to be installed.") # construct atom (node) feature h_bond_infos = construct_hydrogen_bonding_info(datapoint) atom_features = np.asarray( [ _construct_atom_feature(atom, h_bond_infos, self.use_chirality, self.use_partial_charge) for atom in datapoint.GetAtoms() ], dtype=float, ) # construct edge (bond) index src, dest = [], [] for bond in datapoint.GetBonds(): # add edge list considering a directed graph start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() src += [start, end] dest += [end, start] # construct edge (bond) feature bond_features = None # deafult None if self.use_edges: features = [] for bond in datapoint.GetBonds(): features += 2 * [_construct_bond_feature(bond)] bond_features = np.asarray(features, dtype=float) return GraphData(node_features=atom_features, edge_index=np.asarray([src, dest], dtype=int), edge_features=bond_features)