def atom_features(self, atom: RDKitAtom) -> np.ndarray: """ Deepchem already contains an atom_features function, however we are defining a new one here due to the need to handle features specific to MAT. Since we need new features like Atom GetNeighbors and IsInRing, and the number of features required for MAT is a fraction of what the Deepchem atom_features function computes, we can speed up computation by defining a custom function. Parameters ---------- atom: RDKitAtom RDKit Atom object. Returns ---------- ndarray Numpy array containing atom features. """ attrib = [] attrib += one_hot_encode(atom.GetAtomicNum(), [5, 6, 7, 8, 9, 15, 16, 17, 35, 53, 999]) attrib += one_hot_encode(len(atom.GetNeighbors()), [0, 1, 2, 3, 4, 5]) attrib += one_hot_encode(atom.GetTotalNumHs(), [0, 1, 2, 3, 4]) attrib += one_hot_encode(atom.GetFormalCharge(), [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]) attrib.append(atom.IsInRing()) attrib.append(atom.GetIsAromatic()) return np.array(attrib, dtype=np.float32)
def test_one_hot_encode(self): # string set assert one_hot_encode("a", ["a", "b", "c"]) == [1.0, 0.0, 0.0] # integer set assert one_hot_encode(2, [0.0, 1, 2]) == [0.0, 0.0, 1.0] # include_unknown_set is False assert one_hot_encode(3, [0.0, 1, 2]) == [0.0, 0.0, 0.0] # include_unknown_set is True assert one_hot_encode(3, [0.0, 1, 2], True) == [0.0, 0.0, 0.0, 1.0]
def _edge_features(self, mol: RDKitMol, path_atoms: Tuple[int, ...], ring_info) -> np.ndarray: """Computes the edge features for a given pair of nodes. Parameters ---------- mol : : RDKitMol RDKit molecule instance. path_atoms: tuple Shortest path between the given pair of nodes. ring_info: list Different rings that contain the pair of atoms """ features = [] path_bonds = [] path_length = len(path_atoms) for path_idx in range(path_length - 1): bond = mol.GetBondBetweenAtoms(path_atoms[path_idx], path_atoms[path_idx + 1]) if bond is None: import warnings warnings.warn('Valid idx of bonds must be passed') path_bonds.append(bond) for path_idx in range(self.max_length): if path_idx < len(path_bonds): bond_type = get_bond_type_one_hot(path_bonds[path_idx]) conjugacy = get_bond_is_conjugated_one_hot( path_bonds[path_idx]) ring_attach = get_bond_is_in_same_ring_one_hot( path_bonds[path_idx]) features.append( np.concatenate([bond_type, conjugacy, ring_attach])) else: features.append(np.zeros(6)) if path_length + 1 > self.max_length: path_length = self.max_length + 1 position_feature = np.zeros(self.max_length + 2) position_feature[path_length] = 1 features.append(position_feature) if ring_info: rfeat = [ one_hot_encode(r, allowable_set=self.RING_TYPES) for r in ring_info ] # The 1.0 float value represents True Boolean rfeat = [1.0] + np.any(rfeat, axis=0).tolist() features.append(rfeat) else: # This will return a boolean vector with all entries False features.append( [0.0] + one_hot_encode(ring_info, allowable_set=self.RING_TYPES)) return np.concatenate(features, axis=0)
def _featurize_string(self, string: str) -> np.ndarray: """Compute one-hot featurization of string. Parameters ---------- string: str An arbitrary string to be featurized. Returns ------- np.ndarray An one hot vector encoded from arbitrary input string. The shape is `(max_length, len(charset) + 1)`. The index of unknown character is `len(charset)`. """ if isinstance(self.max_length, int): if (len(string) > self.max_length): # Validation raise ValueError( "The length of {} is longer than `max_length`.") string = self.pad_string(string) # Padding return np.array([ one_hot_encode(val, self.charset, include_unknown_set=True) for val in string ])
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Compute one-hot featurization of this molecule. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray An one hot vector encoded from SMILES. The shape is `(max_length, len(charset) + 1)`. The index of unknown character is `len(charset)`. """ try: from rdkit import Chem except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") smiles = Chem.MolToSmiles(mol) # validation if len(smiles) > self.max_length: logger.info( "The length of {} is longer than `max_length`. So we return an empty array." ) return np.array([]) smiles = self.pad_smile(smiles) return np.array([ one_hot_encode(val, self.charset, include_unknown_set=True) for val in smiles ])
def get_atom_explicit_valence_one_hot( atom: RDKitAtom, allowable_set: List[int] = DEFAULT_ATOM_EXPLICIT_VALENCE_SET, include_unknown_set: bool = True) -> List[float]: """Get an one-hot feature of explicit valence of an atom. Parameters --------- atom: rdkit.Chem.rdchem.Atom RDKit atom object allowable_set: List[int] Atom explicit valence to consider. The default set is `[1, ..., 6]` include_unknown_set: bool, default True If true, the index of all types not in `allowable_set` is `len(allowable_set)`. Returns ------- List[float] A one-hot vector of explicit valence an atom has. If `include_unknown_set` is False, the length is `len(allowable_set)`. If `include_unknown_set` is True, the length is `len(allowable_set) + 1`. """ return one_hot_encode(atom.GetExplicitValence(), allowable_set, include_unknown_set)