Exemple #1
0
  def atom_features(self, atom: RDKitAtom) -> np.ndarray:
    """
    Deepchem already contains an atom_features function, however we are defining a new one here due to the need to handle features specific to MAT.
    Since we need new features like Atom GetNeighbors and IsInRing, and the number of features required for MAT is a fraction of what the Deepchem atom_features function computes, we can speed up computation by defining a custom function.

    Parameters
    ----------
    atom: RDKitAtom
      RDKit Atom object.

    Returns
    ----------
    ndarray
      Numpy array containing atom features.
    """
    attrib = []
    attrib += one_hot_encode(atom.GetAtomicNum(),
                             [5, 6, 7, 8, 9, 15, 16, 17, 35, 53, 999])
    attrib += one_hot_encode(len(atom.GetNeighbors()), [0, 1, 2, 3, 4, 5])
    attrib += one_hot_encode(atom.GetTotalNumHs(), [0, 1, 2, 3, 4])

    attrib += one_hot_encode(atom.GetFormalCharge(),
                             [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5])

    attrib.append(atom.IsInRing())
    attrib.append(atom.GetIsAromatic())

    return np.array(attrib, dtype=np.float32)
Exemple #2
0
 def test_one_hot_encode(self):
     # string set
     assert one_hot_encode("a", ["a", "b", "c"]) == [1.0, 0.0, 0.0]
     # integer set
     assert one_hot_encode(2, [0.0, 1, 2]) == [0.0, 0.0, 1.0]
     # include_unknown_set is False
     assert one_hot_encode(3, [0.0, 1, 2]) == [0.0, 0.0, 0.0]
     # include_unknown_set is True
     assert one_hot_encode(3, [0.0, 1, 2], True) == [0.0, 0.0, 0.0, 1.0]
    def _edge_features(self, mol: RDKitMol, path_atoms: Tuple[int, ...],
                       ring_info) -> np.ndarray:
        """Computes the edge features for a given pair of nodes.

    Parameters
    ----------
    mol : : RDKitMol
        RDKit molecule instance.
    path_atoms: tuple
        Shortest path between the given pair of nodes.
    ring_info: list
        Different rings that contain the pair of atoms
    """
        features = []
        path_bonds = []
        path_length = len(path_atoms)
        for path_idx in range(path_length - 1):
            bond = mol.GetBondBetweenAtoms(path_atoms[path_idx],
                                           path_atoms[path_idx + 1])
            if bond is None:
                import warnings
                warnings.warn('Valid idx of bonds must be passed')
            path_bonds.append(bond)

        for path_idx in range(self.max_length):
            if path_idx < len(path_bonds):
                bond_type = get_bond_type_one_hot(path_bonds[path_idx])
                conjugacy = get_bond_is_conjugated_one_hot(
                    path_bonds[path_idx])
                ring_attach = get_bond_is_in_same_ring_one_hot(
                    path_bonds[path_idx])
                features.append(
                    np.concatenate([bond_type, conjugacy, ring_attach]))
            else:
                features.append(np.zeros(6))

        if path_length + 1 > self.max_length:
            path_length = self.max_length + 1
        position_feature = np.zeros(self.max_length + 2)
        position_feature[path_length] = 1
        features.append(position_feature)
        if ring_info:
            rfeat = [
                one_hot_encode(r, allowable_set=self.RING_TYPES)
                for r in ring_info
            ]
            # The 1.0 float value represents True Boolean
            rfeat = [1.0] + np.any(rfeat, axis=0).tolist()
            features.append(rfeat)
        else:
            # This will return a boolean vector with all entries False
            features.append(
                [0.0] +
                one_hot_encode(ring_info, allowable_set=self.RING_TYPES))
        return np.concatenate(features, axis=0)
Exemple #4
0
    def _featurize_string(self, string: str) -> np.ndarray:
        """Compute one-hot featurization of string.

    Parameters
    ----------
    string: str
      An arbitrary string to be featurized.

    Returns
    -------
    np.ndarray
      An one hot vector encoded from arbitrary input string.
      The shape is `(max_length, len(charset) + 1)`.
      The index of unknown character is `len(charset)`.
    """
        if isinstance(self.max_length, int):
            if (len(string) > self.max_length):  # Validation
                raise ValueError(
                    "The length of {} is longer than `max_length`.")
            string = self.pad_string(string)  # Padding

        return np.array([
            one_hot_encode(val, self.charset, include_unknown_set=True)
            for val in string
        ])
  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """Compute one-hot featurization of this molecule.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      An one hot vector encoded from SMILES.
      The shape is `(max_length, len(charset) + 1)`.
      The index of unknown character is `len(charset)`.
    """
    try:
      from rdkit import Chem
    except ModuleNotFoundError:
      raise ImportError("This class requires RDKit to be installed.")

    smiles = Chem.MolToSmiles(mol)
    # validation
    if len(smiles) > self.max_length:
      logger.info(
          "The length of {} is longer than `max_length`. So we return an empty array."
      )
      return np.array([])

    smiles = self.pad_smile(smiles)
    return np.array([
        one_hot_encode(val, self.charset, include_unknown_set=True)
        for val in smiles
    ])
Exemple #6
0
def get_atom_explicit_valence_one_hot(
        atom: RDKitAtom,
        allowable_set: List[int] = DEFAULT_ATOM_EXPLICIT_VALENCE_SET,
        include_unknown_set: bool = True) -> List[float]:
    """Get an one-hot feature of explicit valence of an atom.
  Parameters
  ---------
  atom: rdkit.Chem.rdchem.Atom
    RDKit atom object
  allowable_set: List[int]
    Atom explicit valence to consider. The default set is `[1, ..., 6]`
  include_unknown_set: bool, default True
    If true, the index of all types not in `allowable_set` is `len(allowable_set)`.
  Returns
  -------
  List[float]
    A one-hot vector of explicit valence an atom has.
    If `include_unknown_set` is False, the length is `len(allowable_set)`.
    If `include_unknown_set` is True, the length is `len(allowable_set) + 1`.
  """
    return one_hot_encode(atom.GetExplicitValence(), allowable_set,
                          include_unknown_set)