Esempio n. 1
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Calculate atomic coordinates.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        # Check whether num_confs >=1 or not
        num_confs = len(datapoint.GetConformers())
        if num_confs == 0:
            datapoint = Chem.AddHs(datapoint)
            AllChem.EmbedMolecule(datapoint, AllChem.ETKDG())
            datapoint = Chem.RemoveHs(datapoint)

        N = datapoint.GetNumAtoms()
        coords = np.zeros((N, 3))

        # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the
        # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation
        # consistent with most QM software packages.
        if self.use_bohr:
            coords_list = [
                datapoint.GetConformer(0).GetAtomPosition(i).__idiv__(
                    0.52917721092) for i in range(N)
            ]
        else:
            coords_list = [
                datapoint.GetConformer(0).GetAtomPosition(i) for i in range(N)
            ]

        for atom in range(N):
            coords[atom, 0] = coords_list[atom].x
            coords[atom, 1] = coords_list[atom].y
            coords[atom, 2] = coords_list[atom].z

        return coords
Esempio n. 2
0
    def _featurize(self, mol: RDKitMol) -> np.ndarray:
        """Calculate atomic coordinates.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")

        # Check whether num_confs >=1 or not
        num_confs = len(mol.GetConformers())
        if num_confs == 0:
            mol = Chem.AddHs(mol)
            AllChem.EmbedMolecule(mol, AllChem.ETKDG())
            mol = Chem.RemoveHs(mol)

        N = mol.GetNumAtoms()
        coords = np.zeros((N, 3))

        # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the
        # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation
        # consistent with most QM software packages.
        if self.use_bohr:
            coords_list = [
                mol.GetConformer(0).GetAtomPosition(i).__idiv__(0.52917721092)
                for i in range(N)
            ]
        else:
            coords_list = [
                mol.GetConformer(0).GetAtomPosition(i) for i in range(N)
            ]

        for atom in range(N):
            coords[atom, 0] = coords_list[atom].x
            coords[atom, 1] = coords_list[atom].y
            coords[atom, 2] = coords_list[atom].z

        return coords
Esempio n. 3
0
    def prune_conformers(self, mol: RDKitMol) -> RDKitMol:
        """
    Prune conformers from a molecule using an RMSD threshold, starting
    with the lowest energy conformer.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    new_mol: rdkit.Chem.rdchem.Mol
      A new rdkit.Chem.rdchem.Mol containing the chosen conformers, sorted by
      increasing energy.
    """
        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ValueError("This function requires RDKit to be installed.")

        if self.rmsd_threshold < 0 or mol.GetNumConformers() <= 1:
            return mol
        energies = self.get_conformer_energies(mol)
        rmsd = self.get_conformer_rmsd(mol)

        sort = np.argsort(energies)  # sort by increasing energy
        keep: List[float] = []  # always keep lowest-energy conformer
        discard = []
        for i in sort:
            # always keep lowest-energy conformer
            if len(keep) == 0:
                keep.append(i)
                continue

            # discard conformers after max_conformers is reached
            if len(keep) >= self.max_conformers:
                discard.append(i)
                continue

            # get RMSD to selected conformers
            this_rmsd = rmsd[i][np.asarray(keep, dtype=int)]

            # discard conformers within the RMSD threshold
            if np.all(this_rmsd >= self.rmsd_threshold):
                keep.append(i)
            else:
                discard.append(i)

        # create a new molecule to hold the chosen conformers
        # this ensures proper conformer IDs and energy-based ordering
        new_mol = Chem.Mol(mol)
        new_mol.RemoveAllConformers()
        conf_ids = [conf.GetId() for conf in mol.GetConformers()]
        for i in keep:
            conf = mol.GetConformer(conf_ids[i])
            new_mol.AddConformer(conf, assignId=True)
        return new_mol
Esempio n. 4
0
def pair_features(mol: RDKitMol,
                  bond_features_map: dict,
                  bond_adj_list: List,
                  bt_len: int = 6,
                  graph_distance: bool = True,
                  max_pair_distance: Optional[int] = None) -> np.ndarray:
  """Helper method used to compute atom pair feature vectors.

  Many different featurization methods compute atom pair features
  such as WeaveFeaturizer. Note that atom pair features could be
  for pairs of atoms which aren't necessarily bonded to one
  another.

  Parameters
  ----------
  mol: RDKit Mol
    Molecule to compute features on.
  bond_features_map: dict 
    Dictionary that maps pairs of atom ids (say `(2, 3)` for a bond between
    atoms 2 and 3) to the features for the bond between them.
  bond_adj_list: list of lists
    `bond_adj_list[i]` is a list of the atom indices that atom `i` shares a
    bond with . This list is symmetrical so if `j in bond_adj_list[i]` then `i
    in bond_adj_list[j]`.
  bt_len: int, optional (default 6)
    The number of different bond types to consider.
  graph_distance: bool, optional (default True)
    If true, use graph distance between molecules. Else use euclidean
    distance. The specified `mol` must have a conformer. Atomic
    positions will be retrieved by calling `mol.getConformer(0)`.
  max_pair_distance: Optional[int], (default None)
    This value can be a positive integer or None. This
    parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)

  Note
  ----
  This method requires RDKit to be installed.

  Returns
  -------
  features: np.ndarray
    Of shape `(N_edges, bt_len + max_distance + 1)`. This is the array
    of pairwise features for all atom pairs, where N_edges is the
    number of edges within max_pair_distance of one another in this
    molecules.
  pair_edges: np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of
    pairs within `max_pair_distance` of one another.
  """
  if graph_distance:
    max_distance = 7
  else:
    max_distance = 1
  N = mol.GetNumAtoms()
  pair_edges = max_pair_distance_pairs(mol, max_pair_distance)
  num_pairs = pair_edges.shape[1]
  N_edges = pair_edges.shape[1]
  features = np.zeros((N_edges, bt_len + max_distance + 1))
  # Get mapping
  mapping = {}
  for n in range(N_edges):
    a1, a2 = pair_edges[:, n]
    mapping[(int(a1), int(a2))] = n
  num_atoms = mol.GetNumAtoms()
  rings = mol.GetRingInfo().AtomRings()
  for a1 in range(num_atoms):
    for a2 in bond_adj_list[a1]:
      # first `bt_len` features are bond features(if applicable)
      if (int(a1), int(a2)) not in mapping:
        raise ValueError(
            "Malformed molecule with bonds not in specified graph distance.")
      else:
        n = mapping[(int(a1), int(a2))]
      features[n, :bt_len] = np.asarray(
          bond_features_map[tuple(sorted((a1, a2)))], dtype=float)
    for ring in rings:
      if a1 in ring:
        for a2 in ring:
          if (int(a1), int(a2)) not in mapping:
            # For ring pairs outside max pairs distance continue
            continue
          else:
            n = mapping[(int(a1), int(a2))]
          # `bt_len`-th feature is if the pair of atoms are in the same ring
          if a2 == a1:
            features[n, bt_len] = 0
          else:
            features[n, bt_len] = 1
    # graph distance between two atoms
    if graph_distance:
      # distance is a matrix of 1-hot encoded distances for all atoms
      distance = find_distance(
          a1, num_atoms, bond_adj_list, max_distance=max_distance)
      for a2 in range(num_atoms):
        if (int(a1), int(a2)) not in mapping:
          # For ring pairs outside max pairs distance continue
          continue
        else:
          n = mapping[(int(a1), int(a2))]
          features[n, bt_len + 1:] = distance[a2]
  # Euclidean distance between atoms
  if not graph_distance:
    coords = np.zeros((N, 3))
    for atom in range(N):
      pos = mol.GetConformer(0).GetAtomPosition(atom)
      coords[atom, :] = [pos.x, pos.y, pos.z]
    features[:, :, -1] = np.sqrt(np.sum(np.square(
      np.stack([coords] * N, axis=1) - \
      np.stack([coords] * N, axis=0)), axis=2))

  return features, pair_edges