Ejemplo n.º 1
0
    def get_conformer_rmsd(mol: RDKitMol) -> np.ndarray:
        """
    Calculate conformer-conformer RMSD.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    rmsd: np.ndarray
      A conformer-conformer RMSD value. The shape is `(NumConformers, NumConformers)`
    """
        try:
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ValueError("This function requires RDKit to be installed.")

        rmsd = np.zeros((mol.GetNumConformers(), mol.GetNumConformers()),
                        dtype=float)
        for i, ref_conf in enumerate(mol.GetConformers()):
            for j, fit_conf in enumerate(mol.GetConformers()):
                if i >= j:
                    continue
                rmsd[i, j] = AllChem.GetBestRMS(mol, mol, ref_conf.GetId(),
                                                fit_conf.GetId())
                rmsd[j, i] = rmsd[i, j]
        return rmsd
Ejemplo n.º 2
0
def mol_to_graph(mol: RDKitMol):
    """Convert RDKit Mol to NetworkX graph

  Convert mol into a graph representation atoms are nodes, and bonds
  are vertices stored as graph

  Parameters
  ----------
  mol: RDKit Mol
    The molecule to convert into a graph. 

  Returns
  -------
  graph: networkx.Graph
    Contains atoms indices as nodes, edges as bonds.

  Note
  ----
  This function requires NetworkX to be installed.
  """
    try:
        import networkx as nx
    except ModuleNotFoundError:
        raise ValueError("This function requires NetworkX to be installed.")

    G = nx.Graph()
    num_atoms = mol.GetNumAtoms()
    G.add_nodes_from(range(num_atoms))
    for i in range(mol.GetNumBonds()):
        from_idx = mol.GetBonds()[i].GetBeginAtomIdx()
        to_idx = mol.GetBonds()[i].GetEndAtomIdx()
        G.add_edge(from_idx, to_idx)
    return G
Ejemplo n.º 3
0
    def generate_conformers(self, mol: RDKitMol) -> RDKitMol:
        """
    Generate conformers for a molecule.

    This function returns a copy of the original molecule with embedded
    conformers.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    mol: rdkit.Chem.rdchem.Mol
      A new RDKit Mol object containing the chosen conformers, sorted by
      increasing energy.
    """

        # initial embedding
        mol = self.embed_molecule(mol)
        if not mol.GetNumConformers():
            msg = 'No conformers generated for molecule'
            if mol.HasProp('_Name'):
                name = mol.GetProp('_Name')
                msg += ' "{}".'.format(name)
            else:
                msg += '.'
            raise RuntimeError(msg)

        # minimization and pruning
        self.minimize_conformers(mol)
        mol = self.prune_conformers(mol)

        return mol
Ejemplo n.º 4
0
    def prune_conformers(self, mol: RDKitMol) -> RDKitMol:
        """
    Prune conformers from a molecule using an RMSD threshold, starting
    with the lowest energy conformer.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    new_mol: rdkit.Chem.rdchem.Mol
      A new rdkit.Chem.rdchem.Mol containing the chosen conformers, sorted by
      increasing energy.
    """
        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ValueError("This function requires RDKit to be installed.")

        if self.rmsd_threshold < 0 or mol.GetNumConformers() <= 1:
            return mol
        energies = self.get_conformer_energies(mol)
        rmsd = self.get_conformer_rmsd(mol)

        sort = np.argsort(energies)  # sort by increasing energy
        keep: List[float] = []  # always keep lowest-energy conformer
        discard = []
        for i in sort:
            # always keep lowest-energy conformer
            if len(keep) == 0:
                keep.append(i)
                continue

            # discard conformers after max_conformers is reached
            if len(keep) >= self.max_conformers:
                discard.append(i)
                continue

            # get RMSD to selected conformers
            this_rmsd = rmsd[i][np.asarray(keep, dtype=int)]

            # discard conformers within the RMSD threshold
            if np.all(this_rmsd >= self.rmsd_threshold):
                keep.append(i)
            else:
                discard.append(i)

        # create a new molecule to hold the chosen conformers
        # this ensures proper conformer IDs and energy-based ordering
        new_mol = Chem.Mol(mol)
        new_mol.RemoveAllConformers()
        conf_ids = [conf.GetId() for conf in mol.GetConformers()]
        for i in keep:
            conf = mol.GetConformer(conf_ids[i])
            new_mol.AddConformer(conf, assignId=True)
        return new_mol
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        from rdkit import Chem
        from rdkit.Chem import AllChem

        # construct atom and bond features
        try:
            mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
        except:
            # If partial charges were not computed
            AllChem.ComputeGasteigerCharges(mol)

        h_bond_infos = construct_hydrogen_bonding_info(mol)
        sssr = Chem.GetSymmSSSR(mol)

        # construct atom (node) feature
        atom_features = np.array(
            [
                _construct_atom_feature(atom, h_bond_infos, sssr)
                for atom in mol.GetAtoms()
            ],
            dtype=np.float,
        )

        # construct edge (bond) information
        src, dest, bond_features = [], [], []
        for bond in mol.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]
            bond_features += 2 * [_construct_bond_feature(bond)]

        if self.add_self_edges:
            num_atoms = mol.GetNumAtoms()
            src += [i for i in range(num_atoms)]
            dest += [i for i in range(num_atoms)]
            # add dummy edge features
            bond_fea_length = len(bond_features[0])
            bond_features += num_atoms * [[0 for _ in range(bond_fea_length)]]

        return GraphData(node_features=atom_features,
                         edge_index=np.array([src, dest], dtype=np.int),
                         edge_features=np.array(bond_features, dtype=np.float))
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        if self.use_partial_charge:
            try:
                mol.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
            except:
                # If partial charges were not computed
                try:
                    from rdkit.Chem import AllChem
                    AllChem.ComputeGasteigerCharges(mol)
                except ModuleNotFoundError:
                    raise ImportError(
                        "This class requires RDKit to be installed.")

        # construct atom (node) feature
        h_bond_infos = construct_hydrogen_bonding_info(mol)
        atom_features = np.asarray(
            [
                _construct_atom_feature(atom, h_bond_infos, self.use_chirality,
                                        self.use_partial_charge)
                for atom in mol.GetAtoms()
            ],
            dtype=float,
        )

        # construct edge (bond) index
        src, dest = [], []
        for bond in mol.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]

        # construct edge (bond) feature
        bond_features = None  # deafult None
        if self.use_edges:
            features = []
            for bond in mol.GetBonds():
                features += 2 * [_construct_bond_feature(bond)]
            bond_features = np.asarray(features, dtype=float)

        return GraphData(node_features=atom_features,
                         edge_index=np.asarray([src, dest], dtype=int),
                         edge_features=bond_features)
Ejemplo n.º 7
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Calculate atomic coordinates.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        # Check whether num_confs >=1 or not
        num_confs = len(datapoint.GetConformers())
        if num_confs == 0:
            datapoint = Chem.AddHs(datapoint)
            AllChem.EmbedMolecule(datapoint, AllChem.ETKDG())
            datapoint = Chem.RemoveHs(datapoint)

        N = datapoint.GetNumAtoms()
        coords = np.zeros((N, 3))

        # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the
        # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation
        # consistent with most QM software packages.
        if self.use_bohr:
            coords_list = [
                datapoint.GetConformer(0).GetAtomPosition(i).__idiv__(
                    0.52917721092) for i in range(N)
            ]
        else:
            coords_list = [
                datapoint.GetConformer(0).GetAtomPosition(i) for i in range(N)
            ]

        for atom in range(N):
            coords[atom, 0] = coords_list[atom].x
            coords[atom, 1] = coords_list[atom].y
            coords[atom, 2] = coords_list[atom].z

        return coords
Ejemplo n.º 8
0
    def _featurize(self, datapoint: RDKitMol,
                   **kwargs) -> Optional[GraphMatrix]:
        """
    Calculate adjacency matrix and nodes features for RDKitMol.
    It strips any chirality and charges

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphMatrix
      A molecule graph with some features.
    """

        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ImportError("This method requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.kekulize:
            Chem.Kekulize(datapoint)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = datapoint.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:datapoint.GetNumAtoms(), :datapoint.GetNumAtoms()],
                        axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in datapoint.GetAtoms()
            ] + [0] * (self.max_atom_count - datapoint.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
Ejemplo n.º 9
0
    def _featurize(self, mol: RDKitMol) -> np.ndarray:
        """Calculate atomic coordinates.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of atomic coordinates. The shape is `(n_atoms, 3)`.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")

        # Check whether num_confs >=1 or not
        num_confs = len(mol.GetConformers())
        if num_confs == 0:
            mol = Chem.AddHs(mol)
            AllChem.EmbedMolecule(mol, AllChem.ETKDG())
            mol = Chem.RemoveHs(mol)

        N = mol.GetNumAtoms()
        coords = np.zeros((N, 3))

        # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the
        # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation
        # consistent with most QM software packages.
        if self.use_bohr:
            coords_list = [
                mol.GetConformer(0).GetAtomPosition(i).__idiv__(0.52917721092)
                for i in range(N)
            ]
        else:
            coords_list = [
                mol.GetConformer(0).GetAtomPosition(i) for i in range(N)
            ]

        for atom in range(N):
            coords[atom, 0] = coords_list[atom].x
            coords[atom, 1] = coords_list[atom].y
            coords[atom, 2] = coords_list[atom].z

        return coords
Ejemplo n.º 10
0
def _create_component_map(mol: RDKitMol,
                          components: List[List[int]]) -> Dict[int, int]:
    """Creates a map from atom ids to disconnected component id

  For each atom in `mol`, maps it to the id of the component in the
  molecule. The intent is that this is used on a molecule whose
  rotatable bonds have been removed. `components` is a list of the
  connected components after this surgery.

  Parameters
  ----------
  mol: RDKit Mol
    molecule to find disconnected compontents in 
  components: List[List[int]]
    List of connected components

  Returns
  -------
  comp_map: Dict[int, int]
    Maps atom ids to component ides
  """
    comp_map = {}
    for i in range(mol.GetNumAtoms()):
        for j in range(len(components)):
            if i in components[j]:
                comp_map[i] = j
                break
    return comp_map
Ejemplo n.º 11
0
  def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData:
    """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
    if 'mol' in kwargs:
      datapoint = kwargs.get("mol")
      raise DeprecationWarning(
          'Mol is being phased out as a parameter, please pass "datapoint" instead.'
      )

    node_features = np.asarray(
        [self._pagtn_atom_featurizer(atom) for atom in datapoint.GetAtoms()],
        dtype=np.float)
    edge_index, edge_features = self._pagtn_edge_featurizer(datapoint)
    graph = GraphData(node_features, edge_index, edge_features)
    return graph
Ejemplo n.º 12
0
def convert_protein_to_pdbqt(mol: RDKitMol, outfile: str) -> None:
    """Convert a protein PDB file into a pdbqt file.

  Writes the extra PDBQT terms directly to `outfile`.

  Parameters
  ----------
  mol: RDKit Mol
    Protein molecule
  outfile: str
    filename which already has a valid pdb representation of mol
  """
    lines = [x.strip() for x in open(outfile).readlines()]
    out_lines = []
    for line in lines:
        if "ROOT" in line or "ENDROOT" in line or "TORSDOF" in line:
            out_lines.append("%s\n" % line)
            continue
        if not line.startswith("ATOM"):
            continue
        line = line[:66]
        atom_index = int(line[6:11])
        atom = mol.GetAtoms()[atom_index - 1]
        line = "%s    +0.000 %s\n" % (line, atom.GetSymbol().ljust(2))
        out_lines.append(line)
    with open(outfile, 'w') as fout:
        for line in out_lines:
            fout.write(line)
Ejemplo n.º 13
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Calculate symmetry function.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of symmetry function. The shape is `(max_atoms, 4)`.
    """
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )
        coordinates = self.coordfeat._featurize(datapoint)
        atom_numbers = np.array(
            [atom.GetAtomicNum() for atom in datapoint.GetAtoms()])
        atom_numbers = np.expand_dims(atom_numbers, axis=1)
        assert atom_numbers.shape[0] == coordinates.shape[0]
        features = np.concatenate([atom_numbers, coordinates], axis=1)
        return pad_array(features, (self.max_atoms, 4))
Ejemplo n.º 14
0
    def _featurize(self, mol: RDKitMol) -> Optional[GraphMatrix]:
        """
    Calculate adjacency matrix and nodes features for RDKitMol.
    It strips any chirality and charges

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphMatrix
      A molecule graph with some features.
    """

        try:
            from rdkit import Chem
        except ModuleNotFoundError:
            raise ImportError("This method requires RDKit to be installed.")

        if self.kekulize:
            Chem.Kekulize(mol)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = mol.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in mol.GetAtoms()
            ] + [0] * (self.max_atom_count - mol.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
Ejemplo n.º 15
0
def max_pair_distance_pairs(mol: RDKitMol,
                            max_pair_distance: Optional[int]) -> np.ndarray:
  """Helper method which finds atom pairs within max_pair_distance graph distance.

  This helper method is used to find atoms which are within max_pair_distance
  graph_distance of one another. This is done by using the fact that the
  powers of an adjacency matrix encode path connectivity information. In
  particular, if `adj` is the adjacency matrix, then `adj**k` has a nonzero
  value at `(i, j)` if and only if there exists a path of graph distance `k`
  between `i` and `j`. To find all atoms within `max_pair_distance` of each
  other, we can compute the adjacency matrix powers `[adj, adj**2,
  ...,adj**max_pair_distance]` and find pairs which are nonzero in any of
  these matrices. Since adjacency matrices and their powers are positive
  numbers, this is simply the nonzero elements of `adj + adj**2 + ... +
  adj**max_pair_distance`.

  Parameters
  ----------
  mol: rdkit.Chem.rdchem.Mol
    RDKit molecules
  max_pair_distance: Optional[int], (default None)
    This value can be a positive integer or None. This
    parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)


  Returns
  -------
  np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs
    within `max_pair_distance` of one another.
  """
  from rdkit import Chem
  from rdkit.Chem import rdmolops
  N = len(mol.GetAtoms())
  if (max_pair_distance is None or max_pair_distance >= N):
    max_distance = N
  elif max_pair_distance is not None and max_pair_distance <= 0:
    raise ValueError(
        "max_pair_distance must either be a positive integer or None")
  elif max_pair_distance is not None:
    max_distance = max_pair_distance
  adj = rdmolops.GetAdjacencyMatrix(mol)
  # Handle edge case of self-pairs (i, i)
  sum_adj = np.eye(N)
  for i in range(max_distance):
    # Increment by 1 since we don't want 0-indexing
    power = i + 1
    sum_adj += np.linalg.matrix_power(adj, power)
  nonzero_locs = np.where(sum_adj != 0)
  num_pairs = len(nonzero_locs[0])
  # This creates a matrix of shape (2, num_pairs)
  pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs))
  return pair_edges
Ejemplo n.º 16
0
    def coulomb_matrix(self, mol: RDKitMol) -> np.ndarray:
        """
    Generate Coulomb matrices for each conformer of the given molecule.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      The coulomb matrices of the given molecule
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")

        # Check whether num_confs >=1 or not
        num_confs = len(mol.GetConformers())
        if num_confs == 0:
            mol = Chem.AddHs(mol)
            AllChem.EmbedMolecule(mol, AllChem.ETKDG())

        if self.remove_hydrogens:
            mol = Chem.RemoveHs(mol)
        n_atoms = mol.GetNumAtoms()
        z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        rval = []
        for conf in mol.GetConformers():
            d = self.get_interatomic_distances(conf)
            m = np.outer(z, z) / d
            m[range(n_atoms), range(n_atoms)] = 0.5 * np.array(z)**2.4
            if self.randomize:
                for random_m in self.randomize_coulomb_matrix(m):
                    random_m = pad_array(random_m, self.max_atoms)
                    rval.append(random_m)
            else:
                m = pad_array(m, self.max_atoms)
                rval.append(m)
        rval = np.asarray(rval)
        return rval
Ejemplo n.º 17
0
    def _edge_features(self, mol: RDKitMol, path_atoms: Tuple[int, ...],
                       ring_info) -> np.ndarray:
        """Computes the edge features for a given pair of nodes.

    Parameters
    ----------
    mol : : RDKitMol
        RDKit molecule instance.
    path_atoms: tuple
        Shortest path between the given pair of nodes.
    ring_info: list
        Different rings that contain the pair of atoms
    """
        features = []
        path_bonds = []
        path_length = len(path_atoms)
        for path_idx in range(path_length - 1):
            bond = mol.GetBondBetweenAtoms(path_atoms[path_idx],
                                           path_atoms[path_idx + 1])
            if bond is None:
                import warnings
                warnings.warn('Valid idx of bonds must be passed')
            path_bonds.append(bond)

        for path_idx in range(self.max_length):
            if path_idx < len(path_bonds):
                bond_type = get_bond_type_one_hot(path_bonds[path_idx])
                conjugacy = get_bond_is_conjugated_one_hot(
                    path_bonds[path_idx])
                ring_attach = get_bond_is_in_same_ring_one_hot(
                    path_bonds[path_idx])
                features.append(
                    np.concatenate([bond_type, conjugacy, ring_attach]))
            else:
                features.append(np.zeros(6))

        if path_length + 1 > self.max_length:
            path_length = self.max_length + 1
        position_feature = np.zeros(self.max_length + 2)
        position_feature[path_length] = 1
        features.append(position_feature)
        if ring_info:
            rfeat = [
                one_hot_encode(r, allowable_set=self.RING_TYPES)
                for r in ring_info
            ]
            # The 1.0 float value represents True Boolean
            rfeat = [1.0] + np.any(rfeat, axis=0).tolist()
            features.append(rfeat)
        else:
            # This will return a boolean vector with all entries False
            features.append(
                [0.0] +
                one_hot_encode(ring_info, allowable_set=self.RING_TYPES))
        return np.concatenate(features, axis=0)
Ejemplo n.º 18
0
    def minimize_conformers(self, mol: RDKitMol) -> None:
        """
    Minimize molecule conformers.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object with embedded conformers.
    """
        for conf in mol.GetConformers():
            ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
            ff.Minimize()
    def _featurize(self, mol: RDKitMol) -> GraphMatrix:
        """Calculate adjacency matrix and nodes features for RDKitMol.

        Parameters
        ----------
        mol: rdkit.Chem.rdchem.Mol
          RDKit mol object.
        Returns
        -------
        graph: GraphMatrix
          A molecule graph with some features.
        """
        if self.kekulize:
            Chem.Kekulize(mol)

        A = np.zeros(shape=(self.max_atom_count, self.max_atom_count),
                     dtype=np.float32)
        bonds = mol.GetBonds()

        begin, end = [b.GetBeginAtomIdx()
                      for b in bonds], [b.GetEndAtomIdx() for b in bonds]
        bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

        A[begin, end] = bond_type
        A[end, begin] = bond_type

        degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
        X = np.array(
            [
                self.atom_encoder[atom.GetAtomicNum()]
                for atom in mol.GetAtoms()
            ] + [0] * (self.max_atom_count - mol.GetNumAtoms()),
            dtype=np.int32,
        )
        graph = GraphMatrix(A, X)

        return graph if (degree > 0).all() else None
Ejemplo n.º 20
0
def compute_all_ecfp(mol: RDKitMol,
                     indices: Optional[Set[int]] = None,
                     degree: int = 2) -> Dict[int, str]:
    """Obtain molecular fragment for all atoms emanating outward to given degree.

  For each fragment, compute SMILES string (for now) and hash to
  an int. Return a dictionary mapping atom index to hashed
  SMILES.

  Parameters
  ----------
  mol: rdkit Molecule
    Molecule to compute ecfp fragments on
  indices: Optional[Set[int]]
    List of atom indices for molecule. Default is all indices. If
    specified will only compute fragments for specified atoms.
  degree: int
    Graph degree to use when computing ECFP fingerprints

  Returns
  ----------
  dict
    Dictionary mapping atom index to hashed smiles.
  """

    ecfp_dict = {}
    from rdkit import Chem
    for i in range(mol.GetNumAtoms()):
        if indices is not None and i not in indices:
            continue
        env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True)
        submol = Chem.PathToSubmol(mol, env)
        smile = Chem.MolToSmiles(submol)
        ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile)

    return ecfp_dict
Ejemplo n.º 21
0
  def construct_node_features_matrix(self, mol: RDKitMol) -> np.ndarray:
    """
    This function constructs a matrix of atom features for all atoms in a given molecule using the atom_features function.

    Parameters
    ----------
    mol: RDKitMol
      RDKit Mol object.

    Returns
    ----------
    Atom_features: ndarray
      Numpy array containing atom features.
    """
    return np.array([self.atom_features(atom) for atom in mol.GetAtoms()])
Ejemplo n.º 22
0
  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """Calculate symmetry function.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A numpy array of symmetry function. The shape is `(max_atoms, 4)`.
    """
    coordinates = self.coordfeat._featurize(mol)
    atom_numbers = np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()])
    atom_numbers = np.expand_dims(atom_numbers, axis=1)
    assert atom_numbers.shape[0] == coordinates.shape[0]
    features = np.concatenate([atom_numbers, coordinates], axis=1)
    return pad_array(features, (self.max_atoms, 4))
Ejemplo n.º 23
0
    def _featurize(self, mol: RDKitMol) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        node_features = np.asarray(
            [self._pagtn_atom_featurizer(atom) for atom in mol.GetAtoms()],
            dtype=np.float)
        edge_index, edge_features = self._pagtn_edge_featurizer(mol)
        graph = GraphData(node_features, edge_index, edge_features)
        return graph
Ejemplo n.º 24
0
  def get_conformer_energies(self, mol: RDKitMol) -> np.ndarray:
    """
    Calculate conformer energies.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object with embedded conformers.

    Returns
    -------
    energies : np.ndarray
      Minimized conformer energies.
    """
    energies = []
    for conf in mol.GetConformers():
      ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
      energy = ff.CalcEnergy()
      energies.append(energy)
    return np.asarray(energies, dtype=float)
Ejemplo n.º 25
0
    def _pagtn_edge_featurizer(self,
                               mol: RDKitMol) -> Tuple[np.ndarray, np.ndarray]:
        """Calculate bond features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    np.ndarray
      Source and Destination node indexes of each bond.
    np.ndarray
      numpy vector of bond features.
    """
        n_atoms = mol.GetNumAtoms()
        # To get the shortest paths between two nodes.
        paths_dict = compute_all_pairs_shortest_path(mol)
        # To get info if two nodes belong to the same ring.
        rings_dict = compute_pairwise_ring_info(mol)
        # Featurizer
        feats = []
        src = []
        dest = []
        for i in range(n_atoms):
            for j in range(n_atoms):
                src.append(i)
                dest.append(j)

                if (i, j) not in paths_dict:
                    feats.append(np.zeros(7 * self.max_length + 7))
                    continue
                ring_info = rings_dict.get(self.ordered_pair(i, j), [])
                feats.append(
                    self._edge_features(mol, paths_dict[(i, j)], ring_info))

        return np.array([src, dest], dtype=np.int), np.array(feats,
                                                             dtype=np.float)
Ejemplo n.º 26
0
def get_rotatable_bonds(mol: RDKitMol) -> List[Tuple[int, int]]:
    """
  https://github.com/rdkit/rdkit/blob/f4529c910e546af590c56eba01f96e9015c269a6/Code/GraphMol/Descriptors/Lipinski.cpp#L107

  Taken from rdkit source to find which bonds are rotatable store
  rotatable bonds in (from_atom, to_atom)

  Parameters
  ----------
  mol: RDKit Mol
    Ligand molecule

  Returns
  -------
  rotatable_bonds: List[List[int, int]]
    List of rotatable bonds in molecule

  Note
  ----
  This function requires RDKit to be installed.
  """
    try:
        from rdkit import Chem
        from rdkit.Chem import rdmolops
    except ModuleNotFoundError:
        raise ValueError("This function requires RDKit to be installed.")

    pattern = Chem.MolFromSmarts(
        "[!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])("
        "[CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])&!$([#7,O,S!D1]-!@[CD3]="
        "[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])&!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#"
        "*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])"
        "[CH3])]")
    rdmolops.FastFindRings(mol)
    rotatable_bonds = mol.GetSubstructMatches(pattern)
    return rotatable_bonds
Ejemplo n.º 27
0
    def _featurize(self, mol: RDKitMol) -> np.ndarray:
        """Featurizes a single SMILE into an image.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A 3D array of image, the shape is `(img_size, img_size, 1)`.
      If the length of SMILES is longer than `max_len`, this value is an empty array.
    """
        from rdkit import Chem
        from rdkit.Chem import AllChem

        smile = Chem.MolToSmiles(mol)
        if len(smile) > self.max_len:
            return np.array([])

        cmol = Chem.Mol(mol.ToBinary())
        cmol.ComputeGasteigerCharges()
        AllChem.Compute2DCoords(cmol)
        atom_coords = cmol.GetConformer(0).GetPositions()

        if self.img_spec == "std":
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 1))
            # Compute bond properties
            bond_props = np.array(
                [[2.0, bond.GetBeginAtomIdx(),
                  bond.GetEndAtomIdx()] for bond in mol.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[atom.GetAtomicNum()]
                                   for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

        else:
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 4))
            # Compute bond properties
            bond_props = np.array([[
                bond.GetBondTypeAsDouble(),
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx()
            ] for bond in mol.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[
                atom.GetAtomicNum(),
                atom.GetProp("_GasteigerCharge"),
                atom.GetExplicitValence(),
                atom.GetHybridization().real,
            ] for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

            partial_charges = atom_props[:, 1]
            if np.any(np.isnan(partial_charges)):
                return np.array([])

        frac = np.linspace(0, 1, int(1 / self.res * 2))
        # Reshape done for proper broadcast
        frac = frac.reshape(-1, 1, 1)

        bond_begin_idxs = bond_props[:, 1].astype(int)
        bond_end_idxs = bond_props[:, 2].astype(int)

        # Reshapes, and axes manipulations to facilitate vector processing.
        begin_coords = atom_coords[bond_begin_idxs]
        begin_coords = np.expand_dims(begin_coords.T, axis=0)
        end_coords = atom_coords[bond_end_idxs]
        end_coords = np.expand_dims(end_coords.T, axis=0)

        # Draw a line between the two atoms.
        # The coordinates of this line, are indicated in line_coords
        line_coords = frac * begin_coords + (1 - frac) * end_coords
        # Turn the line coordinates into image positions
        bond_line_idxs = np.ceil(
            (line_coords[:, 0] + self.embed) / self.res).astype(int)
        bond_line_idys = np.ceil(
            (line_coords[:, 1] + self.embed) / self.res).astype(int)
        # Set the bond line coordinates to the bond property used.
        img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0]

        # Turn atomic coordinates into image positions
        atom_idxs = np.round(
            (atom_coords[:, 0] + self.embed) / self.res).astype(int)
        atom_idys = np.round(
            (atom_coords[:, 1] + self.embed) / self.res).astype(int)
        # Set the atom positions in image to different atomic properties in channels
        img[atom_idxs, atom_idys, :] = atom_props
        return img
Ejemplo n.º 28
0
def pair_features(mol: RDKitMol,
                  bond_features_map: dict,
                  bond_adj_list: List,
                  bt_len: int = 6,
                  graph_distance: bool = True,
                  max_pair_distance: Optional[int] = None) -> np.ndarray:
  """Helper method used to compute atom pair feature vectors.

  Many different featurization methods compute atom pair features
  such as WeaveFeaturizer. Note that atom pair features could be
  for pairs of atoms which aren't necessarily bonded to one
  another.

  Parameters
  ----------
  mol: RDKit Mol
    Molecule to compute features on.
  bond_features_map: dict 
    Dictionary that maps pairs of atom ids (say `(2, 3)` for a bond between
    atoms 2 and 3) to the features for the bond between them.
  bond_adj_list: list of lists
    `bond_adj_list[i]` is a list of the atom indices that atom `i` shares a
    bond with . This list is symmetrical so if `j in bond_adj_list[i]` then `i
    in bond_adj_list[j]`.
  bt_len: int, optional (default 6)
    The number of different bond types to consider.
  graph_distance: bool, optional (default True)
    If true, use graph distance between molecules. Else use euclidean
    distance. The specified `mol` must have a conformer. Atomic
    positions will be retrieved by calling `mol.getConformer(0)`.
  max_pair_distance: Optional[int], (default None)
    This value can be a positive integer or None. This
    parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)

  Note
  ----
  This method requires RDKit to be installed.

  Returns
  -------
  features: np.ndarray
    Of shape `(N_edges, bt_len + max_distance + 1)`. This is the array
    of pairwise features for all atom pairs, where N_edges is the
    number of edges within max_pair_distance of one another in this
    molecules.
  pair_edges: np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of
    pairs within `max_pair_distance` of one another.
  """
  if graph_distance:
    max_distance = 7
  else:
    max_distance = 1
  N = mol.GetNumAtoms()
  pair_edges = max_pair_distance_pairs(mol, max_pair_distance)
  num_pairs = pair_edges.shape[1]
  N_edges = pair_edges.shape[1]
  features = np.zeros((N_edges, bt_len + max_distance + 1))
  # Get mapping
  mapping = {}
  for n in range(N_edges):
    a1, a2 = pair_edges[:, n]
    mapping[(int(a1), int(a2))] = n
  num_atoms = mol.GetNumAtoms()
  rings = mol.GetRingInfo().AtomRings()
  for a1 in range(num_atoms):
    for a2 in bond_adj_list[a1]:
      # first `bt_len` features are bond features(if applicable)
      if (int(a1), int(a2)) not in mapping:
        raise ValueError(
            "Malformed molecule with bonds not in specified graph distance.")
      else:
        n = mapping[(int(a1), int(a2))]
      features[n, :bt_len] = np.asarray(
          bond_features_map[tuple(sorted((a1, a2)))], dtype=float)
    for ring in rings:
      if a1 in ring:
        for a2 in ring:
          if (int(a1), int(a2)) not in mapping:
            # For ring pairs outside max pairs distance continue
            continue
          else:
            n = mapping[(int(a1), int(a2))]
          # `bt_len`-th feature is if the pair of atoms are in the same ring
          if a2 == a1:
            features[n, bt_len] = 0
          else:
            features[n, bt_len] = 1
    # graph distance between two atoms
    if graph_distance:
      # distance is a matrix of 1-hot encoded distances for all atoms
      distance = find_distance(
          a1, num_atoms, bond_adj_list, max_distance=max_distance)
      for a2 in range(num_atoms):
        if (int(a1), int(a2)) not in mapping:
          # For ring pairs outside max pairs distance continue
          continue
        else:
          n = mapping[(int(a1), int(a2))]
          features[n, bt_len + 1:] = distance[a2]
  # Euclidean distance between atoms
  if not graph_distance:
    coords = np.zeros((N, 3))
    for atom in range(N):
      pos = mol.GetConformer(0).GetAtomPosition(atom)
      coords[atom, :] = [pos.x, pos.y, pos.z]
    features[:, :, -1] = np.sqrt(np.sum(np.square(
      np.stack([coords] * N, axis=1) - \
      np.stack([coords] * N, axis=0)), axis=2))

  return features, pair_edges
Ejemplo n.º 29
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """Featurizes a single SMILE into an image.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit Mol object

    Returns
    -------
    np.ndarray
      A 3D array of image, the shape is `(img_size, img_size, 1)`.
      If the length of SMILES is longer than `max_len`, this value is an empty array.
    """
        try:
            from rdkit import Chem
            from rdkit.Chem import AllChem
        except ModuleNotFoundError:
            raise ImportError("This class requires RDKit to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        smile = Chem.MolToSmiles(datapoint)
        if len(smile) > self.max_len:
            return np.array([])

        cmol = Chem.Mol(datapoint.ToBinary())
        cmol.ComputeGasteigerCharges()
        AllChem.Compute2DCoords(cmol)
        atom_coords = cmol.GetConformer(0).GetPositions()

        if self.img_spec == "std":
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 1))
            # Compute bond properties
            bond_props = np.array(
                [[2.0, bond.GetBeginAtomIdx(),
                  bond.GetEndAtomIdx()] for bond in datapoint.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[atom.GetAtomicNum()]
                                   for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

        else:
            # Setup image
            img = np.zeros((self.img_size, self.img_size, 4))
            # Compute bond properties
            bond_props = np.array([[
                bond.GetBondTypeAsDouble(),
                bond.GetBeginAtomIdx(),
                bond.GetEndAtomIdx()
            ] for bond in datapoint.GetBonds()])
            # Compute atom properties
            atom_props = np.array([[
                atom.GetAtomicNum(),
                atom.GetProp("_GasteigerCharge"),
                atom.GetExplicitValence(),
                atom.GetHybridization().real,
            ] for atom in cmol.GetAtoms()])

            bond_props = bond_props.astype(np.float32)
            atom_props = atom_props.astype(np.float32)

            partial_charges = atom_props[:, 1]
            if np.any(np.isnan(partial_charges)):
                return np.array([])

        frac = np.linspace(0, 1, int(1 / self.res * 2))
        # Reshape done for proper broadcast
        frac = frac.reshape(-1, 1, 1)

        bond_begin_idxs = bond_props[:, 1].astype(int)
        bond_end_idxs = bond_props[:, 2].astype(int)

        # Reshapes, and axes manipulations to facilitate vector processing.
        begin_coords = atom_coords[bond_begin_idxs]
        begin_coords = np.expand_dims(begin_coords.T, axis=0)
        end_coords = atom_coords[bond_end_idxs]
        end_coords = np.expand_dims(end_coords.T, axis=0)

        # Draw a line between the two atoms.
        # The coordinates of this line, are indicated in line_coords
        line_coords = frac * begin_coords + (1 - frac) * end_coords
        # Turn the line coordinates into image positions
        bond_line_idxs = np.ceil(
            (line_coords[:, 0] + self.embed) / self.res).astype(int)
        bond_line_idys = np.ceil(
            (line_coords[:, 1] + self.embed) / self.res).astype(int)
        # Turn atomic coordinates into image positions
        atom_idxs = np.round(
            (atom_coords[:, 0] + self.embed) / self.res).astype(int)
        atom_idys = np.round(
            (atom_coords[:, 1] + self.embed) / self.res).astype(int)

        try:
            # Set the bond line coordinates to the bond property used.
            img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0]

            # Set the atom positions in image to different atomic properties in channels
            img[atom_idxs, atom_idys, :] = atom_props

        except IndexError:
            # With fixed res and img_size some molecules (e.g. long chains) may not fit.
            raise IndexError(
                "The molecule does not fit into the image. Consider increasing img_size or res of the SmilesToImage featurizer."
            )
        return img
Ejemplo n.º 30
0
    def _featurize(self, datapoint: RDKitMol, **kwargs) -> GraphData:
        """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    datapoint: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
        assert datapoint.GetNumAtoms(
        ) > 1, "More than one atom should be present in the molecule for this featurizer to work."
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        if self.use_partial_charge:
            try:
                datapoint.GetAtomWithIdx(0).GetProp('_GasteigerCharge')
            except:
                # If partial charges were not computed
                try:
                    from rdkit.Chem import AllChem
                    AllChem.ComputeGasteigerCharges(datapoint)
                except ModuleNotFoundError:
                    raise ImportError(
                        "This class requires RDKit to be installed.")

        # construct atom (node) feature
        h_bond_infos = construct_hydrogen_bonding_info(datapoint)
        atom_features = np.asarray(
            [
                _construct_atom_feature(atom, h_bond_infos, self.use_chirality,
                                        self.use_partial_charge)
                for atom in datapoint.GetAtoms()
            ],
            dtype=float,
        )

        # construct edge (bond) index
        src, dest = [], []
        for bond in datapoint.GetBonds():
            # add edge list considering a directed graph
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            src += [start, end]
            dest += [end, start]

        # construct edge (bond) feature
        bond_features = None  # deafult None
        if self.use_edges:
            features = []
            for bond in datapoint.GetBonds():
                features += 2 * [_construct_bond_feature(bond)]
            bond_features = np.asarray(features, dtype=float)

        return GraphData(node_features=atom_features,
                         edge_index=np.asarray([src, dest], dtype=int),
                         edge_features=bond_features)