Esempio n. 1
0
    def test_pdbqt_to_pdb(self):
        """Test that a PDBQT molecule can be converted back in to PDB."""
        xyz, mol = rdkit_utils.load_molecule(self.protein_file,
                                             calc_charges=False,
                                             add_hydrogens=False)
        with tempfile.TemporaryDirectory() as tmp:
            out_pdb = os.path.join(tmp, "mol.pdb")
            out_pdbqt = os.path.join(tmp, "mol.pdbqt")

            rdkit_utils.write_molecule(mol, out_pdb, is_protein=True)
            rdkit_utils.write_molecule(mol, out_pdbqt, is_protein=True)

            pdb_block = pdbqt_utils.pdbqt_to_pdb(out_pdbqt)
            from rdkit import Chem
            pdb_mol = Chem.MolFromPDBBlock(pdb_block,
                                           sanitize=False,
                                           removeHs=False)

            xyz, pdbqt_mol = rdkit_utils.load_molecule(out_pdbqt,
                                                       add_hydrogens=False,
                                                       calc_charges=False)

        assert pdb_mol.GetNumAtoms() == pdbqt_mol.GetNumAtoms()
        for atom_idx in range(pdb_mol.GetNumAtoms()):
            atom1 = pdb_mol.GetAtoms()[atom_idx]
            atom2 = pdbqt_mol.GetAtoms()[atom_idx]
            assert atom1.GetAtomicNum() == atom2.GetAtomicNum()
Esempio n. 2
0
  def _featurize(self, mol_pdb_file, protein_pdb_file):
    try:
      frag1_coords, frag1_mol = load_molecule(
          mol_pdb_file, is_protein=False, sanitize=True, add_hydrogens=False)
      frag2_coords, frag2_mol = load_molecule(
          protein_pdb_file, is_protein=True, sanitize=True, add_hydrogens=False)
    except MoleculeLoadException:
      # Currently handles loading failures by returning None
      # TODO: Is there a better handling procedure?
      logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
      return None
    system_mol = merge_molecules([frag1_mol, frag2_mol])
    system_coords = get_xyz_from_mol(system_mol)

    frag1_coords, frag1_mol = self._strip_hydrogens(frag1_coords, frag1_mol)
    frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol)
    system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol)

    try:
      frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol(
          frag1_coords, frag1_mol, self.frag1_num_atoms)

      frag2_coords, frag2_neighbor_list, frag2_z = self.featurize_mol(
          frag2_coords, frag2_mol, self.frag2_num_atoms)

      system_coords, system_neighbor_list, system_z = self.featurize_mol(
          system_coords, system_mol, self.complex_num_atoms)
    except ValueError:
      logging.warning(
          "max_atoms was set too low. Some complexes too large and skipped")
      return None

    return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \
           system_coords, system_neighbor_list, system_z
Esempio n. 3
0
 def test_merge_molecules(self):
   current_dir = os.path.dirname(os.path.realpath(__file__))
   ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf")
   xyz, mol = rdkit_utils.load_molecule(
       ligand_file, calc_charges=False, add_hydrogens=False)
   num_mol_atoms = mol.GetNumAtoms()
   # self.ligand_file is for 3ws9_ligand.sdf
   oth_xyz, oth_mol = rdkit_utils.load_molecule(
       self.ligand_file, calc_charges=False, add_hydrogens=False)
   num_oth_mol_atoms = oth_mol.GetNumAtoms()
   merged = rdkit_utils.merge_molecules([mol, oth_mol])
   merged_num_atoms = merged.GetNumAtoms()
   assert merged_num_atoms == num_mol_atoms + num_oth_mol_atoms
Esempio n. 4
0
 def test_load_molecule2(self):
   current_dir = os.path.dirname(os.path.realpath(__file__))
   ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf")
   xyz, mol = rdkit_utils.load_molecule(
       ligand_file, calc_charges=False, add_hydrogens=False)
   assert xyz is not None
   assert mol is not None
  def _featurize(self, complex: Tuple[str, str]):
    """
    Compute neighbor list for complex.

    Parameters
    ----------
    complex: Tuple[str, str]
      Filenames for molecule and protein.
    """
    mol_pdb_file, protein_pdb_file = complex
    mol_coords, ob_mol = load_molecule(mol_pdb_file)
    protein_coords, protein_mol = load_molecule(protein_pdb_file)
    system_coords = merge_molecules_xyz([mol_coords, protein_coords])

    system_neighbor_list = compute_neighbor_list(
        system_coords, self.neighbor_cutoff, self.max_num_neighbors, None)

    return (system_coords, system_neighbor_list)
Esempio n. 6
0
  def test_write_molecule(self):
    current_dir = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf")
    xyz, mol = rdkit_utils.load_molecule(
        ligand_file, calc_charges=False, add_hydrogens=False)

    with tempfile.TemporaryDirectory() as tmp:
      outfile = os.path.join(tmp, "mol.sdf")
      rdkit_utils.write_molecule(mol, outfile)

      xyz, mol2 = rdkit_utils.load_molecule(
          outfile, calc_charges=False, add_hydrogens=False)

    assert mol.GetNumAtoms() == mol2.GetNumAtoms()
    for atom_idx in range(mol.GetNumAtoms()):
      atom1 = mol.GetAtoms()[atom_idx]
      atom2 = mol.GetAtoms()[atom_idx]
      assert atom1.GetAtomicNum() == atom2.GetAtomicNum()
Esempio n. 7
0
  def test_get_xyz_from_mol(self):
    current_dir = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf")

    xyz, mol = rdkit_utils.load_molecule(
        ligand_file, calc_charges=False, add_hydrogens=False)
    xyz2 = rdkit_utils.get_xyz_from_mol(mol)

    equal_array = np.all(xyz == xyz2)
    assert equal_array
Esempio n. 8
0
  def _featurize(self, mol_pdb_file, protein_pdb_file):
    """
    Compute neighbor list for complex.

    Parameters
    ----------
    mol_pdb_file: str
      Filename for ligand pdb file.
    protein_pdb_file: str
      Filename for protein pdb file.
    """
    mol_coords, ob_mol = load_molecule(mol_pdb_file)
    protein_coords, protein_mol = load_molecule(protein_pdb_file)
    system_coords = merge_molecules_xyz([mol_coords, protein_coords])

    system_neighbor_list = compute_neighbor_list(
        system_coords, self.neighbor_cutoff, self.max_num_neighbors, None)

    return (system_coords, system_neighbor_list)
Esempio n. 9
0
 def test_convert_protein_to_pdbqt(self):
     """Test a protein in a PDB can be converted to PDBQT."""
     from rdkit import Chem
     xyz, mol = rdkit_utils.load_molecule(self.protein_file,
                                          calc_charges=False,
                                          add_hydrogens=False)
     with tempfile.TemporaryDirectory() as tmp:
         outfile = os.path.join(tmp, "mol.pdbqt")
         writer = Chem.PDBWriter(outfile)
         writer.write(mol)
         writer.close()
         pdbqt_utils.convert_protein_to_pdbqt(mol, outfile)
         pdbqt_xyz, pdbqt_mol = rdkit_utils.load_molecule(
             outfile, add_hydrogens=False, calc_charges=False)
     assert pdbqt_mol.GetNumAtoms() == pdbqt_mol.GetNumAtoms()
     for atom_idx in range(pdbqt_mol.GetNumAtoms()):
         atom1 = pdbqt_mol.GetAtoms()[atom_idx]
         atom2 = pdbqt_mol.GetAtoms()[atom_idx]
         assert atom1.GetAtomicNum() == atom2.GetAtomicNum()
Esempio n. 10
0
  def test_get_face_boxes_for_protein(self):
    """Tests that binding pockets are detected."""
    current_dir = os.path.dirname(os.path.realpath(__file__))
    protein_file = os.path.join(current_dir, "1jld_protein.pdb")
    coords = rdkit_utils.load_molecule(protein_file)[0]

    boxes = box_utils.get_face_boxes(coords)
    assert isinstance(boxes, list)
    # Pocket is of form ((x_min, x_max), (y_min, y_max), (z_min, z_max))
    for pocket in boxes:
      assert isinstance(pocket, box_utils.CoordinateBox)
Esempio n. 11
0
 def test_merge_molecules_xyz(self):
   current_dir = os.path.dirname(os.path.realpath(__file__))
   ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf")
   xyz, mol = rdkit_utils.load_molecule(
       ligand_file, calc_charges=False, add_hydrogens=False)
   merged = rdkit_utils.merge_molecules_xyz([xyz, xyz])
   for i in range(len(xyz)):
     first_atom_equal = np.all(xyz[i] == merged[i])
     second_atom_equal = np.all(xyz[i] == merged[i + len(xyz)])
     assert first_atom_equal
     assert second_atom_equal
Esempio n. 12
0
 def test_load_molecule(self):
   # adding hydrogens and charges is tested in dc.utils
   from rdkit.Chem.AllChem import Mol
   for add_hydrogens in (True, False):
     for calc_charges in (True, False):
       mol_xyz, mol_rdk = rdkit_utils.load_molecule(
           self.ligand_file, add_hydrogens, calc_charges)
       num_atoms = mol_rdk.GetNumAtoms()
       self.assertIsInstance(mol_xyz, np.ndarray)
       self.assertIsInstance(mol_rdk, Mol)
       self.assertEqual(mol_xyz.shape, (num_atoms, 3))
Esempio n. 13
0
    def setUp(self):
        current_dir = os.path.dirname(os.path.realpath(__file__))

        # simple flat ring
        from rdkit.Chem import MolFromSmiles
        from rdkit.Chem.rdDepictor import Compute2DCoords
        self.cycle4 = MolFromSmiles('C1CCC1')
        # self.cycle4.Compute2DCoords()
        Compute2DCoords(self.cycle4)

        # load and sanitize two real molecules
        _, self.prot = load_molecule(os.path.join(
            current_dir, '../../feat/tests/data/3ws9_protein_fixer_rdkit.pdb'),
                                     add_hydrogens=False,
                                     calc_charges=False,
                                     sanitize=True)

        _, self.lig = load_molecule(os.path.join(
            current_dir, '../../feat//tests/data/3ws9_ligand.sdf'),
                                    add_hydrogens=False,
                                    calc_charges=False,
                                    sanitize=True)
Esempio n. 14
0
  def test_compute_charges(self):
    current_dir = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf")
    xyz, mol = rdkit_utils.load_molecule(
        ligand_file, calc_charges=False, add_hydrogens=True)
    rdkit_utils.compute_charges(mol)

    has_a_charge = False
    for atom_idx in range(mol.GetNumAtoms()):
      atom = mol.GetAtoms()[atom_idx]
      value = atom.GetProp(str("_GasteigerCharge"))
      if value != 0:
        has_a_charge = True
    assert has_a_charge
Esempio n. 15
0
def extract_active_site(
        protein_file: str,
        ligand_file: str,
        cutoff: float = 4.0) -> Tuple[CoordinateBox, np.ndarray]:
    """Extracts a box for the active site.

  Parameters
  ----------
  protein_file : str
    Location of protein PDB
  ligand_file : str
    Location of ligand input file
  cutoff : float, optional (default 4.0)
    The distance in angstroms from the protein pocket to
    consider for featurization.

  Returns
  -------
  Tuple[CoordinateBox, np.ndarray]
    A tuple of `(CoordinateBox, np.ndarray)` where the second entry is
    of shape `(N, 3)` with `N` the number of atoms in the active site.
  """
    protein = load_molecule(protein_file, add_hydrogens=False)
    ligand = load_molecule(ligand_file, add_hydrogens=True, calc_charges=True)
    protein_contacts, ligand_contacts = get_contact_atom_indices(
        [protein, ligand], cutoff=cutoff)
    protein_coords = protein[0]
    pocket_coords = protein_coords[protein_contacts]

    x_min = int(np.floor(np.amin(pocket_coords[:, 0])))
    x_max = int(np.ceil(np.amax(pocket_coords[:, 0])))
    y_min = int(np.floor(np.amin(pocket_coords[:, 1])))
    y_max = int(np.ceil(np.amax(pocket_coords[:, 1])))
    z_min = int(np.floor(np.amin(pocket_coords[:, 2])))
    z_max = int(np.ceil(np.amax(pocket_coords[:, 2])))
    box = CoordinateBox((x_min, x_max), (y_min, y_max), (z_min, z_max))
    return box, pocket_coords
  def _featurize(self, datapoint, **kwargs):
    """
    Compute neighbor list for complex.

    Parameters
    ----------
    datapoint: Tuple[str, str]
      Filenames for molecule and protein.
    """
    if 'complex' in kwargs:
      datapoint = kwargs.get("complex")
      raise DeprecationWarning(
          'Complex is being phased out as a parameter, please pass "datapoint" instead.'
      )

    mol_pdb_file, protein_pdb_file = datapoint
    mol_coords, ob_mol = load_molecule(mol_pdb_file)
    protein_coords, protein_mol = load_molecule(protein_pdb_file)
    system_coords = merge_molecules_xyz([mol_coords, protein_coords])

    system_neighbor_list = compute_neighbor_list(
        system_coords, self.neighbor_cutoff, self.max_num_neighbors, None)

    return (system_coords, system_neighbor_list)
Esempio n. 17
0
    def find_all_pockets(self, protein_file: str) -> List[CoordinateBox]:
        """Find list of binding pockets on protein.

    Parameters
    ----------
    protein_file : str
      Protein to load in.

    Returns
    -------
    List[CoordinateBox]
      List of binding pockets on protein. Each pocket is a `CoordinateBox`
    """
        coords, _ = load_molecule(protein_file)
        return get_face_boxes(coords, self.pad)
    def featurize(  # type: ignore[override]
            self, protein_file: str,
            pockets: List[CoordinateBox]) -> np.ndarray:
        """
    Calculate atomic coodinates.

    Parameters
    ----------
    protein_file: str
      Location of PDB file. Will be loaded by MDTraj
    pockets: List[CoordinateBox]
      List of `dc.utils.CoordinateBox` objects.

    Returns
    -------
    np.ndarray
      A numpy array of shale `(len(pockets), n_residues)`
    """
        try:
            import mdtraj
        except ModuleNotFoundError:
            raise ImportError("This class requires mdtraj to be installed.")

        protein_coords = load_molecule(protein_file,
                                       add_hydrogens=False,
                                       calc_charges=False)[0]
        mapping = boxes_to_atoms(protein_coords, pockets)
        protein = mdtraj.load(protein_file)
        n_pockets = len(pockets)
        n_residues = len(BindingPocketFeaturizer.residues)
        res_map = dict(zip(BindingPocketFeaturizer.residues,
                           range(n_residues)))
        all_features = np.zeros((n_pockets, n_residues))
        for pocket_num, pocket in enumerate(pockets):
            pocket_atoms = mapping[pocket]
            for ind, atom in enumerate(pocket_atoms):
                atom_name = str(protein.top.atom(atom))
                # atom_name is of format RESX-ATOMTYPE
                # where X is a 1 to 4 digit number
                residue = atom_name[:3]
                if residue not in res_map:
                    logger.info("Warning: Non-standard residue in PDB file")
                    continue
                all_features[pocket_num, res_map[residue]] += 1
        return all_features
Esempio n. 19
0
  def test_apply_pdbfixer(self):
    current_dir = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(current_dir, "../../dock/tests/1jld_ligand.sdf")
    xyz, mol = rdkit_utils.load_molecule(
        ligand_file, calc_charges=False, add_hydrogens=False)
    original_hydrogen_count = 0
    for atom_idx in range(mol.GetNumAtoms()):
      atom = mol.GetAtoms()[atom_idx]
      if atom.GetAtomicNum() == 1:
        original_hydrogen_count += 1

    assert mol is not None
    mol = rdkit_utils.apply_pdbfixer(mol, hydrogenate=True, is_protein=False)
    assert mol is not None
    after_hydrogen_count = 0
    for atom_idx in range(mol.GetNumAtoms()):
      atom = mol.GetAtoms()[atom_idx]
      if atom.GetAtomicNum() == 1:
        after_hydrogen_count += 1
    assert after_hydrogen_count >= original_hydrogen_count
Esempio n. 20
0
    def featurize(self, protein_file, pockets):
        """
    Calculate atomic coodinates.

    Parameters
    ----------
    protein_file: str
      Location of PDB file. Will be loaded by MDTraj
    pockets: list[CoordinateBox]
      List of `dc.utils.CoordinateBox` objects.

    Returns
    -------
    A numpy array of shale `(len(pockets), n_residues)`
    """
        import mdtraj
        protein_coords = load_molecule(protein_file,
                                       add_hydrogens=False,
                                       calc_charges=False)[0]
        mapping = boxes_to_atoms(protein_coords, pockets)
        protein = mdtraj.load(protein_file)
        n_pockets = len(pockets)
        n_residues = len(BindingPocketFeaturizer.residues)
        res_map = dict(zip(BindingPocketFeaturizer.residues,
                           range(n_residues)))
        all_features = np.zeros((n_pockets, n_residues))
        for pocket_num, pocket in enumerate(pockets):
            pocket_atoms = mapping[pocket]
            for ind, atom in enumerate(pocket_atoms):
                atom_name = str(protein.top.atom(atom))
                # atom_name is of format RESX-ATOMTYPE
                # where X is a 1 to 4 digit number
                residue = atom_name[:3]
                if residue not in res_map:
                    logger.info("Warning: Non-standard residue in PDB file")
                    continue
                atomtype = atom_name.split("-")[1]
                all_features[pocket_num, res_map[residue]] += 1
        return all_features
Esempio n. 21
0
    def find_pockets(self, macromolecule_file: str) -> List[CoordinateBox]:
        """Find list of suitable binding pockets on protein.

    This function computes putative binding pockets on this protein.
    This class uses the `ConvexHull` to compute binding pockets. Each
    face of the hull is converted into a coordinate box used for
    binding.

    Parameters
    ----------
    macromolecule_file : str
      Location of the macromolecule file to load

    Returns
    -------
    List[CoordinateBox]
      List of pockets. Each pocket is a `CoordinateBox`
    """
        coords, _ = load_molecule(macromolecule_file,
                                  add_hydrogens=False,
                                  calc_charges=False)
        boxes = get_face_boxes(coords, self.pad)
        boxes = merge_overlapping_boxes(boxes)
        return boxes
Esempio n. 22
0
  def test_strip_hydrogens(self):
    mol_xyz, mol_rdk = rdkit_utils.load_molecule(self.ligand_file)
    _ = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz)

    # Test on RDKit
    _ = strip_hydrogens(mol_xyz, mol_rdk)
Esempio n. 23
0
    def generate_poses(
            self,
            molecular_complex: Tuple[str, str],
            centroid: Optional[np.ndarray] = None,
            box_dims: Optional[np.ndarray] = None,
            exhaustiveness: int = 10,
            num_modes: int = 9,
            num_pockets: Optional[int] = None,
            out_dir: Optional[str] = None,
            generate_scores: bool = True,
            **kwargs) -> Union[Tuple[DOCKED_POSES, np.ndarray], DOCKED_POSES]:
        """Generates the docked complex and outputs files for docked complex.

    Parameters
    ----------
    molecular_complexes: Tuple[str, str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file).
    centroid: np.ndarray, optional (default None)
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional (default None)
      A numpy array of shape `(3,)` holding the size of the box to dock.
      If not specified is set to size of molecular complex plus 4 angstroms.
    exhaustiveness: int (default 8)
      Tells GNINA how exhaustive it should be with pose
      generation.
    num_modes: int (default 9)
      Tells GNINA how many binding modes it should generate at
      each invocation.
    out_dir: str, optional
      If specified, write generated poses to this directory.
    generate_scores: bool, optional (default True)
      If `True`, the pose generator will return scores for complexes.
      This is used typically when invoking external docking programs
      that compute scores.
    kwargs:
      Any args supported by GNINA as documented
      https://github.com/gnina/gnina#usage

    Returns
    -------
    Tuple[`docked_poses`, `scores`] or `docked_poses`
      Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
      is a list of docked molecular complexes. Each entry in this list
      contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
      `scores` is an array of binding affinities (kcal/mol),
      CNN pose scores, and CNN affinities predicted by GNINA.

    """

        if out_dir is None:
            out_dir = tempfile.mkdtemp()
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        # Parse complex
        if len(molecular_complex) > 2:
            raise ValueError(
                "GNINA can only dock protein-ligand complexes and not more general molecular complexes."
            )

        (protein_file, ligand_file) = molecular_complex

        # check filetypes
        if not protein_file.endswith('.pdb'):
            raise ValueError('Protein file must be in .pdb format.')
        if not ligand_file.endswith('.sdf'):
            raise ValueError('Ligand file must be in .sdf format.')

        protein_mol = load_molecule(protein_file,
                                    calc_charges=True,
                                    add_hydrogens=True)
        ligand_name = os.path.basename(ligand_file).split(".")[0]

        # Define locations of log and output files
        log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name)
        out_file = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
        logger.info("About to call GNINA.")

        # Write GNINA conf file
        conf_file = os.path.join(out_dir, "conf.txt")
        write_gnina_conf(protein_filename=protein_file,
                         ligand_filename=ligand_file,
                         conf_filename=conf_file,
                         num_modes=num_modes,
                         exhaustiveness=exhaustiveness,
                         **kwargs)

        # Run GNINA
        args = [
            self.gnina_cmd, "--config", conf_file, "--log", log_file, "--out",
            out_file
        ]
        process = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = process.communicate()

        # read output and log
        ligands, _ = load_docked_ligands(out_file)
        docked_complexes = [(protein_mol[1], ligand) for ligand in ligands]
        scores = read_gnina_log(log_file)

        if generate_scores:
            return docked_complexes, scores
        else:
            return docked_complexes
Esempio n. 24
0
    def generate_poses(
            self,
            molecular_complex: Tuple[str, str],
            centroid: Optional[np.ndarray] = None,
            box_dims: Optional[np.ndarray] = None,
            exhaustiveness: int = 10,
            num_modes: int = 9,
            num_pockets: Optional[int] = None,
            out_dir: Optional[str] = None,
            generate_scores: Optional[bool] = False,
            **kwargs) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
        """Generates the docked complex and outputs files for docked complex.

    Parameters
    ----------
    molecular_complexes: Tuple[str, str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file). The protein should be a pdb file
      and the ligand should be an sdf file.
    centroid: np.ndarray, optional
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional
      A numpy array of shape `(3,)` holding the size of the box to dock. If not
      specified is set to size of molecular complex plus 5 angstroms.
    exhaustiveness: int, optional (default 10)
      Tells Autodock Vina how exhaustive it should be with pose generation. A
      higher value of exhaustiveness implies more computation effort for the
      docking experiment.
    num_modes: int, optional (default 9)
      Tells Autodock Vina how many binding modes it should generate at
      each invocation.
    num_pockets: int, optional (default None)
      If specified, `self.pocket_finder` must be set. Will only
      generate poses for the first `num_pockets` returned by
      `self.pocket_finder`.
    out_dir: str, optional
      If specified, write generated poses to this directory.
    generate_score: bool, optional (default False)
      If `True`, the pose generator will return scores for complexes.
      This is used typically when invoking external docking programs
      that compute scores.
    kwargs:
      The kwargs - cpu, min_rmsd, max_evals, energy_range supported by VINA
      are as documented in https://autodock-vina.readthedocs.io/en/latest/vina.html

    Returns
    -------
    Tuple[`docked_poses`, `scores`] or `docked_poses`
      Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
      is a list of docked molecular complexes. Each entry in this list
      contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
      `scores` is a list of binding free energies predicted by Vina.

    Raises
    ------
    `ValueError` if `num_pockets` is set but `self.pocket_finder is None`.
    """
        if "cpu" in kwargs:
            cpu = kwargs["cpu"]
        else:
            cpu = 0
        if "min_rmsd" in kwargs:
            min_rmsd = kwargs["min_rmsd"]
        else:
            min_rmsd = 1.0
        if "max_evals" in kwargs:
            max_evals = kwargs["max_evals"]
        else:
            max_evals = 0
        if "energy_range" in kwargs:
            energy_range = kwargs["energy_range"]
        else:
            energy_range = 3.0

        try:
            from vina import Vina
        except ModuleNotFoundError:
            raise ImportError("This function requires vina to be installed")

        if out_dir is None:
            out_dir = tempfile.mkdtemp()

        if num_pockets is not None and self.pocket_finder is None:
            raise ValueError(
                "If num_pockets is specified, pocket_finder must have been provided at construction time."
            )

        # Parse complex
        if len(molecular_complex) > 2:
            raise ValueError(
                "Autodock Vina can only dock protein-ligand complexes and not more general molecular complexes."
            )

        (protein_file, ligand_file) = molecular_complex

        # Prepare protein
        protein_name = os.path.basename(protein_file).split(".")[0]
        protein_hyd = os.path.join(out_dir, "%s_hyd.pdb" % protein_name)
        protein_pdbqt = os.path.join(out_dir, "%s.pdbqt" % protein_name)
        protein_mol = load_molecule(protein_file,
                                    calc_charges=True,
                                    add_hydrogens=True)
        write_molecule(protein_mol[1], protein_hyd, is_protein=True)
        write_molecule(protein_mol[1], protein_pdbqt, is_protein=True)

        # Get protein centroid and range
        if centroid is not None and box_dims is not None:
            centroids = [centroid]
            dimensions = [box_dims]
        else:
            if self.pocket_finder is None:
                logger.info(
                    "Pockets not specified. Will use whole protein to dock")
                centroids = [compute_centroid(protein_mol[0])]
                dimensions = [compute_protein_range(protein_mol[0]) + 5.0]
            else:
                logger.info("About to find putative binding pockets")
                pockets = self.pocket_finder.find_pockets(protein_file)
                logger.info("%d pockets found in total" % len(pockets))
                logger.info("Computing centroid and size of proposed pockets.")
                centroids, dimensions = [], []
                for pocket in pockets:
                    (x_min, x_max), (y_min, y_max), (
                        z_min,
                        z_max) = pocket.x_range, pocket.y_range, pocket.z_range
                    # TODO(rbharath: Does vina divide box dimensions by 2?
                    x_box = (x_max - x_min) / 2.
                    y_box = (y_max - y_min) / 2.
                    z_box = (z_max - z_min) / 2.
                    centroids.append(pocket.center())
                    dimensions.append(np.array((x_box, y_box, z_box)))

        if num_pockets is not None:
            logger.info(
                "num_pockets = %d so selecting this many pockets for docking."
                % num_pockets)
            centroids = centroids[:num_pockets]
            dimensions = dimensions[:num_pockets]

        # Prepare ligand
        ligand_name = os.path.basename(ligand_file).split(".")[0]
        ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

        ligand_mol = load_molecule(ligand_file,
                                   calc_charges=True,
                                   add_hydrogens=True)
        write_molecule(ligand_mol[1], ligand_pdbqt)

        docked_complexes = []
        all_scores = []
        vpg = Vina(sf_name='vina',
                   cpu=cpu,
                   seed=0,
                   no_refine=False,
                   verbosity=1)
        for i, (protein_centroid,
                box_dims) in enumerate(zip(centroids, dimensions)):
            logger.info("Docking in pocket %d/%d" % (i + 1, len(centroids)))
            logger.info("Docking with center: %s" % str(protein_centroid))
            logger.info("Box dimensions: %s" % str(box_dims))
            # Write Vina conf file
            conf_file = os.path.join(out_dir, "conf.txt")
            write_vina_conf(protein_pdbqt,
                            ligand_pdbqt,
                            protein_centroid,
                            box_dims,
                            conf_file,
                            num_modes=num_modes,
                            exhaustiveness=exhaustiveness)

            # Define locations of output files
            out_pdbqt = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
            logger.info("About to call Vina")

            vpg.set_receptor(protein_pdbqt)
            vpg.set_ligand_from_file(ligand_pdbqt)

            vpg.compute_vina_maps(center=protein_centroid, box_size=box_dims)
            vpg.dock(exhaustiveness=exhaustiveness,
                     n_poses=num_modes,
                     min_rmsd=min_rmsd,
                     max_evals=max_evals)
            vpg.write_poses(out_pdbqt,
                            n_poses=num_modes,
                            energy_range=energy_range,
                            overwrite=True)

            ligands, scores = load_docked_ligands(out_pdbqt)
            docked_complexes += [(protein_mol[1], ligand)
                                 for ligand in ligands]
            all_scores += scores

        if generate_scores:
            return docked_complexes, all_scores
        else:
            return docked_complexes
Esempio n. 25
0
    def _featurize(self, complex):
        """Computes grid featurization of protein/ligand complex.

    Takes as input filenames pdb of the protein, pdb of the ligand.

    This function then computes the centroid of the ligand; decrements this
    centroid from the atomic coordinates of protein and ligand atoms, and then
    merges the translated protein and ligand. This combined system/complex is
    then saved.

    This function then computes a featurization with scheme specified by the user.

    Parameters
    ----------
    complex: Tuple[str, str]
      Filenames for molecule and protein.
    """
        try:
            mol_pdb_file, protein_pdb_file = complex
            time1 = time.time()

            protein_xyz, protein_rdk = load_molecule(protein_pdb_file,
                                                     calc_charges=True,
                                                     sanitize=self.sanitize)
            time2 = time.time()
            logger.info(
                "TIMING: Loading protein coordinates took %0.3f s" %
                (time2 - time1), self.verbose)
            time1 = time.time()
            ligand_xyz, ligand_rdk = load_molecule(mol_pdb_file,
                                                   calc_charges=True,
                                                   sanitize=self.sanitize)
            time2 = time.time()
            logger.info(
                "TIMING: Loading ligand coordinates took %0.3f s" %
                (time2 - time1), self.verbose)
        except MoleculeLoadException:
            logger.warning(
                "Some molecules cannot be loaded by Rdkit. Skipping")
            return None

        time1 = time.time()
        centroid = compute_centroid(ligand_xyz)
        ligand_xyz = subtract_centroid(ligand_xyz, centroid)
        protein_xyz = subtract_centroid(protein_xyz, centroid)
        time2 = time.time()
        logger.info(
            "TIMING: Centroid processing took %0.3f s" % (time2 - time1),
            self.verbose)

        pairwise_distances = compute_pairwise_distances(
            protein_xyz, ligand_xyz)

        transformed_systems = {}
        transformed_systems[(0, 0)] = [protein_xyz, ligand_xyz]

        for i in range(self.nb_rotations):
            rotated_system = rotate_molecules([protein_xyz, ligand_xyz])
            transformed_systems[(i + 1, 0)] = rotated_system

        features_dict = {}
        for system_id, (protein_xyz,
                        ligand_xyz) in transformed_systems.items():
            feature_arrays = []
            for is_flat, function_name in self.feature_types:

                result = self._compute_feature(
                    function_name,
                    protein_xyz,
                    protein_rdk,
                    ligand_xyz,
                    ligand_rdk,
                    pairwise_distances,
                )
                feature_arrays += result

                if self.flatten:
                    features_dict[system_id] = np.concatenate([
                        feature_array.flatten()
                        for feature_array in feature_arrays
                    ])
                else:
                    features_dict[system_id] = np.concatenate(feature_arrays,
                                                              axis=-1)

        # TODO(rbharath): Is this squeeze OK?
        features = np.squeeze(np.array(list(features_dict.values())))
        return features
Esempio n. 26
0
    def generate_poses(
        self,
        molecular_complex: Tuple[str, str],
        centroid: Optional[np.ndarray] = None,
        box_dims: Optional[np.ndarray] = None,
        exhaustiveness: int = 10,
        num_modes: int = 9,
        num_pockets: Optional[int] = None,
        out_dir: Optional[str] = None,
        generate_scores: bool = False
    ) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
        """Generates the docked complex and outputs files for docked complex.

    TODO: How can this work on Windows? We need to install a .msi file and
    invoke it correctly from Python for this to work.

    Parameters
    ----------
    molecular_complexes: Tuple[str, str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file).
    centroid: np.ndarray, optional
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional
      A numpy array of shape `(3,)` holding the size of the box to dock. If not
      specified is set to size of molecular complex plus 5 angstroms.
    exhaustiveness: int, optional (default 10)
      Tells Autodock Vina how exhaustive it should be with pose
      generation.
    num_modes: int, optional (default 9)
      Tells Autodock Vina how many binding modes it should generate at
      each invocation.
    num_pockets: int, optional (default None)
      If specified, `self.pocket_finder` must be set. Will only
      generate poses for the first `num_pockets` returned by
      `self.pocket_finder`.
    out_dir: str, optional
      If specified, write generated poses to this directory.
    generate_score: bool, optional (default False)
      If `True`, the pose generator will return scores for complexes.
      This is used typically when invoking external docking programs
      that compute scores.

    Returns
    -------
    Tuple[`docked_poses`, `scores`] or `docked_poses`
      Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
      is a list of docked molecular complexes. Each entry in this list
      contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
      `scores` is a list of binding free energies predicted by Vina.

    Raises
    ------
    `ValueError` if `num_pockets` is set but `self.pocket_finder is None`.
    """
        if out_dir is None:
            out_dir = tempfile.mkdtemp()

        if num_pockets is not None and self.pocket_finder is None:
            raise ValueError(
                "If num_pockets is specified, pocket_finder must have been provided at construction time."
            )

        # Parse complex
        if len(molecular_complex) > 2:
            raise ValueError(
                "Autodock Vina can only dock protein-ligand complexes and not more general molecular complexes."
            )

        (protein_file, ligand_file) = molecular_complex

        # Prepare protein
        protein_name = os.path.basename(protein_file).split(".")[0]
        protein_hyd = os.path.join(out_dir, "%s_hyd.pdb" % protein_name)
        protein_pdbqt = os.path.join(out_dir, "%s.pdbqt" % protein_name)
        protein_mol = load_molecule(protein_file,
                                    calc_charges=True,
                                    add_hydrogens=True)
        write_molecule(protein_mol[1], protein_hyd, is_protein=True)
        write_molecule(protein_mol[1], protein_pdbqt, is_protein=True)

        # Get protein centroid and range
        if centroid is not None and box_dims is not None:
            centroids = [centroid]
            dimensions = [box_dims]
        else:
            if self.pocket_finder is None:
                logger.info(
                    "Pockets not specified. Will use whole protein to dock")
                protein_centroid = compute_centroid(protein_mol[0])
                protein_range = compute_protein_range(protein_mol[0])
                box_dims = protein_range + 5.0
                centroids, dimensions = [protein_centroid], [box_dims]
            else:
                logger.info("About to find putative binding pockets")
                pockets = self.pocket_finder.find_pockets(protein_file)
                logger.info("%d pockets found in total" % len(pockets))
                logger.info("Computing centroid and size of proposed pockets.")
                centroids, dimensions = [], []
                for pocket in pockets:
                    protein_centroid = pocket.center()
                    (x_min, x_max), (y_min, y_max), (
                        z_min,
                        z_max) = pocket.x_range, pocket.y_range, pocket.z_range
                    # TODO(rbharath: Does vina divide box dimensions by 2?
                    x_box = (x_max - x_min) / 2.
                    y_box = (y_max - y_min) / 2.
                    z_box = (z_max - z_min) / 2.
                    box_dims = (x_box, y_box, z_box)
                    centroids.append(protein_centroid)
                    dimensions.append(box_dims)

        if num_pockets is not None:
            logger.info(
                "num_pockets = %d so selecting this many pockets for docking."
                % num_pockets)
            centroids = centroids[:num_pockets]
            dimensions = dimensions[:num_pockets]

        # Prepare protein
        ligand_name = os.path.basename(ligand_file).split(".")[0]
        ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

        ligand_mol = load_molecule(ligand_file,
                                   calc_charges=True,
                                   add_hydrogens=True)
        write_molecule(ligand_mol[1], ligand_pdbqt)

        docked_complexes = []
        all_scores = []
        for i, (protein_centroid,
                box_dims) in enumerate(zip(centroids, dimensions)):
            logger.info("Docking in pocket %d/%d" % (i + 1, len(centroids)))
            logger.info("Docking with center: %s" % str(protein_centroid))
            logger.info("Box dimensions: %s" % str(box_dims))
            # Write Vina conf file
            conf_file = os.path.join(out_dir, "conf.txt")
            write_vina_conf(protein_pdbqt,
                            ligand_pdbqt,
                            protein_centroid,
                            box_dims,
                            conf_file,
                            num_modes=num_modes,
                            exhaustiveness=exhaustiveness)

            # Define locations of log and output files
            log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name)
            out_pdbqt = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
            logger.info("About to call Vina")
            if platform.system() == 'Windows':
                args = [
                    self.vina_cmd, "--config", conf_file, "--log", log_file,
                    "--out", out_pdbqt
                ]
            else:
                # I'm not sure why specifying the args as a list fails on other platforms,
                # but for some reason it only works if I pass it as a string.
                # FIXME: Incompatible types in assignment
                args = "%s --config %s --log %s --out %s" % (  # type: ignore
                    self.vina_cmd, conf_file, log_file, out_pdbqt)
            # FIXME: We should use `subprocess.run` instead of `call`
            call(args, shell=True)
            ligands, scores = load_docked_ligands(out_pdbqt)
            docked_complexes += [(protein_mol[1], ligand)
                                 for ligand in ligands]
            all_scores += scores

        if generate_scores:
            return docked_complexes, all_scores
        else:
            return docked_complexes
Esempio n. 27
0
 def test_create_molecular_fragment(self):
   mol_xyz, mol_rdk = rdkit_utils.load_molecule(self.ligand_file)
   fragment = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz)
   assert len(mol_rdk.GetAtoms()) == len(fragment.GetAtoms())
   assert (fragment.GetCoords() == mol_xyz).all()
Esempio n. 28
0
 def test_merge_molecular_fragments(self):
   mol_xyz, mol_rdk = rdkit_utils.load_molecule(self.ligand_file)
   fragment1 = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz)
   fragment2 = MolecularFragment(mol_rdk.GetAtoms(), mol_xyz)
   joint = merge_molecular_fragments([fragment1, fragment2])
   assert len(mol_rdk.GetAtoms()) * 2 == len(joint.GetAtoms())