Beispiel #1
0
def get_molecules_from_pdb(protein_pdb, ligand_pdb):
    """
    Input:
        protein_pdb: str. Path to protein .pdb file
        ligand_pdb: str. Path to ligand .pdb file
    Output:
        protein: rdkit.Chem.rdchem.Mol
        ligand: rdkit.Chem.rdchem.Mol
        compl: rdkit.Chem.rdchem.Mol
    """

    # TODO Currently combining protein and ligand with
    # mdtraj. Verify this method.
    complex_traj = combine_mdtraj(md.load(protein_pdb), md.load(ligand_pdb))
    tmpfile = tempfile.mkstemp(suffix='.pdb')
    f, comp_pdb = tmpfile

    complex_traj.save(comp_pdb)

    protein = rdmolfiles.MolFromPDBFile(protein_pdb)
    ligand = rdmolfiles.MolFromPDBFile(ligand_pdb)
    compl = rdmolfiles.MolFromPDBFile(comp_pdb)

    os.close(f)
    os.remove(comp_pdb)

    return (protein, ligand, compl)
Beispiel #2
0
def pdb2graph(pdbid, data_dir='./data/pdbbind/v2018'):
    """
    Input:
        pdbid: str. protein code from PDBBind
    Returns:
        tuple of tuples. Graph representation of nodes
    """

    protein_pdb_file = os.path.join(data_dir, pdbid,
                                    "{}_protein.pdb".format(pdbid))
    ligand_pdb_file = os.path.join(data_dir, pdbid,
                                   "{}_ligand.pdb".format(pdbid))

    if not os.path.exists(protein_pdb_file) or \
        not os.path.exists(ligand_pdb_file):
        raise IOError(".pdb file not found in {}".format(
            os.path.join(data_dir, pdbid)))

    # combining protein pdb file and ligand pdb file to one pdb file
    protein_traj = md.load(protein_pdb_file)
    ligand_traj = md.load(ligand_pdb_file)

    complex_traj = combine_mdtraj(md.load(protein_pdb_file),
                                  md.load(ligand_pdb_file))
    tempdir = tempfile.mkdtemp()
    complex_traj.save(os.path.join(tempdir, 'complex.pdb'))

    protein = rdmolfiles.MolFromPDBFile(protein_pdb_file)
    ligand = rdmolfiles.MolFromPDBFile(ligand_pdb_file)
    compl = AllChem.MolFromPDBFile(os.path.join(tempdir, 'complex.pdb'))

    return (build_graph_from_molecule(protein),
            build_graph_from_molecule(ligand),
            build_graph_from_molecule(compl))
Beispiel #3
0
def pdb2graph(pdbid, data_dir='./data/pdbbind/v2018'):
    """
    pdbid: str. protein code from PDBBind
    graph_type: str. One of ['protein', 'ligand', ]
    """
    
    protein_pdb_file = os.path.join(
        data_dir, pdbid, "{}_protein.pdb".format(pdbid))
    ligand_pdb_file = os.path.join(
        data_dir, pdbid, "{}_ligand.pdb".format(pdbid))

    if not os.path.exists(protein_pdb_file) or \
        not os.path.exists(ligand_pdb_file):
        raise IOError(".pdb file not found in {}".format(
            os.path.join(data_dir, pdbid)))

    # combining protein pdb file and ligand pdb file to one pdb file   
    protein_traj = md.load(protein_pdb_file)
    ligand_traj = md.load(ligand_pdb_file)

    complex_traj = combine_mdtraj(protein_traj, ligand_traj)
    tempdir = tempfile.mkdtemp()
    complex_traj.save(os.path.join(tempdir, 'complex.pdb'))

    protein = rdmolfiles.MolFromPDBFile(protein_pdb_file)
    ligand = rdmolfiles.MolFromPDBFile(ligand_pdb_file)
    compl = rdmolfiles.MolFromPDBFile(os.path.join(tempdir, 'complex.pdb'))
        
    return (build_graph_from_molecule(protein),
            build_graph_from_molecule(ligand),
            build_graph_from_molecule(compl))
Beispiel #4
0
		def read_lig_pdb(lig_name):
			mol_path = "../ligands/"+lig_name+".pdb"
			mol = rdmolfiles.MolFromPDBFile(
										mol_path, 
										sanitize=False
										)
			return mol
Beispiel #5
0
def print_fragments_from_molecules(filename):
    try:
        mol = rdmolfiles.MolFromPDBFile(filename)
        for _, smi in generate_fragments(mol):
            print(smi)
    except:
        print("Failed",  filename)
Beispiel #6
0
def retrieveMoleculePDB(ligand_path):
	"""
	Returns RDKit molecule objects for requested path PDB file.

	-- args
	ligand_path (str): path leading to molecule pdb file

	-- returns
	RDKit molecule object
	"""
	mol = rdmolfiles.MolFromPDBFile(
									ligand_path, 
									sanitize=True
									)
	return mol
Beispiel #7
0
    def load_dataset(self):
        # Load files
        x, c, y = [], [], []

        if self.dataset is None:
            tx, tc = [], []
            for i, target_n in enumerate(next(os.walk(self.path))[1]):
                try:
                    target = rdmolfiles.MolFromPDBFile(self.path + target_n +
                                                       '/receptor.pdb')
                    targetc = target.GetConformer().GetPositions()
                except AttributeError:
                    print("Failed to load target: " + target_n)
                    continue
                for file, value in (("actives", True), ("decoys", False)):
                    with open(self.path + target_n + '/' + file + '_final.ism',
                              'r') as f:
                        lines = f.readlines()
                        smiless = [x.strip().split(" ")[0] for x in lines]
                        out = Parallel(n_jobs=4)(
                            delayed(Dataset._load_molecule)(smile)
                            for smile in smiless)
                        out = filter(None, out)
                        x_, c_ = zip(*out)
                        x.extend(x_)
                        c.extend(c_)
                        tx.extend([target] * len(x_))
                        tc.extend([targetc] * len(x_))
                        y.extend([value] * len(x_))

                self.target_size = max([t.GetNumAtoms() for t in tx])
                self.molecule_size = max([m.GetNumAtoms() for m in x])

        else:
            for file, value in (("actives", True), ("decoys", False)):
                with open(self.path + file + '_final.ism', 'r') as f:
                    lines = f.readlines()
                    smiless = [x.strip().split(" ")[0] for x in lines]
                    out = Parallel(n_jobs=4)(
                        delayed(Dataset._load_molecule)(smile)
                        for smile in smiless)
                    out = filter(None, out)
                    x_, c_ = zip(*out)
                    x.extend(x_)
                    c.extend(c_)
                    y.extend([value] * len(x_))

            self.molecule_size = max([m.GetNumAtoms() for m in x])

        self.mols, self.coords, self.target = np.array(x), np.array(
            c), np.array(y)
        pkl_dict = {
            'mols': self.mols,
            'coords': self.coords,
            'target': self.target
        }
        pkl.dump(pkl_dict, open(self.path + "dataset.pkl", "wb"))

        # Shuffle data
        idx = np.random.permutation(len(self.mols))
        self.mols, self.coords, self.target = self.mols[idx], self.coords[
            idx], self.target[idx]

        # Split data
        spl1 = int(len(self.mols) * 0.2)
        spl2 = int(len(self.mols) * 0.1)

        self.x = {
            "train": self.mols[spl1:],
            "valid": self.mols[spl2:spl1],
            "test": self.mols[:spl2]
        }
        self.c = {
            "train": self.coords[spl1:],
            "valid": self.coords[spl2:spl1],
            "test": self.coords[:spl2]
        }
        self.y = {
            "train": self.target[spl1:],
            "valid": self.target[spl2:spl1],
            "test": self.target[:spl2]
        }

        if self.dataset is None:
            self.targets, self.t_coords = np.array(tx), np.array(tc)
            pkl_dict = {
                'mols': np.array(x),
                'coords': np.array(c),
                'target': np.array(y),
                'targets': self.targets,
                't_coords': self.t_coords
            }
            pkl.dump(pkl_dict, open(self.path + "dataset.pkl", "wb"))

            # Shuffle data
            self.targets, self.t_coords = self.targets[idx], self.t_coords[idx]

            # Split data
            spl1 = int(len(self.mols) * 0.2)
            spl2 = int(len(self.mols) * 0.1)

            self.tx = {
                "train": self.targets[spl1:],
                "valid": self.targets[spl2:spl1],
                "test": self.targets[:spl2]
            }
            self.tc = {
                "train": self.t_coords[spl1:],
                "valid": self.t_coords[spl2:spl1],
                "test": self.t_coords[:spl2]
            }
Beispiel #8
0
    def mutate(
        self, pdbid: str,
        replace_with: Dict[int,
                           Optional[str]]) -> Union[List[str], rdchem.Mol]:
        """Modify amino acid residues at the defined positions.

        If the locations indexes exceed the amount in data, then they
        will be ignored and a warning will be announced.

        Params:
        -------
        pdbid: str
            PDB ID associated with the structure.

        replace_with: dict
            The index location(s) within the full protein to replace
            certain residue(s) with. If a residue associated with a
            index location is None, then the modified residue is chosen
            randomly. If a certain index exceeds the number of available
            residues in the protein, then those enteries are simply
            ignored and the user is notified.

        Returns:
        --------
        protein: list of str or rdkit.Chem.rdchem.Mol
            Modified protein with residues. If fmt="primary", then list
            of string (peptide names) is returned. If fmt="tertiary",
            then 3D molecule structure is returned.
        """
        # Load PDB structure (download, if necessary)
        pdb_dir = maybe_create_dir(os.path.join(self.rootdir, pdbid))
        pdb_file = os.path.join(pdb_dir, f"{pdbid}.pdb")
        if not os.path.exists(pdb_file):
            is_successful = cmd.fetch(pdbid,
                                      name=pdbid,
                                      state=1,
                                      type="pdb",
                                      path=pdb_dir)
            if is_successful == -1:
                raise DownloadError(f"Unable to download '{pdbid}'.")
        else:
            cmd.load(pdb_file, object=pdbid, state=1, format="pdb")

        # Get all residue names, see: https://pymolwiki.org/index.php/List_Selection
        resnames_dict = {"names": []}
        cmd.iterate("(name ca)", "names.append(resn)", space=resnames_dict)
        residue_names = resnames_dict["names"]
        num_residues = len(residue_names)

        # Cleanup idxs: remove indicies that exceed number of available residues
        nonvalid_idxs = [
            idx for idx in replace_with.keys() if idx > num_residues
        ]
        for idx in nonvalid_idxs:
            print(
                f"OutOfRange: Removing idx {idx} (only {num_residues} residues)."
            )
            replace_with.pop(idx)

        # Randomly choose an amino acid (AA) to replace residue, if None is provided.
        # Additionally, format string such that it is a valid 3 letter amino acid.
        for idx, residue in replace_with.items():
            if residue is None:
                replace_with[idx] = np.random.choice(aa3)
            elif is_aa(residue):
                residue = residue.upper()
                if len(residue) == 1:
                    replace_with[idx] = one_to_three.get(residue)
                elif len(residue) == 3:
                    replace_with[idx] = residue
            else:
                raise ValueError(
                    f"Invalid residue '{residue}'. Choose one from "
                    f"the following {aa1+aa3}.")

        # Determine save filepath name
        modified_res_str = ":".join(
            [f"{k}{three_to_one.get(v)}" for k, v in replace_with.items()])
        filename = f"{self.fmt}_{modified_res_str}"
        filename += ".pdb" if self.fmt == "tertiary" else ".json"
        save_filepath = os.path.join(self.rootdir, pdbid, filename)

        # Replace primary structure, i.e. residue names (str)
        if self.fmt == "primary":
            # Load data from cache, if it exists
            protein = None
            if os.path.exists(save_filepath):
                with open(save_filepath) as json_file:
                    protein = json.load(json_file)
            if protein is None:
                for idx, residue in replace_with.items():
                    residue_names[
                        idx - 1] = residue  # since PDB starts with idx of 1
                protein = [three_to_one.get(name) for name in residue_names]

                # Save sequence temporarily
                _ = maybe_create_dir(save_filepath)
                with open(save_filepath, "w") as outfile:
                    json.dump(protein, outfile)
        # Replace tertiary structure, i.e. residue's 3D coordinates
        elif self.fmt == "tertiary":
            if not os.path.exists(save_filepath):
                # Split states so that we can optimize only on specific state(s).
                # NOTE: Might be useful to choose lowest energy state to mutate,
                # OR mutate rotamers for all positions, then choose one with
                # lowest energy.
                cmd.split_states(object=pdbid)

                # Delete all other objects other than one we want to mutate
                # NOTE: For now, keep only first object. This might change
                # depending on which state needs to be kept.
                objs = cmd.get_object_list()  # aka states
                keep_objs = [pdbid + "_0001"]
                for obj in objs:
                    if obj not in keep_objs:
                        cmd.delete(obj)
                assert keep_objs == cmd.get_object_list()

                # Mutate residues
                cmd.wizard("mutagenesis")
                wizard: Mutagenesis = cmd.get_wizard()
                for idx, res in replace_with.items():
                    selection = "{0:s}//A/{1:d}/".format(keep_objs[0], idx)
                    wizard.do_select(
                        selection)  # select which residue index to replace
                    wizard.set_mode(
                        res)  # choose name of residue to replace with
                    wizard.do_state(
                        1
                    )  # select rotamer with least strain (aka conflicts w/ other atoms)
                    wizard.apply()  # apply point mutation
                cmd.set_wizard(None)  # close wizard

                # Save PDB temporarily
                _ = maybe_create_dir(save_filepath)
                cmd.save(save_filepath, selection=pdbid, format="pdb")
                cmd.delete("all")  # remove all objects, clears workspace

            # Load + choose model/structure with lowest energy
            # NOTE: If sanitize=True, the function checks if Mol has the correct
            # hybridization/valance structure (aka is it chemically reasonable).
            # When converting from the PDB block, this sometimes results in
            # improper parsing. Instead, for now, we just check if the Mol is
            # syntactically valid (i.e. all rings/branches closed, no illegal
            # atom types, etc).
            protein = rdmolfiles.MolFromPDBFile(save_filepath,
                                                sanitize=False,
                                                removeHs=False)
            if protein.GetNumConformers() > 1:
                protein = _get_conformer(protein, conformer="min", algo="MMFF")
        else:
            raise NotImplementedError

        # Remove file, if not needed
        if not self.cache:
            os.remove(save_filepath)

        return protein