def get_molecules_from_pdb(protein_pdb, ligand_pdb): """ Input: protein_pdb: str. Path to protein .pdb file ligand_pdb: str. Path to ligand .pdb file Output: protein: rdkit.Chem.rdchem.Mol ligand: rdkit.Chem.rdchem.Mol compl: rdkit.Chem.rdchem.Mol """ # TODO Currently combining protein and ligand with # mdtraj. Verify this method. complex_traj = combine_mdtraj(md.load(protein_pdb), md.load(ligand_pdb)) tmpfile = tempfile.mkstemp(suffix='.pdb') f, comp_pdb = tmpfile complex_traj.save(comp_pdb) protein = rdmolfiles.MolFromPDBFile(protein_pdb) ligand = rdmolfiles.MolFromPDBFile(ligand_pdb) compl = rdmolfiles.MolFromPDBFile(comp_pdb) os.close(f) os.remove(comp_pdb) return (protein, ligand, compl)
def pdb2graph(pdbid, data_dir='./data/pdbbind/v2018'): """ Input: pdbid: str. protein code from PDBBind Returns: tuple of tuples. Graph representation of nodes """ protein_pdb_file = os.path.join(data_dir, pdbid, "{}_protein.pdb".format(pdbid)) ligand_pdb_file = os.path.join(data_dir, pdbid, "{}_ligand.pdb".format(pdbid)) if not os.path.exists(protein_pdb_file) or \ not os.path.exists(ligand_pdb_file): raise IOError(".pdb file not found in {}".format( os.path.join(data_dir, pdbid))) # combining protein pdb file and ligand pdb file to one pdb file protein_traj = md.load(protein_pdb_file) ligand_traj = md.load(ligand_pdb_file) complex_traj = combine_mdtraj(md.load(protein_pdb_file), md.load(ligand_pdb_file)) tempdir = tempfile.mkdtemp() complex_traj.save(os.path.join(tempdir, 'complex.pdb')) protein = rdmolfiles.MolFromPDBFile(protein_pdb_file) ligand = rdmolfiles.MolFromPDBFile(ligand_pdb_file) compl = AllChem.MolFromPDBFile(os.path.join(tempdir, 'complex.pdb')) return (build_graph_from_molecule(protein), build_graph_from_molecule(ligand), build_graph_from_molecule(compl))
def pdb2graph(pdbid, data_dir='./data/pdbbind/v2018'): """ pdbid: str. protein code from PDBBind graph_type: str. One of ['protein', 'ligand', ] """ protein_pdb_file = os.path.join( data_dir, pdbid, "{}_protein.pdb".format(pdbid)) ligand_pdb_file = os.path.join( data_dir, pdbid, "{}_ligand.pdb".format(pdbid)) if not os.path.exists(protein_pdb_file) or \ not os.path.exists(ligand_pdb_file): raise IOError(".pdb file not found in {}".format( os.path.join(data_dir, pdbid))) # combining protein pdb file and ligand pdb file to one pdb file protein_traj = md.load(protein_pdb_file) ligand_traj = md.load(ligand_pdb_file) complex_traj = combine_mdtraj(protein_traj, ligand_traj) tempdir = tempfile.mkdtemp() complex_traj.save(os.path.join(tempdir, 'complex.pdb')) protein = rdmolfiles.MolFromPDBFile(protein_pdb_file) ligand = rdmolfiles.MolFromPDBFile(ligand_pdb_file) compl = rdmolfiles.MolFromPDBFile(os.path.join(tempdir, 'complex.pdb')) return (build_graph_from_molecule(protein), build_graph_from_molecule(ligand), build_graph_from_molecule(compl))
def read_lig_pdb(lig_name): mol_path = "../ligands/"+lig_name+".pdb" mol = rdmolfiles.MolFromPDBFile( mol_path, sanitize=False ) return mol
def print_fragments_from_molecules(filename): try: mol = rdmolfiles.MolFromPDBFile(filename) for _, smi in generate_fragments(mol): print(smi) except: print("Failed", filename)
def retrieveMoleculePDB(ligand_path): """ Returns RDKit molecule objects for requested path PDB file. -- args ligand_path (str): path leading to molecule pdb file -- returns RDKit molecule object """ mol = rdmolfiles.MolFromPDBFile( ligand_path, sanitize=True ) return mol
def load_dataset(self): # Load files x, c, y = [], [], [] if self.dataset is None: tx, tc = [], [] for i, target_n in enumerate(next(os.walk(self.path))[1]): try: target = rdmolfiles.MolFromPDBFile(self.path + target_n + '/receptor.pdb') targetc = target.GetConformer().GetPositions() except AttributeError: print("Failed to load target: " + target_n) continue for file, value in (("actives", True), ("decoys", False)): with open(self.path + target_n + '/' + file + '_final.ism', 'r') as f: lines = f.readlines() smiless = [x.strip().split(" ")[0] for x in lines] out = Parallel(n_jobs=4)( delayed(Dataset._load_molecule)(smile) for smile in smiless) out = filter(None, out) x_, c_ = zip(*out) x.extend(x_) c.extend(c_) tx.extend([target] * len(x_)) tc.extend([targetc] * len(x_)) y.extend([value] * len(x_)) self.target_size = max([t.GetNumAtoms() for t in tx]) self.molecule_size = max([m.GetNumAtoms() for m in x]) else: for file, value in (("actives", True), ("decoys", False)): with open(self.path + file + '_final.ism', 'r') as f: lines = f.readlines() smiless = [x.strip().split(" ")[0] for x in lines] out = Parallel(n_jobs=4)( delayed(Dataset._load_molecule)(smile) for smile in smiless) out = filter(None, out) x_, c_ = zip(*out) x.extend(x_) c.extend(c_) y.extend([value] * len(x_)) self.molecule_size = max([m.GetNumAtoms() for m in x]) self.mols, self.coords, self.target = np.array(x), np.array( c), np.array(y) pkl_dict = { 'mols': self.mols, 'coords': self.coords, 'target': self.target } pkl.dump(pkl_dict, open(self.path + "dataset.pkl", "wb")) # Shuffle data idx = np.random.permutation(len(self.mols)) self.mols, self.coords, self.target = self.mols[idx], self.coords[ idx], self.target[idx] # Split data spl1 = int(len(self.mols) * 0.2) spl2 = int(len(self.mols) * 0.1) self.x = { "train": self.mols[spl1:], "valid": self.mols[spl2:spl1], "test": self.mols[:spl2] } self.c = { "train": self.coords[spl1:], "valid": self.coords[spl2:spl1], "test": self.coords[:spl2] } self.y = { "train": self.target[spl1:], "valid": self.target[spl2:spl1], "test": self.target[:spl2] } if self.dataset is None: self.targets, self.t_coords = np.array(tx), np.array(tc) pkl_dict = { 'mols': np.array(x), 'coords': np.array(c), 'target': np.array(y), 'targets': self.targets, 't_coords': self.t_coords } pkl.dump(pkl_dict, open(self.path + "dataset.pkl", "wb")) # Shuffle data self.targets, self.t_coords = self.targets[idx], self.t_coords[idx] # Split data spl1 = int(len(self.mols) * 0.2) spl2 = int(len(self.mols) * 0.1) self.tx = { "train": self.targets[spl1:], "valid": self.targets[spl2:spl1], "test": self.targets[:spl2] } self.tc = { "train": self.t_coords[spl1:], "valid": self.t_coords[spl2:spl1], "test": self.t_coords[:spl2] }
def mutate( self, pdbid: str, replace_with: Dict[int, Optional[str]]) -> Union[List[str], rdchem.Mol]: """Modify amino acid residues at the defined positions. If the locations indexes exceed the amount in data, then they will be ignored and a warning will be announced. Params: ------- pdbid: str PDB ID associated with the structure. replace_with: dict The index location(s) within the full protein to replace certain residue(s) with. If a residue associated with a index location is None, then the modified residue is chosen randomly. If a certain index exceeds the number of available residues in the protein, then those enteries are simply ignored and the user is notified. Returns: -------- protein: list of str or rdkit.Chem.rdchem.Mol Modified protein with residues. If fmt="primary", then list of string (peptide names) is returned. If fmt="tertiary", then 3D molecule structure is returned. """ # Load PDB structure (download, if necessary) pdb_dir = maybe_create_dir(os.path.join(self.rootdir, pdbid)) pdb_file = os.path.join(pdb_dir, f"{pdbid}.pdb") if not os.path.exists(pdb_file): is_successful = cmd.fetch(pdbid, name=pdbid, state=1, type="pdb", path=pdb_dir) if is_successful == -1: raise DownloadError(f"Unable to download '{pdbid}'.") else: cmd.load(pdb_file, object=pdbid, state=1, format="pdb") # Get all residue names, see: https://pymolwiki.org/index.php/List_Selection resnames_dict = {"names": []} cmd.iterate("(name ca)", "names.append(resn)", space=resnames_dict) residue_names = resnames_dict["names"] num_residues = len(residue_names) # Cleanup idxs: remove indicies that exceed number of available residues nonvalid_idxs = [ idx for idx in replace_with.keys() if idx > num_residues ] for idx in nonvalid_idxs: print( f"OutOfRange: Removing idx {idx} (only {num_residues} residues)." ) replace_with.pop(idx) # Randomly choose an amino acid (AA) to replace residue, if None is provided. # Additionally, format string such that it is a valid 3 letter amino acid. for idx, residue in replace_with.items(): if residue is None: replace_with[idx] = np.random.choice(aa3) elif is_aa(residue): residue = residue.upper() if len(residue) == 1: replace_with[idx] = one_to_three.get(residue) elif len(residue) == 3: replace_with[idx] = residue else: raise ValueError( f"Invalid residue '{residue}'. Choose one from " f"the following {aa1+aa3}.") # Determine save filepath name modified_res_str = ":".join( [f"{k}{three_to_one.get(v)}" for k, v in replace_with.items()]) filename = f"{self.fmt}_{modified_res_str}" filename += ".pdb" if self.fmt == "tertiary" else ".json" save_filepath = os.path.join(self.rootdir, pdbid, filename) # Replace primary structure, i.e. residue names (str) if self.fmt == "primary": # Load data from cache, if it exists protein = None if os.path.exists(save_filepath): with open(save_filepath) as json_file: protein = json.load(json_file) if protein is None: for idx, residue in replace_with.items(): residue_names[ idx - 1] = residue # since PDB starts with idx of 1 protein = [three_to_one.get(name) for name in residue_names] # Save sequence temporarily _ = maybe_create_dir(save_filepath) with open(save_filepath, "w") as outfile: json.dump(protein, outfile) # Replace tertiary structure, i.e. residue's 3D coordinates elif self.fmt == "tertiary": if not os.path.exists(save_filepath): # Split states so that we can optimize only on specific state(s). # NOTE: Might be useful to choose lowest energy state to mutate, # OR mutate rotamers for all positions, then choose one with # lowest energy. cmd.split_states(object=pdbid) # Delete all other objects other than one we want to mutate # NOTE: For now, keep only first object. This might change # depending on which state needs to be kept. objs = cmd.get_object_list() # aka states keep_objs = [pdbid + "_0001"] for obj in objs: if obj not in keep_objs: cmd.delete(obj) assert keep_objs == cmd.get_object_list() # Mutate residues cmd.wizard("mutagenesis") wizard: Mutagenesis = cmd.get_wizard() for idx, res in replace_with.items(): selection = "{0:s}//A/{1:d}/".format(keep_objs[0], idx) wizard.do_select( selection) # select which residue index to replace wizard.set_mode( res) # choose name of residue to replace with wizard.do_state( 1 ) # select rotamer with least strain (aka conflicts w/ other atoms) wizard.apply() # apply point mutation cmd.set_wizard(None) # close wizard # Save PDB temporarily _ = maybe_create_dir(save_filepath) cmd.save(save_filepath, selection=pdbid, format="pdb") cmd.delete("all") # remove all objects, clears workspace # Load + choose model/structure with lowest energy # NOTE: If sanitize=True, the function checks if Mol has the correct # hybridization/valance structure (aka is it chemically reasonable). # When converting from the PDB block, this sometimes results in # improper parsing. Instead, for now, we just check if the Mol is # syntactically valid (i.e. all rings/branches closed, no illegal # atom types, etc). protein = rdmolfiles.MolFromPDBFile(save_filepath, sanitize=False, removeHs=False) if protein.GetNumConformers() > 1: protein = _get_conformer(protein, conformer="min", algo="MMFF") else: raise NotImplementedError # Remove file, if not needed if not self.cache: os.remove(save_filepath) return protein