def sql_cg(pdbname): if len(pdbname) == 4: #PDBID prody.fetchPDB(pdbname, compressed=False) pdbname+=".pdb" mol = prody.parsePDB(pdbname) na=mol.numAtoms() print ("num atoms is ",na) LOD_NATOMS = (na*LOD_LEVELES).astype(int) filename = pdb_directory+pdbname binary = sbl_binary_U sblname="sbl-ballcovor-pdb-U__inner_approximation.txt" vmdscript="*.vmd" for i in range(4): #os.system(binary+sbl_arg_file+filename+sbl_arg_nballs+str(LOD_NATOMS[i])+sbl_arg_outer+sbl_arg_interpolated+sbl_arg_verbose) os.system(binary+sbl_arg_file+filename+sbl_arg_nballs+str(LOD_NATOMS[i])+sbl_arg_verbose) data=np.loadtxt(sblname) #sqrt the radius and save if len(data)==0: print ("********************************") print (pdbname) print ("********************************") continue data[:,3] = np.sqrt(data[:,3]) ofilename = pdbname+"_"+str(LOD_LEVELES[i])+'.txt' np.savetxt(ofilename, data, fmt='%g') print ("success ", ofilename) os.system("rm "+sblname) os.system("mkdir "+pdbname+"_sph") os.system("mv *.vmd "+pdbname+"_sph/.")
def get_PDB(args): """Gets PDB file or downloads PDF file from rcsb.org""" if args.PDB_file: my_PDB = args.PDB_file else: my_PDB = prody.fetchPDB(args.PDB_fetch, compressed=False) print() return my_PDB
def voxelize(pdbname,spacing=10.0,padding=0.0): #spacing=1/5.0 #padding=5.0 if len(pdbname) == 4: #PDBID prody.fetchPDB(pdbname, compressed=False) pdbname+=".pdb" mol = prody.parsePDB(pdbname) na=mol.numAtoms() c=mol.getCoords() center_c = c - np.average(c,0) bot=np.min(center_c,0)+padding top=np.max(center_c,0)+padding ijk = (1/spacing * (center_c)).astype(int) n=np.max(ijk) ijku=unique_rows(ijk) out_coords = (ijku*spacing) np.savetxt(pdbname+".vox", out_coords, delimiter=' ', fmt='%f') print ("success ", pdbname+".vox")
def voxelize_avg(pdbname,spacing=20.0,padding=20.0): #spacing=1/5.0 #padding=5.0 if len(pdbname) == 4: #PDBID prody.fetchPDB(pdbname, compressed=False) pdbname+=".pdb" mol = prody.parsePDB(pdbname) na=mol.numAtoms() c=mol.getCoords() center_c = c - np.average(c,0) bot=np.min(center_c,0)+padding top=np.max(center_c,0)+padding #new_center = top-bot ind = np.array((1/spacing*(center_c - bot)), 'int') maxi = np.max(ind, 0) mask = np.zeros( maxi+1 ) #ind1 = [tuple(x.tolist()) for x in ind] mask[ [ind[:,0],ind[:,1],ind[:,2]] ] = 1 ijk = (1/spacing * (center_c-bot)).astype(int) n=np.max(ijk) ijku=unique_rows(ijk) out_coords = (ijku*spacing)+bot np.savetxt(pdbname+".xyz", out_coords, delimiter=' ', fmt='%f') avg=[ijk[0].tolist(),] coords=[[center_c[0]],] for i in range(1,len(ijk)): found =False for j in range(len(avg)) : if ijk[i].tolist()==avg[j] : coords[j].append(center_c[i].tolist()) found = True break if not found : avg.append(ijk[i].tolist()) coords.append([center_c[i].tolist()]) cavg=[] for c in coords: cavg.append(np.average(c,0).tolist()) np.savetxt(pdbname+"_avg.xyz", cavg, delimiter=' ', fmt='%f') print ("success ", pdbname+".xyz")
def prody_fetch(opt): """Fetch PDB files from PDB FTP server.""" import prody pdblist = opt.pdb listfn = opt.listfn if listfn: if os.path.isfile(listfn): inp = prody.openFile(listfn) for line in inp: line = line.strip() for s in line.split(','): for pdb in s.split(): if len(pdb) == 4: pdblist.append(pdb) inp.close() else: opt.subparser.error("No such file: '{0:s}'".format(listfn)) prody.fetchPDB(pdblist, opt.folder, compressed=opt.gzip, copy=True)
def load_or_parse_residues( pdb1, chain_id1, repo_path=REPO_PATH, allowed_solving_methods=['SOLUTION NMR', 'X-RAY DIFFRACTION']): if tostr(pdb1, chain_id1) in COORDS: residues = load_residues(pdb1, chain_id1) return residues src_path = os.path.join(repo_path, '%s.pdb.gz' % pdb1) if not os.path.exists(src_path): fetchPDB(pdb1, folder=os.path.dirname(src_path)) if not os.path.exists(src_path): return None st1, h1 = parsePDB(src_path, header=True, chain=chain_id1) if (st1 is None) or (h1 is None): return None if h1['experiment'] not in allowed_solving_methods: return None residues = store_residues(st1, pdb1, chain_id1) return residues
def prodyLoad(pdbname,biomt=False): #biomt? if len(pdbname) == 4: #PDBID prody.fetchPDB(pdbname.lower(), compressed=False) pdbname = pdbname.lower() + ".pdb" else : if pdbname[-4:] != ".pdb": pdbname += ".pdb" if biomt: mol,header = prody.parsePDB(pdbname, header=True) if len(header['biomoltrans']): mol = prody.buildBiomolecules( header, mol) else: mol = prody.parsePDB(pdbname, header=False) na=mol.numAtoms() c=mol.getCoords() center_c = c - np.average(c,0) np.savetxt(pdb_directory+os.sep+pdbname+"_cl.txt", center_c, fmt='%f') return center_c
def doKmeans(pdbname,spacing=20.0,padding=20.0, percentile=0.001): if len(pdbname) == 4: #PDBID prody.fetchPDB(pdbname.lower(), compressed=False) pdbname = pdbname.lower() + ".pdb" else : if pdbname[-4:] != ".pdb": pdbname += ".pdb" #mol,header = prody.parsePDB(pdbname, header=True) mol = prody.parsePDB(pdbname, header=False) #if len(header['biomoltrans']): # mol = prody.buildBiomolecules( header, mol) na=mol.numAtoms() c=mol.getCoords() center_c = c - np.average(c,0) #print int(round(len(center_c)*0.008)) ncluster = int(round(len(center_c)*percentile)) if mol.numAtoms('ca') == mol.numAtoms(): ncluster *= 5 if ncluster == 0: ncluster = int(na/10.0) if ncluster <= 6: ncluster *= 2 if ncluster == 0: print (pdbname,"no cluster") return print (ncluster) k_means = KMeans(init='k-means++', n_clusters=ncluster, n_init=10) k_means.fit(center_c) k_means_labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ k_means_labels_unique = np.unique(k_means_labels) np.savetxt(pdbname+"_kmeans2.txt", k_means.cluster_centers_, fmt='%f') np.savetxt(pdbname+"_cl.txt", center_c, fmt='%f')
def prody_fetch(*pdb, **kwargs): """Fetch PDB files from PDB FTP server. :arg pdbs: PDB identifier(s) or filename(s) :arg dir: target directory for saving PDB file(s), default is ``'.'`` :arg gzip: gzip fetched files or not, default is ``False``""" import prody pdblist = pdb if len(pdblist) == 1 and os.path.isfile(pdblist[0]): from prody.utilities import openFile with openFile(pdblist[0]) as inp: for item in inp.read().strip().split(): for pdb in item.split(','): if len(pdb) == 4 and pdb.isalnum(): pdblist.append(pdb) prody.fetchPDB(*pdblist, folder=kwargs.get('folder', '.'), compressed=kwargs.get('gzip', False), copy=True)
def parse_pdb_files(input_pdb): if type(input_pdb) == str or type(input_pdb) == PosixPath: input_pdb = Path(input_pdb) if input_pdb.is_dir(): pdb_files = list(input_pdb.glob("*.pdb")) elif input_pdb.is_file(): with open(input_pdb) as f: pdb_files = f.read().strip().split("\n") else: pdb_files = str(input_pdb).split("\n") else: pdb_files = list(input_pdb) if not Path(pdb_files[0]).is_file(): pdb_files = [pd.fetchPDB(pdb_name) for pdb_name in pdb_files] return pdb_files
def getPDB(pdbId): """ Downloads a pdb from the Protein Data Bank (if necessary) and removes all models so that it only has one model. @param pdbId: A 4 letter pdb id @return: The downloaded pdb data structure. """ # Download pdb path = prody.fetchPDB(pdbId, compressed = False) # Get pdb data structure pdb = prody.parsePDB(path) number_of_models = pdb.select("protein").numCoordsets() # Delete all coordsets but coordset 0 [pdb.delCoordset(1) for i in range(1,number_of_models)] return pdb
def prody_blast(opt): """Blast search PDB based on command line arguments.""" import prody LOGGER = prody.LOGGER seq = opt.seq title = None if os.path.isfile(seq): title, seq = readFirstSequenceFasta(seq) LOGGER.info("First sequence ({0:s}) is parsed from {1:s}." .format(title, repr(seq))) if not seq.isalpha() or not seq.isupper(): opt.subparser.error("{0:s} is not a valid sequence or a file" .format(repr(seq))) folder, identity, coverage = opt.folder, opt.identity, opt.coverage if not 0 < identity < 100: opt.subparser.error('identity must be between 0 and 100') if not 0 < coverage < 100: opt.subparser.error('overlap must be between 0 and 100') blast_results = prody.blastPDB(seq) hits = blast_results.getHits(percent_identity=identity, percent_coverage=coverage) #sort hits by decreasing percent identity hits2 = [] for pdb in hits: hits2.append( (-hits[pdb]['percent_identity'], pdb) ) hits2.sort() for identity, pdb in hits2: chain = hits[pdb]['chain_id'] percent_identity = hits[pdb]['percent_identity'] title = hits[pdb]['title'] print(pdb + ' ' + chain + ' ' + ('%5.1f%%' % (percent_identity)) + ' ' + title) # download hits if --folder is given if opt.folder: LOGGER.info('Downloading hits to ' + opt.folder) pdblist = [ pdb for identity, pdb in hits2 ] pdblist2 = prody.fetchPDB(pdblist, opt.folder, compressed=opt.gzip, copy=True)
def get_pdb(pdb_id, selection): """ Downloads a pdb from the Protein Data Bank (if necessary) and removes all models so that it only has one model. :param pdb_id: A 4 letter pdb id :return: The downloaded pdb prody data structure and the path to the downloaded file. """ # Download pdb path = prody.fetchPDB(pdb_id, compressed=False) # Get pdb data structure pdb = prody.parsePDB(path) pdb = pdb.select(selection).copy() number_of_models = pdb.numCoordsets() # Delete all coordsets but coordset 0 [pdb.delCoordset(1) for _ in range(1, number_of_models)] return pdb, path
def compare(): ###get PDB files from databank that are associated with each protein for later use ##change directory #create a folder that contains all pdb files from the PDB if it does not exist prody.pathPDBFolder(wd + '/challengedata/PDBfiles') #list of proteins that need to be downloaded weeks = [] for(_, dirnames, _) in os.walk(wd + '/challengedata'): if (dirnames=='latest.txt' or dirnames=='answers' or dirnames =='rdkit-scripts'): pass elif (dirnames not in weeks): weeks.extend(dirnames) proteins = [x for x in weeks if 'celpp' not in x] #download pdb using prody for x in proteins: if x=='rdkit-scripts' or x=='PDBfiles' or x=='answers': pass else: protein = prody.fetchPDB(x)
def worker(args): count, pdbid = args os.makedirs(f'{datadir}/{pdbid}', mode=0o755, exist_ok=True) os.chdir(f'{datadir}/{pdbid}') prody.fetchPDB(pdbid) return dict(count=count, pdbid=pdbid)
def __init__(self, comb, pdb_acc_code, chain, **kwargs): """ :comb: arg: instance of cls Comb with attributes pdbchain_dict, ifg_selection_info :pdb_acc_code: type: str: 4 character pdb accession code :param kwargs: path_to_pdb path_to_dssp """ #search for acc code in input_dir_pdb from comb object. assert isinstance(pdb_acc_code, str), 'PDB accession code needs to be a string' pdb_file = [ file.name for file in os.scandir(comb.input_dir_pdb) if pdb_acc_code in file.name ] try: if pdb_file: pdb_file = pdb_file[0] self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + pdb_file, altloc='A', model=1) elif 'path_to_pdb' in kwargs: self.prody_pdb = pr.parsePDB(kwargs.get('path_to_pdb'), altloc='A', model=1) else: # NEED TO UPDATE: note if going to fetch pdb, it should be sent through Reduce first... try: os.mkdir(comb.input_dir_pdb + 'raw') os.mkdir(comb.input_dir_pdb + 'reduce') except: pass pr.fetchPDB(pdb_acc_code, compressed=False, folder=comb.input_dir_pdb + 'raw') os.system(comb.path_to_reduce + comb.reduce + ' -FLIP -Quiet -DB ' + comb.path_to_reduce + 'reduce_wwPDB_het_dict.txt ' + comb.input_dir_pdb + 'raw/' + pdb_acc_code.lower() + '.pdb > ' + comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb') self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb', altloc='A', model=1) except NameError: raise NameError( 'ParsePDB instance needs a pdb file path or a valid pdb accession code.' ) self.pdb_acc_code = pdb_acc_code.lower() self.pdb_chain = chain if len(self.prody_pdb) == len(self.prody_pdb.select('icode _')) \ and self.prody_pdb.select('protein and chain ' + self.pdb_chain) is not None: self.contacts = pr.Contacts(self.prody_pdb) self.set_bonds() if pdb_file: self.fs_struct = freesasa.Structure(comb.input_dir_pdb + pdb_file) elif 'path_to_pdb' in kwargs: self.fs_struct = freesasa.Structure(kwargs.get('path_to_pdb')) else: path = comb.input_dir_pdb + 'reduce/' self.fs_struct = freesasa.Structure(path + next( file.name for file in os.scandir(path) if self.pdb_acc_code in file.name)) self.fs_result = freesasa.calc(self.fs_struct) self.fs_result_cb_3A = self.freesasa_cb(probe_radius=3) self.fs_result_cb_4A = self.freesasa_cb(probe_radius=4) self.fs_result_cb_5A = self.freesasa_cb(probe_radius=5) self.prody_pdb_bb_cb_atom_ind = self.prody_pdb.select( 'protein and (backbone or name CB) ' 'and not element H D').getIndices() dssp_file = [ file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name ] if dssp_file: dssp_file = dssp_file[0] self.dssp = pr.parseDSSP(comb.input_dir_dssp + dssp_file, self.prody_pdb) elif 'path_to_dssp' in kwargs: self.dssp = pr.parseDSSP(kwargs.get('path_to_dssp'), self.prody_pdb) else: if pdb_file: pr.execDSSP(comb.input_dir_pdb + pdb_file, outputdir=comb.input_dir_dssp) elif 'path_to_pdb' in kwargs: pr.execDSSP(kwargs.get('path_to_pdb'), outputdir=comb.input_dir_dssp) else: path = comb.input_dir_pdb + 'reduce/' + next( file.name for file in os.scandir(comb.input_dir_pdb + 'reduce') if pdb_acc_code in file.name) pr.execDSSP(path, outputdir=comb.input_dir_dssp) self.dssp = pr.parseDSSP( comb.input_dir_dssp + next(file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name), self.prody_pdb) self.possible_ifgs = self.find_possible_ifgs(comb) else: self.possible_ifgs = None # valence and hydrogen bond data for vandermers and iFGs of ParsedPDB protein instance # iFG specific: self._ifg_pdb_info = [] self._ifg_atom_density = [] self._ifg_contact_water = [] self._ifg_contact_ligand = [] self._ifg_contact_metal = [] # vdM specific: self._vdm_pdb_info = [] self._vdm_sasa_info = [] self._ifg_contact_vdm = [] self._ifg_hbond_vdm = [] self._ifg_hbond_water = [] self._ifg_hbond_ligand = [] self._ifg_ca_hbond_vdm = []
temp[0].setChids('A') temp[1].setChids('A') # build chain for i in range(len(sequence) - 1): neighborAA = None if (isNeighborDependent): if (isRightNeighbor): if (i + 2 < len(sequence)): neighborAA = sequence[i + 2] else: neighborAA = sequence[i] aa = sequence[i + 1] diamid = thedb.query(aa, neighborAA) chain = chainer.appendDiamid2Chain(chain, diamid, i + 2) chain = chain.select('not resnum 0').copy() chain = chain.select('not resnum ' + str(len(sequence) + 1)).copy() return chain if __name__ == '__main__': prody.fetchPDB('1d3z') pdb = prody.parsePDB('1d3z.pdb.gz') sequence = pdb.select('name CA').getSequence() thedb = db.DB('samples/TDRD_R_TCBIG.json', 'data/diamides', 'samples/NDRD_R_TCBIG_pretty.json') structure = structure_builder(sequence, thedb, True, True) prody.writePDB('1d3z_test_ndrd_out.pdb', structure) print(structure) print(structure.getCoords()) #print([aa.getResname() for aa in structure.iterResidues()])
def prody_blast(sequence, **kwargs): """Blast search PDB and download hits. :arg sequence: sequence or file in fasta format :arg identity: percent sequence identity for blast search, default is 90.0 :type identity: float :arg overlap: percent sequence overlap between sequences, default is 90.0 :type overlap: float :arg outdir: download uncompressed PDB files to given directory :type outdir: str :arg gzip: write compressed PDB file *Blast Parameters* :arg filename: a *filename* to save the results in XML format :type filename: str :arg hitlist_size: search parameters, default is 250 :type hitlist_size: int :arg expect: search parameters, default is 1e-10 :type expect: float :arg sleep: how long to wait to reconnect for results, default is 2 sleep time is doubled when results are not ready. :type sleep: int :arg timeout: when to give up waiting for results. default is 30 :type timeout: int""" import prody LOGGER = prody.LOGGER title = None if os.path.isfile(sequence): title, sequence = readFirstSequenceFasta(sequence) LOGGER.info("First sequence ({0}) is parsed from {1}.".format( title, repr(sequence))) if not sequence.isalpha() or not sequence.isupper(): raise ValueError("{0} is not a valid sequence or a file".format( repr(sequence))) outdir = kwargs.get('outdir') identity, overlap = kwargs.get('identity', 90), kwargs.get('overlap', 90) if not 0 < identity < 100: raise ValueError('identity must be between 0 and 100') if not 0 < overlap < 100: raise ValueError('overlap must be between 0 and 100') filename = kwargs.get('filename', None) hitlist_size = kwargs.get('hitlist_size', 250) expect = kwargs.get('expect', 1e-10) sleep, timeout = kwargs.get('sleep', 2), kwargs.get('timeout', 30) blast_results = prody.blastPDB(sequence, filename=filename, hitlist_size=hitlist_size, expect=expect, sleep=sleep, timeout=timeout) if not blast_results.isSuccess: raise IOError('blast search timed out, please try again') hits = blast_results.getHits(percent_identity=identity, percent_overlap=overlap) #sort hits by decreasing percent identity hits2 = [] for pdb in hits: hits2.append((-hits[pdb]['percent_identity'], pdb)) hits2.sort() stdout = kwargs.get('stdout', False) if not stdout: finalHits = [] else: from sys import stdout for identity, pdb in hits2: chain = hits[pdb]['chain_id'] percent_identity = hits[pdb]['percent_identity'] title = hits[pdb]['title'] if stdout: stdout.write(pdb + ' ' + chain + ' ' + ('%5.1f%%' % (percent_identity)) + ' ' + title) else: finalHits.append( (pdb, chain, ('%5.1f%%' % (percent_identity)), title)) # download hits if --outdir is given if outdir: LOGGER.info('Downloading hits to ' + outdir) pdblist = [pdb for identity, pdb in hits2] pdblist2 = prody.fetchPDB(pdblist, outdir, compressed=kwargs.get('gzip'), copy=True) if not stdout: return finalHits
def prody_blast(sequence, **kwargs): """Blast search PDB and download hits. :arg sequence: sequence or file in fasta format :arg identity: percent sequence identity for blast search, default is 90.0 :type identity: float :arg overlap: percent sequence overlap between sequences, default is 90.0 :type overlap: float :arg outdir: download uncompressed PDB files to given directory :type outdir: str :arg gzip: write compressed PDB file *Blast Parameters* :arg filename: a *filename* to save the results in XML format :type filename: str :arg hitlist_size: search parameters, default is 250 :type hitlist_size: int :arg expect: search parameters, default is 1e-10 :type expect: float :arg sleep: how long to wait to reconnect for results, default is 2 sleep time is doubled when results are not ready. :type sleep: int :arg timeout: when to give up waiting for results. default is 30 :type timeout: int""" import prody LOGGER = prody.LOGGER title = None if os.path.isfile(sequence): title, sequence = readFirstSequenceFasta(sequence) LOGGER.info("First sequence ({0}) is parsed from {1}." .format(title, repr(sequence))) if not sequence.isalpha() or not sequence.isupper(): raise ValueError("{0} is not a valid sequence or a file" .format(repr(sequence))) outdir = kwargs.get('outdir') identity, overlap = kwargs.get('identity', 90), kwargs.get('overlap', 90) if not 0 < identity < 100: raise ValueError('identity must be between 0 and 100') if not 0 < overlap < 100: raise ValueError('overlap must be between 0 and 100') filename = kwargs.get('filename', None) hitlist_size = kwargs.get('hitlist_size', 250) expect = kwargs.get('expect', 1e-10) sleep, timeout = kwargs.get('sleep', 2), kwargs.get('timeout', 30) blast_results = prody.blastPDB(sequence,filename=filename, hitlist_size=hitlist_size, expect=expect, sleep=sleep, timeout=timeout) if blast_results is None: raise IOError('blast search timed out, please try again') hits = blast_results.getHits(percent_identity=identity, percent_overlap=overlap) #sort hits by decreasing percent identity hits2 = [] for pdb in hits: hits2.append( (-hits[pdb]['percent_identity'], pdb) ) hits2.sort() stdout = kwargs.get('stdout', False) if not stdout: finalHits = [] else: from sys import stdout for identity, pdb in hits2: chain = hits[pdb]['chain_id'] percent_identity = hits[pdb]['percent_identity'] title = hits[pdb]['title'] if stdout: stdout.write(pdb + ' ' + chain + ' ' + ('%5.1f%%' % (percent_identity)) + ' ' + title) else: finalHits.append((pdb, chain, ('%5.1f%%' % (percent_identity)), title)) # download hits if --output-dir is given if outdir: LOGGER.info('Downloading hits to ' + outdir) pdblist = [ pdb for identity, pdb in hits2 ] pdblist2 = prody.fetchPDB(pdblist, outdir, compressed=kwargs.get('gzip'), copy=True) if not stdout: return finalHits
def __init__(self, comb, pdb_acc_code, chain, **kwargs): """ :comb: arg: instance of cls Comb with attributes pdbchain_dict, ifg_selection_info :pdb_acc_code: type: str: 4 character pdb accession code :param kwargs: path_to_pdb path_to_dssp """ #search for acc code in input_dir_pdb from comb object. assert isinstance(pdb_acc_code, str), 'PDB accession code needs to be a string' pdb_file = [ file.name for file in os.scandir(comb.input_dir_pdb) if pdb_acc_code in file.name ] try: if pdb_file: pdb_file = pdb_file[0] self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + pdb_file, altloc='A', model=1) elif 'path_to_pdb' in kwargs: self.prody_pdb = pr.parsePDB(kwargs.get('path_to_pdb'), altloc='A', model=1) else: try: os.mkdir(comb.input_dir_pdb + 'raw') os.mkdir(comb.input_dir_pdb + 'reduce') except: pass pr.fetchPDB(pdb_acc_code, compressed=False, folder=comb.input_dir_pdb + 'raw') os.system(comb.path_to_reduce + comb.reduce + ' -FLIP -Quiet -DB ' + comb.path_to_reduce + 'reduce_wwPDB_het_dict.txt ' + comb.input_dir_pdb + 'raw/' + pdb_acc_code.lower() + '.pdb > ' + comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb') self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb', altloc='A', model=1) except NameError: raise NameError( 'ParsePDB instance needs a pdb file path or a valid pdb accession code.' ) self.pdb_acc_code = pdb_acc_code.lower() self.pdb_chain = chain if len(self.prody_pdb) == len(self.prody_pdb.select('icode _')) \ and self.prody_pdb.select('protein and chain ' + self.pdb_chain) is not None: self.contacts = pr.Contacts(self.prody_pdb) self.set_bonds() if pdb_file: self.fs_struct = freesasa.Structure(comb.input_dir_pdb + pdb_file) elif 'path_to_pdb' in kwargs: self.fs_struct = freesasa.Structure(kwargs.get('path_to_pdb')) else: path = comb.input_dir_pdb + 'reduce/' self.fs_struct = freesasa.Structure(path + next( file.name for file in os.scandir(path) if self.pdb_acc_code in file.name)) self.fs_result = freesasa.calc(self.fs_struct) self.fs_result_cb_3A = self.freesasa_cb(probe_radius=3) self.fs_result_cb_4A = self.freesasa_cb(probe_radius=4) self.fs_result_cb_5A = self.freesasa_cb(probe_radius=5) self.prody_pdb_bb_cb_atom_ind = self.prody_pdb.select( 'protein and (backbone or name CB) ' 'and not element H D').getIndices() dssp_file = [ file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name ] if dssp_file: dssp_file = dssp_file[0] self.parse_dssp(dssp_file, comb) if comb.query_path: self.possible_ifgs = self.find_possible_ifgs_rmsd( comb, rmsd_threshold=comb.rmsd_threshold) else: self.possible_ifgs = self.find_possible_ifgs(comb) else: self.possible_ifgs = None self.alphahull = self.set_alphahull() self.segnames = sorted(np.unique(self.prody_pdb.getSegnames())) self._ifg_pdb_info = [] self._ifg_atom_density = [] self._ifg_contact_water = [] self._ifg_contact_ligand = [] self._ifg_contact_metal = [] # vdM specific: self._vdm_pdb_info = [] self._vdm_sasa_info = [] self._ifg_contact_vdm = [] self._ifg_hbond_vdm = [] self._ifg_hbond_water = [] self._ifg_hbond_ligand = [] self._ifg_ca_hbond_vdm = []
def fetch_pdb(data=csv): df = pd.read_csv(data, sep='\t') PDB = list(set([i[:4] for i in df['IDs'].values])) fetched = pdy.fetchPDB(PDB) return fetched
def modelLoops(pdbid, chids, alnfile='temp.ali'): from modeller import environ, model, alignment from modeller.automodel import loopmodel import os as os import tempfile as tempfile import shutil as shutil import prody as prody import numpy as np prevdir = os.getcwd() pdb = [] with tempfile.TemporaryDirectory() as tmpdir: os.chdir(os.path.expanduser(tmpdir)) prody.fetchPDB(pdbid) e = environ() for chid in chids: knowns = pdbid+'_'+chid sequence = pdbid+'_'+chid+'_full' try: # Try to model structure aln = alignment(e) m = model(e, file=pdbid, model_segment=('FIRST:'+chid, 'LAST:'+chid)) aln.append_model(m, atom_files=pdbid, align_codes=knowns) aln.append_sequence(getSeqres(pdbid, chid)[0]) aln[-1].code = sequence aln.malign() aln.write(file=alnfile, alignment_format='PIR') a = loopmodel(e, alnfile=alnfile, knowns=knowns, sequence=sequence) a.make() pdbfile = a.outputs[0]['name'] h = model(e, file=pdbfile) aln = alignment(e) aln.append_model(m, atom_files=pdbid, align_codes=knowns) aln.append_model(h, atom_files=pdbfile, align_codes=sequence) h.res_num_from(m, aln) # Restore old residue numbering and chain indexing h.write(file=pdbfile) if not pdb: pdb = prody.parsePDB(pdbfile) else: pdb = pdb + prody.parsePDB(pdbfile) except: # If it fails, return original PDB file. Likely the original file has no gaps (i.e. NOT EFFICIENT). print 'PDB %s chain %s could not be modelled' %(pdbid, chid) ref = prody.parsePDB(pdbid) sel = 'chain %s' %chid atom = ref.select(sel) reffile = knowns+'.pdb' prody.writePDB(reffile, atom) if not pdb: pdb = prody.parsePDB(reffile) else: pdb = pdb + prody.parsePDB(reffile) os.chdir(prevdir) return pdb
def align(): global wd ans = wd + '/challengedata/answers' if os.path.isdir( ans) == False: #if the answers directory isnt formed make it os.mkdir(wd + '/challengedata/answers') rddir = wd + '/challengedata/rdkit-scripts' if os.path.isdir(rddir) == False: a = 'git clone https://github.com/dkoes/rdkit-scripts' os.system(a) data = os.listdir(wd + '/challengedata') for x in (data): #for each weeks data if x == "readme.txt" or x == "latest.txt" or x == "answers" or x == "rdkit-scripts" or x == 'PDBfiles' or x == 'visual.txt': pass else: toDir = wd + '/challengedata/answers/' + x if os.path.isdir( toDir ) == False: #if the path to answers dir doesnt exist os.mkdir(toDir) #make directory dock = os.listdir(wd + '/challengedata/' + x) for y in (dock): a = str(os.getcwd() + '/answers/' + x + '/' + y + '/lmcss_docked.sdf') if y == 'readme.txt' or y == 'new_release_structure_sequence_canonical.tsv' or y == 'new_release_structure_nonpolymer.tsv' or y == 'new_release_crystallization_pH.tsv' or y == 'new_release_structure_sequence.tsv': pass elif (os.path.isfile(a) == True): pass else: input = os.listdir(wd + '/challengedata/' + x + '/' + y) for z in (input): if z.startswith("LMCSS") and z.endswith(".pdb"): if (z.endswith("lig.pdb")): pass else: id = z.strip('.pdb') sts = str("grep ATOM " + z + " > lmcss_rec.pdb" ) #creates receptor .pdb file cd = wd + '/challengedata' os.chdir( cd + '/' + x + '/' + y) #change directory to week/ligand os.system( sts ) #runs and creates receptor .pbd file os.chdir(cd) #back to challenge directory input = os.listdir( cd + '/' + x + '/' + y ) #lists files inside ligand in certain week for z in (input): if z.endswith( ".smi" ): # changes .smi -> lig.sdf cd = str(os.getcwd()) sts = str(" " + cd + '/' + x + '/' + y + '/' + z + " lig.sdf --maxconfs 1") os.chdir(cd + '/' + x + '/' + y) os.system( cd + '/rdkit-scripts/rdconf.py' + sts) os.chdir(cd) for z in (input): # runs smina if z.endswith("lig.pdb"): sts = str( "smina -r lmcss_rec.pdb -l lig.sdf --autobox_ligand " + z + " -o " + id + "_docked.sdf") cd = str( os.getcwd()) #lignad directory os.chdir(cd + '/' + x + '/' + y) #os.system(sts) sts = str( "smina -r lmcss_rec.pdb -l lig.sdf --autobox_ligand " + z + " -o lmcss_docked.sdf") cd = str( os.getcwd()) #lignad directory os.chdir(cd + '/' + x + '/' + y) os.system(sts) os.chdir(cd) cur = str(os.getcwd() + '/answers/' + x + '/' + y) if (os.path.isdir(cur) == True): os.chdir(cd + '/' + x + '/' + y) os.getcwd() ## input = os.listdir(cd + '/' + x + '/' + y) for i in (input): if i.endswith( ".txt" ) and i != "center.txt" and i != "visual.txt": f = open(i) lines = f.readlines() ligand = lines[2].strip( 'ligand, ') ligand = ligand.replace( '\n', '') ligand = str(ligand) #gets the ligand from txt file if i.endswith("lig.pdb"): #see if pdb exists prody.fetchPDB(y) proteinPDB = prody.parsePDB(y) ourPDB = prody.parsePDB( 'lmcss_rec.pdb') a, b, seqid, overlap = prody.matchChains( proteinPDB, ourPDB)[0] b, protein_sp = prody.superpose( b, a, weights=None) b.select(ligand + '_ligand.pdb') sts = str("obrms -f " + i + ' ' + id + "_docked.sdf") #run obrms # parse results and output to the visualization txt file os.system(sts) f = open('visual.txt', 'ab+') f.write(x + ' smina ' + y + '\n') f.close curdir = str(cd + '/' + x + '/' + y + '/' + id + '_docked.sdf') print(input) ## for i in (input): if i.endswith("lig.pdb"): #see if pdb exists protein = prody.fetchPDB(y) #NEED NUMPY ARRAY prody.writeArray( 'lmcss_docked_array.sdf', array) prody.superpose( 'lmcss_docked.sdf', protein, weights=None) sts = str("obrms -f " + i + " lmcss_docked.sdf") #run obrms # parse results and output to the visualization txt file os.system(sts) os.chdir(wd + '/challengedata/') f = open('visual.txt', 'ab+') f.write(x + ' smina ' + y + '\n') f.close curdir = str( cd + '/' + x + '/' + y + '/lmcss_docked.sdf') todir = str(cd + '/answers/' + x + '/' + y + '/') shutil.copy(curdir, todir) print(curdir) break os.chdir(wd) else: os.mkdir(cur) os.chdir(cd + '/' + x + '/' + y) input = os.listdir(cd + '/' + x + '/' + y) for i in (input): if i.endswith( ".txt" ) and i != "center.txt" and i != "visual.txt": f = open(i) lines = f.readlines() ligand = lines[2].strip( "ligand, ") ligand = ligand.replace( '\n', '') ligand = str(ligand) #gets ligand from txt file if i.endswith("lig.pdb"): prody.fetchPDB(y) proteinPDB = prody.parsePDB(y) ourPDB = prody.parsePDB( 'lmcss_rec.pdb') prody.matchChains( proteinPDB, ourPDB) protein_sp = prody.superpose( ourPDB, proteinPDB, weights=None) protein_sp.select( ligand + '_ligand.pdb') sts = str("obrms -f " + i + ' ' + id + "_docked.sdf") os.system(sts) f = open('visual.txt', 'ab+') f.write(x + ' smina ' + y + '\n') f.close curdir = str(cd + '/' + x + '/' + y + '/' + id + '_docked.sdf') if i.endswith("lig.pdb"): protein = prody.fetchPDB(y) prody.writeArray( 'lmcss_docked_array.sdf', array) prody.superpose( 'lmcss_docked.sdf', protein, weights=None) sts = str("obrms -f " + i + " lmcss_docked.sdf") os.system(sts) os.chdir(wd + '/challengedata/') f = open('visual.txt', 'ab+') f.write(x + ' smina ' + y + '\n') f.close curdir = str( cd + '/' + x + '/' + y + '/lmcss_docked.sdf') todir = str(cd + '/answers/' + x + '/' + y + '/') shutil.copy(curdir, todir) print(curdir) break os.chdir(wd)
def align(): global wd ans = wd + '/challengedata/answers' if os.path.isdir( ans) == False: #if the answers directory isnt formed make it os.mkdir(wd + '/challengedata/answers') rddir = wd + '/challengedata/rdkit-scripts' if os.path.isdir(rddir) == False: a = 'git clone https://github.com/dkoes/rdkit-scripts' os.system(a) data = os.listdir(wd + '/challengedata') for x in (data): #for each weeks data if x == "readme.txt" or x == "latest.txt" or x == "answers" or x == "rdkit-scripts" or x == 'PDBfiles' or x == 'visual.txt': pass else: toDir = wd + '/challengedata/answers/' + x if os.path.isdir( toDir ) == False: #if the path to answers dir doesnt exist os.mkdir(toDir) #make directory dock = os.listdir(wd + '/challengedata/' + x) for y in (dock): a = str(os.getcwd() + '/answers/' + x + '/' + y + '/lmcss_docked.sdf') if y == 'readme.txt' or y == 'new_release_structure_sequence_canonical.tsv' or y == 'new_release_structure_nonpolymer.tsv' or y == 'new_release_crystallization_pH.tsv' or y == 'new_release_structure_sequence.tsv': pass elif (os.path.isfile(a) == True): pass else: input = os.listdir(wd + '/challengedata/' + x + '/' + y) for z in (input): if z.startswith("LMCSS") and z.endswith(".pdb"): if (z.endswith("lig.pdb")): pass else: sts = str("grep ATOM " + z + " > lmcss_rec.pdb") cd = wd + '/challengedata' os.chdir(cd + '/' + x + '/' + y) os.system(sts) os.chdir(cd) input = os.listdir(cd + '/' + x + '/' + y) for z in (input): if z.endswith(".smi"): cd = str(os.getcwd()) sts = str(" " + cd + '/' + x + '/' + y + '/' + z + " lig.sdf --maxconfs 1") os.chdir(cd + '/' + x + '/' + y) os.system( cd + '/rdkit-scripts/rdconf.py' + sts) os.chdir(cd) input = os.listdir(cd + '/' + x + '/' + y) for z in (input): if z.endswith("lig.pdb"): sts = str( "smina -r lmcss_rec.pdb -l lig.sdf --autobox_ligand " + z + " -o lmcss_docked.sdf") cd = str(os.getcwd()) os.chdir(cd + '/' + x + '/' + y) os.system(sts) os.chdir(cd) cur = str(os.getcwd() + '/answers/' + x + '/' + y) if (os.path.isdir(cur) == True): os.chdir(cd + '/' + x + '/' + y) input = os.listdir(cd + '/' + x + '/' + y) for i in (input): if i.endswith("lig.pdb"): #see if pdb exists protein = prody.fetchPDB(y) f = open('sdsorted.txt', 'ab+') bind = subprocess.check_output( 'sdsorter lmcss_docked.sdf -print', shell=True) f.write(bind) f.close() k = open('sdsorted.txt') lines = k.readlines() bind = lines[1].strip('1 ') bind = bind.split(" ", 1) print(bind[0]) k.close() sts = str("obrms -f " + i + " lmcss_docked.sdf") f = open('rmsd.txt', 'ab+') rm = subprocess.check_output( sts, shell=True) f.write(rm) f.close() j = open('rmsd.txt') lines = j.readlines() top = lines[1].strip('RMSD : ') top = top.replace('\n', '') j.close() print top #run obrms # parse results and output to the visualization txt file #os.system(sts) f = open('visual.txt', 'ab+') f.write(x + ' smina ' + y + ' ' + top + ' ' + bind[0] + '\n') f.close os.chdir(wd + '/challengedata/') print(x + ' ' + y) break os.chdir(wd) else: os.mkdir(cur) os.chdir(cd + '/' + x + '/' + y) input = os.listdir(cd + '/' + x + '/' + y) for i in (input): if i.endswith("lig.pdb"): protein = prody.fetchPDB(y) f = open('sdsorted.txt', 'ab+') bind = subprocess.check_output( 'sdsorter lmcss_docked.sdf -print', shell=True) f.write(bind) f.close() k = open('sdsorted.txt') lines = k.readlines() bind = lines[1].strip('1 ') bind = bind.split(" ", 1) print(bind[0]) k.close() sts = str("obrms -f " + i + " lmcss_docked.sdf") f = open('rmsd.txt', 'ab+') rm = subprocess.check_output( sts, shell=True) f.write(rm) f.close() j = open('rmsd.txt') lines = j.readlines() top = lines[1].strip('RMSD : ') top = top.replace('\n', '') print top j.close() #os.system(sts) f = open('visual.txt', 'ab+') f.write(x + ' smina ' + y + ' ' + top + ' ' + bind[0] + '\n') f.close() os.chdir(wd + '/challengedata/') print(x + ' ' + y) break os.chdir(wd)