def get_pdb_template( self, sequence, ): """ Retrieve a template structure from PDB from a BLAST search Parameters ---------- sequence: str A string of the protein sequence Returns ------- hits: dict A dictionary generated from ProDy with PDB hits. """ from prody import blastPDB import tempfile import pickle blast_record = blastPDB(sequence) hits = blast_record.getHits() # TODO need to address when BLAST search times out # TODO add option based on sequency similarity cut off return hits
def prody_blast(opt): """Blast search PDB based on command line arguments.""" import prody LOGGER = prody.LOGGER seq = opt.seq title = None if os.path.isfile(seq): title, seq = readFirstSequenceFasta(seq) LOGGER.info("First sequence ({0:s}) is parsed from {1:s}." .format(title, repr(seq))) if not seq.isalpha() or not seq.isupper(): opt.subparser.error("{0:s} is not a valid sequence or a file" .format(repr(seq))) folder, identity, coverage = opt.folder, opt.identity, opt.coverage if not 0 < identity < 100: opt.subparser.error('identity must be between 0 and 100') if not 0 < coverage < 100: opt.subparser.error('overlap must be between 0 and 100') blast_results = prody.blastPDB(seq) hits = blast_results.getHits(percent_identity=identity, percent_coverage=coverage) #sort hits by decreasing percent identity hits2 = [] for pdb in hits: hits2.append( (-hits[pdb]['percent_identity'], pdb) ) hits2.sort() for identity, pdb in hits2: chain = hits[pdb]['chain_id'] percent_identity = hits[pdb]['percent_identity'] title = hits[pdb]['title'] print(pdb + ' ' + chain + ' ' + ('%5.1f%%' % (percent_identity)) + ' ' + title) # download hits if --folder is given if opt.folder: LOGGER.info('Downloading hits to ' + opt.folder) pdblist = [ pdb for identity, pdb in hits2 ] pdblist2 = prody.fetchPDB(pdblist, opt.folder, compressed=opt.gzip, copy=True)
def get_PDBs(pdbdir): skip_blast = False for pdb_file in os.listdir(pdbdir): if pdb_file.endswith('.pdb'): print "\n***Now BLASTing %s***" % pdb_file actual_pdb_file = pdb_file output_dir = os.path.join('/kortemmelab/home/james.lucas/Mutant_PDBs', pdb_file[:-4]) try: os.makedirs(output_dir) except: print 'File directory %s already exists!!!' % pdb_file[:-4] skip_blast = True if skip_blast == False: pdb_chains = re.split(r'_', pdb_file)[1][:-4] pdb_id = re.split(r'_', pdb_file)[0] hv = prody.parsePDB(os.path.join(pdbdir, pdb_file)).getHierView() blast_dict = generate_mut_seq(pdbdir, hv) hit_list = [] for sequence in blast_dict: set_dict = {} temp_set = set() blast_me = prody.blastPDB(blast_dict[sequence], timeout = 600) hits_dict = blast_me.getHits(percent_identity=95) for hit in hits_dict: temp_set.add(hit) hit_list.append(temp_set) common_set = hit_list[0] for entry in hit_list[1:]: common_set.intersection_update(entry) for hit in common_set: rcsb.download_pdb(hit, output_dir) alignments(actual_pdb_file, common_set, pdbdir) else: print 'Blast skipped!'
def prody_blast(sequence, **kwargs): """Blast search PDB and download hits. :arg sequence: sequence or file in fasta format :arg identity: percent sequence identity for blast search, default is 90.0 :type identity: float :arg overlap: percent sequence overlap between sequences, default is 90.0 :type overlap: float :arg outdir: download uncompressed PDB files to given directory :type outdir: str :arg gzip: write compressed PDB file *Blast Parameters* :arg filename: a *filename* to save the results in XML format :type filename: str :arg hitlist_size: search parameters, default is 250 :type hitlist_size: int :arg expect: search parameters, default is 1e-10 :type expect: float :arg sleep: how long to wait to reconnect for results, default is 2 sleep time is doubled when results are not ready. :type sleep: int :arg timeout: when to give up waiting for results. default is 30 :type timeout: int""" import prody LOGGER = prody.LOGGER title = None if os.path.isfile(sequence): title, sequence = readFirstSequenceFasta(sequence) LOGGER.info("First sequence ({0}) is parsed from {1}." .format(title, repr(sequence))) if not sequence.isalpha() or not sequence.isupper(): raise ValueError("{0} is not a valid sequence or a file" .format(repr(sequence))) outdir = kwargs.get('outdir') identity, overlap = kwargs.get('identity', 90), kwargs.get('overlap', 90) if not 0 < identity < 100: raise ValueError('identity must be between 0 and 100') if not 0 < overlap < 100: raise ValueError('overlap must be between 0 and 100') filename = kwargs.get('filename', None) hitlist_size = kwargs.get('hitlist_size', 250) expect = kwargs.get('expect', 1e-10) sleep, timeout = kwargs.get('sleep', 2), kwargs.get('timeout', 30) blast_results = prody.blastPDB(sequence,filename=filename, hitlist_size=hitlist_size, expect=expect, sleep=sleep, timeout=timeout) if blast_results is None: raise IOError('blast search timed out, please try again') hits = blast_results.getHits(percent_identity=identity, percent_overlap=overlap) #sort hits by decreasing percent identity hits2 = [] for pdb in hits: hits2.append( (-hits[pdb]['percent_identity'], pdb) ) hits2.sort() stdout = kwargs.get('stdout', False) if not stdout: finalHits = [] else: from sys import stdout for identity, pdb in hits2: chain = hits[pdb]['chain_id'] percent_identity = hits[pdb]['percent_identity'] title = hits[pdb]['title'] if stdout: stdout.write(pdb + ' ' + chain + ' ' + ('%5.1f%%' % (percent_identity)) + ' ' + title) else: finalHits.append((pdb, chain, ('%5.1f%%' % (percent_identity)), title)) # download hits if --output-dir is given if outdir: LOGGER.info('Downloading hits to ' + outdir) pdblist = [ pdb for identity, pdb in hits2 ] pdblist2 = prody.fetchPDB(pdblist, outdir, compressed=kwargs.get('gzip'), copy=True) if not stdout: return finalHits
def prody_blast(sequence, **kwargs): """Blast search PDB and download hits. :arg sequence: sequence or file in fasta format :arg identity: percent sequence identity for blast search, default is 90.0 :type identity: float :arg overlap: percent sequence overlap between sequences, default is 90.0 :type overlap: float :arg outdir: download uncompressed PDB files to given directory :type outdir: str :arg gzip: write compressed PDB file *Blast Parameters* :arg filename: a *filename* to save the results in XML format :type filename: str :arg hitlist_size: search parameters, default is 250 :type hitlist_size: int :arg expect: search parameters, default is 1e-10 :type expect: float :arg sleep: how long to wait to reconnect for results, default is 2 sleep time is doubled when results are not ready. :type sleep: int :arg timeout: when to give up waiting for results. default is 30 :type timeout: int""" import prody LOGGER = prody.LOGGER title = None if os.path.isfile(sequence): title, sequence = readFirstSequenceFasta(sequence) LOGGER.info("First sequence ({0}) is parsed from {1}.".format( title, repr(sequence))) if not sequence.isalpha() or not sequence.isupper(): raise ValueError("{0} is not a valid sequence or a file".format( repr(sequence))) outdir = kwargs.get('outdir') identity, overlap = kwargs.get('identity', 90), kwargs.get('overlap', 90) if not 0 < identity < 100: raise ValueError('identity must be between 0 and 100') if not 0 < overlap < 100: raise ValueError('overlap must be between 0 and 100') filename = kwargs.get('filename', None) hitlist_size = kwargs.get('hitlist_size', 250) expect = kwargs.get('expect', 1e-10) sleep, timeout = kwargs.get('sleep', 2), kwargs.get('timeout', 30) blast_results = prody.blastPDB(sequence, filename=filename, hitlist_size=hitlist_size, expect=expect, sleep=sleep, timeout=timeout) if not blast_results.isSuccess: raise IOError('blast search timed out, please try again') hits = blast_results.getHits(percent_identity=identity, percent_overlap=overlap) #sort hits by decreasing percent identity hits2 = [] for pdb in hits: hits2.append((-hits[pdb]['percent_identity'], pdb)) hits2.sort() stdout = kwargs.get('stdout', False) if not stdout: finalHits = [] else: from sys import stdout for identity, pdb in hits2: chain = hits[pdb]['chain_id'] percent_identity = hits[pdb]['percent_identity'] title = hits[pdb]['title'] if stdout: stdout.write(pdb + ' ' + chain + ' ' + ('%5.1f%%' % (percent_identity)) + ' ' + title) else: finalHits.append( (pdb, chain, ('%5.1f%%' % (percent_identity)), title)) # download hits if --outdir is given if outdir: LOGGER.info('Downloading hits to ' + outdir) pdblist = [pdb for identity, pdb in hits2] pdblist2 = prody.fetchPDB(pdblist, outdir, compressed=kwargs.get('gzip'), copy=True) if not stdout: return finalHits