def parseResults(blastResultsPath, limitHits=MAX_HITS): ''' returns: a map from query seq id to a list of tuples of (subject seq id, evalue) for the top hits of the query sequence in the subject genome ''' # parse tabular results into hits. thank you, ncbi, for creating results this easy to parse. hitsMap = {} hitsCountMap = {} prevSeqId = None prevHitId = None fh = open(blastResultsPath) for line in fh: splits = line.split() try: seqId = fasta.idFromName(splits[0]) # remove namespace prefix, e.g. 'gi|' hitId = fasta.idFromName(splits[1]) hitEvalue = float(splits[10]) except Exception as e: logging.exception('parseResults(): prevSeqId: {}, prevHitId: {}, line: {}'.format(prevSeqId, prevHitId, line)) # results table reports multiple "alignments" per "hit" in ascending order by evalue # we only store the top hits. if prevSeqId != seqId or prevHitId != hitId: prevSeqId = seqId prevHitId = hitId if seqId not in hitsCountMap: hitsCountMap[seqId] = 0 hitsMap[seqId] = [] if not limitHits or hitsCountMap[seqId] < limitHits: hitsCountMap[seqId] += 1 hitsMap[seqId].append((hitId, hitEvalue)) fh.close() return hitsMap
def parseResults(blastResultsPath, limitHits=MAX_HITS): ''' returns: a map from query seq id to a list of tuples of (subject seq id, evalue) for the top hits of the query sequence in the subject genome ''' # parse tabular results into hits. thank you, ncbi, for creating results this easy to parse. hitsMap = {} hitsCountMap = {} prevSeqId = None prevHitId = None fh = open(blastResultsPath) for line in fh: splits = line.split() try: seqId = fasta.idFromName( splits[0]) # remove namespace prefix, e.g. 'gi|' hitId = fasta.idFromName(splits[1]) hitEvalue = float(splits[10]) except Exception as e: logging.exception( 'parseResults(): prevSeqId: {}, prevHitId: {}, line: {}'. format(prevSeqId, prevHitId, line)) # results table reports multiple "alignments" per "hit" in ascending order by evalue # we only store the top hits. if prevSeqId != seqId or prevHitId != hitId: prevSeqId = seqId prevHitId = hitId if seqId not in hitsCountMap: hitsCountMap[seqId] = 0 hitsMap[seqId] = [] if not limitHits or hitsCountMap[seqId] < limitHits: hitsCountMap[seqId] += 1 hitsMap[seqId].append((hitId, hitEvalue)) fh.close() return hitsMap
def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq, workPath): ''' aligns seq to hit. trims aligned seq and hit seq. returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits, and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold. e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc) ''' # ALIGN SEQ and HIT # need to align the sequences so we'z can study the rate of evolution per site inputFasta = '>%s\n%s\n>%s\n%s\n'%(seqId, seq, hitSeqId, hitSeq) if USE_CLUSTALW: alignedFasta = alignFastaClustalw(inputFasta, workPath) else: alignedFasta = alignFastaKalign(inputFasta) # try to recover from rare, intermittent failure of fasta alignment if not alignedFasta: logging.error('fasta alignment failed.\ninputFasta=%s\n' + 'alignedFasta=%s\nSleep and retry alignment.', inputFasta, alignedFasta) time.sleep(0.1) alignedFasta = alignFastaKalign(inputFasta) try: # parse the aligned fasta into sequence ids and sequences namelinesAndSeqs = list(fasta.readFasta(cStringIO.StringIO(alignedFasta))) idAndSeqs = [(fasta.idFromName(seqNameline), seq) for seqNameline, seq in namelinesAndSeqs] alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs except Exception as e: e.args += (inputFasta, alignedFasta) raise # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING # find most diverged sequence # sort sequences by dash count. why? divIdSeqs = [] for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq): dashCount = seq.count('-') div = dashCount / float(len(seq)) g = (dashCount, div, id, seq) divIdSeqs.append(g) divIdSeqs.sort() # check for excessive divergence leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[0] # check for excessive divergence and generate dashtrim. mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[1] # dashtrim = dashlen_check(mostDivergedSeq, divergence) startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq) # logging.debug('dashtrim='+str(dashtrim)) # trim and add seqs to output def divergencePredicate(divergenceThreshold): '''Why this logic? Ask Dennis. Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.''' if leastDivergedSeq and leastDivergedDiv > divergenceThreshold: return True if (startTrim or endTrim) and trimDivergence >= divergenceThreshold: return True return False alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [(id, seq[startTrim:(len(seq)-endTrim)]) for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq)] return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate
def makeGetSeqForId(genomeFastaPath): ''' genomeFastaPath: location of fasta file. also location/name of blast formatted indexes of the fasta file. ''' # suck fasta file into memory, converting it into a map from id to sequence # in memory dict performs much better than on-disk retrieval with xdget or fastacmd. # and genome fasta files do not take much space (on a modern computer). fastaMap = {} for (seqNameline, seq) in fasta.readFasta(genomeFastaPath): seqId = fasta.idFromName(seqNameline) fastaMap[seqId] = seq def getSeqForIdInMemory(seqId): return fastaMap[seqId] return getSeqForIdInMemory
def findSeqIdWithFasta(fastaSeq, subjectIndexPath): ''' return first hit ''' try: path = nested.makeTempPath() util.writeToFile(fastaSeq, path) cmd = 'blastp -outfmt 6 -query %s -db %s'%(path, subjectIndexPath) results = util.run(cmd, shell=True) finally: os.remove(path) hitId = None for line in results.splitlines(): # example line: foo sp|P39709|SEO1_YEAST 100.00 40 0 0 1 40 1 40 3e-1884.7 # the second field is from the hit nameline. hitId = fasta.idFromName(line.split()[1]) break # grab the first hit return hitId
def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq, workPath): ''' aligns seq to hit. trims aligned seq and hit seq. returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits, and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold. e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc) ''' # ALIGN SEQ and HIT # need to align the sequences so we'z can study the rate of evolution per site inputFasta = '>%s\n%s\n>%s\n%s\n' % (seqId, seq, hitSeqId, hitSeq) if USE_CLUSTALW: alignedFasta = alignFastaClustalw(inputFasta, workPath) else: alignedFasta = alignFastaKalign(inputFasta) # try to recover from rare, intermittent failure of fasta alignment if not alignedFasta: logging.error( 'fasta alignment failed.\ninputFasta=%s\n' + 'alignedFasta=%s\nSleep and retry alignment.', inputFasta, alignedFasta) time.sleep(0.1) alignedFasta = alignFastaKalign(inputFasta) try: # parse the aligned fasta into sequence ids and sequences namelinesAndSeqs = list( fasta.readFasta(cStringIO.StringIO(alignedFasta))) idAndSeqs = [(fasta.idFromName(seqNameline), seq) for seqNameline, seq in namelinesAndSeqs] alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs except Exception as e: e.args += (inputFasta, alignedFasta) raise # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING # find most diverged sequence # sort sequences by dash count. why? divIdSeqs = [] for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq): dashCount = seq.count('-') div = dashCount / float(len(seq)) g = (dashCount, div, id, seq) divIdSeqs.append(g) divIdSeqs.sort() # check for excessive divergence leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[ 0] # check for excessive divergence and generate dashtrim. mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[ 1] # dashtrim = dashlen_check(mostDivergedSeq, divergence) startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq) # logging.debug('dashtrim='+str(dashtrim)) # trim and add seqs to output def divergencePredicate(divergenceThreshold): '''Why this logic? Ask Dennis. Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.''' if leastDivergedSeq and leastDivergedDiv > divergenceThreshold: return True if (startTrim or endTrim) and trimDivergence >= divergenceThreshold: return True return False alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [ (id, seq[startTrim:(len(seq) - endTrim)]) for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq) ] return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate