def parseResults(blastResultsPath, limitHits=MAX_HITS):
    '''
    returns: a map from query seq id to a list of tuples of (subject seq id, evalue) for the top hits of the query sequence in the subject genome
    '''
    # parse tabular results into hits.  thank you, ncbi, for creating results this easy to parse.
    hitsMap = {}
    hitsCountMap = {}
    prevSeqId = None
    prevHitId = None
    fh = open(blastResultsPath)
    for line in fh:
        splits = line.split()
        try:
            seqId = fasta.idFromName(splits[0]) # remove namespace prefix, e.g. 'gi|'
            hitId = fasta.idFromName(splits[1])
            hitEvalue = float(splits[10])
        except Exception as e:
            logging.exception('parseResults(): prevSeqId: {}, prevHitId: {}, line: {}'.format(prevSeqId, prevHitId, line))
        # results table reports multiple "alignments" per "hit" in ascending order by evalue
        # we only store the top hits.
        if prevSeqId != seqId or prevHitId != hitId:
            prevSeqId = seqId
            prevHitId = hitId
            if seqId not in hitsCountMap:
                hitsCountMap[seqId] = 0
                hitsMap[seqId] = []
            if not limitHits or hitsCountMap[seqId] < limitHits:
                hitsCountMap[seqId] += 1                
                hitsMap[seqId].append((hitId, hitEvalue))
    fh.close()
    return hitsMap
def parseResults(blastResultsPath, limitHits=MAX_HITS):
    '''
    returns: a map from query seq id to a list of tuples of (subject seq id, evalue) for the top hits of the query sequence in the subject genome
    '''
    # parse tabular results into hits.  thank you, ncbi, for creating results this easy to parse.
    hitsMap = {}
    hitsCountMap = {}
    prevSeqId = None
    prevHitId = None
    fh = open(blastResultsPath)
    for line in fh:
        splits = line.split()
        try:
            seqId = fasta.idFromName(
                splits[0])  # remove namespace prefix, e.g. 'gi|'
            hitId = fasta.idFromName(splits[1])
            hitEvalue = float(splits[10])
        except Exception as e:
            logging.exception(
                'parseResults(): prevSeqId: {}, prevHitId: {}, line: {}'.
                format(prevSeqId, prevHitId, line))
        # results table reports multiple "alignments" per "hit" in ascending order by evalue
        # we only store the top hits.
        if prevSeqId != seqId or prevHitId != hitId:
            prevSeqId = seqId
            prevHitId = hitId
            if seqId not in hitsCountMap:
                hitsCountMap[seqId] = 0
                hitsMap[seqId] = []
            if not limitHits or hitsCountMap[seqId] < limitHits:
                hitsCountMap[seqId] += 1
                hitsMap[seqId].append((hitId, hitEvalue))
    fh.close()
    return hitsMap
def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq, workPath):
    '''
    aligns seq to hit.  trims aligned seq and hit seq.
    returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits,
    and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold.
    e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc)
    '''
    # ALIGN SEQ and HIT
    # need to align the sequences so we'z can study the rate of evolution per site
    inputFasta = '>%s\n%s\n>%s\n%s\n'%(seqId, seq, hitSeqId, hitSeq)
    if USE_CLUSTALW:
        alignedFasta = alignFastaClustalw(inputFasta, workPath)
    else:
        alignedFasta = alignFastaKalign(inputFasta)
        # try to recover from rare, intermittent failure of fasta alignment
        if not alignedFasta:
            logging.error('fasta alignment failed.\ninputFasta=%s\n' +
                          'alignedFasta=%s\nSleep and retry alignment.',
                          inputFasta, alignedFasta)
            time.sleep(0.1)
            alignedFasta = alignFastaKalign(inputFasta)
    try:
        # parse the aligned fasta into sequence ids and sequences
        namelinesAndSeqs = list(fasta.readFasta(cStringIO.StringIO(alignedFasta)))
        idAndSeqs = [(fasta.idFromName(seqNameline), seq) for seqNameline, seq in namelinesAndSeqs]
        alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs
    except Exception as e:
        e.args += (inputFasta, alignedFasta)
        raise
    
    # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING
    # find most diverged sequence
    # sort sequences by dash count.  why?
    divIdSeqs = []
    for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq):
        dashCount = seq.count('-')
        div = dashCount / float(len(seq))
        g = (dashCount, div, id, seq)
        divIdSeqs.append(g)
    divIdSeqs.sort()

    # check for excessive divergence
    leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[0]
    # check for excessive divergence and generate dashtrim.
    mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[1]
    # dashtrim = dashlen_check(mostDivergedSeq, divergence)
    startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq)
    # logging.debug('dashtrim='+str(dashtrim))
    # trim and add seqs to output
    def divergencePredicate(divergenceThreshold):
        '''Why this logic?  Ask Dennis.  Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.'''
        if leastDivergedSeq and leastDivergedDiv > divergenceThreshold:
            return True
        if (startTrim or endTrim) and trimDivergence >= divergenceThreshold:
            return True
        return False
            
    alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [(id, seq[startTrim:(len(seq)-endTrim)]) for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq)]
    return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate
def makeGetSeqForId(genomeFastaPath):
    '''
    genomeFastaPath: location of fasta file.  also location/name of blast formatted indexes of the fasta file.
    '''
    # suck fasta file into memory, converting it into a map from id to sequence
    # in memory dict performs much better than on-disk retrieval with xdget or fastacmd.
    # and genome fasta files do not take much space (on a modern computer).
    fastaMap = {}
    for (seqNameline, seq) in fasta.readFasta(genomeFastaPath):
        seqId = fasta.idFromName(seqNameline)
        fastaMap[seqId] = seq
    def getSeqForIdInMemory(seqId):
        return fastaMap[seqId]
    return getSeqForIdInMemory
Example #5
0
def findSeqIdWithFasta(fastaSeq, subjectIndexPath):
    ''' return first hit '''
    try:
        path = nested.makeTempPath()
        util.writeToFile(fastaSeq, path)
        cmd = 'blastp -outfmt 6 -query %s -db %s'%(path, subjectIndexPath)
        results = util.run(cmd, shell=True)
    finally:
        os.remove(path)        
    hitId = None
    for line in results.splitlines():
        # example line: foo sp|P39709|SEO1_YEAST 100.00 40 0 0 1 40 1 40 3e-1884.7
        # the second field is from the hit nameline.
        hitId = fasta.idFromName(line.split()[1])
        break # grab the first hit
    return hitId
def makeGetSeqForId(genomeFastaPath):
    '''
    genomeFastaPath: location of fasta file.  also location/name of blast formatted indexes of the fasta file.
    '''
    # suck fasta file into memory, converting it into a map from id to sequence
    # in memory dict performs much better than on-disk retrieval with xdget or fastacmd.
    # and genome fasta files do not take much space (on a modern computer).
    fastaMap = {}
    for (seqNameline, seq) in fasta.readFasta(genomeFastaPath):
        seqId = fasta.idFromName(seqNameline)
        fastaMap[seqId] = seq

    def getSeqForIdInMemory(seqId):
        return fastaMap[seqId]

    return getSeqForIdInMemory
def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq,
                                           workPath):
    '''
    aligns seq to hit.  trims aligned seq and hit seq.
    returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits,
    and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold.
    e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc)
    '''
    # ALIGN SEQ and HIT
    # need to align the sequences so we'z can study the rate of evolution per site
    inputFasta = '>%s\n%s\n>%s\n%s\n' % (seqId, seq, hitSeqId, hitSeq)
    if USE_CLUSTALW:
        alignedFasta = alignFastaClustalw(inputFasta, workPath)
    else:
        alignedFasta = alignFastaKalign(inputFasta)
        # try to recover from rare, intermittent failure of fasta alignment
        if not alignedFasta:
            logging.error(
                'fasta alignment failed.\ninputFasta=%s\n' +
                'alignedFasta=%s\nSleep and retry alignment.', inputFasta,
                alignedFasta)
            time.sleep(0.1)
            alignedFasta = alignFastaKalign(inputFasta)
    try:
        # parse the aligned fasta into sequence ids and sequences
        namelinesAndSeqs = list(
            fasta.readFasta(cStringIO.StringIO(alignedFasta)))
        idAndSeqs = [(fasta.idFromName(seqNameline), seq)
                     for seqNameline, seq in namelinesAndSeqs]
        alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs
    except Exception as e:
        e.args += (inputFasta, alignedFasta)
        raise

    # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING
    # find most diverged sequence
    # sort sequences by dash count.  why?
    divIdSeqs = []
    for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq):
        dashCount = seq.count('-')
        div = dashCount / float(len(seq))
        g = (dashCount, div, id, seq)
        divIdSeqs.append(g)
    divIdSeqs.sort()

    # check for excessive divergence
    leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[
        0]
    # check for excessive divergence and generate dashtrim.
    mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[
        1]
    # dashtrim = dashlen_check(mostDivergedSeq, divergence)
    startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq)

    # logging.debug('dashtrim='+str(dashtrim))
    # trim and add seqs to output
    def divergencePredicate(divergenceThreshold):
        '''Why this logic?  Ask Dennis.  Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.'''
        if leastDivergedSeq and leastDivergedDiv > divergenceThreshold:
            return True
        if (startTrim or endTrim) and trimDivergence >= divergenceThreshold:
            return True
        return False

    alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [
        (id, seq[startTrim:(len(seq) - endTrim)])
        for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq)
    ]
    return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate