def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq, workPath):
    '''
    aligns seq to hit.  trims aligned seq and hit seq.
    returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits,
    and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold.
    e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc)
    '''
    # ALIGN SEQ and HIT
    # need to align the sequences so we'z can study the rate of evolution per site
    inputFasta = '>%s\n%s\n>%s\n%s\n'%(seqId, seq, hitSeqId, hitSeq)
    if USE_CLUSTALW:
        alignedFasta = alignFastaClustalw(inputFasta, workPath)
    else:
        alignedFasta = alignFastaKalign(inputFasta)
        # try to recover from rare, intermittent failure of fasta alignment
        if not alignedFasta:
            logging.error('fasta alignment failed.\ninputFasta=%s\n' +
                          'alignedFasta=%s\nSleep and retry alignment.',
                          inputFasta, alignedFasta)
            time.sleep(0.1)
            alignedFasta = alignFastaKalign(inputFasta)
    try:
        # parse the aligned fasta into sequence ids and sequences
        namelinesAndSeqs = list(fasta.readFasta(cStringIO.StringIO(alignedFasta)))
        idAndSeqs = [(fasta.idFromName(seqNameline), seq) for seqNameline, seq in namelinesAndSeqs]
        alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs
    except Exception as e:
        e.args += (inputFasta, alignedFasta)
        raise
    
    # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING
    # find most diverged sequence
    # sort sequences by dash count.  why?
    divIdSeqs = []
    for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq):
        dashCount = seq.count('-')
        div = dashCount / float(len(seq))
        g = (dashCount, div, id, seq)
        divIdSeqs.append(g)
    divIdSeqs.sort()

    # check for excessive divergence
    leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[0]
    # check for excessive divergence and generate dashtrim.
    mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[1]
    # dashtrim = dashlen_check(mostDivergedSeq, divergence)
    startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq)
    # logging.debug('dashtrim='+str(dashtrim))
    # trim and add seqs to output
    def divergencePredicate(divergenceThreshold):
        '''Why this logic?  Ask Dennis.  Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.'''
        if leastDivergedSeq and leastDivergedDiv > divergenceThreshold:
            return True
        if (startTrim or endTrim) and trimDivergence >= divergenceThreshold:
            return True
        return False
            
    alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [(id, seq[startTrim:(len(seq)-endTrim)]) for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq)]
    return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate
def makeGetSeqForId(genomeFastaPath):
    '''
    genomeFastaPath: location of fasta file.  also location/name of blast formatted indexes of the fasta file.
    '''
    # suck fasta file into memory, converting it into a map from id to sequence
    # in memory dict performs much better than on-disk retrieval with xdget or fastacmd.
    # and genome fasta files do not take much space (on a modern computer).
    fastaMap = {}
    for (seqNameline, seq) in fasta.readFasta(genomeFastaPath):
        seqId = fasta.idFromName(seqNameline)
        fastaMap[seqId] = seq
    def getSeqForIdInMemory(seqId):
        return fastaMap[seqId]
    return getSeqForIdInMemory
def makeGetSeqForId(genomeFastaPath):
    '''
    genomeFastaPath: location of fasta file.  also location/name of blast formatted indexes of the fasta file.
    '''
    # suck fasta file into memory, converting it into a map from id to sequence
    # in memory dict performs much better than on-disk retrieval with xdget or fastacmd.
    # and genome fasta files do not take much space (on a modern computer).
    fastaMap = {}
    for (seqNameline, seq) in fasta.readFasta(genomeFastaPath):
        seqId = fasta.idFromName(seqNameline)
        fastaMap[seqId] = seq

    def getSeqForIdInMemory(seqId):
        return fastaMap[seqId]

    return getSeqForIdInMemory
def create_fv_files():
    filename_fasta = inputFile
    filename_profile = profile
    filename_fv = outputFile
    fasta_dict = readFasta(filename_fasta)
    profile_dict = read_profiles(filename_profile)
    pool = mp.Pool(processes=8)
    results = [
        pool.apply_async(form_feature_vector,
                         args=(prot_id, fasta_dict[prot_id], profile_dict))
        for prot_id in fasta_dict
    ]

    fv_dict_raw = dict()
    for p in results:
        (prot_id, fv) = p.get()
        fv_dict_raw[prot_id] = fv
    write_feature_vector(filename_fv, fv_dict_raw)
    return
Beispiel #5
0
        count[posdict[c]] += 1
    return ranks


#Gather arguments from the user
if (len(sys.argv) < 4):
    #If in the incorrect form, return an error message
    print("Arguments must be of the form : referencefile, readsfile, k, dmax.")
    exit(0)
#referencefile and readsfile must be file names, k and dmax integers.
referencefile, readsfile, kmerLength, dmax = sys.argv[1], sys.argv[2], int(
    sys.argv[3]), int(sys.argv[4])

#Initialization of the reference file
#readFasta only take the sequence of bases, and the $ is for the BWT, to mark the end of the string.
reference = (fasta.readFasta(referencefile)).lower() + "$"

#Initialization of reads and readsInv, its reverse complementary
reads, readsBioPalind = [], []
for line in open(readsfile, "r"):
    if line[0] != ">":  #lines with > do not contain sequences, but merely comments about the sequences.

        reads.append(
            line[:-1].lower()
        )  #-1 to remove \n. To lower case for practical reasons when calling posdict.
        readsBioPalind.append(biologicalPalyndrome(
            line[:-1].lower()))  #We also stock the biological palyndromes

#We create SA, BWT, Rank and F from reference

print("generating SA")
def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq,
                                           workPath):
    '''
    aligns seq to hit.  trims aligned seq and hit seq.
    returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits,
    and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold.
    e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc)
    '''
    # ALIGN SEQ and HIT
    # need to align the sequences so we'z can study the rate of evolution per site
    inputFasta = '>%s\n%s\n>%s\n%s\n' % (seqId, seq, hitSeqId, hitSeq)
    if USE_CLUSTALW:
        alignedFasta = alignFastaClustalw(inputFasta, workPath)
    else:
        alignedFasta = alignFastaKalign(inputFasta)
        # try to recover from rare, intermittent failure of fasta alignment
        if not alignedFasta:
            logging.error(
                'fasta alignment failed.\ninputFasta=%s\n' +
                'alignedFasta=%s\nSleep and retry alignment.', inputFasta,
                alignedFasta)
            time.sleep(0.1)
            alignedFasta = alignFastaKalign(inputFasta)
    try:
        # parse the aligned fasta into sequence ids and sequences
        namelinesAndSeqs = list(
            fasta.readFasta(cStringIO.StringIO(alignedFasta)))
        idAndSeqs = [(fasta.idFromName(seqNameline), seq)
                     for seqNameline, seq in namelinesAndSeqs]
        alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs
    except Exception as e:
        e.args += (inputFasta, alignedFasta)
        raise

    # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING
    # find most diverged sequence
    # sort sequences by dash count.  why?
    divIdSeqs = []
    for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq):
        dashCount = seq.count('-')
        div = dashCount / float(len(seq))
        g = (dashCount, div, id, seq)
        divIdSeqs.append(g)
    divIdSeqs.sort()

    # check for excessive divergence
    leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[
        0]
    # check for excessive divergence and generate dashtrim.
    mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[
        1]
    # dashtrim = dashlen_check(mostDivergedSeq, divergence)
    startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq)

    # logging.debug('dashtrim='+str(dashtrim))
    # trim and add seqs to output
    def divergencePredicate(divergenceThreshold):
        '''Why this logic?  Ask Dennis.  Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.'''
        if leastDivergedSeq and leastDivergedDiv > divergenceThreshold:
            return True
        if (startTrim or endTrim) and trimDivergence >= divergenceThreshold:
            return True
        return False

    alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [
        (id, seq[startTrim:(len(seq) - endTrim)])
        for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq)
    ]
    return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate