Exemple #1
0
def pair_hmm_align_unaligned_seqs(seqs, moltype, params={}):
    """
        This needs to be moved to cogent.align.align
    """

    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError,\
         "Pairwise aligning of seqs requires exactly two seqs."

    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(\
         match=1,transition=-1,transversion=-1)

    return global_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
Exemple #2
0
 def test_pairwise_returns_score(self):
     """exercise pairwise local/global returns alignment score"""
     S = make_dna_scoring_dict(10, -1, -8)
     aln, score = local_pairwise(seq1, seq2, S, 10, 2, return_score=True)
     self.assertTrue(score > 100)
     aln, score = global_pairwise(seq1, seq2, S, 10, 2, return_score=True)
     self.assertTrue(score > 100)
Exemple #3
0
def pair_hmm_align_unaligned_seqs(seqs,moltype,params={}):
    """
        This needs to be moved to cogent.align.align
    """
    
    seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError,\
         "Pairwise aligning of seqs requires exactly two seqs."
    
    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(\
         match=1,transition=-1,transversion=-1)
    
    return global_pairwise(s1,s2,score_matrix,gap_open,gap_extend)
Exemple #4
0
 def test_pairwise_returns_score(self):
     """exercise pairwise local/global returns alignment score"""
     S = make_dna_scoring_dict(10, -1, -8)
     aln, score = local_pairwise(seq1, seq2, S, 10, 2, return_score=True)
     self.assertTrue(score > 100)
     aln, score = global_pairwise(seq1, seq2, S, 10, 2, return_score=True)
     self.assertTrue(score > 100)
Exemple #5
0
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}):
    """
        Checks parameters for pairwise alignment, returns alignment.

        Code from Greg Caporaso.
    """

    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError(
            "Pairwise aligning of seqs requires exactly two seqs.")

    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(match=1,
                                             transition=-1,
                                             transversion=-1)

    return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
Exemple #6
0
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}):
    """
        Checks parameters for pairwise alignment, returns alignment.

        Code from Greg Caporaso.
    """

    seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False)
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError(
            "Pairwise aligning of seqs requires exactly two seqs.")

    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(
            match=1, transition=-1, transversion=-1)

    return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
Exemple #7
0
 def test_local_tiebreak(self):
     """Should pick the first best-equal hit rather than the last one"""
     # so that the Pyrex and Python versions give the same result.
     score_matrix = make_dna_scoring_dict(match=1, transition=-1, 
             transversion=-1)
     pattern = DNA.makeSequence('cwc', Name='pattern')
     two_hit = DNA.makeSequence( 'cactc', Name= 'target')
     aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2)
     hit = aln.NamedSeqs['target']
     self.assertEqual(str(hit).lower(), 'cac')
Exemple #8
0
 def test_local_tiebreak(self):
     """Should pick the first best-equal hit rather than the last one"""
     # so that the Pyrex and Python versions give the same result.
     score_matrix = make_dna_scoring_dict(match=1,
                                          transition=-1,
                                          transversion=-1)
     pattern = DNA.makeSequence('cwc', Name='pattern')
     two_hit = DNA.makeSequence('cactc', Name='target')
     aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2)
     hit = aln.NamedSeqs['target']
     self.assertEqual(str(hit).lower(), 'cac')
def test(r=1, **kw):
    S = make_dna_scoring_dict(10, -1, -8)

    seq2 = DNA.makeSequence("AAAATGCTTA" * r)
    seq1 = DNA.makeSequence("AATTTTGCTG" * r)

    t0 = time.time()
    aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw)
    t = time.time() - t0
    return (len(seq1) * len(seq2)) / t

    print t
def test(r=1, **kw):
    S = make_dna_scoring_dict(10, -1, -8)

    seq2 = DNA.makeSequence('AAAATGCTTA' * r)
    seq1 = DNA.makeSequence('AATTTTGCTG' * r)

    t0 = time.time()
    aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw)
    t = time.time() - t0
    return (len(seq1) * len(seq2)) / t

    print t
Exemple #11
0
def makeSampleAlignment():
    # must be an esier way to make an alignment of annotated sequences!
    from cogent.align.align import global_pairwise, make_dna_scoring_dict
    DNA = make_dna_scoring_dict(10, -8, -8)
    seq1 = makeSampleSequence()[:-2]
    seq2 = makeSampleSequence()[2:]
    seq1.Name = 'FAKE01'
    seq2.Name = 'FAKE02'
    names = (seq1.getName(), seq2.getName())
    align = global_pairwise(seq1, seq2, DNA, 2, 1)
    align.addAnnotation(annotation.Variable, 'redline', 'align', [((0,15),1),((15,30),2),((30,45),3)])
    align.addAnnotation(annotation.Variable, 'blueline', 'align', [((0,15),1.5),((15,30),2.5),((30,45),3.5)])
    return align
def test(r=1, **kw):   
    S = make_dna_scoring_dict(10, -1, -8)
    
    seq2 = DNA.makeSequence('AAAATGCTTA' * r)
    seq1 = DNA.makeSequence('AATTTTGCTG' * r)
    
    t0 = time.clock()
    try:
        # return_alignment is False in order to emphasise the quadratic part of the work.
        aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw)
    except ArithmeticError:
        return '*'
    else:
        t = time.clock() - t0
        return int ( (len(seq1)*len(seq2))/t/1000 )
Exemple #13
0
def makeSampleAlignment():
    # must be an esier way to make an alignment of annotated sequences!
    from cogent.align.align import global_pairwise, make_dna_scoring_dict
    DNA = make_dna_scoring_dict(10, -8, -8)
    seq1 = makeSampleSequence()[:-2]
    seq2 = makeSampleSequence()[2:]
    seq1.Name = 'FAKE01'
    seq2.Name = 'FAKE02'
    names = (seq1.getName(), seq2.getName())
    align = global_pairwise(seq1, seq2, DNA, 2, 1)
    align.addAnnotation(annotation.Variable, 'redline', 'align',
                        [((0, 15), 1), ((15, 30), 2), ((30, 45), 3)])
    align.addAnnotation(annotation.Variable, 'blueline', 'align',
                        [((0, 15), 1.5), ((15, 30), 2.5), ((30, 45), 3.5)])
    return align
def pair_hmm_align_unaligned_seqs(seqs,
                                  moltype=DNA,
                                  params={}):
    """
        Handles pairwise alignment of given sequence pair
        
        seqs: list of [primer, target sequence] in string format
        moltype: molecule type tested.  Only DNA supported.
        params: Used to set parameters for opening, extending gaps  and score
         matrix if something other than the default given in this function 
         is desired.
    """
    
    try:
        seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False)
    except AlphabetError:
        raise AlphabetError,("Error in characters present in primer "+\
         "%s and/or sequence %s." % (seqs[0], seqs[1]))
    try:
        s1, s2 = seqs.values()
    except ValueError:
        raise ValueError,\
         "Pairwise aligning of seqs requires exactly two seqs."
    
    try:
        gap_open = params['gap_open']
    except KeyError:
        gap_open = 5
    try:
        gap_extend = params['gap_extend']
    except KeyError:
        gap_extend = 2
    try:
        score_matrix = params['score_matrix']
    except KeyError:
        score_matrix = make_dna_scoring_dict(\
         match=1, transition=-1, transversion=-1)
    
    return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def test(r=1, **kw):
    S = make_dna_scoring_dict(10, -1, -8)

    seq2 = DNA.makeSequence('AAAATGCTTA' * r)
    seq1 = DNA.makeSequence('AATTTTGCTG' * r)

    t0 = time.clock()
    try:
        # return_alignment is False in order to emphasise the quadratic part of the work.
        aln = classic_align_pairwise(seq1,
                                     seq2,
                                     S,
                                     10,
                                     2,
                                     local=False,
                                     return_alignment=False,
                                     **kw)
    except ArithmeticError:
        return '*'
    else:
        t = time.clock() - t0
        return int((len(seq1) * len(seq2)) / t / 1000)
Exemple #16
0
 def _aligned_both_ways(self, seq1, seq2, **kw):
     S = make_dna_scoring_dict(10, -1, -8)
     a1 = classic_align_pairwise(seq1, seq2, S, 10, 2, **kw)
     a2 = classic_align_pairwise(seq2, seq1, S, 10, 2, **kw)
     return [a1, a2]
Exemple #17
0
 def _aligned_both_ways(self, seq1, seq2, **kw):
     S = make_dna_scoring_dict(10, -1, -8)
     a1 = classic_align_pairwise(seq1, seq2, S, 10, 2, **kw)
     a2 = classic_align_pairwise(seq2, seq1, S, 10, 2, **kw)
     return [a1, a2]
Exemple #18
0
def RemoveError(log,seqs,seqsnp,sfreq,readerror,meanerror,ofracerr,indelprob,indelmax,pyroseq):
    """ Deblur the reads
    Input:
        log - a LogMe log module to write the debluring info
        seqs - the list of sequences
        seqsnp - a list of numpy arrays of the sequences (for faster comparison) - from SeqToArray()
        sfreq - dictionary (based on the sequence) of the number of reads for each sequence
        readerror - the maximal read error expected (fraction - typically 0.01)
        meanerror - the mean read error used for peak spread normalization - typically 0.01
        ofracerr - the error distribution array, or 0 if use default
        indelprob - the probability for an indel (currently constant for number of indels until max is reached)
        indelmax - the maximal number of indels expected by errors (error cutoff)
        pyroseq - if set, use pairwise alignment for pyrosequencing data
    Output:
        sfreq - the deblurred number of reads for each sequence (0 if not present)
        debugdata - a list of strings
    Notes:
        meanerror is used only for normalizing the peak height before deblurring, whereas readerror
        is used for calculating the expected number of errors for each position
        error distribution array X should be of length >10, where Xi = max frequency of error hamming i
        if it is 0, we use the default distribution
    """
    # take the list values so it won't change
    fracerr=list(ofracerr)
    
    # we assume all sequences are of equal length
    commonlen=len(seqs[0])
    for cseq in seqs:
        if not(commonlen==len(cseq)):
            print("Not all sequences are same length!!!!")
            print(commonlen)
            print(len(cseq))
            print(cseq)
    print ("processing",len(seqs),"sequences")

    numreal=0
    for cchar in seqs[0]:
        if not (cchar=='-'):
            numreal+=1
    modfactor=pow((1-meanerror),numreal)

    # create the error profile from the read error
    # exponential independent
    #   fracerr=[]
    #   for a in range(10):
    #       fracerr.append(pow(readerror,a)/modfactor)

    # empirical
    #    fracerr=[1.0/modfactor,pow(readerror,1)/modfactor,2*pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor]

    # used for the 22 mock mixture
    #    fracerr=[1.0/modfactor,pow(readerror,1)/modfactor,0.01,0.01,0.01,0.005,0.005,0.005,0.005,0.005,0.005,0.001,0.001,0.001,0.001,0.001,0.001,0.0005,0.0001,0.0001]

    # used for the 44 mock mixture
    #   e1=pow(readerror,1)/modfactor
    #   fracerr=[1.0/modfactor,e1,e1/4,e1/5,e1/6,e1/8,e1/10,e1/15,e1/20,e1/30,e1/40,e1/50,e1/50,e1/50,e1/50,e1/50,e1/50,e1/100,e1/500,e1/500]

    # if fracerr not supplied, use the default (22 mock mixture setup)
    log.log("original fracer parameter:",fracerr)
    if fracerr==0:
        fracerr=[1.0/modfactor,pow(readerror,1)/modfactor,0.01,0.01,0.01,0.005,0.005,0.005,0.005,0.005,0.005,0.001,0.001,0.001,0.001]
        log.log("modified fracerr because it was 0")
    else:
        for idx,val in enumerate(fracerr):
            fracerr[idx]=fracerr[idx]/modfactor

    maxhdist=len(fracerr)-1

    print "fracerr"
    print fracerr
    print "readerror"
    print readerror
    print "modfactor"
    print modfactor   

    log.log("indel prob:",indelprob)
    log.log("indel max:",indelmax)
    log.log("readerror:",readerror)
    log.log("meanerror:",meanerror)
    log.log("mod factor:",modfactor)
    log.log("fracerr:",fracerr)     

    # for pairwise alignment:
    DNAm = make_dna_scoring_dict(10, -8, -8)

    for idx,cseq in enumerate(seqs):
        csfreq=sfreq[cseq]
        # no need to remove neighbors if freq. is <=0
        if csfreq<=0:
            continue
        # correct for the fact that many reads are expected to be mutated
        numerr=[] 
        for a in range(len(fracerr)):
            numerr.append(fracerr[a]*csfreq)

        # if it's low level, just continue
        if numerr[1]<0.1:
            continue

        # compare to all other sequences and calculate hamming dist
        cseqnp=seqsnp[idx]
        oseqlen=len(seqs[idx].rstrip('-'))
        for idxtmp,seqnptmp in enumerate(seqsnp):
            # don't compare to ourselves (dist=0)
            if idxtmp==idx:
                continue
            # calculate the hamming distance
            hdist=np.count_nonzero(np.not_equal(seqnptmp,cseqnp))
            # if far away, don't need to correct
            if hdist>maxhdist:
                continue
            # close, so lets calculate exact distance

            numsub=0
            numindel=0
            # experimental try 2
            # s1=seqs[idx].replace('-','')
            # s2=seqs[idxtmp].replace('-','')
            # cseq1,cseq2=nw_align(s1,s2)

            # experimental: pairwise align the sequences
            if pyroseq:
                s0=DNA.makeSequence(seqs[idx])
                s0=s0.degap()
                s1=DNA.makeSequence(seqs[idxtmp])
                s1=s1.degap()
                print s0._seq
                print s1._seq
                align = global_pairwise(s0, s1, DNAm, 10, 9)
                a0=align.getGappedSeq('seq_0')
                a1=align.getGappedSeq('seq_1')            
                cseq1=a0._seq
                cseq2=a1._seq

                len1=len(cseq1.rstrip('-'))
                len2=len(cseq2.rstrip('-'))
                oseqlen=len(cseq1)
                for cpos in range(oseqlen):
                    if not (cseq1[cpos]==cseq2[cpos]):
                        if cseq1[cpos]=='-':
                            if cpos<len1:
                                numindel+=1
                        else:
                            if cseq2[cpos]=='-':
                                if cpos<len2:
                                    numindel+=1
                            else:
                                numsub+=1

            # not pyrosequencing so use the faster global alignment
            else:
                for cpos in range(oseqlen):
                    if not (cseqnp[cpos]==seqnptmp[cpos]):
                        # 4 is '-'
                        if seqnptmp[cpos]==4:
                            numindel+=1
                        else:
                            if cseqnp[cpos]==4:
                                numindel+=1
                            else:
                                numsub+=1

            nerr=numerr[numsub]

            # remove errors due to (PCR?) indels (saw in 22 mock mixture)
            if numindel>0:
                nerr=nerr*indelprob
            if numindel>indelmax:
                nerr=0

            # if the effect is small - don't do anything
            if nerr<0.1:
                continue
            # met all the criteria - so correct the frequency of the neighbor
            sfreq[seqs[idxtmp]]-=nerr
            # if sfreq[seqs[idxtmp]]<=0:
            #     if sfreq[seqs[idxtmp]]+nerr>0:
            #         log.log("Removed sequence ",idxtmp," due to sequence ",idx)
            #         log.log("seq:",idx," and ",idxtmp," have ",numindel," indels and ",numsub,"substitutions")
            #         log.log(cseq1)
            #         log.log(cseq2)
            #         log.log("true seq freq:",csfreq)
            #         log.log("freq from ",sfreq[seqs[idxtmp]]+nerr," to ",sfreq[seqs[idxtmp]])
            # else:
            #     if numindel>0:
            #         log.log("====indels but no delete!!!!")
            #         log.log("seq:",idx," and ",idxtmp," have ",numindel," indels and ",numsub,"substitutions")
            #         log.log(cseq1)
            #         log.log(cseq2)
            #         log.log("true seq freq:",csfreq)
            #         log.log("freq from ",sfreq[seqs[idxtmp]]+nerr," to ",sfreq[seqs[idxtmp]])
    return(sfreq)