def pair_hmm_align_unaligned_seqs(seqs, moltype, params={}): """ This needs to be moved to cogent.align.align """ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError,\ "Pairwise aligning of seqs requires exactly two seqs." try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(\ match=1,transition=-1,transversion=-1) return global_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def test_pairwise_returns_score(self): """exercise pairwise local/global returns alignment score""" S = make_dna_scoring_dict(10, -1, -8) aln, score = local_pairwise(seq1, seq2, S, 10, 2, return_score=True) self.assertTrue(score > 100) aln, score = global_pairwise(seq1, seq2, S, 10, 2, return_score=True) self.assertTrue(score > 100)
def pair_hmm_align_unaligned_seqs(seqs,moltype,params={}): """ This needs to be moved to cogent.align.align """ seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError,\ "Pairwise aligning of seqs requires exactly two seqs." try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(\ match=1,transition=-1,transversion=-1) return global_pairwise(s1,s2,score_matrix,gap_open,gap_extend)
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}): """ Checks parameters for pairwise alignment, returns alignment. Code from Greg Caporaso. """ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError( "Pairwise aligning of seqs requires exactly two seqs.") try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}): """ Checks parameters for pairwise alignment, returns alignment. Code from Greg Caporaso. """ seqs = LoadSeqs(data=seqs, moltype=moltype, aligned=False) try: s1, s2 = seqs.values() except ValueError: raise ValueError( "Pairwise aligning of seqs requires exactly two seqs.") try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict( match=1, transition=-1, transversion=-1) return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def test_local_tiebreak(self): """Should pick the first best-equal hit rather than the last one""" # so that the Pyrex and Python versions give the same result. score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) pattern = DNA.makeSequence('cwc', Name='pattern') two_hit = DNA.makeSequence( 'cactc', Name= 'target') aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2) hit = aln.NamedSeqs['target'] self.assertEqual(str(hit).lower(), 'cac')
def test_local_tiebreak(self): """Should pick the first best-equal hit rather than the last one""" # so that the Pyrex and Python versions give the same result. score_matrix = make_dna_scoring_dict(match=1, transition=-1, transversion=-1) pattern = DNA.makeSequence('cwc', Name='pattern') two_hit = DNA.makeSequence('cactc', Name='target') aln = local_pairwise(pattern, two_hit, score_matrix, 5, 2) hit = aln.NamedSeqs['target'] self.assertEqual(str(hit).lower(), 'cac')
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence("AAAATGCTTA" * r) seq1 = DNA.makeSequence("AATTTTGCTG" * r) t0 = time.time() aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw) t = time.time() - t0 return (len(seq1) * len(seq2)) / t print t
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.time() aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, **kw) t = time.time() - t0 return (len(seq1) * len(seq2)) / t print t
def makeSampleAlignment(): # must be an esier way to make an alignment of annotated sequences! from cogent.align.align import global_pairwise, make_dna_scoring_dict DNA = make_dna_scoring_dict(10, -8, -8) seq1 = makeSampleSequence()[:-2] seq2 = makeSampleSequence()[2:] seq1.Name = 'FAKE01' seq2.Name = 'FAKE02' names = (seq1.getName(), seq2.getName()) align = global_pairwise(seq1, seq2, DNA, 2, 1) align.addAnnotation(annotation.Variable, 'redline', 'align', [((0,15),1),((15,30),2),((30,45),3)]) align.addAnnotation(annotation.Variable, 'blueline', 'align', [((0,15),1.5),((15,30),2.5),((30,45),3.5)]) return align
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.clock() try: # return_alignment is False in order to emphasise the quadratic part of the work. aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw) except ArithmeticError: return '*' else: t = time.clock() - t0 return int ( (len(seq1)*len(seq2))/t/1000 )
def makeSampleAlignment(): # must be an esier way to make an alignment of annotated sequences! from cogent.align.align import global_pairwise, make_dna_scoring_dict DNA = make_dna_scoring_dict(10, -8, -8) seq1 = makeSampleSequence()[:-2] seq2 = makeSampleSequence()[2:] seq1.Name = 'FAKE01' seq2.Name = 'FAKE02' names = (seq1.getName(), seq2.getName()) align = global_pairwise(seq1, seq2, DNA, 2, 1) align.addAnnotation(annotation.Variable, 'redline', 'align', [((0, 15), 1), ((15, 30), 2), ((30, 45), 3)]) align.addAnnotation(annotation.Variable, 'blueline', 'align', [((0, 15), 1.5), ((15, 30), 2.5), ((30, 45), 3.5)]) return align
def pair_hmm_align_unaligned_seqs(seqs, moltype=DNA, params={}): """ Handles pairwise alignment of given sequence pair seqs: list of [primer, target sequence] in string format moltype: molecule type tested. Only DNA supported. params: Used to set parameters for opening, extending gaps and score matrix if something other than the default given in this function is desired. """ try: seqs = LoadSeqs(data=seqs,moltype=moltype,aligned=False) except AlphabetError: raise AlphabetError,("Error in characters present in primer "+\ "%s and/or sequence %s." % (seqs[0], seqs[1])) try: s1, s2 = seqs.values() except ValueError: raise ValueError,\ "Pairwise aligning of seqs requires exactly two seqs." try: gap_open = params['gap_open'] except KeyError: gap_open = 5 try: gap_extend = params['gap_extend'] except KeyError: gap_extend = 2 try: score_matrix = params['score_matrix'] except KeyError: score_matrix = make_dna_scoring_dict(\ match=1, transition=-1, transversion=-1) return local_pairwise(s1, s2, score_matrix, gap_open, gap_extend)
def test(r=1, **kw): S = make_dna_scoring_dict(10, -1, -8) seq2 = DNA.makeSequence('AAAATGCTTA' * r) seq1 = DNA.makeSequence('AATTTTGCTG' * r) t0 = time.clock() try: # return_alignment is False in order to emphasise the quadratic part of the work. aln = classic_align_pairwise(seq1, seq2, S, 10, 2, local=False, return_alignment=False, **kw) except ArithmeticError: return '*' else: t = time.clock() - t0 return int((len(seq1) * len(seq2)) / t / 1000)
def _aligned_both_ways(self, seq1, seq2, **kw): S = make_dna_scoring_dict(10, -1, -8) a1 = classic_align_pairwise(seq1, seq2, S, 10, 2, **kw) a2 = classic_align_pairwise(seq2, seq1, S, 10, 2, **kw) return [a1, a2]
def RemoveError(log,seqs,seqsnp,sfreq,readerror,meanerror,ofracerr,indelprob,indelmax,pyroseq): """ Deblur the reads Input: log - a LogMe log module to write the debluring info seqs - the list of sequences seqsnp - a list of numpy arrays of the sequences (for faster comparison) - from SeqToArray() sfreq - dictionary (based on the sequence) of the number of reads for each sequence readerror - the maximal read error expected (fraction - typically 0.01) meanerror - the mean read error used for peak spread normalization - typically 0.01 ofracerr - the error distribution array, or 0 if use default indelprob - the probability for an indel (currently constant for number of indels until max is reached) indelmax - the maximal number of indels expected by errors (error cutoff) pyroseq - if set, use pairwise alignment for pyrosequencing data Output: sfreq - the deblurred number of reads for each sequence (0 if not present) debugdata - a list of strings Notes: meanerror is used only for normalizing the peak height before deblurring, whereas readerror is used for calculating the expected number of errors for each position error distribution array X should be of length >10, where Xi = max frequency of error hamming i if it is 0, we use the default distribution """ # take the list values so it won't change fracerr=list(ofracerr) # we assume all sequences are of equal length commonlen=len(seqs[0]) for cseq in seqs: if not(commonlen==len(cseq)): print("Not all sequences are same length!!!!") print(commonlen) print(len(cseq)) print(cseq) print ("processing",len(seqs),"sequences") numreal=0 for cchar in seqs[0]: if not (cchar=='-'): numreal+=1 modfactor=pow((1-meanerror),numreal) # create the error profile from the read error # exponential independent # fracerr=[] # for a in range(10): # fracerr.append(pow(readerror,a)/modfactor) # empirical # fracerr=[1.0/modfactor,pow(readerror,1)/modfactor,2*pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor,pow(readerror,2)/modfactor] # used for the 22 mock mixture # fracerr=[1.0/modfactor,pow(readerror,1)/modfactor,0.01,0.01,0.01,0.005,0.005,0.005,0.005,0.005,0.005,0.001,0.001,0.001,0.001,0.001,0.001,0.0005,0.0001,0.0001] # used for the 44 mock mixture # e1=pow(readerror,1)/modfactor # fracerr=[1.0/modfactor,e1,e1/4,e1/5,e1/6,e1/8,e1/10,e1/15,e1/20,e1/30,e1/40,e1/50,e1/50,e1/50,e1/50,e1/50,e1/50,e1/100,e1/500,e1/500] # if fracerr not supplied, use the default (22 mock mixture setup) log.log("original fracer parameter:",fracerr) if fracerr==0: fracerr=[1.0/modfactor,pow(readerror,1)/modfactor,0.01,0.01,0.01,0.005,0.005,0.005,0.005,0.005,0.005,0.001,0.001,0.001,0.001] log.log("modified fracerr because it was 0") else: for idx,val in enumerate(fracerr): fracerr[idx]=fracerr[idx]/modfactor maxhdist=len(fracerr)-1 print "fracerr" print fracerr print "readerror" print readerror print "modfactor" print modfactor log.log("indel prob:",indelprob) log.log("indel max:",indelmax) log.log("readerror:",readerror) log.log("meanerror:",meanerror) log.log("mod factor:",modfactor) log.log("fracerr:",fracerr) # for pairwise alignment: DNAm = make_dna_scoring_dict(10, -8, -8) for idx,cseq in enumerate(seqs): csfreq=sfreq[cseq] # no need to remove neighbors if freq. is <=0 if csfreq<=0: continue # correct for the fact that many reads are expected to be mutated numerr=[] for a in range(len(fracerr)): numerr.append(fracerr[a]*csfreq) # if it's low level, just continue if numerr[1]<0.1: continue # compare to all other sequences and calculate hamming dist cseqnp=seqsnp[idx] oseqlen=len(seqs[idx].rstrip('-')) for idxtmp,seqnptmp in enumerate(seqsnp): # don't compare to ourselves (dist=0) if idxtmp==idx: continue # calculate the hamming distance hdist=np.count_nonzero(np.not_equal(seqnptmp,cseqnp)) # if far away, don't need to correct if hdist>maxhdist: continue # close, so lets calculate exact distance numsub=0 numindel=0 # experimental try 2 # s1=seqs[idx].replace('-','') # s2=seqs[idxtmp].replace('-','') # cseq1,cseq2=nw_align(s1,s2) # experimental: pairwise align the sequences if pyroseq: s0=DNA.makeSequence(seqs[idx]) s0=s0.degap() s1=DNA.makeSequence(seqs[idxtmp]) s1=s1.degap() print s0._seq print s1._seq align = global_pairwise(s0, s1, DNAm, 10, 9) a0=align.getGappedSeq('seq_0') a1=align.getGappedSeq('seq_1') cseq1=a0._seq cseq2=a1._seq len1=len(cseq1.rstrip('-')) len2=len(cseq2.rstrip('-')) oseqlen=len(cseq1) for cpos in range(oseqlen): if not (cseq1[cpos]==cseq2[cpos]): if cseq1[cpos]=='-': if cpos<len1: numindel+=1 else: if cseq2[cpos]=='-': if cpos<len2: numindel+=1 else: numsub+=1 # not pyrosequencing so use the faster global alignment else: for cpos in range(oseqlen): if not (cseqnp[cpos]==seqnptmp[cpos]): # 4 is '-' if seqnptmp[cpos]==4: numindel+=1 else: if cseqnp[cpos]==4: numindel+=1 else: numsub+=1 nerr=numerr[numsub] # remove errors due to (PCR?) indels (saw in 22 mock mixture) if numindel>0: nerr=nerr*indelprob if numindel>indelmax: nerr=0 # if the effect is small - don't do anything if nerr<0.1: continue # met all the criteria - so correct the frequency of the neighbor sfreq[seqs[idxtmp]]-=nerr # if sfreq[seqs[idxtmp]]<=0: # if sfreq[seqs[idxtmp]]+nerr>0: # log.log("Removed sequence ",idxtmp," due to sequence ",idx) # log.log("seq:",idx," and ",idxtmp," have ",numindel," indels and ",numsub,"substitutions") # log.log(cseq1) # log.log(cseq2) # log.log("true seq freq:",csfreq) # log.log("freq from ",sfreq[seqs[idxtmp]]+nerr," to ",sfreq[seqs[idxtmp]]) # else: # if numindel>0: # log.log("====indels but no delete!!!!") # log.log("seq:",idx," and ",idxtmp," have ",numindel," indels and ",numsub,"substitutions") # log.log(cseq1) # log.log(cseq2) # log.log("true seq freq:",csfreq) # log.log("freq from ",sfreq[seqs[idxtmp]]+nerr," to ",sfreq[seqs[idxtmp]]) return(sfreq)