def mpat(tr, seqs) : dseqs = dict(seqs) for n in getPostOrder(tr) : data = n.data if not n.succ : data.seq = (None,dseqs[n.data.taxon]) else : s1,s2 = [tr.node(x).data.seq for x in n.succ] if s1[1] : if s2[1] : a = calign.globalAlign(s1[1],s2[1]) data.seq = (calign.createProfile(a),None) else : p1,p2 = calign.createProfile(s1[1:]), s2[0] #assert all([sum(x)==sum(p1[0]) for x in p1]) #assert all([sum(x)==sum(p2[0]) for x in p2]) pa = calign.prof2profAlign(p1,p2) data.seq = (trimp(pa),None) #print len(pa) else : p1 = s1[0] if s2[1] : p2 = calign.createProfile(s2[1:]) else : p2 = s2[0] #assert all([sum(x)==sum(p1[0]) for x in p1]) #assert all([sum(x)==sum(p2[0]) for x in p2]) pa = calign.prof2profAlign(p1,p2) data.seq = (trimp(pa),None) #print len(pa) #import pdb; pdb.set_trace() assert n.id == tr.root return n.data.seq[0]
def mpa(tr, seqs, scores = defaultMatchScores, trimEnd = None) : dseqs = dict(seqs) #scores = (None,None,gapPenalty,feg) for n in getPostOrder(tr) : data = n.data if not n.succ : data.seq = (None,dseqs[n.data.taxon.strip("'")]) else : s1,s2 = [tr.node(x).data.seq for x in n.succ] if s1[1] : if s2[1] : a = calign.globalAlign(s1[1],s2[1], scores = scores) data.seq = (calign.createProfile(a),None) else : p1,p2 = calign.createProfile(s1[1:]), s2[0] pa = calign.prof2profAlign(p1,p2, scores = scores) data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None) #print len(pa) else : p1 = s1[0] if s2[1] : p2 = calign.createProfile(s2[1:]) else : p2 = s2[0] pa = calign.prof2profAlign(p1,p2, scores = scores) data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None) #print len(pa) #import pdb; pdb.set_trace() assert n.id == tr.root return n.data.seq[0]
def mpc(seqs, nRefines = 4, gapPenalty = defaultMatchScores.gap) : al = seqMultiAlign(seqs, scores = defaultMatchScores._replace(gap = gapPenalty)) c0 = stripseq(cons(calign.createProfile(al))) r = refineSingle(al, gapPenalty = gapPenalty) c1 = stripseq(cons(calign.createProfile(r))) cnt = 0 while c0 != c1 and cnt < nRefines: c0 = c1 r = refineSingle(r, gapPenalty = gapPenalty) c1 = stripseq(cons(calign.createProfile(r))) cnt += 1 return c1, r
def refineSingle(al, gapPenalty = defaultMatchScores.gap) : if not isinstance(al, list) : al = list(al) p = calign.createProfile(al) q = [] for k,s in enumerate(al) : ps = calign.createProfile([s]) p2 = tuple([[a-b for a,b in zip(x,y)] for x,y in zip(p,ps)]) q.append( calign.profileAlign(s, p2, gapPenalty = gapPenalty) ) return q
def doTheCons(sqs, trh, lengthQuant = 40) : als = [(lengthQuant*(len(s)//lengthQuant), s) for s in sqs] als = [x[1] for x in sorted(als, reverse=1)] scmp = random.sample(sqs, min(5,len(sqs))) cbest = (None,100000) for i in range(3) : al = seqMultiAlign(als) c = stripseq(cons(calign.createProfile(al))) # spot check p = mean(calign.allpairs(c, scmp, report = calign.JCcorrection))/2 if p < cbest[1] : cbest = (c, p) if i == 2 or p < trh : break if i == 0 : # try original order als = sqs elif i == 1 : # try random order als = list(sqs) random.shuffle(als) return cbest[0]
def refineAlignment(al, ci = [0], drop = False, mx = -1, rev = False, verbose = False) : if not isinstance(al, list) : al = list(al) p = calign.createProfile(al) if rev: can = [(k,s) for k,s in enumerate(al) if all([s[x] == '-' for x in ci])] else : can = [(k,s) for k,s in enumerate(al) if any([s[x] != '-' for x in ci])] if verbose: print len(al[0]), len(can) import sys sys.stdout.flush() if mx > 0 and len(can) > mx: return changed = 0 if len(can) : pr = calign.createProfile([s for n,s in can]) p2 = tuple([[a-b for a,b in zip(x,y)] for x,y in zip(p,pr)]) if drop: p2,rx = removeColumns(p2, ci) for n,s in can: if len(s.replace('-','')) <= len(p2) : ra = calign.profileAlign(s, p2) if drop : ra = restoreColumns(list(ra),rx, calign.GAP) al[n] = iton(ra) if tuple(ra) != sasn(s) : changed += 1 if changed : p = calign.createProfile(al) r = [k for k,i in enumerate(p) if i[calign.GAP] == len(al)] r1 = toRanges(r) if r1 : for a,b in r1[::-1] : for k in range(len(al)) : x = al[k] al[k] = x[:a] + x[b:] return al,len(al[0]), changed
def seqMultiAlign(seqs, scores = defaultMatchScores, report=False) : if len(seqs) < 2: return seqs a = calign.globalAlign(seqs[0], seqs[1], scores=scores) ns = 2 p = calign.createProfile(a) a = tuple(iton(x) for x in a) for kk,s2 in enumerate(seqs[2:]) : #print ns # assert p == calign.createProfile(a) assert len(a[0]) == len(p) and \ p[0][calign.GAP] != ns and p[-1][calign.GAP] != ns pad = 20 if len(p)+2*pad < len(s2) : # enough for sequences start to align pad = (len(s2) - len(p)) ; assert len(p)+2*pad >= len(s2) pa,extendLeft,extendRight = calign.profileAlign(s2, p, pad=pad, chop=True, gapPenalty=defaultMatchScores.gap) if extendLeft > 0 or extendRight > 0 : gapProfile = [0,0,0,0,0,ns] p1 = tuple((list(gapProfile) for k in range(extendLeft))) + p \ + tuple((list(gapProfile) for k in range(extendRight))) else : p1 = p for k,n in enumerate(pa) : p1[k][n] += 1 p = p1 if extendLeft > 0 or extendRight > 0 : fr = '-'*extendLeft bk = '-'*extendRight a = tuple((fr + x + bk for x in a)) a = a + (iton(pa),) ns += 1 if report and (kk+1) % 1000 == 0 : import sys print kk+1, len(a[0]), sys.stdout.flush() if report: print return a
def sortByEntropy(seqs) : alen = len(seqs[0]) trans = not (seqs[0][0] in range(6)) #print "sorting," counts = calign.createProfile(seqs) #print "stage 1," cc = [sum([log(x) * x for x in c if x > 0])/len(seqs) - log(len(seqs)) for c in counts] scc = sorted(zip(cc, count()), reverse=0) occ = [x[1] for x in scc] #print "stage 2," if trans : sx = sorted(zip([[ngtoi(x[k]) for k in occ] for x in seqs], count())) else : sx = sorted(zip([[x[k] for k in occ] for x in seqs], count())) #print "stage 3," seqsSorted = [k for s,k in sx] #print "sorted" return seqsSorted
def createProfile(seqs) : return calign.createProfile(seqs)