def treeFromSeqs(seqs, tax = None, matchScores = None, correction = True, weights = None, asString = False, verbose = None, saveDists = None) : assert tax is None or len(seqs) == len(tax), "args???" if verbose : # number of cells and? m = sumPairs([len(x) for x in seqs]) secs = m * (1/alCellsPerSecond) print >> verbose, len(seqs),tohms(secs),time.strftime("%T"), verbose.flush() tnow = time.clock() sseqs,order = zip(*sorted(zip(seqs,count()))) ds = calign.distances(sseqs, align = True, scores = matchScores, reorder = order, report = calign.JCcorrection if correction else calign.DIVERGENCE) if saveDists is not None : saveDistancesMatrix(saveDists[0], ds, saveDists[1], compress = saveDists[2]) tr = treeFromDists(ds, tax = tax, weights = weights, asString = asString) if verbose : print >> verbose, tohms(time.clock() - tnow), time.strftime("%T") return tr,ds
def findDuplicates(allSeqs, verbose = None) : """ Locate duplicate sequences in 'allSeqs'. Return a list C with one entry for each uniq sequence with more than one copy, where C[k] is a list with index numbers of all identical copies. """ if verbose: tstart = time.clock() byLen = defaultdict(lambda : []) for ns,s in enumerate(allSeqs): byLen[len(s)].append(ns) cc = [] for b in byLen: d = defaultdict(lambda : []) for si in byLen[b] : d[allSeqs[si]].append(si) for s,si in ifilter(lambda x : len(x[1])>1 , d.iteritems()): cc.append(si) if verbose: print >> verbose, "find duplicates in ", tohms(time.clock() - tstart) return cc
def assembleTree(trees, thFrom, thTo, getSeqForTaxon, nMaxReps = 20, maxPerCons = 100, lowDiversity = 0.02, refineFactor = 1.1, refineUpperLimit = .15, verbose = None) : cahelpers = dict() cahelper = lambda t : cahelpers.get(t.name) or \ (cahelpers.update([(t.name,CAhelper(t))]) or cahelpers.get(t.name)) if verbose: print >> verbose, "cutting",len(trees),"trees at %g" % thFrom # cut trees at thFrom pseudoTaxa = cutForestAt(trees, thFrom, cahelper) nReps = len(pseudoTaxa) reps = [None]*nReps def getReps(k) : if not reps[k] : t,n = pseudoTaxa[k] nc = len(n.data.terms) if nc > 2: nc = min(max(int(math.log(nc,3)), 2), nMaxReps) r = random.sample(n.data.terms, nc) else : r = n.data.terms reps[k] = [getSeqForTaxon(x.data.taxon) for x in r] return reps[k] cons = [None]*nReps def getCons(k) : if not cons[k] : t,n = pseudoTaxa[k] nc = len(n.data.terms) if nc > maxPerCons : i = random.sample(n.data.terms, maxPerCons) else : i = n.data.terms sq = [getSeqForTaxon(x.data.taxon) for x in i] # s, r = align.mpc(sq, nRefines=0) # del r s = doTheCons(sq, n.data.rh) #al = align.seqMultiAlign(sorted(sqs, reverse=1)) #s = align.stripseq(align.cons(calign.createProfile(al))) cons[k] = s return cons[k] mhs = [] for t,n in pseudoTaxa: cahelper(t) # populate rh mhs.append(n.data.rh) # if both low diversity - use consensus. If not valid or close to cluster height, do the # means thing. If not low diversity, use log representatives # low less then 4%?? ## lowDiversity = 0.02 ## refineFactor = 1.1 ## refineUpperLimit = .15 # counts how many alignments done (for display) global acnt acnt = 0 def getDist(i,j) : mi,mj = mhs[i],mhs[j] anyCons = False if mi < lowDiversity : ri = [getCons(i)] anyCons = True else : ri = getReps(i) if mj < lowDiversity : rj = [getCons(j)] anyCons = True else : rj = getReps(j) nhs = len(ri)*len(rj) if nhs == 1 : h = calign.globalAlign(ri[0], rj[0], scores = defaultMatchScores, report = calign.JCcorrection) else : ap = calign.allpairs(ri, rj, align=True, scores = defaultMatchScores, report = calign.JCcorrection) h = sum([sum(x) for x in ap])/nhs global acnt acnt += nhs lowLim = 2*max(mi,mj) if anyCons and (h < lowLim or (h < refineUpperLimit and h < lowLim*refineFactor)) : xri = getReps(i) if len(ri) == 1 else ri xrj = getReps(j) if len(rj) == 1 else rj if ri != xri or rj != xrj : ap1 = calign.allpairs(xri, xrj, align=True, scores = defaultMatchScores, report = calign.JCcorrection) h1 = sum([sum(x) for x in ap1]) xnhs = (len(xri)*len(xrj)) acnt += xnhs h = (h * nhs + h1)/(nhs + xnhs) return max(h, lowLim) if verbose : print >> verbose, "assembling",nReps,"sub-trees into one tree",time.strftime("%T") print "n-sub-tree #pair-only-alignments #alignments time" verbose.flush() tnow = time.clock() # Use array. those can get big ds = array.array('f',repeat(0.0,nPairs(nReps))) pos = 0 for i in range(nReps-1) : for j in range(i+1, nReps) : ds[pos] = getDist(i,j) pos += 1 if verbose : dn = sum(range(nReps-1, nReps-i-2,-1)) print >> verbose, i, dn, "%4.3g%%" % ((100.*dn)/len(ds)), acnt, time.strftime("%T") if verbose : print >> verbose, tohms(time.clock() - tnow), time.strftime("%T") # Using correct weights can throw off the height guarantee, or not? wt = [len(n.data.terms) for t,n in pseudoTaxa] tnew = treeFromDists(ds, tax = [str(x) for x in range(nReps)], weights = wt) del ds for n in getPostOrder(tnew) : if not n.succ : t,nd = pseudoTaxa[int(n.data.taxon)] if len(nd.data.terms) == 1 : n.data.taxon = nd.data.taxon n.data.rtree = "%s:%f" % (n.data.taxon, n.data.branchlength) else : # Insure heights are there cahelper(t) s = t.toNewick(nd.id) d = n.data.branchlength - nd.data.rh if (d < -1e-10) : print "***** ERROR", d n.data.rtree = "%s:%f" % (s, max(d,0.0)) else : ch = [tnew.node(x).data.rtree for x in n.succ] n.data.rtree = "(%s,%s)" % (ch[0],ch[1]) if n.id != tnew.root : n.data.rtree = n.data.rtree + (":%f" % n.data.branchlength) trec = tnew.node(tnew.root).data.rtree trec = parseNewick(trec) return trec