def _taxOrder(trees) : dtops = Counter(toNewick(tree, topologyOnly=1) for tree in trees) averagedPosteriorDistances = defaultdict(lambda : 0) for tx in dtops: tree = parseNewick(tx) distances = dict() taxaDistance(tree, tree.root, distances) for k in distances: averagedPosteriorDistances[k] += distances[k] * dtops[tx]/len(trees) taxa = sorted(trees[0].get_taxa()) ntax = len(taxa) for j,k in allPairs(taxa) : averagedPosteriorDistances[k,j] = averagedPosteriorDistances[j,k] def groupDistance(g1,g2,dm) : return mean([dm[i] for i in itertools.product(g1,g2)]) ## def groupDistance(g1,g2,dm) : ## return min([dm[i] for i in itertools.product(g1,g2)]) groups = [[x] for x in taxa] dm = averagedPosteriorDistances while len(groups) > 1: # find the two closest groups. dists = [(groupDistance(groups[j],groups[k],dm), (j,k)) for j,k in allPairs(range(len(groups)))] dists = sorted(dists) d,(ij,ik) = dists[0] # 123 abc 0,0 0,1 1,0 02 11 20 # 321 abc # 123 cba # 321 cba # abc 123 # abc 321 # cba 123 # cba 321 g1,g2 = groups[ij],groups[ik] def gid(g1,g2,dm) : d = [] for n in range(len(g1)+len(g2) - 1) : for i in range(-1,max(-len(g1)-1,-(len(g2)-n+1)), -1) : #i > -(len(g2)-n+1) d.append(dm[g1[i], g2[n-(i+1)]]) return d dis = gid(g1,g2,dm),gid(g1,list(reversed(g2)),dm),gid(g2, g1,dm), gid(list(reversed(g2)),g1,dm) dis = sorted(zip(dis,range(4))) o = dis[0][1] if o & 1 : g2 = list(reversed(g2)) if o > 1 : g1,g2 = g2,g1 groups[ij] = g1 + g2 del groups[ik] otaxa = groups[0] return otaxa
def _getGTorderFixedStree(gtree, stree, gtax, gtx, tryPairs) : allsn = getPostOrder(stree, stree.root) # Species taxa in layout order, that is the order they are plotted stax = filter(lambda n : not n.succ, allsn) # For each gene tree node set # data.grp: the leftmost/rightmost species of taxa in clade (as indices of # plot positions) # data.grpnds: gene taxa nodes of in the leftmost/rightmost species (per # above). Those are the only ones that can make a diffrence for the node # score. The rest are always in. # # for terminal nodes set # data.o: Ordinal number in sequence # all terminals contain data.snode no = 0 for i,n in enumerate(stax): for gn in gtx[n.id] : gn.data.grp = [i,i] gn.data.grpnds = [[gn],[gn]] gn.data.allo = [[] if k != n else [gn] for k in stax] gn.data.o = no no += 1 gn.data.sz = 1 gtpost = _getRandPostOrder(gtree, gtree.root) swaps = [] for n in gtpost : if n.succ : sns = [gtree.node(x) for x in n.succ] l,r = zip(*[x.data.grp for x in sns]) n.data.grp = min(l),max(r) lg,rg = zip(*[x.data.grpnds for x in sns]) n.data.grpnds = reduce(_plus, [y for x,y in zip(l,lg) if n.data.grp[0] == x]),\ reduce(_plus, [y for x,y in zip(r,rg) if n.data.grp[1] == x]) n.data.sz = sum([x.data.sz for x in sns]) aa = zip(*[x.data.allo for x in sns]) n.data.allo = [l+r for l,r in aa] sw = [(l,r) for l,r in aa if len(r) and len(l)] if len(sw) : s = sum([1 for l,r in aa if len(r) or len(l)]) if s > 1: swaps.append((n, sw, s)) ## def ook(swaps) : ## return [all([all([max([x.data.o for x in u])-min([x.data.o for x in u])+1 == len(u) ## , max([x.data.o for x in v])-min([x.data.o for x in v])+1 == len(v) , ## max([x.data.o for x in u])+1 == min([x.data.o for x in v]) or ## max([x.data.o for x in v])+1 == min([x.data.o for x in u])]) for ## u,v in sw[1]]) ## for sw in swaps] nint = filter(lambda n : len(n.succ), gtpost) def score(nint) : htot, tot = 0.0, 0 for x in nint : l,r = [[z.data.o for z in u] for u in x.data.grpnds] dd = ((max(r) - min(l) + 1) - x.data.sz) if dd > 0 : tot += dd htot += dd * x.data.ht return tot, -htot ms = score(nint) mp = [x.data.o for x in gtax] msLast = (sys.maxint, 0) while ms < msLast: msLast = ms random.shuffle(swaps) for swap in swaps : sv = [] for l,r in swap[1] : lo,ro = [x.data.o for x in l], [x.data.o for x in r] mlo, mro = min(lo),min(ro) ll, lr = len(l),len(r) # assert 1 + max(lo+ro) - min(lo+ro) == ll+lr and max(lo)-mlo+1 == ll and max(ro)-mro+1 == lr #if swap[2] > 1 and verbose: # print [x.id for x in l], [x.id for x in r], mlo,mlo+ll,mro,mro+lr, sv.extend([(n.data.o,n) for n in l+r]) if mlo < mro : ls = mlo + lr rs = mlo else : ls = mro rs = mro + ll for k,n in zip([x - mlo for x in lo], l): n.data.o = ls + k for k,n in zip([x - mro for x in ro], r): n.data.o = rs + k # assert all(ook(swaps)) s = score(nint) if s < ms : #if not swap[2] > 1: # pdb.set_trace() #assert swap[2] > 1 #if verbose: print "*",ms, "to" ,s ms = s mp = [x.data.o for x in gtax] else : for k,n in sv : n.data.o = k #assert score(nint) == ms if tryPairs: msLast = (sys.maxint, 0) while ms < msLast: msLast = ms for kk in gtx: gtxkk = gtx[kk] a = [n.data.o for n in gtxkk] for i0,i1 in allPairs(range(len(gtxkk))) : sw = [gtxkk[x].data.o for x in (i0,i1)] gtxkk[i1].data.o, gtxkk[i0].data.o = sw s = score(nint) if s < ms : ms = s mp = [x.data.o for x in gtax] else : gtxkk[i0].data.o, gtxkk[i1].data.o = sw return ms,mp