Esempio n. 1
0
def _taxOrder(trees) :
  dtops = Counter(toNewick(tree, topologyOnly=1) for tree in trees)

  averagedPosteriorDistances = defaultdict(lambda : 0)

  for tx in dtops:
    tree = parseNewick(tx)
    distances = dict()
    taxaDistance(tree, tree.root, distances)
    for k in distances:
      averagedPosteriorDistances[k] += distances[k] * dtops[tx]/len(trees)

  taxa = sorted(trees[0].get_taxa())
  ntax = len(taxa)
  for j,k in allPairs(taxa) :
    averagedPosteriorDistances[k,j] = averagedPosteriorDistances[j,k]
    
  def groupDistance(g1,g2,dm) :
    return mean([dm[i] for i in itertools.product(g1,g2)])
  ## def groupDistance(g1,g2,dm) :
  ##   return min([dm[i] for i in itertools.product(g1,g2)])
  
  groups = [[x] for x in taxa]
  dm = averagedPosteriorDistances
  while len(groups) > 1:
    # find the two closest groups.
    dists = [(groupDistance(groups[j],groups[k],dm), (j,k)) 
             for j,k in allPairs(range(len(groups)))]
    dists = sorted(dists)
    d,(ij,ik) = dists[0]
    # 123 abc 0,0 0,1 1,0 02 11 20
    # 321 abc
    # 123 cba
    # 321 cba
    # abc 123
    # abc 321
    # cba 123
    # cba 321
    g1,g2 = groups[ij],groups[ik]
    def gid(g1,g2,dm) :
      d = []
      for n in range(len(g1)+len(g2) - 1) :
        for i in range(-1,max(-len(g1)-1,-(len(g2)-n+1)), -1) :
          #i > -(len(g2)-n+1)
          d.append(dm[g1[i], g2[n-(i+1)]])
      return d

    dis = gid(g1,g2,dm),gid(g1,list(reversed(g2)),dm),gid(g2, g1,dm), gid(list(reversed(g2)),g1,dm)
    dis = sorted(zip(dis,range(4)))  
    o = dis[0][1]
    if o & 1 :
      g2 = list(reversed(g2))
    if o > 1 :
      g1,g2 = g2,g1
    groups[ij] = g1 + g2
    del groups[ik]
  otaxa = groups[0]
  return otaxa
Esempio n. 2
0
def _getGTorderFixedStree(gtree, stree, gtax, gtx, tryPairs) :

  allsn = getPostOrder(stree, stree.root)
  # Species taxa in layout order, that is the order they are plotted
  stax = filter(lambda n : not n.succ, allsn)
  
  # For each gene tree node set
  # data.grp: the leftmost/rightmost species of taxa in clade (as indices of
  # plot positions)
  # data.grpnds: gene taxa nodes of in the leftmost/rightmost species (per
  # above). Those are the only ones that can make a diffrence for the node
  # score. The rest are always in.
  #
  # for terminal nodes set
  # data.o: Ordinal number in sequence
  
  # all terminals contain data.snode 
  no = 0
  for i,n in enumerate(stax):
    for gn in gtx[n.id] :
      gn.data.grp = [i,i]
      gn.data.grpnds = [[gn],[gn]]
      gn.data.allo = [[] if k != n else [gn] for k in stax]
      gn.data.o = no
      no += 1
      gn.data.sz = 1

  gtpost = _getRandPostOrder(gtree, gtree.root)
  swaps = []
  for n in gtpost :
    if n.succ :
      sns = [gtree.node(x) for x in n.succ]
      l,r = zip(*[x.data.grp for x in sns])
      n.data.grp = min(l),max(r)
      lg,rg = zip(*[x.data.grpnds for x in sns])
      n.data.grpnds = reduce(_plus, [y for x,y in zip(l,lg) if n.data.grp[0] == x]),\
                       reduce(_plus, [y for x,y in zip(r,rg) if n.data.grp[1] == x])
      n.data.sz = sum([x.data.sz for x in sns])
      aa = zip(*[x.data.allo for x in sns])
      n.data.allo = [l+r for l,r in aa]

      sw = [(l,r) for l,r in aa if len(r) and len(l)]
      
      if len(sw) :
        s = sum([1 for l,r in aa if len(r) or len(l)])
        if s > 1:
          swaps.append((n, sw, s))

  ## def ook(swaps) :
  ##   return [all([all([max([x.data.o for x in u])-min([x.data.o for x in u])+1 == len(u)
  ##                     , max([x.data.o for x in v])-min([x.data.o for x in v])+1 == len(v) ,
  ##                     max([x.data.o for x in u])+1 == min([x.data.o for x in v]) or
  ##                     max([x.data.o for x in v])+1 == min([x.data.o for x in u])]) for
  ##                u,v in sw[1]])
  ##           for sw in swaps]
    
  nint = filter(lambda n : len(n.succ), gtpost)
  def score(nint) :
    htot, tot = 0.0, 0
    for x in nint :
      l,r = [[z.data.o for z in u] for u in x.data.grpnds]
      dd = ((max(r) - min(l) + 1) - x.data.sz)
      if dd > 0 :
        tot += dd
        htot += dd * x.data.ht
    return tot, -htot

  ms = score(nint)
  mp = [x.data.o for x in gtax]
  msLast = (sys.maxint, 0)

  while ms < msLast:
    msLast = ms
    random.shuffle(swaps)

    for swap in swaps :
      sv = []
      for l,r in swap[1] :
        lo,ro = [x.data.o for x in l],  [x.data.o for x in r]
        
        mlo, mro = min(lo),min(ro)
        ll, lr = len(l),len(r)
        
        # assert 1 + max(lo+ro) - min(lo+ro) == ll+lr and max(lo)-mlo+1 == ll and max(ro)-mro+1 == lr

        #if swap[2] > 1 and verbose:
        #  print [x.id for x in l],  [x.id for x in r], mlo,mlo+ll,mro,mro+lr,
        
        sv.extend([(n.data.o,n) for n in l+r])

        if mlo < mro :
          ls = mlo + lr
          rs = mlo
        else :
          ls = mro
          rs = mro + ll

        for k,n in zip([x - mlo for x in lo], l):
          n.data.o = ls + k
        for k,n in zip([x - mro for x in ro], r):
          n.data.o = rs + k

        # assert all(ook(swaps))
        
      s = score(nint)

      if s < ms :
        #if not swap[2] > 1:
        #  pdb.set_trace()
        #assert swap[2] > 1
        #if verbose: print "*",ms, "to" ,s

        ms = s
        mp = [x.data.o for x in gtax]
      else :
        for k,n in sv :
          n.data.o = k
        #assert score(nint) == ms

  if tryPairs:
    msLast = (sys.maxint, 0)
    while ms < msLast:
      msLast = ms
      for kk in gtx:
        gtxkk = gtx[kk]
        a = [n.data.o for n in gtxkk]
        for i0,i1 in allPairs(range(len(gtxkk))) :
          sw = [gtxkk[x].data.o for x in (i0,i1)]
          gtxkk[i1].data.o, gtxkk[i0].data.o = sw
          s = score(nint)
          if s < ms :
            ms = s
            mp = [x.data.o for x in gtax]
          else :
            gtxkk[i0].data.o, gtxkk[i1].data.o = sw

  return ms,mp