Example #1
0
def _taxOrder(trees) :
  dtops = Counter(toNewick(tree, topologyOnly=1) for tree in trees)

  averagedPosteriorDistances = defaultdict(lambda : 0)

  for tx in dtops:
    tree = parseNewick(tx)
    distances = dict()
    taxaDistance(tree, tree.root, distances)
    for k in distances:
      averagedPosteriorDistances[k] += distances[k] * dtops[tx]/len(trees)

  taxa = sorted(trees[0].get_taxa())
  ntax = len(taxa)
  for j,k in allPairs(taxa) :
    averagedPosteriorDistances[k,j] = averagedPosteriorDistances[j,k]
    
  def groupDistance(g1,g2,dm) :
    return mean([dm[i] for i in itertools.product(g1,g2)])
  ## def groupDistance(g1,g2,dm) :
  ##   return min([dm[i] for i in itertools.product(g1,g2)])
  
  groups = [[x] for x in taxa]
  dm = averagedPosteriorDistances
  while len(groups) > 1:
    # find the two closest groups.
    dists = [(groupDistance(groups[j],groups[k],dm), (j,k)) 
             for j,k in allPairs(range(len(groups)))]
    dists = sorted(dists)
    d,(ij,ik) = dists[0]
    # 123 abc 0,0 0,1 1,0 02 11 20
    # 321 abc
    # 123 cba
    # 321 cba
    # abc 123
    # abc 321
    # cba 123
    # cba 321
    g1,g2 = groups[ij],groups[ik]
    def gid(g1,g2,dm) :
      d = []
      for n in range(len(g1)+len(g2) - 1) :
        for i in range(-1,max(-len(g1)-1,-(len(g2)-n+1)), -1) :
          #i > -(len(g2)-n+1)
          d.append(dm[g1[i], g2[n-(i+1)]])
      return d

    dis = gid(g1,g2,dm),gid(g1,list(reversed(g2)),dm),gid(g2, g1,dm), gid(list(reversed(g2)),g1,dm)
    dis = sorted(zip(dis,range(4)))  
    o = dis[0][1]
    if o & 1 :
      g2 = list(reversed(g2))
    if o > 1 :
      g1,g2 = g2,g1
    groups[ij] = g1 + g2
    del groups[ik]
  otaxa = groups[0]
  return otaxa
Example #2
0
def treeFromDists(ds, tax = None, weights = None, asString = False) :
  #up = scipy.cluster.hierarchy.average([x/2. for x in ds])
  #up = calign.upgma([x/2. for x in ds], weights = weights)
  up = calign.upgma(ds, weights = weights)
  if weights and any([x>1 for x in weights]) :
    lw = len(weights)
    #wt = lambda i : weights[i] if i < lw else nup[i-lw][3]
    wt = lambda i : 1 if i < lw else nup[i-lw][3]
    nup = []
    for i,j,d,w in up:
      w = wt(i) + wt(j)
      nup.append([i,j,d/2,w])
    up = nup
  else :
    up = tuple((a,b,c/2,d) for a,b,c,d in up)  # check that equiv
  
  tr = upgma2tree(up, tax)
  return tr if asString else parseNewick(tr)
def checkStr(s):
    try:
        T1 = parseNewick_sol.parseNewick(s)
        T2 = parseNewick.parseNewick(s)

        S = compareTree(T1, T2)
        if 0 in S:
            return 0

        score = 1
        if 1 in S:
            score -= 0.15
        if 2 in S:
            score -= 0.1
        if 3 in S:
            score -= 0.25
        if 4 in S:
            score -= 0.25
        return score
    except Exception as e:
        return 0
def checkStr(s):
    try:
        T1 = parseNewick_sol.parseNewick(s)
        T2 = parseNewick.parseNewick(s)

        S = compareTree(T1,T2)
        if 0 in S:
            return 0

        score = 1
        if 1 in S:
            score -= 0.15
        if 2 in S:
            score -= 0.1
        if 3 in S:
            score -= 0.25
        if 4 in S:
            score -= 0.25
        return score
    except Exception as e:
        return 0
Example #5
0
  def next(self) :
    if self.treetok :
      if self.treetok[0] == 'tree' :
        t = self.treetok[1]
        self.treetok = next(self.itokens)

        t = t.strip().split('=')
        assert len(t) >= 2
        tname = t[0].strip()
        
        t = '='.join(t[1:]).strip()
        rooted=False
        weight=1.0
        
        if t[0] == '[' :
          o,t = _parseOptions(t)
          if 'R' in o :
            rooted = True
          if 'U' in o :
            rooted = False
          if 'W' in o :
             weight=float(o['W'])

        tree = parseNewick(t.strip(), weight=weight, rooted=rooted, name=tname.split()[0],
                           loadAttributes = self.withAttributes)
        if self.taxatable :
          for n in tree.get_terminals():
            data = tree.node(n).data
            try:
              data.taxon = self.taxatable[data.taxon]
            except (ValueError,KeyError):
              raise RuntimeError("translation failed")
          
        return tree
        
    raise StopIteration
Example #6
0
def assembleTree(trees, thFrom, thTo, getSeqForTaxon,
                 nMaxReps = 20, maxPerCons = 100,
                 lowDiversity = 0.02, refineFactor = 1.1, refineUpperLimit = .15,
                 verbose = None) :
  cahelpers = dict()
  cahelper = lambda t : cahelpers.get(t.name) or \
             (cahelpers.update([(t.name,CAhelper(t))])
              or cahelpers.get(t.name))

  if verbose:
    print >> verbose, "cutting",len(trees),"trees at %g" % thFrom
  
  # cut trees at thFrom
  pseudoTaxa  = cutForestAt(trees, thFrom, cahelper)
  nReps = len(pseudoTaxa)

  reps = [None]*nReps
  def getReps(k) :
    if not reps[k] :
      t,n = pseudoTaxa[k]
      nc = len(n.data.terms)
      if nc > 2:
        nc = min(max(int(math.log(nc,3)), 2), nMaxReps)
        r = random.sample(n.data.terms, nc)
      else :
        r = n.data.terms
      reps[k] = [getSeqForTaxon(x.data.taxon) for x in r]
    return reps[k]

  cons = [None]*nReps
  def getCons(k) :
    if not cons[k] :
      t,n = pseudoTaxa[k]
      nc = len(n.data.terms)
      if nc > maxPerCons :
        i = random.sample(n.data.terms, maxPerCons)
      else :
        i = n.data.terms
      sq = [getSeqForTaxon(x.data.taxon) for x in i]
      # s, r = align.mpc(sq, nRefines=0)
      # del r
      s = doTheCons(sq, n.data.rh)
      #al = align.seqMultiAlign(sorted(sqs, reverse=1))
      #s = align.stripseq(align.cons(calign.createProfile(al)))
      
      cons[k] = s
    return cons[k]

  mhs = []
  for t,n in pseudoTaxa:
    cahelper(t) # populate rh
    mhs.append(n.data.rh)

  # if both low diversity - use consensus. If not valid or close to cluster height, do the
  # means thing. If not low diversity, use log representatives
  # low less then 4%??
  ## lowDiversity = 0.02
  ## refineFactor = 1.1
  ## refineUpperLimit = .15
  
  # counts how many alignments done (for display)
  global acnt
  acnt = 0
  
  def getDist(i,j) :
    mi,mj = mhs[i],mhs[j]
    anyCons = False
    if mi < lowDiversity :
      ri = [getCons(i)]
      anyCons = True
    else :
      ri = getReps(i)
    if mj < lowDiversity :
      rj = [getCons(j)]
      anyCons = True
    else :
      rj = getReps(j)

    nhs = len(ri)*len(rj)
    if nhs == 1 :
      h = calign.globalAlign(ri[0], rj[0], scores = defaultMatchScores,
                             report = calign.JCcorrection)
    else :
      ap = calign.allpairs(ri, rj, align=True, scores = defaultMatchScores,
                           report = calign.JCcorrection)
      h = sum([sum(x) for x in ap])/nhs
      
    global acnt
    acnt += nhs
    
    lowLim = 2*max(mi,mj)
    
    if anyCons and (h < lowLim or (h < refineUpperLimit and h < lowLim*refineFactor)) :
      xri = getReps(i) if len(ri) == 1 else ri
      xrj = getReps(j) if len(rj) == 1 else rj

      if ri != xri or rj != xrj :
        ap1 = calign.allpairs(xri, xrj, align=True, scores = defaultMatchScores,
                              report = calign.JCcorrection)
        h1 = sum([sum(x) for x in ap1])

        xnhs = (len(xri)*len(xrj))
        acnt += xnhs
      
        h = (h * nhs + h1)/(nhs + xnhs)
    return max(h, lowLim)

  if verbose :
    print >> verbose, "assembling",nReps,"sub-trees into one tree",time.strftime("%T")
    print "n-sub-tree #pair-only-alignments #alignments time"
    verbose.flush()
    tnow = time.clock()

  # Use array. those can get big
  ds = array.array('f',repeat(0.0,nPairs(nReps)))

  pos = 0
  for i in range(nReps-1) :
    for j in range(i+1, nReps) :
      ds[pos] = getDist(i,j)
      pos += 1

    if verbose :
      dn = sum(range(nReps-1, nReps-i-2,-1))
      print >> verbose, i, dn, "%4.3g%%" % ((100.*dn)/len(ds)), acnt, time.strftime("%T")

  if verbose :
    print >> verbose, tohms(time.clock() - tnow), time.strftime("%T")

  # Using correct weights can throw off the height guarantee, or not?
  wt = [len(n.data.terms) for t,n in pseudoTaxa]

  tnew = treeFromDists(ds, tax = [str(x) for x in range(nReps)], weights = wt)

  del ds

  for n in getPostOrder(tnew) :
    if not n.succ :
      t,nd = pseudoTaxa[int(n.data.taxon)]
      if len(nd.data.terms) == 1 :
        n.data.taxon = nd.data.taxon
        n.data.rtree = "%s:%f" % (n.data.taxon, n.data.branchlength)
      else :
        # Insure heights are there
        cahelper(t)
        s = t.toNewick(nd.id)
        d = n.data.branchlength - nd.data.rh
        if (d < -1e-10) :
          print "***** ERROR", d
        n.data.rtree = "%s:%f" % (s, max(d,0.0))
    else :
      ch = [tnew.node(x).data.rtree for x in n.succ]
      n.data.rtree = "(%s,%s)" % (ch[0],ch[1])
      if n.id != tnew.root :
        n.data.rtree = n.data.rtree + (":%f" % n.data.branchlength)

  trec = tnew.node(tnew.root).data.rtree
  trec = parseNewick(trec)
  return trec