Ejemplo n.º 1
0
def treeFromSeqs(seqs, tax = None, matchScores = None, correction = True,
                 weights = None, asString = False, verbose = None, saveDists = None) :
  assert tax is None or len(seqs) == len(tax), "args???"
  
  if verbose :
    # number of cells and?
    m = sumPairs([len(x) for x in seqs]) 
    secs = m * (1/alCellsPerSecond)
    print >> verbose, len(seqs),tohms(secs),time.strftime("%T"),
    verbose.flush()
    tnow = time.clock()

  sseqs,order = zip(*sorted(zip(seqs,count())))

  ds = calign.distances(sseqs, align = True, scores = matchScores, reorder = order,
                        report = calign.JCcorrection if correction else calign.DIVERGENCE)
  if saveDists is not None :
    saveDistancesMatrix(saveDists[0], ds, saveDists[1], compress = saveDists[2])
  
  tr = treeFromDists(ds, tax = tax, weights = weights, asString = asString)
  
  if verbose :
    print >> verbose, tohms(time.clock() - tnow), time.strftime("%T")
    
  return tr,ds
Ejemplo n.º 2
0
def findDuplicates(allSeqs, verbose = None) :
  """ Locate duplicate sequences in 'allSeqs'. Return a list C with one entry
  for each uniq sequence with more than one copy, where C[k] is a list with
  index numbers of all identical copies.
  """
  
  if verbose:
    tstart = time.clock()

  byLen = defaultdict(lambda : [])
  for ns,s in enumerate(allSeqs):
    byLen[len(s)].append(ns)
  cc = []
  for b in byLen:
    d = defaultdict(lambda : [])
    for si in byLen[b] :
      d[allSeqs[si]].append(si)
    for s,si in ifilter(lambda x : len(x[1])>1 , d.iteritems()):
      cc.append(si)
  
  if verbose:
    print >> verbose, "find duplicates in ", tohms(time.clock() - tstart)
    
  return cc
Ejemplo n.º 3
0
def assembleTree(trees, thFrom, thTo, getSeqForTaxon,
                 nMaxReps = 20, maxPerCons = 100,
                 lowDiversity = 0.02, refineFactor = 1.1, refineUpperLimit = .15,
                 verbose = None) :
  cahelpers = dict()
  cahelper = lambda t : cahelpers.get(t.name) or \
             (cahelpers.update([(t.name,CAhelper(t))])
              or cahelpers.get(t.name))

  if verbose:
    print >> verbose, "cutting",len(trees),"trees at %g" % thFrom
  
  # cut trees at thFrom
  pseudoTaxa  = cutForestAt(trees, thFrom, cahelper)
  nReps = len(pseudoTaxa)

  reps = [None]*nReps
  def getReps(k) :
    if not reps[k] :
      t,n = pseudoTaxa[k]
      nc = len(n.data.terms)
      if nc > 2:
        nc = min(max(int(math.log(nc,3)), 2), nMaxReps)
        r = random.sample(n.data.terms, nc)
      else :
        r = n.data.terms
      reps[k] = [getSeqForTaxon(x.data.taxon) for x in r]
    return reps[k]

  cons = [None]*nReps
  def getCons(k) :
    if not cons[k] :
      t,n = pseudoTaxa[k]
      nc = len(n.data.terms)
      if nc > maxPerCons :
        i = random.sample(n.data.terms, maxPerCons)
      else :
        i = n.data.terms
      sq = [getSeqForTaxon(x.data.taxon) for x in i]
      # s, r = align.mpc(sq, nRefines=0)
      # del r
      s = doTheCons(sq, n.data.rh)
      #al = align.seqMultiAlign(sorted(sqs, reverse=1))
      #s = align.stripseq(align.cons(calign.createProfile(al)))
      
      cons[k] = s
    return cons[k]

  mhs = []
  for t,n in pseudoTaxa:
    cahelper(t) # populate rh
    mhs.append(n.data.rh)

  # if both low diversity - use consensus. If not valid or close to cluster height, do the
  # means thing. If not low diversity, use log representatives
  # low less then 4%??
  ## lowDiversity = 0.02
  ## refineFactor = 1.1
  ## refineUpperLimit = .15
  
  # counts how many alignments done (for display)
  global acnt
  acnt = 0
  
  def getDist(i,j) :
    mi,mj = mhs[i],mhs[j]
    anyCons = False
    if mi < lowDiversity :
      ri = [getCons(i)]
      anyCons = True
    else :
      ri = getReps(i)
    if mj < lowDiversity :
      rj = [getCons(j)]
      anyCons = True
    else :
      rj = getReps(j)

    nhs = len(ri)*len(rj)
    if nhs == 1 :
      h = calign.globalAlign(ri[0], rj[0], scores = defaultMatchScores,
                             report = calign.JCcorrection)
    else :
      ap = calign.allpairs(ri, rj, align=True, scores = defaultMatchScores,
                           report = calign.JCcorrection)
      h = sum([sum(x) for x in ap])/nhs
      
    global acnt
    acnt += nhs
    
    lowLim = 2*max(mi,mj)
    
    if anyCons and (h < lowLim or (h < refineUpperLimit and h < lowLim*refineFactor)) :
      xri = getReps(i) if len(ri) == 1 else ri
      xrj = getReps(j) if len(rj) == 1 else rj

      if ri != xri or rj != xrj :
        ap1 = calign.allpairs(xri, xrj, align=True, scores = defaultMatchScores,
                              report = calign.JCcorrection)
        h1 = sum([sum(x) for x in ap1])

        xnhs = (len(xri)*len(xrj))
        acnt += xnhs
      
        h = (h * nhs + h1)/(nhs + xnhs)
    return max(h, lowLim)

  if verbose :
    print >> verbose, "assembling",nReps,"sub-trees into one tree",time.strftime("%T")
    print "n-sub-tree #pair-only-alignments #alignments time"
    verbose.flush()
    tnow = time.clock()

  # Use array. those can get big
  ds = array.array('f',repeat(0.0,nPairs(nReps)))

  pos = 0
  for i in range(nReps-1) :
    for j in range(i+1, nReps) :
      ds[pos] = getDist(i,j)
      pos += 1

    if verbose :
      dn = sum(range(nReps-1, nReps-i-2,-1))
      print >> verbose, i, dn, "%4.3g%%" % ((100.*dn)/len(ds)), acnt, time.strftime("%T")

  if verbose :
    print >> verbose, tohms(time.clock() - tnow), time.strftime("%T")

  # Using correct weights can throw off the height guarantee, or not?
  wt = [len(n.data.terms) for t,n in pseudoTaxa]

  tnew = treeFromDists(ds, tax = [str(x) for x in range(nReps)], weights = wt)

  del ds

  for n in getPostOrder(tnew) :
    if not n.succ :
      t,nd = pseudoTaxa[int(n.data.taxon)]
      if len(nd.data.terms) == 1 :
        n.data.taxon = nd.data.taxon
        n.data.rtree = "%s:%f" % (n.data.taxon, n.data.branchlength)
      else :
        # Insure heights are there
        cahelper(t)
        s = t.toNewick(nd.id)
        d = n.data.branchlength - nd.data.rh
        if (d < -1e-10) :
          print "***** ERROR", d
        n.data.rtree = "%s:%f" % (s, max(d,0.0))
    else :
      ch = [tnew.node(x).data.rtree for x in n.succ]
      n.data.rtree = "(%s,%s)" % (ch[0],ch[1])
      if n.id != tnew.root :
        n.data.rtree = n.data.rtree + (":%f" % n.data.branchlength)

  trec = tnew.node(tnew.root).data.rtree
  trec = parseNewick(trec)
  return trec