Esempio n. 1
0
def mpat(tr, seqs) :
  dseqs = dict(seqs)
  for n in getPostOrder(tr) :
    data = n.data
    if not n.succ :
      data.seq = (None,dseqs[n.data.taxon])
    else :
      s1,s2 = [tr.node(x).data.seq for x in n.succ]
      if s1[1] :
        if s2[1] :
          a = calign.globalAlign(s1[1],s2[1])
          data.seq = (calign.createProfile(a),None)
        else :
          p1,p2 = calign.createProfile(s1[1:]), s2[0]
          #assert all([sum(x)==sum(p1[0]) for x in p1])
          #assert all([sum(x)==sum(p2[0]) for x in p2])
          pa = calign.prof2profAlign(p1,p2)
          data.seq = (trimp(pa),None)
          #print len(pa)
      else :
        p1 = s1[0]
        if s2[1] :
          p2 = calign.createProfile(s2[1:])
        else :
          p2 = s2[0]
        #assert all([sum(x)==sum(p1[0]) for x in p1])
        #assert all([sum(x)==sum(p2[0]) for x in p2])
        pa = calign.prof2profAlign(p1,p2)
        data.seq = (trimp(pa),None)
        #print len(pa)
        #import pdb; pdb.set_trace()
  assert n.id == tr.root
  return n.data.seq[0]
Esempio n. 2
0
def mpa(tr, seqs, scores = defaultMatchScores, trimEnd = None) :
  dseqs = dict(seqs)
  #scores = (None,None,gapPenalty,feg)
  for n in getPostOrder(tr) :
    data = n.data
    if not n.succ :
      data.seq = (None,dseqs[n.data.taxon.strip("'")])
    else :
      s1,s2 = [tr.node(x).data.seq for x in n.succ]
      if s1[1] :
        if s2[1] :
          a = calign.globalAlign(s1[1],s2[1], scores = scores)
          data.seq = (calign.createProfile(a),None)
        else :
          p1,p2 = calign.createProfile(s1[1:]), s2[0]
          pa = calign.prof2profAlign(p1,p2, scores = scores)
          data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None)
          #print len(pa)
      else :
        p1 = s1[0]
        if s2[1] :
          p2 = calign.createProfile(s2[1:])
        else :
          p2 = s2[0]
        pa = calign.prof2profAlign(p1,p2, scores = scores)
        data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None)
        #print len(pa)
        #import pdb; pdb.set_trace()
  assert n.id == tr.root
  return n.data.seq[0]
Esempio n. 3
0
def mpc(seqs, nRefines = 4, gapPenalty = defaultMatchScores.gap) :
  al = seqMultiAlign(seqs, scores = defaultMatchScores._replace(gap = gapPenalty))

  c0 = stripseq(cons(calign.createProfile(al)))
  r = refineSingle(al, gapPenalty = gapPenalty)
  c1 = stripseq(cons(calign.createProfile(r)))
  cnt = 0
  while c0 != c1 and cnt < nRefines:
    c0 = c1
    r = refineSingle(r, gapPenalty = gapPenalty)
    c1 = stripseq(cons(calign.createProfile(r)))
    cnt += 1
  return c1, r
Esempio n. 4
0
def refineSingle(al, gapPenalty = defaultMatchScores.gap) :
  if not isinstance(al, list) :
    al = list(al)
    
  p = calign.createProfile(al)

  q = []
  for k,s in enumerate(al) :
    ps = calign.createProfile([s])
    p2 = tuple([[a-b for a,b in zip(x,y)] for x,y in zip(p,ps)])

    q.append( calign.profileAlign(s, p2, gapPenalty = gapPenalty) )

  return q
Esempio n. 5
0
def doTheCons(sqs, trh, lengthQuant = 40) :
  als = [(lengthQuant*(len(s)//lengthQuant), s) for s in sqs]
  als = [x[1] for x in sorted(als, reverse=1)]
  scmp = random.sample(sqs, min(5,len(sqs)))
  cbest = (None,100000)
  
  for i in range(3) :
    al = seqMultiAlign(als)
    c = stripseq(cons(calign.createProfile(al)))
    # spot check
    p = mean(calign.allpairs(c, scmp, report = calign.JCcorrection))/2
    if p < cbest[1] :
      cbest = (c, p)
      
    if i == 2 or p < trh :
      break
    if i == 0 :
      # try original order
      als = sqs
    elif i == 1 :
      # try random order
      als = list(sqs)
      random.shuffle(als)

  return cbest[0]
Esempio n. 6
0
def refineAlignment(al, ci = [0], drop = False, mx = -1, rev = False, verbose = False) :
  if not isinstance(al, list) :
    al = list(al)
    
  p = calign.createProfile(al)
  if rev:
    can = [(k,s) for k,s in enumerate(al) if all([s[x] == '-' for x in ci])]
  else :
    can = [(k,s) for k,s in enumerate(al) if any([s[x] != '-' for x in ci])]

  if verbose:
    print len(al[0]), len(can)
    import sys
    sys.stdout.flush()
    
  if mx > 0 and len(can) > mx:
    return
  
  changed = 0
  if len(can) :
    
    pr = calign.createProfile([s for n,s in can])
    p2 = tuple([[a-b for a,b in zip(x,y)] for x,y in zip(p,pr)])
    if drop:
      p2,rx = removeColumns(p2, ci)
    
    for n,s in can:
      if len(s.replace('-','')) <= len(p2) :
        ra = calign.profileAlign(s, p2)
        if drop :
          ra = restoreColumns(list(ra),rx, calign.GAP)
        al[n] = iton(ra)
        if tuple(ra) != sasn(s) :
          changed += 1

  if changed :
    p = calign.createProfile(al)
    r = [k for k,i in enumerate(p) if i[calign.GAP] == len(al)]

    r1 = toRanges(r)
    if r1 :
      for a,b in r1[::-1] :
        for k in range(len(al)) :
          x = al[k]
          al[k] = x[:a] + x[b:]

  return al,len(al[0]), changed
Esempio n. 7
0
def seqMultiAlign(seqs, scores = defaultMatchScores, report=False) :
  if len(seqs) < 2:
    return seqs
  
  a = calign.globalAlign(seqs[0], seqs[1], scores=scores)
  ns = 2
  p = calign.createProfile(a)

  a = tuple(iton(x) for x in a)
  
  for kk,s2 in enumerate(seqs[2:]) :
    #print ns
    # assert p == calign.createProfile(a)
    assert len(a[0]) == len(p) and \
           p[0][calign.GAP] != ns and p[-1][calign.GAP] != ns
    
    pad = 20
    if len(p)+2*pad < len(s2) :
      # enough for sequences start to align
      pad = (len(s2) - len(p))  ;                assert len(p)+2*pad >= len(s2)

    pa,extendLeft,extendRight = calign.profileAlign(s2, p, pad=pad, chop=True,
                                                    gapPenalty=defaultMatchScores.gap)

    if extendLeft > 0 or extendRight > 0 :
      gapProfile = [0,0,0,0,0,ns]
      p1 = tuple((list(gapProfile) for k in range(extendLeft))) + p \
           + tuple((list(gapProfile) for k in range(extendRight)))
    else :
      p1 = p
    
    for k,n in enumerate(pa) :
      p1[k][n] += 1
    p = p1
      
    if extendLeft > 0 or extendRight > 0 :
      fr = '-'*extendLeft
      bk = '-'*extendRight
      a = tuple((fr + x + bk for x in a))
      
    a = a + (iton(pa),)
    ns += 1

    if report and (kk+1) % 1000 == 0 :
      import sys
      print kk+1, len(a[0]),
      sys.stdout.flush()
  if report: print
  
  return a
Esempio n. 8
0
def sortByEntropy(seqs) :
  alen = len(seqs[0])
  trans = not (seqs[0][0] in range(6))
  
  #print "sorting,"
  counts = calign.createProfile(seqs)
  
  #print "stage 1,"
  cc = [sum([log(x) * x for x in c if x > 0])/len(seqs) - log(len(seqs))
        for c in counts]
  scc = sorted(zip(cc, count()), reverse=0)
  occ = [x[1] for x in scc]
  #print "stage 2,"

  if trans :
    sx = sorted(zip([[ngtoi(x[k]) for k in occ] for x in seqs], count()))
  else :
    sx = sorted(zip([[x[k] for k in occ] for x in seqs], count()))
    
  #print "stage 3,"
  seqsSorted = [k for s,k in sx]
  #print "sorted"
  
  return seqsSorted
Esempio n. 9
0
def createProfile(seqs) :
  return calign.createProfile(seqs)