Example #1
0
def gba_map_taxa(dbname):
  cbdb = compbio.projects.cbdb
  dbi = cbdb.getName(dbname)
  tgb_dbi = cbdb.getName('tax_gbs')

  count = 0 
  slc_n = 10000
  slc_ofs = 0
  max_ofs =dbi.S.q(func.max(dbi.Sequence.id)).scalar() 
  while slc_ofs < max_ofs:
    for  s in dbi.Session.query(dbi.Sequence).\
          filter(dbi.Sequence.id>=slc_ofs).\
          filter(dbi.Sequence.id<(slc_ofs+slc_n)):
      gbid = s.gb_id
      count += 1
      if gbid:
        taxid = tgb_dbi.Session.query(tgb_dbi.TaxGBJoin).\
            filter_by(gbid = gbid).first()
        s.source_taxon = taxid.taxid
      if count > 100:
        
        dbi.Session.commit()
        print 'committing'
        count = 0
    slc_ofs += slc_n
  dbi.Session.commit()
Example #2
0
def db_metadata(clade):
  '''
  Get an ncbi genealogy for a clade - e.g: the minimal ncbi node containing
  every terminal of a clade as well as the ncbi nodes at each leaf.

  inputs:
    clade: <biopython clade>
    
  outputs:
    genealogy: the shared ncbi genealogy of every terminal
    nodes:     ncbi taxnodes for every terminal
'''
  tax_dbi = cbdb.getName('taxdmp')
  tgb_dbi = cbdb.getName('tax_gbs')
  gba_dbi = cbdb.getName('gb_acc_idjoin')
  
  #terminal_gbaccs = map(lambda x: clade_gbacc(x), clade.get_terminals())  

  for idx, t in enumerate(clade.get_terminals()):
    try:
      t.m['id'] = idx
      t.m['gbacc'] = clade_gbacc(t)
      t.m['gbid'] = gba_dbi.Session.query(gba_dbi.GBAcc).\
          filter_by(accession = t.m['gbacc']).one().gbid
      taxid = tgb_dbi.Session.query(tgb_dbi.TaxGBJoin).\
          filter_by(gbid = t.m['gbid']).one().taxid
      t.m['taxid'] = taxid
    except: pass
  max_idx = idx
  for idx, t in enumerate(clade.get_nonterminals()):
    t.m['id'] = max_idx + idx
Example #3
0
def gba_map_gbids(dbname):
  cbdb = compbio.projects.cbdb

  dbi = cbdb.getName(dbname)
  count = 0 
  fail_count =0
  sxs_count =0


  gbacc_dbi = cbdb.getName('gb_acc_idjoin')
  slc_n = 10000
  slc_ofs = 0
  max_ofs =dbi.S.q(func.max(dbi.Sequence.id)).scalar() 
  while slc_ofs < max_ofs:
    for s in dbi.Session.query(dbi.Sequence).\
          filter(dbi.Sequence.id>=slc_ofs).\
          filter(dbi.Sequence.id<(slc_ofs+slc_n)):
      try:
        gbid =  gbacc_dbi.Session.query(gbacc_dbi.GBAcc).\
            filter_by(accession = s.gb_accession).first().gbid
        s.gb_id = gbid
        sxs_count += 1
      except:
        print 'failed!'
        fail_count +=1
      count += 1
      if count > 100:
        dbi.Session.commit()
        count = 0
        print 'adding'
    slc_ofs += slc_n

  dbi.Session.commit()

  print fail_count, sxs_count
Example #4
0
def fill_db( name = '16s', reset = True):
  dbi = cbdb.getName(name, tables = get_tables(), 
                     reset = np.mod(reset,2))
      
  count = 0 
  for p in paths:
    fopen = open(p)
    a = dbi.Alignment()
    dbi.Session.add(a)
    dbi.Session.commit()
    for rec in SeqIO.parse(fopen, 'genbank'):
      f0 = rec.features[0]
      if f0.type == 'source':
        source_taxon = f0.qualifiers['db_xref'][0][6:]
        source_organism=f0.qualifiers['organism'][0]
      else:
        source_taxon = None
        source_organism = None

      seq = dbi.Sequence(name = rec.name, 
                         file_name = p,
                         file_offset = fopen.tell(),
                         sequence = rec.seq.__str__(),
                         source_taxon = source_taxon,
                         source_organism = source_organism,
                         gb_accession = rec.id,
                         annotations = rec.annotations.__str__(),
                         alignmentid = 0)

      dbi.Session.add(seq)
      if np.mod(count, 1000) == 0:
        print count, p , seq.source_organism
        dbi.Session.commit()
      count += 1
    dbi.Session.commit()
Example #5
0
  def makeRank(self, rank = 'phylum', subtree = None): 
    #Get the subtree and db connections to build meta for
    tree = subtree if subtree != None else self.t     
    dbi = cbdb.getName('taxdmp')

    print 'Fetching taxonomic nodes from the db'
    #Get the terminal nodes and corresponding ncbi taxa
    terms = [t for t in tree.get_terminals() if t.m.has_key('taxid')]
    nodes = [dbi.S.q(dbi.Node).filter_by(id = t.m['taxid']).scalar() 
             for t in terms]

    #endpoints for parental iteratiion
    taxa = ncbi.get_rank(rank)
    root = ncbi.get_root()


    print 'Computing terminal node mappings for taxon: {0}'.format(rank)
    bar = pbar.simple(len(nodes)); bar.start()

    node_taxa = list(nodes)
    get_p_iter = lambda: \
        node_taxa[idx] == None and True \
        or node_taxa[idx] in taxa and True \
        or node_taxa[idx] == root and True \
        or node_taxa.__setitem__(idx,node_taxa[idx].parent) \
        or node_taxa[idx]

    for idx, v in enumerate(node_taxa):
      bar.update(idx);
      par = list(iter(get_p_iter, True))[-1] if v else None
      terms[idx].m[rank] = par.id if par in taxa else None
    bar.finish()
    print 'Done!'
Example #6
0
def getGenealogy(node):
  root_node =taxRoot()
  dbi = cbdb.getName('taxdmp')

  path = []
  cur = node
  while cur != root_node:
    path.append(cur)
    cur = cur.parent
  path.append(cur)
  return path[::-1]
Example #7
0
def fill_all_rdb16s(reset = True):
  paths = []
  for r, ds, fs in os.walk(config.dataPath('alignments/16s')):
    for f in fs:
      if '.gbk' in f:
        paths.append(os.path.join(r,f))
  cbdb = compbio.projects.cbdb
  dbi = cbdb.getName('16s',
                     tables = get_tables(),
                     reset = np.mod(reset, 2))
  last_ofs = 0
  for p in paths:
    fopen = open(p)
    a = dbi.Alignment(file_name =config.dataURL(p))
    dbi.Session.add(a)
    dbi.Session.commit()
    count = 0 

    for rec in SeqIO.parse(fopen, 'genbank'):
      try:
        src_taxon = rec.features[0].qualifiers['db_xref'][0][6:]
      except Exception, e:
        src_taxon = None

      ann = sjson.dumps(rec.annotations, default = lambda x: x.__str__())
      seq = dbi.Sequence(name = rec.name,
                         file_name = p,
                         file_offset = last_ofs,
                         sequence = rec.seq.__str__(),
                         gb_accession = rec.id,
                         gb_accession_version = 1,
                         gb_id = None,
                         annotations = ann,
                         alignment = a,
                         source_taxon = src_taxon
                         )
      dbi.Session.add(seq)
      last_ofs = fopen.tell()
      if np.mod(count, 1000) == 0:
        print count, p, seq.source_organism
        dbi.Session.commit()
      count += 1
    dbi.Session.commit()
Example #8
0
def fill_db( reset = True):
  dbi = cbdb.getName('taxdmp', tables = get_tables(), reset = np.mod(reset, 2))
  filepath = config.dataPath('ncbi/taxdmp')
  maps = get_maps()

  record_sep = '\t|\n'
  col_sep = '\t|\t'
  colfun = lambda x: unicode(x, errors = 'replace').replace(record_sep, '').split(col_sep)
  record_iterfun = lambda x: x.xreadlines()
  
  fill_tables = {'Gencode':'gencode.dmp',
                 'Node':'nodes.dmp',
                 'Name':'names.dmp',
                 'Citation':'citations.dmp'}

                 
  count = 0
  for k,v in fill_tables.iteritems():
    fopen = open(os.path.join(filepath, v))
    fsize = os.path.getsize(os.path.join(filepath,v))

    mapped_class = dbi.__dict__[k]
    mapped_columns = maps[k]
    l0 = ''
    for l in record_iterfun(fopen):
      count += 1
      l0+=l
      if l0[-3:] == record_sep :
        l = l0
        l0 = ''
      else: continue
      cols = colfun(l)
      cls = mapped_class(**dict(map(lambda (x,y): (x,cols[y]),
                                    mapped_columns.iteritems())))
      dbi.Session.merge(cls)
      if np.mod(count, 1000) == 0:
        dbi.Session.commit()
      
        print k, v, count, cols, '{0:4}%'.format(100 * float(fopen.tell()) / fsize)
    dbi.Session.commit()
  return
Example #9
0
def fill_from_rfam_stk( p, reset = True):
  cbdb = compbio.projects.cbdb
  aname = os.path.basename(p)
  dbi = cbdb.getName(aname, tables = get_tables(), 
                     reset = np.mod(reset,2))
      
  fopen = open(p)
  a = dbi.Alignment(file_name = aname)
  dbi.Session.add(a)
  dbi.Session.commit()

  count = 0
  for rec in SeqIO.parse(fopen, 'stockholm'):
    acc = rec.annotations['accession']
    accidv, accrange = acc.split('/')
    acv_split =  accidv.split('.')
    accid = acv_split[0]
    accid_version = (lambda x: len(x) == 1 and 1 or x[1])(acv_split)
    
    ann = sjson.dumps(rec.annotations, default = lambda x: x.__str__())
    seq = dbi.Sequence(name = rec.name,
                       file_name = p,
                       file_offset = fopen.tell(),
                       sequence = rec.seq.__str__(),
                       gb_accession = accid,
                       gb_accession_version = accid_version,
                       gb_accession_range = accrange,
                       gb_id = None,
                       annotations = ann,
                       alignment = a
                       )
    

    dbi.Session.add(seq)
    if np.mod(count, 100) == 0:
      print count, p , seq.source_organism
      dbi.Session.commit()
    count += 1
  dbi.Session.commit()
Example #10
0
  def investigatePhylum(self, 
                        aliname = 'group2.stk',
                        p_node = None, **kwargs):

    if not p_node: p_node = ncbi.taxon_with_name('phylum', 'Thermotogae')
    ali_seqs = ali.get_seqs(aliname, **mem.sr(kwargs))
    ali_nodes = array(ali.get_taxnodes(aliname, **mem.sr(kwargs)))
    ali_phyla = array(ali.get_taxon_forall(aliname,**mem.sr(kwargs, rank = 'phylum')))
    ali_inds = nonzero(equal(ali_phyla, p_node))[0]
    
    leaf_terminals = self.t.get_terminals()
    leaf_nodes = array(self.leafNodes(**mem.sr(kwargs)))
    leaf_phyla = array(self.getTaxon('phylum', **mem.sr(kwargs)))
    leaf_inds = nonzero(equal(leaf_phyla, p_node))[0]
        
    ap_sub = ali_phyla[ali_inds]
    lp_sub = leaf_phyla[leaf_inds]

    ag_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds],'genus','thermo',
                                         **mem.sr(kwargs)))
    lg_sub = array(self.getTaxon('genus', **mem.sr(kwargs)))[leaf_inds]

    as_sub = array(ali.get_taxon_forsome(ali_nodes[ali_inds], 'species', 'thermo'))
    ls_sub = array(self.getTaxon('species',**mem.sr(kwargs)))[leaf_inds]

    db16 = cbdb.getName('16s')
    a_16s= [ db16.S.q(db16.Sequence).
             filter_by(source_taxon = n.id).all() for n in ali_nodes[ali_inds]]
    l_16s= [ db16.S.q(db16.Sequence).
             filter_by(source_taxon = n.id).all() for n in leaf_nodes[leaf_inds]]

    #fill any empty nodes... (those lacking 16s rRNA)
    for idx, elt in enumerate(a_16s):
      cur_node= ali_nodes[ali_inds[idx]]
      while not elt:
        cur_node = cur_node.parent
        elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all())

    for idx, elt in enumerate(l_16s):
      cur_node= leaf_nodes[leaf_inds[idx]]
      while not elt:
        cur_node = cur_node.parent
        elt.extend(db16.S.q(db16.Sequence).filter_by(source_taxon = cur_node.id).all())



    all_lens = dict([ (k, [len(list( e)) for e in seqlist] )  
                      for seqlist,k in [[a_16s,'a_16s'],[l_16s,'l_16s']]])
    

    leaf_sns = [ SeqNode(lg_sub[i],ls_sub[i] , leaf_nodes[idx], 
                         [(x.sequence,x.gb_id, x.source_taxon, ncbi.get_node(x.source_taxon).rank) 
                          for x in l_16s[i]],
                         src = leaf_terminals[idx],
                         node_id =  'btol:default:{0}'.format(leaf_terminals[idx].m['id']))
                 for i, idx in enumerate(leaf_inds)]
    ali_sns = [ SeqNode(ag_sub[i],as_sub[i] , ali_nodes[idx],
                        [( x.sequence,x.gb_id , x.source_taxon, ncbi.get_node(x.source_taxon).rank) 
                         for x in a_16s[i]],
                        src = ali_seqs[idx], 
                        node_id = 'ali:{0}:{1}'.format(aliname,ali_seqs[idx].id))
                for i, idx in enumerate(ali_inds)]

    return list(it.chain(leaf_sns, ali_sns))
Example #11
0
 def setLeafNodes(**kwargs):
   all_leaves = self.t.get_terminals()
   dbi = cbdb.getName('taxdmp')
   all_nodes = [ ncbi.get_node(l.m['taxid'],dbi) 
                 if 'taxid' in l.m.keys() else None for l in all_leaves]
   return all_nodes
Example #12
0
def taxRoot():
  tax_dbi = cbdb.getName('taxdmp')
  root_node = tax_dbi.Session.query(tax_dbi.Name).filter_by(name_txt  = 'root').one().node
  return root_node
Example #13
0
def rna4gbid( gbid, dbname = '16s'): 
  #print 'giving a random RNA because the taxonomy atabase is not yet created!'
  dbi = cbdb.getName(dbname)
  seq_num=  floor(1000 * random.random())
  seq = dbi.Session.query(dbi.Sequence)[seq_num].sequence
  return seq
Example #14
0
def get_root():
  dbi = cbdb.getName('taxdmp')
  return dbi.S.q(dbi.Node).\
      filter_by(id = 1).one()
Example #15
0
def get_rank(rankname = 'phylum'):
  dbi = cbdb.getName('taxdmp')
  rank = dbi.S.q(dbi.Node).\
      filter_by(rank = rankname).all()
  return rank
Example #16
0
def get_node(nodeid, dbi = None):
  if dbi == None: dbi = cbdb.getName('taxdmp')
  return dbi.S.q(dbi.Node).filter_by(id=nodeid).scalar()
Example #17
0
 def set_seqs(**kwargs):
   cbdb = compbio.projects.cbdb
   dbname = kwargs['dbname']
   dbi = cbdb.getName(dbname)
   nodes = dbi.S.q(dbi.Sequence).all()
   return nodes
Example #18
0
def fill_db( name = 'bacterial_genomes', reset = False,
              postgres = False, host = 'broad'):
    dbi = cbdb.getName(
                       name,
                       postgres = postgres,
                       tables = get_tables(),
                       reset = np.mod(reset, 2), 
                       host = host)


    paths = []
    for r,ds, fs in os.walk('/Volumes/ganymede/all.gbk/'):
      for f in fs:
        if 'gbk' in f: paths.append(os.path.join(r, f))
        count = 0 
    

    for p in paths:
      
      if count < 1668:
        count += 1
        continue
      count += 1
      fopen = open(p)
      for rec in SeqIO.parse(fopen, 'genbank'):
        f0 = rec.features[0]
        if f0.type == 'source':
          source_taxon = f0.qualifiers['db_xref'][0][6:]
          source_organism=f0.qualifiers['organism'][0]
        else:
          source_taxon = None
          source_organism = None
          
        fa_seqpath = 'genomes/'+rec.id+'.fa'
        fa_sequrl = config.dataURL(fa_seqpath)
        fa_seqfile = config.dataPath(fa_sequrl)
        fopen = open(fa_seqfile,'w')
        SeqIO.write(rec,fopen, 'fasta')
        fopen.close()

        adds = []
        genome = dbi.Genome(name = rec.name, 
                           seq_url =fa_sequrl,
                           source_taxon = source_taxon,
                           source_organism = source_organism,
                           gb_accession = rec.id,
                           annotations = rec.annotations.__str__())

        #adds.append(genome)
        print 'adding genome ' + source_organism
        dbi.Session.add(genome)
        print 'commiting update ' 
        dbi.Session.commit()
        print 'genome added! '
        for f in rec.features:
          feature = dbi.Feature(type = f.type,
                                start = f.location.start.position,
                                start_ext = f.location.start.extension,
                                end = f.location.end.position,
                                end_ext = f.location.end.extension,
                                strand = f.strand,
                                genomeobj = genome)
          #print 'adding feature ' + f.type
          #dbi.Session.add(feature)
          adds.append(feature)
          for k,v in f.qualifiers.iteritems():
            q = dbi.Qualifier(key = k,
                                      value = v.__str__(),
                                      featureobj = feature)
            #dbi.Session.add(q)
            adds.append(q)
          for sf in f.sub_features:
            sub = dbi.SubFeature(type = sf.type,
                                 start = sf.location.start.position,
                                 start_ext = sf.location.start.extension,
                                 end =sf.location.end.position,
                                 end_ext = sf.location.end.extension,
                                 strand = sf.strand,
                                 featureobj = feature)
            adds.append(sub)
            #dbi.Session.add(sub)
            for k,v in sf.qualifiers.iteritems():
              q = dbi.Qualifier(key = k,
                                value = v.__str__(),
                                subfeatureobj = sf)
              #Session.add(q)
              adds.append(q)
                                
        dbi.Session.add_all(adds)



        if np.mod(count, 2) == 0:
          print count
#print count, p , seq.source_organism
          print 'committing update'
          dbi.Session.commit()
          print 'update commited!'
      dbi.Session.commit()