Example #1
0
 def _loadNodes(self,ncbiDumpFile):
     inp = openCompressed(ncbiDumpFile,'r')
     nodes = {}
     n_splits = self.nInputFields
     #delimRe = re.compile(r"\s*\|\s*")
     for rec in inp:
         # On profiling, string methods outperformed regexes (compare next two lines):
         #values = [ x.strip() for x in rec.split('\t|\t',n_splits)[:n_splits]]
         #values = delimRe.split(rec,n_splits)
         
         # This will not split the last field, but we never use it anyway:
         values = rec.split("\t|\t")
         node = TaxaNode()
         node.id = int(values[0])
         node.idpar = int(values[1])
         node.rank = values[2].replace(' ','_')
         node.divid = int(values[4])
         #in NCBI file, root node points to itself as a parent.
         #We replace it with 0 for consistency with our SQL DB representation, where
         #circular self-reference would be inconvenient.
         if node.idpar == node.id:
             node.idpar = 0
         nodes[node.id] = node
         assert node.id < ncbiTaxidMax, "We assume that dump file is pristine NCBI file and "+\
                 "assert that taxonomy ID (%s) < our max limit (%s)" % (node.id,ncbiTaxidMax)
     inp.close()
     self.nodes = nodes
Example #2
0
 def load(self):
     db = self.db
     reader = db.makeBulkReader(sql="select * from %s" % (self.tblNodes,),bufLen=100000)
     nodes = {}
     for chunk in reader.chunks():
         for rec in chunk:
             node = TaxaNode()
             node.id = rec['id']
             node.idpar = rec['idpar']
             node.rank = rec['rank']
             node.lnest = rec['lnest']
             node.rnest = rec['rnest']
             node.depth = rec['depth']
             node.seq_len = rec['seq_len']
             node.seq_len_tot = rec['seq_len_tot']
             node.idlevel = rec['idlevel']
             node.divid = rec['divid']
             nodes[node.id] = node
     reader.close()
     return dict(nodes=nodes,merged={})