def assign_taxonomy(self): """Assign taxonomy to genomes. """ # take user-defined taxIds of input genomes if self.input_tax: try: self.input_tax = dict_from_param(self.input_tax) except ValueError: if len(self.data) > 1: raise ValueError('Invalid input taxonomy format.') # for single-sample analysis, one can simply enter a taxId self.input_tax = {max(self.data.keys()): self.input_tax} print('User-specified TaxIDs of input genomes:') for sid, tid in sorted(self.input_tax.items()): if tid not in self.taxdump: # TODO: read from both temp and master taxdump raise ValueError('TaxID {} is not present in taxonomy ' 'database.'.format(tid)) print(' {}: {} ({}).'.format(sid, tid, self.taxdump[tid]['name'])) else: self.input_tax = {} # auto-infer taxIds of remaining genomes sids = sorted([x for x in self.data if x not in self.input_tax]) if sids: print('Auto-inferring plausible taxIds for input genomes based on ' 'taxonomy of search results...') for sid in sids: try: tid, cov = self.infer_genome_tax(self.data[sid], self.taxdump, self.input_cov) self.input_tax[sid] = tid except ValueError: raise ValueError('Cannot auto-infer taxonomy for {}. ' 'Please specify manually.'.format(sid)) print(' {}: {} ({}) (covering {:2g}% best hits).'.format( sid, tid, self.taxdump[tid]['name'], cov)) # refine taxonomy database print('Refining taxonomy database...') refine_taxdump(self.sum_taxids(), self.taxdump) add_children(self.taxdump) print('Done. Retained {} taxa.'.format(len(self.taxdump))) # find lowest common ancestor (LCA) of all genomes self.lca = find_lca(self.input_tax.values(), self.taxdump) print('All input genomes belong to {} ({}).'.format( self.lca, describe_taxon(self.lca, self.taxdump)))
def test_refine_taxdump(self): taxdump = taxdump_from_text(taxdump_archaea) tids = ['2', '2157'] # Bacteria and Archaea obs = set(refine_taxdump(tids, taxdump)) exp = {'1', '131567', '2', '2157'} # plus cellular organisms and root self.assertSetEqual(obs, exp)