Exemple #1
0
    def assign_taxonomy(self):
        """Assign taxonomy to genomes.
        """
        # take user-defined taxIds of input genomes
        if self.input_tax:
            try:
                self.input_tax = dict_from_param(self.input_tax)
            except ValueError:
                if len(self.data) > 1:
                    raise ValueError('Invalid input taxonomy format.')
                # for single-sample analysis, one can simply enter a taxId
                self.input_tax = {max(self.data.keys()): self.input_tax}
            print('User-specified TaxIDs of input genomes:')
            for sid, tid in sorted(self.input_tax.items()):
                if tid not in self.taxdump:
                    # TODO: read from both temp and master taxdump
                    raise ValueError('TaxID {} is not present in taxonomy '
                                     'database.'.format(tid))
                print('  {}: {} ({}).'.format(sid, tid,
                                              self.taxdump[tid]['name']))
        else:
            self.input_tax = {}

        # auto-infer taxIds of remaining genomes
        sids = sorted([x for x in self.data if x not in self.input_tax])
        if sids:
            print('Auto-inferring plausible taxIds for input genomes based on '
                  'taxonomy of search results...')
            for sid in sids:
                try:
                    tid, cov = self.infer_genome_tax(self.data[sid],
                                                     self.taxdump,
                                                     self.input_cov)
                    self.input_tax[sid] = tid
                except ValueError:
                    raise ValueError('Cannot auto-infer taxonomy for {}. '
                                     'Please specify manually.'.format(sid))
                print('  {}: {} ({}) (covering {:2g}% best hits).'.format(
                    sid, tid, self.taxdump[tid]['name'], cov))

        # refine taxonomy database
        print('Refining taxonomy database...')
        refine_taxdump(self.sum_taxids(), self.taxdump)
        add_children(self.taxdump)
        print('Done. Retained {} taxa.'.format(len(self.taxdump)))

        # find lowest common ancestor (LCA) of all genomes
        self.lca = find_lca(self.input_tax.values(), self.taxdump)
        print('All input genomes belong to {} ({}).'.format(
            self.lca, describe_taxon(self.lca, self.taxdump)))
 def test_refine_taxdump(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     tids = ['2', '2157']  # Bacteria and Archaea
     obs = set(refine_taxdump(tids, taxdump))
     exp = {'1', '131567', '2', '2157'}  # plus cellular organisms and root
     self.assertSetEqual(obs, exp)