Ejemplo n.º 1
0
    def genome_lineages(self):
        """Generate lineage information for genomes.
        """
        # identify taxa at standard ranks
        ranks = [
            'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family',
            'genus', 'species'
        ]
        self.df[ranks[:-1]] = self.df['taxid'].apply(
            lambda x: pd.Series(taxids_at_ranks(x, ranks[:-1], self.taxdump)))

        # report number of taxa represented at each rank
        print('Number of taxonomic groups represented:')
        for rank in ranks:
            print('  {}: {}.'.format(rank, self.df[rank].nunique()))

        # merge superkingdom and kingdom
        self.df['kingdom'] = self.df[['superkingdom',
                                      'kingdom']].apply(lambda x: x[1]
                                                        if x[1] else x[0],
                                                        axis=1)

        # generate lineage string
        self.df['lineage'] = self.df[ranks[1:]].fillna('').apply(
            lambda col: col.apply(lambda val, name: '{}__{}'.format(
                name[0], self.taxdump[val]['name'] if val else ''),
                                  args=(col.name, ))).apply('; '.join, axis=1)

        # write table
        fname = 'lineages.txt'
        self.df['lineage'].to_csv(join(self.output, fname),
                                  sep='\t',
                                  header=False)
        print('Genome lineages written to {}.'.format(fname))
Ejemplo n.º 2
0
 def test_taxids_at_ranks(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     ranks = ['phylum', 'class', 'genus', 'species']
     obs = taxids_at_ranks('1538547', ranks, taxdump)
     exp = {
         'phylum': '1655434',
         'class': None,
         'genus': '1655637',
         'species': '1538547'
     }
     self.assertDictEqual(obs, exp)