Esempio n. 1
0
 def test_genome_lineages(self):
     me = Database()
     me.output = self.tmpdir
     me.taxdump = taxdump_from_text(taxdump_proteo)
     data = (
         ('G1', '1224', ''),  # Proteobacteria
         ('G2', '562', '562'),  # Escherichia coli
         ('G3', '622', '622'),  # Shigella dysenteriae
         ('G4', '548', '548'))  # Klebsiella aerogenes
     me.df = pd.DataFrame(data, columns=['genome', 'taxid',
                                         'species']).set_index('genome')
     for rank in [
             'superkingdom', 'kingdom', 'phylum', 'class', 'order',
             'family', 'genus'
     ]:
         me.df[rank] = ''
     me.genome_lineages()
     with open(join(self.tmpdir, 'lineages.txt'), 'r') as f:
         obs = dict(x.split('\t') for x in f.read().splitlines())
     proteo = 'k__Bacteria; p__Proteobacteria;'
     self.assertEqual(obs['G1'], proteo + ' c__; o__; f__; g__; s__')
     entero = proteo + ' c__Gammaproteobacteria; o__Enterobacterales;' +\
         ' f__Enterobacteriaceae;'
     self.assertEqual(obs['G2'],
                      entero + ' g__Escherichia; s__Escherichia coli')
     self.assertEqual(obs['G3'],
                      entero + ' g__Shigella; s__Shigella dysenteriae')
     self.assertEqual(obs['G4'],
                      entero + ' g__Klebsiella; s__Klebsiella aerogenes')
     remove(join(self.tmpdir, 'lineages.txt'))
Esempio n. 2
0
    def test_filter_genomes(self):
        me = Database()
        header = ('# assembly_accession', 'assembly_level')
        data = (('GCF_000000001.1', 'Chromosome'), ('GCF_000000002.1',
                                                    'Complete Genome'),
                ('GCF_000000003.2', 'Scaffold'), ('GCF_000000004.1', 'Contig'),
                ('GCA_000000004.1', 'Contig'))
        df = pd.DataFrame(data, columns=header)
        me.complete = False
        me.genoids = None
        me.exclude = False

        # drop duplicates
        me.df = df.copy()
        me.filter_genomes()
        self.assertEqual(me.df.shape[0], 4)
        self.assertListEqual(
            me.df['genome'].tolist(),
            ['G000000001', 'G000000002', 'G000000003', 'G000000004'])
        self.assertEqual(
            me.df.query('accession == "GCF_000000004.1"').shape[0], 1)

        # complete genomes only
        me.complete = True
        me.df = df.copy()
        me.filter_genomes()
        self.assertListEqual(me.df['accnov'].tolist(),
                             ['GCF_000000001', 'GCF_000000002'])

        # include certain genomes
        me.complete = False
        me.genoids = 'G000000001,G000000003'
        me.df = df.copy()
        me.filter_genomes()
        self.assertListEqual(me.df['accession'].tolist(),
                             ['GCF_000000001.1', 'GCF_000000003.2'])

        # exclude certain genomes
        me.genoids = ['GCF_000000002.1', 'GCF_000000004']
        me.exclude = True
        me.df = df.copy()
        me.filter_genomes()
        self.assertListEqual(me.df['accession'].tolist(),
                             ['GCF_000000001.1', 'GCF_000000003.2'])
Esempio n. 3
0
    def test_sample_by_taxonomy(self):
        me = Database()

        # do nothing
        me.sample = None
        self.assertIsNone(me.sample_by_taxonomy())

        # xxx
        header = ('genome', 'taxid', 'refseq_category', 'assembly_level')
        data = (
            ('G1', '585056', '', 'Chromosome'),  # E. coli UMN026
            ('G2', '1038927', 'representative genome', 'Chromosome'),
            # E. coli O104:H4 (rep. genome to be prioritized over G1)
            ('G3', '2580236', '', 'Contig'),  # sync E. coli
            ('G4', '622', '', 'Scaffold'),  # Shigella
            ('G5', '548', '', 'Scaffold'),  # Klebsiella
            ('G6', '126792', 'reference genome', 'Contig'))  # plasmid
        df = pd.DataFrame(data, columns=header)
        me.reference = False
        me.representative = False
        me.taxdump = taxdump_from_text(taxdump_proteo)

        # up to one genome per genus
        me.rank = 'genus'
        me.sample = 1
        me.df = df.copy()
        me.sample_by_taxonomy()
        self.assertListEqual(me.df.columns.tolist(), list(header) + ['genus'])
        self.assertListEqual(me.df['genome'].tolist(), ['G2', 'G4', 'G5'])

        # include reference genome (plasmid)
        me.reference = True
        me.df = df.copy()
        me.sample_by_taxonomy()
        self.assertEqual(me.df['genome'].tolist()[-1], 'G6')

        # up to two genomes for entire cellular life
        me.rank = 'superkingdom'
        me.sample = 2
        me.reference = False
        me.df = df.copy()
        me.sample_by_taxonomy()
        self.assertListEqual(me.df['genome'].tolist(), ['G1', 'G2'])
Esempio n. 4
0
    def test_identify_taxonomy(self):
        me = Database()
        header = ('organism_name', 'taxid', 'species', 'species_taxid')
        data = (('Escherichia coli UMN026', '585056', 'E. coli', '562'),
                ('Escherichia coli O104:H4', '1038927', 'E. coli', '562'),
                ('Klebsiella aerogenes', '548', 'Klebsiella aerogenes',
                 '548'), ('unclassified Gammaproteobacteria', '118884', '',
                          ''), ('Plasmid pPY113', '126792', '', ''))
        df = pd.DataFrame(data, columns=header)

        # organism names must be capital and latinate
        me.capital = True
        me.block = None
        me.latin = True
        me.taxids = None
        me.exclude = False
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.df = df.copy()
        me.identify_taxonomy()
        self.assertNotIn('species_taxid', me.df.columns)
        self.assertListEqual(me.df.index.tolist(), [0, 1, 2])
        self.assertListEqual(me.df['species'].tolist(), ['562', '562', '548'])

        # block word
        me.block = 'plasmid'
        me.latin = False
        me.df = df.copy()
        me.identify_taxonomy()
        self.assertListEqual(me.df.index.tolist(), [0, 1, 2])

        # no Escherichia
        me.taxids = '561'
        me.exclude = True
        me.df = df.copy()
        me.identify_taxonomy()
        self.assertListEqual(me.df.index.tolist(), [2])
Esempio n. 5
0
 def test_genome_metadata(self):
     me = Database()
     me.output = self.tmpdir
     me.df = pd.Series({
         'genome':
         'G1',
         'accession':
         'GCF_000123456.1',
         'asm_name':
         'ASM123v1',
         'bioproject':
         'PRJNA123456',
         'biosample':
         'SAMN00123456',
         'assembly_level':
         'Chromosome',
         'organism_name':
         'hypothetical organism',
         'infraspecific_name':
         '',
         'isolate':
         '',
         'taxid':
         '12345',
         'ftp_path': ('ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/123/'
                      '456/GCF_000123456.1_ASM123v1'),
         'proteins':
         100,
         'residues':
         12500,
         'whatever':
         'nonsense'
     }).to_frame().T
     me.genome_metadata()
     with open(join(self.tmpdir, 'genomes.tsv'), 'r') as f:
         obs = f.read().splitlines()
     exp = ('genome', 'proteins', 'residues', 'assembly_level', 'accession',
            'bioproject', 'biosample', 'asm_name', 'organism_name',
            'infraspecific_name', 'isolate', 'taxid', 'ftp_path')
     self.assertEqual(obs[0], '\t'.join(exp))
     exp = ('G1', '100', '12500', 'Chromosome', 'GCF_000123456.1',
            'PRJNA123456', 'SAMN00123456', 'ASM123v1',
            'hypothetical organism', '', '', '12345',
            ('ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/123/456/'
             'GCF_000123456.1_ASM123v1'))
     self.assertEqual(obs[1], '\t'.join(exp))
     remove(join(self.tmpdir, 'genomes.tsv'))
Esempio n. 6
0
 def test_build_taxdump(self):
     me = Database()
     me.output = self.tmpdir
     me.tmpdir = join(self.datadir, 'DnaK', 'taxdump')
     me.taxdump = taxdump_from_text(taxdump_proteo)
     data = (
         ('G1', '1224'),  # Proteobacteria
         ('G2', '562'),  # Escherichia coli
         ('G3', '585056'),  # E. coli UMN026
         ('G4', '1038927'))  # E. coli O104:H4
     me.df = pd.DataFrame(data, columns=['genome',
                                         'taxid']).set_index('genome')
     me.build_taxdump()
     with open(join(self.tmpdir, 'taxdump', 'nodes.dmp'), 'r') as f:
         obs = set(x.split('\t')[0] for x in f.read().splitlines())
     exp = {
         '1', '131567', '2', '1224', '1236', '91347', '543', '561', '562',
         '585056', '1038927'
     }
     self.assertSetEqual(obs, exp)
     rmtree(join(self.tmpdir, 'taxdump'))