def test_is_latin(self): self.assertTrue(is_latin('Escherichia coli')) self.assertTrue(is_latin('Rickettsia felis')) self.assertFalse(is_latin('Enterobacteriaceae')) self.assertFalse(is_latin('Escherichia coli O157:H7')) self.assertFalse(is_latin('Citrobacter sp. A293')) self.assertFalse(is_latin('bacterium LF-3')) self.assertFalse(is_latin('Firmicutes bacterium CAG:129')) self.assertFalse(is_latin('unidentified enterobacterium')) self.assertFalse(is_latin('Staphylococcus aureus subsp. aureus'))
def identify_taxonomy(self): """Identify taxonomy of genomes. """ print('Identifying taxonomy of genomes...') n = self.df.shape[0] def report_diff(msg): nonlocal n n_ = self.df.shape[0] if n_ < n: print(' ' + msg.format(n - n_)) n = n_ # remove non-capitalized organism names if self.capital: self.df = self.df[self.df['organism_name'].apply(is_capital)] report_diff('Dropped {} genomes without captalized organism name.') # block certain words in organism names if self.block: self.block = list_from_param(self.block) self.df = self.df[~self.df['organism_name']. apply(contain_words, args=(self.block, ))] report_diff('Dropped {} genomes with one or more blocked words in ' 'organism name.') # remove original species information self.df.drop(columns=['species_taxid'], inplace=True) # drop genomes whose taxIds are not in taxdump self.df.dropna(subset=['taxid'], inplace=True) self.df['taxid'] = self.df['taxid'].astype(str) self.df = self.df[self.df['taxid'].isin(self.taxdump)] report_diff('Dropped {} genomes without valid taxId.') # assign genomes to species (represented by taxId not name) self.df['species'] = self.df['taxid'].apply(taxid_at_rank, rank='species', taxdump=self.taxdump) # drop genomes without species taxId self.df.dropna(subset=['species'], inplace=True) report_diff('Dropped {} genomes without valid species taxId.') # drop genomes without Latinate species name if self.latin: self.df = self.df[self.df['species'].apply( lambda x: is_latin(self.taxdump[x]['name']))] report_diff('Dropped {} genomes without Latinate species name.') print('Done.') # include/exclude taxIds if self.taxids: self.taxids = set(list_from_param(self.taxids)) print(f'{"Ex" if self.exclude else "In"}cluding ' f'{len(self.taxids)} custom TaxIDs...') self.df = self.df[self.df['taxid'].apply(lambda x: is_ancestral( x, self.taxids, self.taxdump) != self.exclude)] report_diff('Dropped {} genomes.')