def identify_taxonomy(self): """Identify taxonomy of genomes. """ print('Identifying taxonomy of genomes...') n = self.df.shape[0] def report_diff(msg): nonlocal n n_ = self.df.shape[0] if n_ < n: print(' ' + msg.format(n - n_)) n = n_ # remove non-capitalized organism names if self.capital: self.df = self.df[self.df['organism_name'].apply(is_capital)] report_diff('Dropped {} genomes without captalized organism name.') # block certain words in organism names if self.block: self.block = list_from_param(self.block) self.df = self.df[~self.df['organism_name']. apply(contain_words, args=(self.block, ))] report_diff('Dropped {} genomes with one or more blocked words in ' 'organism name.') # remove original species information self.df.drop(columns=['species_taxid'], inplace=True) # drop genomes whose taxIds are not in taxdump self.df.dropna(subset=['taxid'], inplace=True) self.df['taxid'] = self.df['taxid'].astype(str) self.df = self.df[self.df['taxid'].isin(self.taxdump)] report_diff('Dropped {} genomes without valid taxId.') # assign genomes to species (represented by taxId not name) self.df['species'] = self.df['taxid'].apply(taxid_at_rank, rank='species', taxdump=self.taxdump) # drop genomes without species taxId self.df.dropna(subset=['species'], inplace=True) report_diff('Dropped {} genomes without valid species taxId.') # drop genomes without Latinate species name if self.latin: self.df = self.df[self.df['species'].apply( lambda x: is_latin(self.taxdump[x]['name']))] report_diff('Dropped {} genomes without Latinate species name.') print('Done.') # include/exclude taxIds if self.taxids: self.taxids = set(list_from_param(self.taxids)) print(f'{"Ex" if self.exclude else "In"}cluding ' f'{len(self.taxids)} custom TaxIDs...') self.df = self.df[self.df['taxid'].apply(lambda x: is_ancestral( x, self.taxids, self.taxdump) != self.exclude)] report_diff('Dropped {} genomes.')
def define_groups(self): """Define the three (actually two) groups: "self" and "close". """ self.groups = {} for key in ('self', 'close'): tids = getattr(self, '{}_tax'.format(key)) # user-defined group if tids: setattr(self, '{}_tax'.format(key), list_from_param(tids)) print('User-defined {} group:'.format(key)) # auto-infer group else: getattr(self, 'infer_{}_group'.format(key))() print('Auto-inferred {} group:'.format(key)) # collect taxIds that belong to group tids = getattr(self, '{}_tax'.format(key)) if key not in self.groups: self.groups[key] = set().union( *[[x] + get_descendants(x, self.taxdump) for x in tids]) # subtract self group from close group if key == 'close': self.groups['close'] = self.groups['close'].difference( self.groups['self']) # report group content for tid in tids: print(' {} ({})'.format(tid, describe_taxon(tid, self.taxdump))) print('{} group has {} taxa.'.format(key.capitalize(), len(self.groups[key])))
def define_groups(self): """Define the three (actually two) groups: "self" and "close". Notes ----- Assign these attributes: 1. `self_tax`: top-level taxId(s) of the self group. 2. `close_tax`: top-level taxId(s) of the close group. 3. `groups` (keys: self, close, distal): all taxIds under each group. """ self.groups = {} for key in ('self', 'close'): tids = getattr(self, f'{key}_tax') # user-defined group if tids: setattr(self, f'{key}_tax', list_from_param(tids)) print(f'User-defined {key} group:') # auto-infer group else: getattr(self, f'infer_{key}_group')() print(f'Auto-inferred {key} group:') # collect taxIds that belong to group tids = getattr(self, f'{key}_tax') if key not in self.groups: self.groups[key] = set().union( *[[x] + get_descendants(x, self.taxdump) for x in tids]) # subtract self group from close group if key == 'close': self.groups['close'] = self.groups['close'].difference( self.groups['self']) # report group content for tid in tids: print(f' {tid} ({describe_taxon(tid, self.taxdump)})') print(f'{key.capitalize()} group has {len(self.groups[key])} ' 'taxa.')
def filter_genomes(self): """Filter genomes based on genome metadata. """ print('Filtering genomes...') n = self.df.shape[0] def report_diff(msg): nonlocal n n_ = self.df.shape[0] if n_ < n: print(' ' + msg.format(n - n_)) n = n_ # complete genomes only if self.complete: self.df = self.df[self.df['assembly_level'].isin( {'Complete Genome', 'Chromosome'})] report_diff('Dropped {} non-complete genomes.') # non-redundant genome IDs # typically not necessary, just in case self.df.rename(columns={'# assembly_accession': 'accession'}, inplace=True) self.df['accnov'] = self.df['accession'].str.split('.', 1).str[0] self.df['genome'] = 'G' + self.df['accnov'].str.split('_', 1).str[-1] self.df.drop_duplicates(subset=['genome'], inplace=True) # include/exclude genome Ids if self.genoids: self.genoids = set(list_from_param(self.genoids)) print(f'{"Ex" if self.exclude else "In"}cluding ' f'{len(self.genoids)} custom genome IDs...') self.df = self.df[( self.df['accession'].isin(self.genoids) | self.df['accnov'].isin(self.genoids) | self.df['genome'].isin(self.genoids)) != self.exclude] report_diff('Dropped {} genomes.') print('Done.')
def test_list_from_param(self): # nothing self.assertEqual(list_from_param(None), []) self.assertEqual(list_from_param(''), []) # already list self.assertListEqual(list_from_param([1, 2, 3]), [1, 2, 3]) # list string self.assertListEqual(list_from_param('test'), ['test']) self.assertListEqual(list_from_param('a,b,c'), ['a', 'b', 'c']) # list file exp = ['this', 'is', 'a', 'list'] fp = join(self.tmpdir, 'test.txt') with open(fp, 'w') as f: for e in exp: print(e, file=f) obs = list_from_param(fp) self.assertListEqual(obs, exp) remove(fp)