def diff(self, options): """Compare two taxonomy files.""" check_file_exists(options.input_taxonomy1) check_file_exists(options.input_taxonomy2) taxonomy1 = Taxonomy().read(options.input_taxonomy1) taxonomy2 = Taxonomy().read(options.input_taxonomy2) all_taxon_ids = set(taxonomy1.keys()).union(list(taxonomy2.keys())) rank_index = Taxonomy.rank_labels.index(options.rank) for taxon_id in all_taxon_ids: if options.report_missing_taxa: if taxon_id not in taxonomy1: print('Missing in taxonomy 1: %s' % taxon_id) elif taxon_id not in taxonomy2: print('Missing in taxonomy 2: %s' % taxon_id) if taxon_id in taxonomy1 and taxon_id in taxonomy2: taxon1 = taxonomy1[taxon_id][rank_index] taxon2 = taxonomy2[taxon_id][rank_index] if taxon1 != taxon2: if options.report_missing_ranks or (taxon1[3:] and taxon2[3:]): print('Different taxon for %s: %s %s' % (taxon_id, taxon1, taxon2)) print('Done.')
def run(self, input_taxonomy, genome_path_file, metadata_file, max_genomes, min_comp, max_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length, output_dir): """Calculate ANI for named species.""" # get genomes passing filtering criteria filtered_genome_ids = filter_genomes(metadata_file, min_comp, max_cont, min_quality, max_contigs, min_N50, max_ambiguous, max_gap_length) # get species in each named species taxonomy = Taxonomy().read(input_taxonomy) genome_ids_to_remove = set(taxonomy.keys()) - filtered_genome_ids for genome_id in genome_ids_to_remove: del taxonomy[genome_id] named_species = Taxonomy().extant_taxa_for_rank('species', taxonomy) # get path to nucleotide files nt_files = {} for line in open(genome_path_file): line_split = line.strip().split('\t') gtdb_id = line_split[0] genome_id = gtdb_id.replace('GB_', '').replace('RS_', '') genome_dir = line_split[1] nt_file = os.path.join(genome_dir, 'prodigal', genome_id + '_protein.fna') nt_files[gtdb_id] = nt_file # populate worker queue with data to process worker_queue = mp.Queue() writer_queue = mp.Queue() num_species = 0 for species, genome_ids in named_species.items(): if len(genome_ids) > 1: worker_queue.put((species, genome_ids)) num_species += 1 for _ in range(self.cpus): worker_queue.put((None, None)) try: worker_proc = [mp.Process(target=self.__worker, args=(metadata_file, nt_files, max_genomes, worker_queue, writer_queue)) for _ in range(self.cpus)] write_proc = mp.Process(target=self.__writer, args=(num_species, output_dir, writer_queue)) write_proc.start() for p in worker_proc: p.start() for p in worker_proc: p.join() writer_queue.put((None, None, None, None, None)) write_proc.join() except: for p in worker_proc: p.terminate() write_proc.terminate()
def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir): """Tabulate differences between two taxonomies. Parameters ---------- tax1_file : str First taxonomy file. tax2_file : str Second taxonomy file. include_user_taxa : boolean Flag indicating if User genomes should be considered. output_dir : str Output directory. """ tax1 = Taxonomy().read(tax1_file) tax2 = Taxonomy().read(tax2_file) if not include_user_taxa: new_tax1 = {} for genome_id, taxonomy in tax1.iteritems(): if not genome_id.startswith('U_'): new_tax1[genome_id] = taxonomy tax1 = new_tax1 new_tax2 = {} for genome_id, taxonomy in tax2.iteritems(): if not genome_id.startswith('U_'): new_tax2[genome_id] = taxonomy tax2 = new_tax2 common_taxa = set(tax1.keys()).intersection(tax2.keys()) self.logger.info('First taxonomy contains %d taxa.' % len(tax1)) self.logger.info('Second taxonomy contains %d taxa.' % len(tax2)) self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa)) # identify differences between taxonomies tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0] tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0] output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2)) unchanged = defaultdict(int) # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__ active_change = defaultdict(int) # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A passive_change = defaultdict(int) # T2 = g__??? -> T1 = g__Jane unresolved_change = defaultdict(int) # T2 = g__Box -> T1 = g__??? for taxa in common_taxa: t1 = tax1[taxa] t2 = tax2[taxa] for rank, (taxon1, taxon2) in enumerate(zip(t1, t2)): if taxon1 == taxon2: unchanged[rank] += 1 elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]: active_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon2 == Taxonomy.rank_prefixes[rank]: passive_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon1 == Taxonomy.rank_prefixes[rank]: unresolved_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) fout.close() # report results output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n') print 'Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal' for rank in xrange(0, len(Taxonomy.rank_prefixes)): total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank] if total != 0: fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' % (Taxonomy.rank_labels[rank], unchanged[rank], unchanged[rank] * 100.0 / total, active_change[rank], active_change[rank] * 100.0 / total, passive_change[rank], passive_change[rank] * 100.0 / total, unresolved_change[rank], unresolved_change[rank] * 100.0 / total)) print '%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank], unchanged[rank], active_change[rank], passive_change[rank], unresolved_change[rank], total)
def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir): """Tabulate differences between two taxonomies. Parameters ---------- tax1_file : str First taxonomy file. tax2_file : str Second taxonomy file. include_user_taxa : boolean Flag indicating if User genomes should be considered. output_dir : str Output directory. """ tax1 = Taxonomy().read(tax1_file) tax2 = Taxonomy().read(tax2_file) if not include_user_taxa: new_tax1 = {} for genome_id, taxonomy in tax1.items(): if not genome_id.startswith('U_'): new_tax1[genome_id] = taxonomy tax1 = new_tax1 new_tax2 = {} for genome_id, taxonomy in tax2.items(): if not genome_id.startswith('U_'): new_tax2[genome_id] = taxonomy tax2 = new_tax2 common_taxa = set(tax1.keys()).intersection(list(tax2.keys())) self.logger.info('First taxonomy contains %d taxa.' % len(tax1)) self.logger.info('Second taxonomy contains %d taxa.' % len(tax2)) self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa)) # identify differences between taxonomies tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0] tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0] output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2)) unchanged = defaultdict(int) # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__ active_change = defaultdict(int) # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A passive_change = defaultdict(int) # T2 = g__??? -> T1 = g__Jane unresolved_change = defaultdict(int) # T2 = g__Box -> T1 = g__??? for taxa in common_taxa: t1 = tax1[taxa] t2 = tax2[taxa] for rank, (taxon1, taxon2) in enumerate(list(zip(t1, t2))): if taxon1 == taxon2: unchanged[rank] += 1 elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]: active_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon2 == Taxonomy.rank_prefixes[rank]: passive_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon1 == Taxonomy.rank_prefixes[rank]: unresolved_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) fout.close() # report results output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n') print('Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal') for rank in range(0, len(Taxonomy.rank_prefixes)): total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank] if total != 0: fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' % (Taxonomy.rank_labels[rank], unchanged[rank], unchanged[rank] * 100.0 / total, active_change[rank], active_change[rank] * 100.0 / total, passive_change[rank], passive_change[rank] * 100.0 / total, unresolved_change[rank], unresolved_change[rank] * 100.0 / total)) print('%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank], unchanged[rank], active_change[rank], passive_change[rank], unresolved_change[rank], total))