def setUp(self): self.classify = Classify() self.out_dir = tempfile.mkdtemp(prefix='gtdbtk_tmp_') self.prefix = 'gtdbtk' self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference' self.aln_dir_ref = 'tests/data/align_dir_reference/align' self.user_msa_file = os.path.join(self.aln_dir_ref, 'gtdbtk.ar122.user_msa.fasta') self.taxonomy_file = Config.TAXONOMY_FILE self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
def test_read(self): expected = { 'GCF_005435136.1': ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S1'], '2': ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S2'] } path_tax = os.path.join(self.dir_tmp, 'tax_file.tsv') with open(path_tax, 'w') as f: for k, v in expected.items(): f.write(f'{k}\t{";".join(v)}\n') t = Taxonomy() result = t.read(path_tax) self.assertDictEqual(expected, result)
def root(self, options): """Root tree using outgroup. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ self.logger.warning("Tree rooting is still under development!") check_file_exists(options.input_tree) if options.custom_taxonomy_file: check_file_exists(options.custom_taxonomy_file) taxonomy = Taxonomy().read(options.custom_taxonomy_file) else: taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) self.logger.info('Identifying genomes from the specified outgroup.') outgroup = set() for genome_id, taxa in taxonomy.items(): if options.outgroup_taxon in taxa: outgroup.add(genome_id) reroot = RerootTree() reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup) # Symlink to the tree summary file, if not run independently if hasattr(options, 'suffix'): if options.suffix == 'bac120': symlink_f( PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_AR122_ROOTED_TREE.format( prefix=options.prefix)))) elif options.suffix == 'ar122': symlink_f( PATH_AR122_ROOTED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_AR122_ROOTED_TREE.format( prefix=options.prefix)))) else: raise GenomeMarkerSetUnknown( 'There was an error determining the marker set.') self.logger.info('Done.')
def setUp(self): self.classify = Classify() self.generic_out_path = 'tests/data/results' tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) self.out_dir = os.path.join(self.generic_out_path, tmp_folder) if not os.path.exists(self.generic_out_path): os.makedirs(self.generic_out_path) self.prefix = 'gtdbtk' self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference' self.user_msa_file = os.path.join( self.pplacer_dir_reference, 'gtdbtk.ar122.user_msa.fasta') self.taxonomy_file = Config.TAXONOMY_FILE self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
def _get_ingroup_domain(self, ingroup_taxon) -> str: """Get domain on ingroup taxon.""" # read GTDB taxonomy in order to establish domain on ingroup taxon gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE) ingroup_domain = None for taxa in gtdb_taxonomy.values(): if ingroup_taxon in taxa: ingroup_domain = taxa[Taxonomy.DOMAIN_IDX] if ingroup_domain is None: raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in ' f'the GTDB taxonomy.') return ingroup_domain
def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v, mash_s, min_af, mash_db): """Runs the pipeline. Parameters ---------- genomes : dict[str, str] Dict[genome_id] = fasta_path no_mash : bool True if Mash should be used for pre-filtering, False otherwise. max_d : float maximum distance to keep [0-1] out_dir : str The directory to write the output files to. prefix : str The prefix to use when writing output files. mash_k : int k-mer size [1-32] mash_v : float maximum p-value to keep [0-1] mash_s : int maximum number of non-redundant hashes min_af : float alignment fraction to consider closest genome mash_db : Optional[str] The path to read/write the pre-computed Mash reference sketch database. """ self.check_dependencies(no_mash) self.logger.info('Loading reference genomes.') ref_genomes = self._get_ref_genomes() d_compare = defaultdict(set) d_paths = {**genomes, **ref_genomes} # Pre-filter using Mash if specified. if not no_mash: dir_mash = os.path.join(out_dir, DIR_ANI_REP_INT_MASH) mash = Mash(self.cpus, dir_mash, prefix) self.logger.info(f'Using Mash version {mash.version()}') mash_results = mash.run(genomes, ref_genomes, max_d, mash_k, mash_v, mash_s, mash_db) for qry_gid, ref_hits in mash_results.items(): d_compare[qry_gid] = d_compare[qry_gid].union( set(ref_hits.keys())) # Compare against all reference genomes. else: for qry_gid in genomes: d_compare[qry_gid] = set(ref_genomes.keys()) self.logger.info( f'Calculating ANI with FastANI v{FastANI._get_version()}.') fastani = FastANI(self.cpus, force_single=True) fastani_results = fastani.run(d_compare, d_paths) taxonomy = Taxonomy().read(TAXONOMY_FILE, canonical_ids=True) ANISummaryFile(out_dir, prefix, fastani_results, taxonomy) ANIClosestFile(out_dir, prefix, fastani_results, genomes, min_af, taxonomy)
def _assign_taxon_labels(self, fmeasure_for_taxa): """Assign taxon labels to nodes. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...] Node with highest F-measure for each taxon. Returns ------- set Taxon labels placed in tree. """ placed_taxon = set() for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()): if len(fmeasure_for_taxa[taxon]) == 1: placed_taxon.add(taxon) stat_table = fmeasure_for_taxa[taxon][0] node = stat_table.node fmeasure = stat_table.fmeasure precision = stat_table.precision recall = stat_table.recall support, taxon_label, aux_info = parse_label(node.label) if taxon_label: taxon_label += '; ' + taxon else: taxon_label = taxon node.label = create_label(support, taxon_label, aux_info) return placed_taxon
def test_read_canonical(self): to_write = { 'GCF_005435136.1': ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S1'], 'RS_GCF_005435135.1': ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S2'] } expected = { 'G005435136': to_write['GCF_005435136.1'], 'G005435135': to_write['RS_GCF_005435135.1'], } path_tax = os.path.join(self.dir_tmp, 'tax_file.tsv') with open(path_tax, 'w') as f: for k, v in to_write.items(): f.write(f'{k}\t{";".join(v)}\n') t = Taxonomy() result = t.read(path_tax, canonical_ids=True) self.assertDictEqual(expected, result)
def test_read_error(self): expected = { 'GCF_005435136.1': ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S'] } path_tax = os.path.join(self.dir_tmp, 'tax_file.tsv') with open(path_tax, 'w') as f: for k, v in expected.items(): f.write(f'{k},{";".join(v)}\n') t = Taxonomy() self.assertRaises(GTDBTkExit, t.read, path_tax)
def _write_statistics_table(self, fmeasure_for_taxa, taxonomy, out_table): """Write table containing statistics for each taxon. Parameters ---------- fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)] Node with highest F-measure for each taxon. taxonomy : d[unique_id] -> [d__<taxon>; ...; s__<taxon>] Taxonomic information for taxa in tree of interest. out_table : str Output table to write statistics for assigned labels. """ # get extent taxa extant_taxa = Taxonomy().extant_taxa(taxonomy) fout_table = open(out_table, 'w') fout_table.write( 'Taxon\tNo. Expected in Tree\tF-measure\tPrecision\tRecall') fout_table.write('\tNo. Genomes from Taxon\tNo. Genome In Lineage') fout_table.write('\tRogue out\tRogue in\n') for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()): if len(fmeasure_for_taxa[taxon]) != 1: self.logger.error( 'Multiple positions specified for taxon label.') sys.exit() num_genomes = len(extant_taxa[taxon]) stat_table = fmeasure_for_taxa[taxon][0] fout_table.write( '%s\t%d\t%.4f\t%.4f\t%.4f\t%d\t%d\t%s\t%s\n' % (taxon, num_genomes, stat_table.fmeasure, stat_table.precision, stat_table.recall, stat_table.taxa_in_lineage, stat_table.num_leaves_with_taxa, ','.join( stat_table.rogue_out), ','.join(stat_table.rogue_in))) fout_table.close()
def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, str, str, str]]: """Read and merge taxonomy files.""" self.logger.info('Reading GTDB taxonomy for representative genomes.') taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) if options.gtdbtk_classification_file: # add and overwrite taxonomy for genomes specified in the # GTDB-Tk classification file check_file_exists(options.gtdbtk_classification_file) self.logger.info('Reading GTDB-Tk classification file.') gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file) del gtdbtk_taxonomy['user_genome'] num_reassigned = 0 for gid, taxa in gtdbtk_taxonomy.items(): if gid in taxonomy: num_reassigned += 1 taxonomy[gid] = taxa self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.') self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.') if options.custom_taxonomy_file: # add and overwrite taxonomy for genomes specified in the # custom taxonomy file check_file_exists(options.custom_taxonomy_file) self.logger.info('Reading custom taxonomy file.') custom_taxonomy = Taxonomy().read(options.custom_taxonomy_file) num_reassigned = 0 for gid, taxa in custom_taxonomy.items(): if gid in taxonomy: num_reassigned += 1 taxonomy[gid] = taxa self.logger.info(f'Read custom taxonomy for {len(custom_taxonomy):,} genomes.') self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.') if options.gtdbtk_classification_file and options.custom_taxonomy_file: dup_genomes = set(gtdbtk_taxonomy).intersection(custom_taxonomy) if len(dup_genomes) > 0: self.logger.error('GTDB-Tk classification and custom taxonomy ' 'files must not specify taxonomies for the ' 'same genomes.') self.logger.error('These files have {:,} genomes in common.'.format(len(dup_genomes))) self.logger.error('Example duplicate genome: {}'.format(dup_genomes.pop())) raise GTDBTkExit('Duplicated taxonomy information.') self.logger.info(f'Read taxonomy for {len(taxonomy):,} genomes.') return taxonomy
def align(self, identify_dir, skip_gtdb_refs, taxa_filter, min_perc_aa, custom_msa_filters, skip_trimming, rnd_seed, cols_per_gene, min_consensus, max_consensus, min_per_taxa, out_dir, prefix, outgroup_taxon, genomes_to_process=None): """Align marker genes in genomes.""" # read genomes that failed identify steps to skip them failed_genomes_file = os.path.join( os.path.join(identify_dir, PATH_FAILS.format(prefix=prefix))) if os.path.isfile(failed_genomes_file): with open(failed_genomes_file) as fgf: failed_genomes = [row.split()[0] for row in fgf] else: failed_genomes = list() # If the user is re-running this step, check if the identify step is consistent. genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir) if genomes_to_process is not None and len(genomic_files) != len( genomes_to_process): if list( set(genomic_files.keys()) - set(genomes_to_process.keys()) ).sort() != failed_genomes.sort(): self.logger.error( '{} are not present in the input list of genome to process.' .format( list( set(genomic_files.keys()) - set(genomes_to_process.keys())))) raise InconsistentGenomeBatch( 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' 'genomes not present in your initial identify directory. Remove them, or run ' 'GTDB-Tk on a new directory.') # If this is being run as a part of classify_wf, copy the required files. if identify_dir != out_dir: identify_path = os.path.join(out_dir, DIR_IDENTIFY) make_sure_path_exists(identify_path) copy( CopyNumberFileBAC120(identify_dir, prefix).path, identify_path) copy(CopyNumberFileAR53(identify_dir, prefix).path, identify_path) copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path) # Create the align intermediate directory. make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)) # Write out files with marker information ar53_marker_info_file = MarkerInfoFileAR53(out_dir, prefix) ar53_marker_info_file.write() bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix) bac120_marker_info_file.write() # Determine what domain each genome belongs to. bac_gids, ar_gids, _bac_ar_diff = self.genome_domain( identify_dir, prefix) if len(bac_gids) + len(ar_gids) == 0: raise GTDBTkExit(f'Unable to assign a domain to any genomes, ' f'please check the identify marker summary file, ' f'and verify genome quality.') # # Create a temporary directory that will be used to generate each of the alignments. # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \ # tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac: # # cur_gid_dict = {x: genomic_files[x] for x in ar_gids} # self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} ' # f'genomes identified as archaeal.') # align.concat_single_copy_hits(dir_tmp_arc, # cur_gid_dict, # ar53_marker_info_file) # self.logger.info( f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.' ) dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120", 'bacterial', CopyNumberFileBAC120), (ar_gids, Config.CONCAT_AR53, Config.MASK_AR53, "ar53", 'archaeal', CopyNumberFileAR53)) gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter: # No genomes identified as this domain. if len(gids) == 0: continue self.logger.info( f'Processing {len(gids):,} genomes identified as {domain_str}.' ) if marker_set_id == 'bac120': marker_info_file = bac120_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_BAC120_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix)) else: marker_info_file = ar53_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_AR53_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_AR53_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_AR53_USER_MSA.format(prefix=prefix)) cur_genome_files = { gid: f for gid, f in genomic_files.items() if gid in gids } if skip_gtdb_refs: gtdb_msa = {} else: gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy, taxa_filter, outgroup_taxon) gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file) # Generate the user MSA. user_msa = align.align_marker_set(cur_genome_files, marker_info_file, copy_number_f, self.cpus) if len(user_msa) == 0: self.logger.warning( f'Identified {len(user_msa):,} single copy {domain_str} hits.' ) continue # Write the individual marker alignments to disk if self.debug: self._write_individual_markers(user_msa, marker_set_id, marker_info_file.path, out_dir, prefix) # filter columns without sufficient representation across taxa if skip_trimming: self.logger.info( 'Skipping custom filtering and selection of columns.') pruned_seqs = {} trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa) elif custom_msa_filters: aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) self.logger.info( 'Performing custom filtering and selection of columns.') trim_msa = TrimMSA( cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0, max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed, os.path.join(out_dir, f'filter_{marker_set_id}')) trimmed_seqs, pruned_seqs = trim_msa.trim( aligned_genomes, marker_info_file.path) if trimmed_seqs: self.logger.info( 'Filtered MSA from {:,} to {:,} AAs.'.format( len(list(aligned_genomes.values())[0]), len(list(trimmed_seqs.values())[0]))) self.logger.info( 'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), min_perc_aa)) filtered_user_genomes = set(pruned_seqs).intersection(user_msa) if len(filtered_user_genomes): self.logger.info( f'Filtered genomes include {len(filtered_user_genomes)} user submitted genomes.' ) else: self.logger.log( Config.LOG_TASK, f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.' ) trimmed_seqs, pruned_seqs = self._apply_mask( gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0) self.logger.info( 'Masked {} alignment from {:,} to {:,} AAs.'.format( domain_str, len(list(user_msa.values())[0]), len(list(trimmed_seqs.values())[0]))) if min_perc_aa > 0: self.logger.info( '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), domain_str, min_perc_aa)) # write out filtering information with open(marker_filtered_genomes, 'w') as fout: for pruned_seq_id, pruned_seq in pruned_seqs.items(): if len(pruned_seq) == 0: perc_alignment = 0 else: valid_bases = sum( [1 for c in pruned_seq if c.isalpha()]) perc_alignment = valid_bases * 100.0 / len(pruned_seq) fout.write( f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n' ) # write out MSAs if not skip_gtdb_refs: self.logger.info( f'Creating concatenated alignment for {len(trimmed_seqs):,} ' f'{domain_str} GTDB and user genomes.') self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy, zip_output=True) trimmed_user_msa = { k: v for k, v in trimmed_seqs.items() if k in user_msa } if len(trimmed_user_msa) > 0: self.logger.info( f'Creating concatenated alignment for {len(trimmed_user_msa):,} ' f'{domain_str} user genomes.') self._write_msa(trimmed_user_msa, marker_user_msa_path, gtdb_taxonomy, zip_output=True) else: self.logger.info( f'All {domain_str} user genomes have been filtered out.')
def align(self, identify_dir, skip_gtdb_refs, taxa_filter, min_perc_aa, custom_msa_filters, skip_trimming, rnd_seed, cols_per_gene, min_consensus, max_consensus, min_per_taxa, out_dir, prefix, outgroup_taxon, genomes_to_process=None): """Align marker genes in genomes.""" if identify_dir != out_dir: if not os.path.isdir(os.path.join(out_dir, DIR_IDENTIFY)): os.makedirs(os.path.join(out_dir, DIR_IDENTIFY)) copy( os.path.join(identify_dir, PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)), os.path.join(out_dir, DIR_IDENTIFY)) copy( os.path.join(identify_dir, PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)), os.path.join(out_dir, DIR_IDENTIFY)) identify_gene_file = os.path.join( identify_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)) copy(identify_gene_file, os.path.join(out_dir, DIR_IDENTIFY)) if not os.path.exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)): os.makedirs(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)) # write out files with marker information bac120_marker_info_file = os.path.join( out_dir, PATH_BAC120_MARKER_INFO.format(prefix=prefix)) self._write_marker_info(Config.BAC120_MARKERS, bac120_marker_info_file) ar122_marker_info_file = os.path.join( out_dir, PATH_AR122_MARKER_INFO.format(prefix=prefix)) self._write_marker_info(Config.AR122_MARKERS, ar122_marker_info_file) genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir) if genomes_to_process is not None and len(genomic_files) != len( genomes_to_process): self.logger.error( '{} are not present in the input list of genome to process.'. format( list( set(genomic_files.keys()) - set(genomes_to_process.keys())))) raise InconsistentGenomeBatch( 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' 'genomes not present in your initial identify directory. Remove them, or run ' 'GTDB-Tk on a new directory.') self.logger.info('Aligning markers in %d genomes with %d threads.' % (len(genomic_files), self.cpus)) # determine marker set for each user genome bac_gids, ar_gids, _bac_ar_diff = self.genome_domain( identify_dir, prefix) # align user genomes gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) for gids, msa_file, mask_file, marker_set_id in ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120"), (ar_gids, Config.CONCAT_AR122, Config.MASK_AR122, "ar122")): domain_str = 'archaeal' if marker_set_id == 'bac120': domain_str = 'bacterial' if len(gids) == 0: continue self.logger.info( 'Processing {:,} genomes identified as {}.'.format( len(gids), domain_str)) if marker_set_id == 'bac120': marker_info_file = bac120_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_BAC120_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix)) else: marker_info_file = ar122_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_AR122_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_AR122_USER_MSA.format(prefix=prefix)) cur_genome_files = { gid: f for gid, f in genomic_files.items() if gid in gids } if skip_gtdb_refs: gtdb_msa = {} else: gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy, taxa_filter, outgroup_taxon) gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file) hmm_aligner = HmmAligner(self.cpus, self.pfam_top_hit_suffix, self.tigrfam_top_hit_suffix, self.protein_file_suffix, self.pfam_hmm_dir, self.tigrfam_hmms, Config.BAC120_MARKERS, Config.AR122_MARKERS) user_msa = hmm_aligner.align_marker_set(cur_genome_files, marker_set_id) # Write the individual marker alignments to disk if self.debug: self._write_individual_markers(user_msa, marker_set_id, marker_info_file, out_dir, prefix) # filter columns without sufficient representation across taxa if skip_trimming: self.logger.info( 'Skipping custom filtering and selection of columns.') pruned_seqs = {} trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa) elif custom_msa_filters: aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) self.logger.info( 'Performing custom filtering and selection of columns.') trim_msa = TrimMSA( cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0, max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed, os.path.join(out_dir, 'filter_%s' % marker_set_id)) trimmed_seqs, pruned_seqs = trim_msa.trim( aligned_genomes, marker_info_file) if trimmed_seqs: self.logger.info( 'Filtered MSA from {:,} to {:,} AAs.'.format( len(list(aligned_genomes.values())[0]), len(list(trimmed_seqs.values())[0]))) self.logger.info( 'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), min_perc_aa)) filtered_user_genomes = set(pruned_seqs).intersection(user_msa) if len(filtered_user_genomes): self.logger.info( 'Filtered genomes include {:.} user submitted genomes.' .format(len(filtered_user_genomes))) else: self.logger.info( f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.' ) trimmed_seqs, pruned_seqs = self._apply_mask( gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0) self.logger.info( 'Masked {} alignment from {:,} to {:,} AAs.'.format( domain_str, len(list(user_msa.values())[0]), len(list(trimmed_seqs.values())[0]))) if min_perc_aa > 0: self.logger.info( '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), domain_str, min_perc_aa)) # write out filtering information with open(marker_filtered_genomes, 'w') as fout: for pruned_seq_id, pruned_seq in pruned_seqs.items(): if len(pruned_seq) == 0: perc_alignment = 0 else: valid_bases = sum( [1 for c in pruned_seq if c.isalpha()]) perc_alignment = valid_bases * 100.0 / len(pruned_seq) fout.write( '%s\t%s\n' % (pruned_seq_id, 'Insufficient number of amino acids in MSA ({:.1f}%)'. format(perc_alignment))) # write out MSAs if not skip_gtdb_refs: self.logger.info( 'Creating concatenated alignment for {:,} {} GTDB and user genomes.' .format(len(trimmed_seqs), domain_str)) self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy) trimmed_user_msa = { k: v for k, v in trimmed_seqs.items() if k in user_msa } if len(trimmed_user_msa) > 0: self.logger.info( 'Creating concatenated alignment for {:,} {} user genomes.' .format(len(trimmed_user_msa), domain_str)) self._write_msa(trimmed_user_msa, marker_user_msa_path, gtdb_taxonomy) else: self.logger.info( f'All {domain_str} user genomes have been filtered out.') # Create symlinks to the summary files if marker_set_id == 'bac120': symlink_f( PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_FILTERED_GENOMES.format( prefix=prefix)))) if len(trimmed_user_msa) > 0: symlink_f( PATH_BAC120_USER_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_USER_MSA.format(prefix=prefix)))) if not skip_gtdb_refs: symlink_f( PATH_BAC120_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_MSA.format(prefix=prefix)))) elif marker_set_id == 'ar122': symlink_f( PATH_AR122_FILTERED_GENOMES.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_FILTERED_GENOMES.format( prefix=prefix)))) if len(trimmed_user_msa) > 0: symlink_f( PATH_AR122_USER_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_USER_MSA.format(prefix=prefix)))) if not skip_gtdb_refs: symlink_f( PATH_AR122_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_MSA.format(prefix=prefix)))) else: self.logger.error( 'There was an error determining the marker set.') raise GenomeMarkerSetUnknown
class TestClassify(unittest.TestCase): def setUp(self): self.classify = Classify() self.generic_out_path = 'tests/data/results' tmp_folder = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) self.out_dir = os.path.join(self.generic_out_path, tmp_folder) if not os.path.exists(self.generic_out_path): os.makedirs(self.generic_out_path) self.prefix = 'gtdbtk' self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference' self.aln_dir_ref = 'tests/data/align_dir_reference/align' self.user_msa_file = os.path.join(self.aln_dir_ref, 'gtdbtk.ar122.user_msa.fasta') self.taxonomy_file = Config.TAXONOMY_FILE self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) def test_standardise_taxonomy(self): taxstring = 'p__phylum1;c_class1' marker_set = 'bac120' new_taxstring = self.classify.standardise_taxonomy( taxstring, marker_set) self.assertEqual(new_taxstring, 'd__Bacteria;p__phylum1;c_class1;o__;f__;g__;s__') # Test that the correct domain is returned. self.assertEqual( self.classify.standardise_taxonomy('p__P;c__C;o__O;f__F;g__G;s__S', 'bac120'), 'd__Bacteria;p__P;c__C;o__O;f__F;g__G;s__S') self.assertEqual( self.classify.standardise_taxonomy('p__P;c__C;o__O;f__F;g__G;s__S', 'ar122'), 'd__Archaea;p__P;c__C;o__O;f__F;g__G;s__S') # Remove ranks and check rank_order = {'p': 0, 'c': 1, 'o': 2, 'f': 3, 'g': 4, 's': 5} rank_lst = ['p__P', 'c__C', 'o__O', 'f__F', 'g__G', 's__S'] ranks = {'p': 'P', 'c': 'C', 'o': 'O', 'f': 'F', 'g': 'G', 's': 'S'} dom_info = {'d__Bacteria': 'bac120', 'd__Archaea': 'ar122'} for k in range(1, len(ranks) - 1): for cur_domain in ('d__Bacteria', 'd__Archaea'): ranks_selected = rank_lst[0:-k] expected = list() test_lst = list() for cur_rank, _ in sorted(rank_order.items(), key=lambda x: [1]): if cur_rank in ranks_selected: test_lst.append(f'{cur_rank}__{ranks[cur_rank]}') expected.append(f'{cur_rank}__{ranks[cur_rank]}') else: expected.append(f'{cur_rank}__') expected_str = f'{cur_domain};{";".join(expected)}' test_str = ";".join(test_lst) cur_dom = dom_info[cur_domain] test_value = self.classify.standardise_taxonomy( test_str, cur_dom) self.assertEqual(expected_str, test_value) def test_write_red_dict(self): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) marker_dict = self.classify._write_red_dict(self.out_dir, self.prefix, 'bac120') self.assertTrue(len(marker_dict) == 6) self.assertTrue('d__' in marker_dict) self.assertTrue(marker_dict.get('d__') == 0) self.assertTrue('p__' in marker_dict) self.assertTrue('c__' in marker_dict) self.assertTrue('o__' in marker_dict) self.assertTrue('f__' in marker_dict) self.assertTrue('g__' in marker_dict) def test_get_pplacer_taxonomy(self): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) tree = dendropy.Tree.get_from_path(os.path.join( os.getcwd(), self.pplacer_dir_reference, 'gtdbtk.ar122.classify.tree'), schema='newick', rooting='force-rooted', preserve_underscores=True) self.classify._get_pplacer_taxonomy(self.out_dir, self.prefix, 'ar122', self.user_msa_file, tree) results = {} with open( os.path.join( self.out_dir, PATH_AR122_PPLACER_CLASS.format(prefix=self.prefix)), 'r') as f: for line in f: infos = line.strip().split('\t') results[infos[0]] = infos[1] self.assertTrue(len(results) == 3) self.assertTrue('genome_1' in results) self.assertTrue('genome_2' in results) self.assertTrue('genome_3' in results) self.assertEqual( results.get('genome_1'), 'd__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__' ) def test_place_genomes(self): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) tree_file = self.classify.place_genomes(self.user_msa_file, 'ar122', self.out_dir, self.prefix) with open(tree_file, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(last_line.startswith('(')) self.assertTrue(last_line.endswith('d__Archaea;')) def test_formatnote(self): first3genomes = list(self.gtdb_taxonomy.keys())[:3] sorted_dict = ((first3genomes[0], { 'ani': 98.5, 'af': 1.0 }), (first3genomes[1], { 'ani': 92.6, 'af': 1.0 }), (first3genomes[2], { 'ani': 90.3, 'af': 1.3 })) labels = [first3genomes[0]] note_list = self.classify._formatnote(sorted_dict, labels) self.assertTrue(first3genomes[1] in note_list[0]) self.assertTrue(first3genomes[2] in note_list[1]) self.assertTrue(note_list[0].endswith(', 92.6, 1.0')) self.assertTrue(note_list[1].endswith(', 90.3, 1.3')) def test_calculate_red_distances(self): tree = os.path.join(self.pplacer_dir_reference, 'gtdbtk.ar122.classify.tree') result_tree = self.classify._calculate_red_distances( tree, self.out_dir) egs2 = [ eg.length for eg in result_tree.postorder_edge_iter() if eg.length is not None ] self.assertTrue(sum(egs2) / len(egs2) < 0.1) def tearDown(self): shutil.rmtree(self.generic_out_path)
class TestClassify(unittest.TestCase): def setUp(self): self.classify = Classify() self.generic_out_path = 'tests/data/results' tmp_folder = ''.join(random.choice( string.ascii_uppercase + string.digits) for _ in range(10)) self.out_dir = os.path.join(self.generic_out_path, tmp_folder) if not os.path.exists(self.generic_out_path): os.makedirs(self.generic_out_path) self.prefix = 'gtdbtk' self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference' self.user_msa_file = os.path.join( self.pplacer_dir_reference, 'gtdbtk.ar122.user_msa.fasta') self.taxonomy_file = Config.TAXONOMY_FILE self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) def test_standardise_taxonomy(self): taxstring = 'p__phylum1;c_class1' marker_set = 'bac120' new_taxstring = self.classify.standardise_taxonomy( taxstring, marker_set) self.assertEqual( new_taxstring, 'd__Bacteria;p__phylum1;c_class1;o__;f__;g__;s__') def test_write_red_dict(self): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) marker_dict = self.classify._write_red_dict( self.out_dir, self.prefix, 'bac120') self.assertTrue(len(marker_dict) == 6) self.assertTrue('d__' in marker_dict) self.assertTrue(marker_dict.get('d__') == 0) self.assertTrue('p__' in marker_dict) self.assertTrue('c__' in marker_dict) self.assertTrue('o__' in marker_dict) self.assertTrue('f__' in marker_dict) self.assertTrue('g__' in marker_dict) def test_get_pplacer_taxonomy(self): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) tree = dendropy.Tree.get_from_path(os.path.join(os.getcwd(), self.pplacer_dir_reference, 'gtdbtk.ar122.classify.tree'), schema='newick', rooting='force-rooted', preserve_underscores=True) self.classify._get_pplacer_taxonomy( self.out_dir, self.prefix, 'ar122', self.user_msa_file, tree) results = {} with open(os.path.join(self.out_dir, PATH_AR122_PPLACER_CLASS.format(prefix=self.prefix)), 'r') as f: for line in f: infos = line.strip().split('\t') results[infos[0]] = infos[1] self.assertTrue(len(results) == 3) self.assertTrue('genome_1' in results) self.assertTrue('genome_2' in results) self.assertTrue('genome_3' in results) self.assertEqual(results.get( 'genome_1'), 'd__Archaea;p__Thermoplasmatota;c__MGII;o__MGIII;f__CG-Epi1;g__UBA8886;s__') def test_place_genomes(self): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) tree_file = self.classify.place_genomes( self.user_msa_file, 'ar122', self.out_dir, self.prefix) with open(tree_file, 'r') as f: lines = f.read().splitlines() last_line = lines[-1] self.assertTrue(last_line.startswith('(')) self.assertTrue(last_line.endswith('d__Archaea;')) def test_formatnote(self): first3genomes = self.gtdb_taxonomy.keys()[:3] sorted_dict = ((first3genomes[0], {'ani': 98.5, 'af': 1.0}), (first3genomes[1], { 'ani': 92.6, 'af': 1.0}), (first3genomes[2], {'ani': 90.3, 'af': 1.3})) labels = [first3genomes[0]] note_list = self.classify._formatnote(sorted_dict, labels) self.assertTrue(first3genomes[1]in note_list[0]) self.assertTrue(first3genomes[2]in note_list[1]) self.assertTrue(note_list[0].endswith(', 92.6, 1.0')) self.assertTrue(note_list[1].endswith(', 90.3, 1.3')) def test_calculate_red_distances(self): tree = os.path.join(self.pplacer_dir_reference, 'gtdbtk.ar122.classify.tree') result_tree = self.classify._calculate_red_distances( tree, self.out_dir) egs2 = [eg.length for eg in result_tree.postorder_edge_iter() if eg.length is not None] self.assertTrue(sum(egs2) / len(egs2) < 0.1) def tearDown(self): shutil.rmtree(self.generic_out_path)
def _fmeasure(self, tree, taxonomy): """Find node with highest F-measure for each taxon. Finds best placement for each taxon label by calculating the F-measure for every taxon at every node. Parameters ---------- tree : Tree Dendropy Tree. taxonomy : d[extent_taxon_id] -> taxa list Taxon labels for extant taxa. Returns ------- d[taxon] -> [(Node, F-measure, precision, recall_, ...] Node(s) with highest F-measure for each taxon. """ # get named lineages/taxa at each taxonomic rank taxa_at_rank = Taxonomy().named_lineages_at_rank(taxonomy) # get extant taxa for each taxon label extent_taxa_with_label = {} for i, rank in enumerate(Taxonomy.rank_labels): extent_taxa_with_label[i] = Taxonomy().extant_taxa_for_rank( rank, taxonomy) # get parent taxon for each taxon: taxon_parents = Taxonomy().parents(taxonomy) # get number of leaves and taxon in each lineage self.logger.info('Calculating taxa within each lineage.') for node in tree.preorder_node_iter(): num_leaves = 0 taxa_count = defaultdict(lambda: defaultdict(int)) for leaf in node.leaf_iter(): num_leaves += 1 for rank_index, taxon in enumerate(taxonomy[leaf.taxon.label]): if taxon != Taxonomy.rank_prefixes[rank_index]: taxa_count[rank_index][taxon] += 1 node.num_leaves = num_leaves node.taxa_count = taxa_count taxa_in_tree = defaultdict(int) for leaf in tree.leaf_node_iter(): for taxon in taxonomy[leaf.taxon.label]: taxa_in_tree[taxon] += 1 # find node with best F-measure for each taxon fmeasure_for_taxa = {} for rank_index in range(0, len(Taxonomy.rank_labels)): # if rank_index == 6: #*** skip species # continue self.logger.info('Processing {:,} taxa at {} rank.'.format( len(taxa_at_rank[rank_index]), Taxonomy.rank_labels[rank_index].capitalize())) for taxon in taxa_at_rank[rank_index]: if rank_index == 0: # processing taxa at the domain is a special case taxon_parent_node = tree.seed_node else: # find first named parent # e.g., Cyanobacteria for Synechococcales in d__Bacteria;p__Cyanobacteria;c__;o__Synechococcales parent_taxon = 'x__' parent_index = rank_index - 1 while len(parent_taxon) == 3 and parent_index != -1: parent_taxon = taxon_parents[taxon][parent_index] parent_index -= 1 if parent_taxon in fmeasure_for_taxa: # only need to process the lineage below the parent node, # but must take the MRCA if the placement of the parent # taxon is unresolved parent_nodes = [] for stat_table in fmeasure_for_taxa[parent_taxon]: parent_nodes.append(stat_table.node) if len(parent_nodes) == 1: taxon_parent_node = parent_nodes[0] else: taxa = [] for p in parent_nodes: taxa += [leaf.taxon for leaf in p.leaf_iter()] taxon_parent_node = tree.mrca(taxa=taxa) if taxon_parent_node.taxa_count[rank_index][ taxon] < 0.5 * taxa_in_tree[taxon]: # substantial portion of genomes for taxon fall outside # the parent lineages so best search the entire tree taxon_parent_node = tree.seed_node else: # the parent for this taxon was not placed so # it can be ignored (e.g., bacterial phylum in archaeal tree) continue cur_taxon_fmeasure = -1 cur_taxa = set(extent_taxa_with_label[rank_index][taxon]) total_taxa = len(cur_taxa) for node in taxon_parent_node.preorder_iter(): taxa_in_lineage = node.taxa_count[rank_index][taxon] num_leaves_with_taxa = sum( node.taxa_count[rank_index].values()) if taxa_in_lineage != 0 and num_leaves_with_taxa != 0: precision = float( taxa_in_lineage) / num_leaves_with_taxa recall = float(taxa_in_lineage) / total_taxa fmeasure = (2 * precision * recall) / (precision + recall) if fmeasure >= cur_taxon_fmeasure: node_taxa = set( [l.taxon.label for l in node.leaf_iter()]) rogue_out = cur_taxa - node_taxa rogue_in = [] for gid in node_taxa - cur_taxa: if taxonomy[gid][ rank_index] != Taxonomy.rank_prefixes[ rank_index]: rogue_in.append(gid) stat_table = self.StatsTable( node=node, fmeasure=fmeasure, precision=precision, recall=recall, taxa_in_lineage=taxa_in_lineage, total_taxa=total_taxa, num_leaves_with_taxa=num_leaves_with_taxa, rogue_out=rogue_out, rogue_in=rogue_in) if fmeasure > cur_taxon_fmeasure: cur_taxon_fmeasure = fmeasure fmeasure_for_taxa[taxon] = [stat_table] elif fmeasure == cur_taxon_fmeasure: fmeasure_for_taxa[taxon].append(stat_table) return fmeasure_for_taxa