def infer(self, options): """Infer a tree from a user specified MSA. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ check_file_exists(options.msa_file) make_sure_path_exists(options.out_dir) if options.cpus > 1: check_dependencies(['FastTreeMP']) else: check_dependencies(['FastTree']) if hasattr(options, 'suffix'): output_tree = os.path.join( options.out_dir, PATH_MARKER_UNROOTED_TREE.format(prefix=options.prefix, marker=options.suffix)) tree_log = os.path.join( options.out_dir, PATH_MARKER_TREE_LOG.format(prefix=options.prefix, marker=options.suffix)) fasttree_log = os.path.join( options.out_dir, PATH_MARKER_FASTTREE_LOG.format(prefix=options.prefix, marker=options.suffix)) else: output_tree = os.path.join( options.out_dir, PATH_UNROOTED_TREE.format(prefix=options.prefix)) tree_log = os.path.join( options.out_dir, PATH_TREE_LOG.format(prefix=options.prefix)) fasttree_log = os.path.join( options.out_dir, PATH_FASTTREE_LOG.format(prefix=options.prefix)) fasttree = FastTree() fasttree.run(output_tree, tree_log, fasttree_log, options.prot_model, options.no_support, options.no_gamma, options.msa_file, options.cpus) self.logger.info(f'FastTree version: {fasttree.version}') if hasattr(options, 'subparser_name') and options.subparser_name == 'infer': symlink_f( output_tree[len(options.out_dir) + 1:], os.path.join(options.out_dir, os.path.basename(output_tree))) self.logger.info('Done.')
def root(self, options): """Root tree using outgroup. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ self.logger.warning("Tree rooting is still under development!") check_file_exists(options.input_tree) if options.custom_taxonomy_file: check_file_exists(options.custom_taxonomy_file) taxonomy = Taxonomy().read(options.custom_taxonomy_file) else: taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) self.logger.info('Identifying genomes from the specified outgroup.') outgroup = set() for genome_id, taxa in taxonomy.items(): if options.outgroup_taxon in taxa: outgroup.add(genome_id) reroot = RerootTree() reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup) # Symlink to the tree summary file, if not run independently if hasattr(options, 'suffix'): if options.suffix == 'bac120': symlink_f( PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_AR122_ROOTED_TREE.format( prefix=options.prefix)))) elif options.suffix == 'ar122': symlink_f( PATH_AR122_ROOTED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_AR122_ROOTED_TREE.format( prefix=options.prefix)))) else: raise GenomeMarkerSetUnknown( 'There was an error determining the marker set.') self.logger.info('Done.')
def decorate(self, options): """Decorate tree with GTDB taxonomy. Parameters ---------- options : argparse.Namespace The CLI arguments input by the user. """ check_file_exists(options.input_tree) taxonomy = self._read_taxonomy_files(options) d = Decorate() d.run(options.input_tree, taxonomy, options.output_tree) self.logger.info('Done.') # symlink to the decorated tree file, if not run independently if hasattr(options, 'suffix'): if options.suffix == 'bac120': symlink_f( PATH_BAC120_DECORATED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_BAC120_DECORATED_TREE.format( prefix=options.prefix)))) symlink_f( PATH_BAC120_DECORATED_TREE.format(prefix=options.prefix) + '-table', os.path.join( options.out_dir, os.path.basename( PATH_BAC120_DECORATED_TREE.format( prefix=options.prefix) + '-table'))) elif options.suffix == 'ar122': symlink_f( PATH_AR122_DECORATED_TREE.format(prefix=options.prefix), os.path.join( options.out_dir, os.path.basename( PATH_AR122_DECORATED_TREE.format( prefix=options.prefix)))) symlink_f( PATH_AR122_DECORATED_TREE.format(prefix=options.prefix) + '-table', os.path.join( options.out_dir, os.path.basename( PATH_AR122_DECORATED_TREE.format( prefix=options.prefix) + '-table'))) else: raise GenomeMarkerSetUnknown( 'There was an error determining the marker set.')
def _report_identified_marker_genes(self, gene_dict, outdir, prefix): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR122 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in sorted(gene_dict.items()): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar122_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar122_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. symlink_f( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
def _report_identified_marker_genes(self, gene_dict, outdir, prefix, write_single_copy_genes): """Report statistics for identified marker genes.""" # Summarise the copy number of each AR53 and BAC120 markers. tln_summary_file = TlnTableSummaryFile(outdir, prefix) ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix) bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix) # Process each genome. for db_genome_id, info in tqdm_log(sorted(gene_dict.items()), unit='genome'): cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE) pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id) tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id) pfam_tophit_file.read() tigr_tophit_file.read() # Summarise each of the markers for this genome. ar53_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) bac120_copy_number_file.add_genome(db_genome_id, info.get("aa_gene_path"), pfam_tophit_file, tigr_tophit_file) # Write the best translation table to disk for this genome. tln_summary_file.add_genome(db_genome_id, info.get("best_translation_table")) # Write each of the summary files to disk. ar53_copy_number_file.write() bac120_copy_number_file.write() tln_summary_file.write() # Create a symlink to store the summary files in the root. # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix)))) # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), # os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_FAILS.format(prefix=prefix), os.path.join(outdir, os.path.basename(PATH_FAILS.format(prefix=prefix)))) # Write the single copy AR53/BAC120 FASTA files to disk. if write_single_copy_genes: fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA) self.logger.info( f'Writing unaligned single-copy genes to: {fasta_dir}') # Iterate over each domain. marker_doms = list() marker_doms.append( (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'], ar53_copy_number_file, 'ar53')) marker_doms.append((Config.BAC120_MARKERS['PFAM'] + Config.BAC120_MARKERS['TIGRFAM'], bac120_copy_number_file, 'bac120')) for marker_names, marker_file, marker_d in marker_doms: # Create the domain-specific subdirectory. fasta_d_dir = os.path.join(fasta_dir, marker_d) make_sure_path_exists(fasta_d_dir) # Iterate over each marker. for marker_name in marker_names: marker_name = marker_name.rstrip(r'\.[HMMhmm]') marker_path = os.path.join(fasta_d_dir, f'{marker_name}.fa') to_write = list() for genome_id in sorted(gene_dict): unq_hits = marker_file.get_single_copy_hits(genome_id) if marker_name in unq_hits: to_write.append(f'>{genome_id}') to_write.append(unq_hits[marker_name]['seq']) if len(to_write) > 0: with open(marker_path, 'w') as fh: fh.write('\n'.join(to_write))
def align(self, identify_dir, skip_gtdb_refs, taxa_filter, min_perc_aa, custom_msa_filters, skip_trimming, rnd_seed, cols_per_gene, min_consensus, max_consensus, min_per_taxa, out_dir, prefix, outgroup_taxon, genomes_to_process=None): """Align marker genes in genomes.""" if identify_dir != out_dir: if not os.path.isdir(os.path.join(out_dir, DIR_IDENTIFY)): os.makedirs(os.path.join(out_dir, DIR_IDENTIFY)) copy( os.path.join(identify_dir, PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)), os.path.join(out_dir, DIR_IDENTIFY)) copy( os.path.join(identify_dir, PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)), os.path.join(out_dir, DIR_IDENTIFY)) identify_gene_file = os.path.join( identify_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)) copy(identify_gene_file, os.path.join(out_dir, DIR_IDENTIFY)) if not os.path.exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)): os.makedirs(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)) # write out files with marker information bac120_marker_info_file = os.path.join( out_dir, PATH_BAC120_MARKER_INFO.format(prefix=prefix)) self._write_marker_info(Config.BAC120_MARKERS, bac120_marker_info_file) ar122_marker_info_file = os.path.join( out_dir, PATH_AR122_MARKER_INFO.format(prefix=prefix)) self._write_marker_info(Config.AR122_MARKERS, ar122_marker_info_file) genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir) if genomes_to_process is not None and len(genomic_files) != len( genomes_to_process): self.logger.error( '{} are not present in the input list of genome to process.'. format( list( set(genomic_files.keys()) - set(genomes_to_process.keys())))) raise InconsistentGenomeBatch( 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' 'genomes not present in your initial identify directory. Remove them, or run ' 'GTDB-Tk on a new directory.') self.logger.info('Aligning markers in %d genomes with %d threads.' % (len(genomic_files), self.cpus)) # determine marker set for each user genome bac_gids, ar_gids, _bac_ar_diff = self.genome_domain( identify_dir, prefix) # align user genomes gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) for gids, msa_file, mask_file, marker_set_id in ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120"), (ar_gids, Config.CONCAT_AR122, Config.MASK_AR122, "ar122")): domain_str = 'archaeal' if marker_set_id == 'bac120': domain_str = 'bacterial' if len(gids) == 0: continue self.logger.info( 'Processing {:,} genomes identified as {}.'.format( len(gids), domain_str)) if marker_set_id == 'bac120': marker_info_file = bac120_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_BAC120_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix)) else: marker_info_file = ar122_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_AR122_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_AR122_USER_MSA.format(prefix=prefix)) cur_genome_files = { gid: f for gid, f in genomic_files.items() if gid in gids } if skip_gtdb_refs: gtdb_msa = {} else: gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy, taxa_filter, outgroup_taxon) gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file) hmm_aligner = HmmAligner(self.cpus, self.pfam_top_hit_suffix, self.tigrfam_top_hit_suffix, self.protein_file_suffix, self.pfam_hmm_dir, self.tigrfam_hmms, Config.BAC120_MARKERS, Config.AR122_MARKERS) user_msa = hmm_aligner.align_marker_set(cur_genome_files, marker_set_id) # Write the individual marker alignments to disk if self.debug: self._write_individual_markers(user_msa, marker_set_id, marker_info_file, out_dir, prefix) # filter columns without sufficient representation across taxa if skip_trimming: self.logger.info( 'Skipping custom filtering and selection of columns.') pruned_seqs = {} trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa) elif custom_msa_filters: aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) self.logger.info( 'Performing custom filtering and selection of columns.') trim_msa = TrimMSA( cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0, max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed, os.path.join(out_dir, 'filter_%s' % marker_set_id)) trimmed_seqs, pruned_seqs = trim_msa.trim( aligned_genomes, marker_info_file) if trimmed_seqs: self.logger.info( 'Filtered MSA from {:,} to {:,} AAs.'.format( len(list(aligned_genomes.values())[0]), len(list(trimmed_seqs.values())[0]))) self.logger.info( 'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), min_perc_aa)) filtered_user_genomes = set(pruned_seqs).intersection(user_msa) if len(filtered_user_genomes): self.logger.info( 'Filtered genomes include {:.} user submitted genomes.' .format(len(filtered_user_genomes))) else: self.logger.info( f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.' ) trimmed_seqs, pruned_seqs = self._apply_mask( gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0) self.logger.info( 'Masked {} alignment from {:,} to {:,} AAs.'.format( domain_str, len(list(user_msa.values())[0]), len(list(trimmed_seqs.values())[0]))) if min_perc_aa > 0: self.logger.info( '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), domain_str, min_perc_aa)) # write out filtering information with open(marker_filtered_genomes, 'w') as fout: for pruned_seq_id, pruned_seq in pruned_seqs.items(): if len(pruned_seq) == 0: perc_alignment = 0 else: valid_bases = sum( [1 for c in pruned_seq if c.isalpha()]) perc_alignment = valid_bases * 100.0 / len(pruned_seq) fout.write( '%s\t%s\n' % (pruned_seq_id, 'Insufficient number of amino acids in MSA ({:.1f}%)'. format(perc_alignment))) # write out MSAs if not skip_gtdb_refs: self.logger.info( 'Creating concatenated alignment for {:,} {} GTDB and user genomes.' .format(len(trimmed_seqs), domain_str)) self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy) trimmed_user_msa = { k: v for k, v in trimmed_seqs.items() if k in user_msa } if len(trimmed_user_msa) > 0: self.logger.info( 'Creating concatenated alignment for {:,} {} user genomes.' .format(len(trimmed_user_msa), domain_str)) self._write_msa(trimmed_user_msa, marker_user_msa_path, gtdb_taxonomy) else: self.logger.info( f'All {domain_str} user genomes have been filtered out.') # Create symlinks to the summary files if marker_set_id == 'bac120': symlink_f( PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_FILTERED_GENOMES.format( prefix=prefix)))) if len(trimmed_user_msa) > 0: symlink_f( PATH_BAC120_USER_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_USER_MSA.format(prefix=prefix)))) if not skip_gtdb_refs: symlink_f( PATH_BAC120_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_MSA.format(prefix=prefix)))) elif marker_set_id == 'ar122': symlink_f( PATH_AR122_FILTERED_GENOMES.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_FILTERED_GENOMES.format( prefix=prefix)))) if len(trimmed_user_msa) > 0: symlink_f( PATH_AR122_USER_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_USER_MSA.format(prefix=prefix)))) if not skip_gtdb_refs: symlink_f( PATH_AR122_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_MSA.format(prefix=prefix)))) else: self.logger.error( 'There was an error determining the marker set.') raise GenomeMarkerSetUnknown
def _report_identified_marker_genes(self, gene_dict, outdir, prefix): """Report statistics for identified marker genes.""" translation_table_file = open( os.path.join(outdir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)), "w") bac_outfile = open( os.path.join(outdir, PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)), "w") arc_outfile = open( os.path.join(outdir, PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)), "w") header = "Name\tnumber_unique_genes\tnumber_multiple_genes\tnumber_missing_genes\tlist_unique_genes\tlist_multiple_genes\tlist_missing_genes\n" bac_outfile.write(header) arc_outfile.write(header) # gather information for all marker genes marker_dbs = { "PFAM": PFAM_TOP_HIT_SUFFIX, "TIGR": TIGRFAM_TOP_HIT_SUFFIX } marker_bac_list_original = [] for db_marker in Config.BAC120_MARKERS.keys(): marker_bac_list_original.extend([ marker.replace(".HMM", "").replace(".hmm", "") for marker in Config.BAC120_MARKERS[db_marker] ]) marker_arc_list_original = [] for db_marker in Config.AR122_MARKERS.keys(): marker_arc_list_original.extend([ marker.replace(".HMM", "").replace(".hmm", "") for marker in Config.AR122_MARKERS[db_marker] ]) for db_genome_id, info in gene_dict.items(): unique_genes_bac, multi_hits_bac, missing_genes_bac = [], [], [] unique_genes_arc, multi_hits_arc, missing_genes_arc = [], [], [] gene_bac_dict, gene_arc_dict = {}, {} path = info.get("aa_gene_path") for _marker_db, marker_suffix in marker_dbs.items(): # get all gene sequences protein_file = str(path) tophit_path = os.path.join( outdir, DIR_MARKER_GENE, db_genome_id, '{}{}'.format(db_genome_id, marker_suffix)) # we load the list of all the genes detected in the genome all_genes_dict = read_fasta(protein_file, False) # Prodigal adds an asterisks at the end of each called genes. # These asterisks sometimes appear in the MSA, which can be # an issue for some downstream software for seq_id, seq in all_genes_dict.items(): if seq[-1] == '*': all_genes_dict[seq_id] = seq[:-1] # we store the tophit file line by line and store the # information in a dictionary with open(tophit_path) as tp: # first line is header line tp.readline() for line_tp in tp: linelist = line_tp.split("\t") genename = linelist[0] sublist = linelist[1] if ";" in sublist: diff_markers = sublist.split(";") else: diff_markers = [sublist] for each_mark in diff_markers: sublist = each_mark.split(",") markerid = sublist[0] if (markerid not in marker_bac_list_original and markerid not in marker_arc_list_original): continue if markerid in marker_bac_list_original: if markerid in gene_bac_dict: gene_bac_dict.get( markerid)["multihit"] = True else: gene_bac_dict[markerid] = { "gene": genename, "multihit": False } if markerid in marker_arc_list_original: if markerid in gene_arc_dict: gene_arc_dict.get( markerid)["multihit"] = True else: gene_arc_dict[markerid] = { "gene": genename, "multihit": False } for mid in marker_bac_list_original: if mid not in gene_bac_dict: missing_genes_bac.append(mid) elif gene_bac_dict[mid]["multihit"]: multi_hits_bac.append(mid) else: unique_genes_bac.append(mid) for mid in marker_arc_list_original: if mid not in gene_arc_dict: missing_genes_arc.append(mid) elif gene_arc_dict[mid]["multihit"]: multi_hits_arc.append(mid) else: unique_genes_arc.append(mid) bac_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( db_genome_id, len(unique_genes_bac), len(multi_hits_bac), len(missing_genes_bac), ','.join(unique_genes_bac), ','.join(multi_hits_bac), ','.join(missing_genes_bac))) arc_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( db_genome_id, len(unique_genes_arc), len(multi_hits_arc), len(missing_genes_arc), ','.join(unique_genes_arc), ','.join(multi_hits_arc), ','.join(missing_genes_arc))) translation_table_file.write('{}\t{}\n'.format( db_genome_id, info.get("best_translation_table"))) bac_outfile.close() arc_outfile.close() translation_table_file.close() # Create a symlink to store the summary files in the root. symlink_f( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)))) symlink_f( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix), os.path.join( outdir, os.path.basename( PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
def align(self, identify_dir, skip_gtdb_refs, taxa_filter, min_perc_aa, custom_msa_filters, skip_trimming, rnd_seed, cols_per_gene, min_consensus, max_consensus, min_per_taxa, out_dir, prefix, outgroup_taxon, genomes_to_process=None): """Align marker genes in genomes.""" # If the user is re-running this step, check if the identify step is consistent. genomic_files = self._path_to_identify_data(identify_dir, identify_dir != out_dir) if genomes_to_process is not None and len(genomic_files) != len( genomes_to_process): self.logger.error( '{} are not present in the input list of genome to process.'. format( list( set(genomic_files.keys()) - set(genomes_to_process.keys())))) raise InconsistentGenomeBatch( 'You are attempting to run GTDB-Tk on a non-empty directory that contains extra ' 'genomes not present in your initial identify directory. Remove them, or run ' 'GTDB-Tk on a new directory.') # If this is being run as a part of classify_wf, copy the required files. if identify_dir != out_dir: identify_path = os.path.join(out_dir, DIR_IDENTIFY) make_sure_path_exists(identify_path) copy( CopyNumberFileBAC120(identify_dir, prefix).path, identify_path) copy(CopyNumberFileAR122(identify_dir, prefix).path, identify_path) copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path) # Create the align intermediate directory. make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)) # Write out files with marker information ar122_marker_info_file = MarkerInfoFileAR122(out_dir, prefix) ar122_marker_info_file.write() bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix) bac120_marker_info_file.write() # Determine what domain each genome belongs to. bac_gids, ar_gids, _bac_ar_diff = self.genome_domain( identify_dir, prefix) # # Create a temporary directory that will be used to generate each of the alignments. # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \ # tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac: # # cur_gid_dict = {x: genomic_files[x] for x in ar_gids} # self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} ' # f'genomes identified as archaeal.') # align.concat_single_copy_hits(dir_tmp_arc, # cur_gid_dict, # ar122_marker_info_file) # self.logger.info( f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.' ) dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120", 'bacterial', CopyNumberFileBAC120), (ar_gids, Config.CONCAT_AR122, Config.MASK_AR122, "ar122", 'archaeal', CopyNumberFileAR122)) gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter: # No genomes identified as this domain. if len(gids) == 0: continue self.logger.info( f'Processing {len(gids):,} genomes identified as {domain_str}.' ) if marker_set_id == 'bac120': marker_info_file = bac120_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_BAC120_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix)) else: marker_info_file = ar122_marker_info_file marker_filtered_genomes = os.path.join( out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix)) marker_msa_path = os.path.join( out_dir, PATH_AR122_MSA.format(prefix=prefix)) marker_user_msa_path = os.path.join( out_dir, PATH_AR122_USER_MSA.format(prefix=prefix)) cur_genome_files = { gid: f for gid, f in genomic_files.items() if gid in gids } if skip_gtdb_refs: gtdb_msa = {} else: gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy, taxa_filter, outgroup_taxon) gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file) # Generate the user MSA. user_msa = align.align_marker_set(cur_genome_files, marker_info_file, copy_number_f, self.cpus) # self.logger.log(Config.LOG_TASK, f'Aligning {len(cur_genome_files):,} {domain_str} genomes.') # hmm_aligner = HmmAligner(self.cpus, # self.pfam_top_hit_suffix, # self.tigrfam_top_hit_suffix, # self.protein_file_suffix, # self.pfam_hmm_dir, # self.tigrfam_hmms, # Config.BAC120_MARKERS, # Config.AR122_MARKERS) # user_msa = hmm_aligner.align_marker_set(cur_genome_files, # marker_set_id) # Write the individual marker alignments to disk if self.debug: self._write_individual_markers(user_msa, marker_set_id, marker_info_file.path, out_dir, prefix) # filter columns without sufficient representation across taxa if skip_trimming: self.logger.info( 'Skipping custom filtering and selection of columns.') pruned_seqs = {} trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa) elif custom_msa_filters: aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) self.logger.info( 'Performing custom filtering and selection of columns.') trim_msa = TrimMSA( cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0, max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed, os.path.join(out_dir, f'filter_{marker_set_id}')) trimmed_seqs, pruned_seqs = trim_msa.trim( aligned_genomes, marker_info_file.path) if trimmed_seqs: self.logger.info( 'Filtered MSA from {:,} to {:,} AAs.'.format( len(list(aligned_genomes.values())[0]), len(list(trimmed_seqs.values())[0]))) self.logger.info( 'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), min_perc_aa)) filtered_user_genomes = set(pruned_seqs).intersection(user_msa) if len(filtered_user_genomes): self.logger.info( 'Filtered genomes include {:.} user submitted genomes.' .format(len(filtered_user_genomes))) else: self.logger.log( Config.LOG_TASK, f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.' ) trimmed_seqs, pruned_seqs = self._apply_mask( gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0) self.logger.info( 'Masked {} alignment from {:,} to {:,} AAs.'.format( domain_str, len(list(user_msa.values())[0]), len(list(trimmed_seqs.values())[0]))) if min_perc_aa > 0: self.logger.info( '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.' .format(len(pruned_seqs), domain_str, min_perc_aa)) # write out filtering information with open(marker_filtered_genomes, 'w') as fout: for pruned_seq_id, pruned_seq in pruned_seqs.items(): if len(pruned_seq) == 0: perc_alignment = 0 else: valid_bases = sum( [1 for c in pruned_seq if c.isalpha()]) perc_alignment = valid_bases * 100.0 / len(pruned_seq) fout.write( f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n' ) # write out MSAs if not skip_gtdb_refs: self.logger.info( f'Creating concatenated alignment for {len(trimmed_seqs):,} ' f'{domain_str} GTDB and user genomes.') self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy) trimmed_user_msa = { k: v for k, v in trimmed_seqs.items() if k in user_msa } if len(trimmed_user_msa) > 0: self.logger.info( f'Creating concatenated alignment for {len(trimmed_user_msa):,} ' f'{domain_str} user genomes.') self._write_msa(trimmed_user_msa, marker_user_msa_path, gtdb_taxonomy) else: self.logger.info( f'All {domain_str} user genomes have been filtered out.') # Create symlinks to the summary files if marker_set_id == 'bac120': symlink_f( PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_FILTERED_GENOMES.format( prefix=prefix)))) if len(trimmed_user_msa) > 0: symlink_f( PATH_BAC120_USER_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_USER_MSA.format(prefix=prefix)))) if not skip_gtdb_refs: symlink_f( PATH_BAC120_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_BAC120_MSA.format(prefix=prefix)))) elif marker_set_id == 'ar122': symlink_f( PATH_AR122_FILTERED_GENOMES.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_FILTERED_GENOMES.format( prefix=prefix)))) if len(trimmed_user_msa) > 0: symlink_f( PATH_AR122_USER_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_USER_MSA.format(prefix=prefix)))) if not skip_gtdb_refs: symlink_f( PATH_AR122_MSA.format(prefix=prefix), os.path.join( out_dir, os.path.basename( PATH_AR122_MSA.format(prefix=prefix)))) else: raise GenomeMarkerSetUnknown( 'There was an error determining the marker set.')