Beispiel #1
0
    def infer(self, options):
        """Infer a tree from a user specified MSA.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if options.cpus > 1:
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        if hasattr(options, 'suffix'):
            output_tree = os.path.join(
                options.out_dir,
                PATH_MARKER_UNROOTED_TREE.format(prefix=options.prefix,
                                                 marker=options.suffix))
            tree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_TREE_LOG.format(prefix=options.prefix,
                                            marker=options.suffix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_MARKER_FASTTREE_LOG.format(prefix=options.prefix,
                                                marker=options.suffix))
        else:
            output_tree = os.path.join(
                options.out_dir,
                PATH_UNROOTED_TREE.format(prefix=options.prefix))
            tree_log = os.path.join(
                options.out_dir, PATH_TREE_LOG.format(prefix=options.prefix))
            fasttree_log = os.path.join(
                options.out_dir,
                PATH_FASTTREE_LOG.format(prefix=options.prefix))

        fasttree = FastTree()
        fasttree.run(output_tree, tree_log, fasttree_log, options.prot_model,
                     options.no_support, options.no_gamma, options.msa_file,
                     options.cpus)
        self.logger.info(f'FastTree version: {fasttree.version}')

        if hasattr(options,
                   'subparser_name') and options.subparser_name == 'infer':
            symlink_f(
                output_tree[len(options.out_dir) + 1:],
                os.path.join(options.out_dir, os.path.basename(output_tree)))

        self.logger.info('Done.')
Beispiel #2
0
    def root(self, options):
        """Root tree using outgroup.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """
        self.logger.warning("Tree rooting is still under development!")

        check_file_exists(options.input_tree)

        if options.custom_taxonomy_file:
            check_file_exists(options.custom_taxonomy_file)
            taxonomy = Taxonomy().read(options.custom_taxonomy_file)
        else:
            taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in taxonomy.items():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree,
                                  outgroup)

        # Symlink to the tree summary file, if not run independently
        if hasattr(options, 'suffix'):
            if options.suffix == 'bac120':
                symlink_f(
                    PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_ROOTED_TREE.format(
                                prefix=options.prefix))))
            elif options.suffix == 'ar122':
                symlink_f(
                    PATH_AR122_ROOTED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_ROOTED_TREE.format(
                                prefix=options.prefix))))
            else:
                raise GenomeMarkerSetUnknown(
                    'There was an error determining the marker set.')

        self.logger.info('Done.')
Beispiel #3
0
    def decorate(self, options):
        """Decorate tree with GTDB taxonomy.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """

        check_file_exists(options.input_tree)

        taxonomy = self._read_taxonomy_files(options)

        d = Decorate()
        d.run(options.input_tree, taxonomy, options.output_tree)

        self.logger.info('Done.')

        # symlink to the decorated tree file, if not run independently
        if hasattr(options, 'suffix'):
            if options.suffix == 'bac120':
                symlink_f(
                    PATH_BAC120_DECORATED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_BAC120_DECORATED_TREE.format(
                                prefix=options.prefix))))
                symlink_f(
                    PATH_BAC120_DECORATED_TREE.format(prefix=options.prefix) +
                    '-table',
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_BAC120_DECORATED_TREE.format(
                                prefix=options.prefix) + '-table')))
            elif options.suffix == 'ar122':
                symlink_f(
                    PATH_AR122_DECORATED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_DECORATED_TREE.format(
                                prefix=options.prefix))))
                symlink_f(
                    PATH_AR122_DECORATED_TREE.format(prefix=options.prefix) +
                    '-table',
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_DECORATED_TREE.format(
                                prefix=options.prefix) + '-table')))
            else:
                raise GenomeMarkerSetUnknown(
                    'There was an error determining the marker set.')
Beispiel #4
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR122 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in sorted(gene_dict.items()):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar122_copy_number_file.add_genome(db_genome_id,
                                              info.get("aa_gene_path"),
                                              pfam_tophit_file,
                                              tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar122_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        symlink_f(
            PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_AR122_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_AR122_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
Beispiel #5
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
                                        write_single_copy_genes):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR53 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in tqdm_log(sorted(gene_dict.items()),
                                           unit='genome'):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar53_copy_number_file.add_genome(db_genome_id,
                                             info.get("aa_gene_path"),
                                             pfam_tophit_file,
                                             tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar53_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_FAILS.format(prefix=prefix),
            os.path.join(outdir,
                         os.path.basename(PATH_FAILS.format(prefix=prefix))))

        # Write the single copy AR53/BAC120 FASTA files to disk.
        if write_single_copy_genes:
            fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA)
            self.logger.info(
                f'Writing unaligned single-copy genes to: {fasta_dir}')

            # Iterate over each domain.
            marker_doms = list()
            marker_doms.append(
                (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'],
                 ar53_copy_number_file, 'ar53'))
            marker_doms.append((Config.BAC120_MARKERS['PFAM'] +
                                Config.BAC120_MARKERS['TIGRFAM'],
                                bac120_copy_number_file, 'bac120'))
            for marker_names, marker_file, marker_d in marker_doms:

                # Create the domain-specific subdirectory.
                fasta_d_dir = os.path.join(fasta_dir, marker_d)
                make_sure_path_exists(fasta_d_dir)

                # Iterate over each marker.
                for marker_name in marker_names:
                    marker_name = marker_name.rstrip(r'\.[HMMhmm]')
                    marker_path = os.path.join(fasta_d_dir,
                                               f'{marker_name}.fa')

                    to_write = list()
                    for genome_id in sorted(gene_dict):
                        unq_hits = marker_file.get_single_copy_hits(genome_id)
                        if marker_name in unq_hits:
                            to_write.append(f'>{genome_id}')
                            to_write.append(unq_hits[marker_name]['seq'])

                    if len(to_write) > 0:
                        with open(marker_path, 'w') as fh:
                            fh.write('\n'.join(to_write))
Beispiel #6
0
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon,
              genomes_to_process=None):
        """Align marker genes in genomes."""

        if identify_dir != out_dir:
            if not os.path.isdir(os.path.join(out_dir, DIR_IDENTIFY)):
                os.makedirs(os.path.join(out_dir, DIR_IDENTIFY))

            copy(
                os.path.join(identify_dir,
                             PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)),
                os.path.join(out_dir, DIR_IDENTIFY))
            copy(
                os.path.join(identify_dir,
                             PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)),
                os.path.join(out_dir, DIR_IDENTIFY))

            identify_gene_file = os.path.join(
                identify_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))
            copy(identify_gene_file, os.path.join(out_dir, DIR_IDENTIFY))

        if not os.path.exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)):
            os.makedirs(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

        # write out files with marker information
        bac120_marker_info_file = os.path.join(
            out_dir, PATH_BAC120_MARKER_INFO.format(prefix=prefix))
        self._write_marker_info(Config.BAC120_MARKERS, bac120_marker_info_file)
        ar122_marker_info_file = os.path.join(
            out_dir, PATH_AR122_MARKER_INFO.format(prefix=prefix))
        self._write_marker_info(Config.AR122_MARKERS, ar122_marker_info_file)

        genomic_files = self._path_to_identify_data(identify_dir,
                                                    identify_dir != out_dir)
        if genomes_to_process is not None and len(genomic_files) != len(
                genomes_to_process):
            self.logger.error(
                '{} are not present in the input list of genome to process.'.
                format(
                    list(
                        set(genomic_files.keys()) -
                        set(genomes_to_process.keys()))))
            raise InconsistentGenomeBatch(
                'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
                'genomes not present in your initial identify directory. Remove them, or run '
                'GTDB-Tk on a new directory.')

        self.logger.info('Aligning markers in %d genomes with %d threads.' %
                         (len(genomic_files), self.cpus))

        # determine marker set for each user genome
        bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(
            identify_dir, prefix)

        # align user genomes
        gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
        for gids, msa_file, mask_file, marker_set_id in ((bac_gids,
                                                          Config.CONCAT_BAC120,
                                                          Config.MASK_BAC120,
                                                          "bac120"),
                                                         (ar_gids,
                                                          Config.CONCAT_AR122,
                                                          Config.MASK_AR122,
                                                          "ar122")):

            domain_str = 'archaeal'
            if marker_set_id == 'bac120':
                domain_str = 'bacterial'

            if len(gids) == 0:
                continue

            self.logger.info(
                'Processing {:,} genomes identified as {}.'.format(
                    len(gids), domain_str))
            if marker_set_id == 'bac120':
                marker_info_file = bac120_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir,
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
            else:
                marker_info_file = ar122_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_AR122_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_AR122_USER_MSA.format(prefix=prefix))

            cur_genome_files = {
                gid: f
                for gid, f in genomic_files.items() if gid in gids
            }

            if skip_gtdb_refs:
                gtdb_msa = {}
            else:
                gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy,
                                                    taxa_filter,
                                                    outgroup_taxon)
            gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

            hmm_aligner = HmmAligner(self.cpus, self.pfam_top_hit_suffix,
                                     self.tigrfam_top_hit_suffix,
                                     self.protein_file_suffix,
                                     self.pfam_hmm_dir, self.tigrfam_hmms,
                                     Config.BAC120_MARKERS,
                                     Config.AR122_MARKERS)
            user_msa = hmm_aligner.align_marker_set(cur_genome_files,
                                                    marker_set_id)

            # Write the individual marker alignments to disk
            if self.debug:
                self._write_individual_markers(user_msa, marker_set_id,
                                               marker_info_file, out_dir,
                                               prefix)

            # filter columns without sufficient representation across taxa
            if skip_trimming:
                self.logger.info(
                    'Skipping custom filtering and selection of columns.')
                pruned_seqs = {}
                trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

            elif custom_msa_filters:
                aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                self.logger.info(
                    'Performing custom filtering and selection of columns.')

                trim_msa = TrimMSA(
                    cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0,
                    max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed,
                    os.path.join(out_dir, 'filter_%s' % marker_set_id))

                trimmed_seqs, pruned_seqs = trim_msa.trim(
                    aligned_genomes, marker_info_file)

                if trimmed_seqs:
                    self.logger.info(
                        'Filtered MSA from {:,} to {:,} AAs.'.format(
                            len(list(aligned_genomes.values())[0]),
                            len(list(trimmed_seqs.values())[0])))

                self.logger.info(
                    'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.'
                    .format(len(pruned_seqs), min_perc_aa))

                filtered_user_genomes = set(pruned_seqs).intersection(user_msa)
                if len(filtered_user_genomes):
                    self.logger.info(
                        'Filtered genomes include {:.} user submitted genomes.'
                        .format(len(filtered_user_genomes)))
            else:
                self.logger.info(
                    f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.'
                )
                trimmed_seqs, pruned_seqs = self._apply_mask(
                    gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0)
                self.logger.info(
                    'Masked {} alignment from {:,} to {:,} AAs.'.format(
                        domain_str, len(list(user_msa.values())[0]),
                        len(list(trimmed_seqs.values())[0])))

                if min_perc_aa > 0:
                    self.logger.info(
                        '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.'
                        .format(len(pruned_seqs), domain_str, min_perc_aa))

            # write out filtering information
            with open(marker_filtered_genomes, 'w') as fout:
                for pruned_seq_id, pruned_seq in pruned_seqs.items():
                    if len(pruned_seq) == 0:
                        perc_alignment = 0
                    else:
                        valid_bases = sum(
                            [1 for c in pruned_seq if c.isalpha()])
                        perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                    fout.write(
                        '%s\t%s\n' %
                        (pruned_seq_id,
                         'Insufficient number of amino acids in MSA ({:.1f}%)'.
                         format(perc_alignment)))

            # write out MSAs
            if not skip_gtdb_refs:
                self.logger.info(
                    'Creating concatenated alignment for {:,} {} GTDB and user genomes.'
                    .format(len(trimmed_seqs), domain_str))
                self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy)

            trimmed_user_msa = {
                k: v
                for k, v in trimmed_seqs.items() if k in user_msa
            }
            if len(trimmed_user_msa) > 0:
                self.logger.info(
                    'Creating concatenated alignment for {:,} {} user genomes.'
                    .format(len(trimmed_user_msa), domain_str))
                self._write_msa(trimmed_user_msa, marker_user_msa_path,
                                gtdb_taxonomy)
            else:
                self.logger.info(
                    f'All {domain_str} user genomes have been filtered out.')

            # Create symlinks to the summary files
            if marker_set_id == 'bac120':
                symlink_f(
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_BAC120_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_BAC120_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_BAC120_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_MSA.format(prefix=prefix))))
            elif marker_set_id == 'ar122':
                symlink_f(
                    PATH_AR122_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_AR122_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_AR122_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_AR122_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_MSA.format(prefix=prefix))))
            else:
                self.logger.error(
                    'There was an error determining the marker set.')
                raise GenomeMarkerSetUnknown
Beispiel #7
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        translation_table_file = open(
            os.path.join(outdir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix)),
            "w")
        bac_outfile = open(
            os.path.join(outdir,
                         PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)),
            "w")
        arc_outfile = open(
            os.path.join(outdir,
                         PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)), "w")

        header = "Name\tnumber_unique_genes\tnumber_multiple_genes\tnumber_missing_genes\tlist_unique_genes\tlist_multiple_genes\tlist_missing_genes\n"

        bac_outfile.write(header)
        arc_outfile.write(header)

        # gather information for all marker genes
        marker_dbs = {
            "PFAM": PFAM_TOP_HIT_SUFFIX,
            "TIGR": TIGRFAM_TOP_HIT_SUFFIX
        }

        marker_bac_list_original = []
        for db_marker in Config.BAC120_MARKERS.keys():
            marker_bac_list_original.extend([
                marker.replace(".HMM", "").replace(".hmm", "")
                for marker in Config.BAC120_MARKERS[db_marker]
            ])

        marker_arc_list_original = []
        for db_marker in Config.AR122_MARKERS.keys():
            marker_arc_list_original.extend([
                marker.replace(".HMM", "").replace(".hmm", "")
                for marker in Config.AR122_MARKERS[db_marker]
            ])

        for db_genome_id, info in gene_dict.items():

            unique_genes_bac, multi_hits_bac, missing_genes_bac = [], [], []
            unique_genes_arc, multi_hits_arc, missing_genes_arc = [], [], []

            gene_bac_dict, gene_arc_dict = {}, {}

            path = info.get("aa_gene_path")
            for _marker_db, marker_suffix in marker_dbs.items():
                # get all gene sequences
                protein_file = str(path)
                tophit_path = os.path.join(
                    outdir, DIR_MARKER_GENE, db_genome_id,
                    '{}{}'.format(db_genome_id, marker_suffix))

                # we load the list of all the genes detected in the genome
                all_genes_dict = read_fasta(protein_file, False)

                # Prodigal adds an asterisks at the end of each called genes.
                # These asterisks sometimes appear in the MSA, which can be
                # an issue for some downstream software
                for seq_id, seq in all_genes_dict.items():
                    if seq[-1] == '*':
                        all_genes_dict[seq_id] = seq[:-1]

                # we store the tophit file line by line and store the
                # information in a dictionary
                with open(tophit_path) as tp:
                    # first line is header line
                    tp.readline()

                    for line_tp in tp:
                        linelist = line_tp.split("\t")
                        genename = linelist[0]
                        sublist = linelist[1]
                        if ";" in sublist:
                            diff_markers = sublist.split(";")
                        else:
                            diff_markers = [sublist]

                        for each_mark in diff_markers:
                            sublist = each_mark.split(",")
                            markerid = sublist[0]

                            if (markerid not in marker_bac_list_original and
                                    markerid not in marker_arc_list_original):
                                continue

                            if markerid in marker_bac_list_original:
                                if markerid in gene_bac_dict:
                                    gene_bac_dict.get(
                                        markerid)["multihit"] = True
                                else:
                                    gene_bac_dict[markerid] = {
                                        "gene": genename,
                                        "multihit": False
                                    }

                            if markerid in marker_arc_list_original:
                                if markerid in gene_arc_dict:
                                    gene_arc_dict.get(
                                        markerid)["multihit"] = True
                                else:
                                    gene_arc_dict[markerid] = {
                                        "gene": genename,
                                        "multihit": False
                                    }

            for mid in marker_bac_list_original:
                if mid not in gene_bac_dict:
                    missing_genes_bac.append(mid)
                elif gene_bac_dict[mid]["multihit"]:
                    multi_hits_bac.append(mid)
                else:
                    unique_genes_bac.append(mid)

            for mid in marker_arc_list_original:
                if mid not in gene_arc_dict:
                    missing_genes_arc.append(mid)
                elif gene_arc_dict[mid]["multihit"]:
                    multi_hits_arc.append(mid)
                else:
                    unique_genes_arc.append(mid)

            bac_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                db_genome_id, len(unique_genes_bac), len(multi_hits_bac),
                len(missing_genes_bac), ','.join(unique_genes_bac),
                ','.join(multi_hits_bac), ','.join(missing_genes_bac)))

            arc_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                db_genome_id, len(unique_genes_arc), len(multi_hits_arc),
                len(missing_genes_arc), ','.join(unique_genes_arc),
                ','.join(multi_hits_arc), ','.join(missing_genes_arc)))

            translation_table_file.write('{}\t{}\n'.format(
                db_genome_id, info.get("best_translation_table")))

        bac_outfile.close()
        arc_outfile.close()
        translation_table_file.close()

        # Create a symlink to store the summary files in the root.
        symlink_f(
            PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_AR122_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_AR122_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
Beispiel #8
0
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon,
              genomes_to_process=None):
        """Align marker genes in genomes."""

        # If the user is re-running this step, check if the identify step is consistent.
        genomic_files = self._path_to_identify_data(identify_dir,
                                                    identify_dir != out_dir)
        if genomes_to_process is not None and len(genomic_files) != len(
                genomes_to_process):
            self.logger.error(
                '{} are not present in the input list of genome to process.'.
                format(
                    list(
                        set(genomic_files.keys()) -
                        set(genomes_to_process.keys()))))
            raise InconsistentGenomeBatch(
                'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
                'genomes not present in your initial identify directory. Remove them, or run '
                'GTDB-Tk on a new directory.')

        # If this is being run as a part of classify_wf, copy the required files.
        if identify_dir != out_dir:
            identify_path = os.path.join(out_dir, DIR_IDENTIFY)
            make_sure_path_exists(identify_path)
            copy(
                CopyNumberFileBAC120(identify_dir, prefix).path, identify_path)
            copy(CopyNumberFileAR122(identify_dir, prefix).path, identify_path)
            copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path)

        # Create the align intermediate directory.
        make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

        # Write out files with marker information
        ar122_marker_info_file = MarkerInfoFileAR122(out_dir, prefix)
        ar122_marker_info_file.write()
        bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix)
        bac120_marker_info_file.write()

        # Determine what domain each genome belongs to.
        bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(
            identify_dir, prefix)

        # # Create a temporary directory that will be used to generate each of the alignments.
        # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \
        #         tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac:
        #
        #     cur_gid_dict = {x: genomic_files[x] for x in ar_gids}
        #     self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} '
        #                      f'genomes identified as archaeal.')
        #     align.concat_single_copy_hits(dir_tmp_arc,
        #                                   cur_gid_dict,
        #                                   ar122_marker_info_file)
        #

        self.logger.info(
            f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.'
        )
        dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120,
                     "bac120", 'bacterial', CopyNumberFileBAC120),
                    (ar_gids, Config.CONCAT_AR122, Config.MASK_AR122, "ar122",
                     'archaeal', CopyNumberFileAR122))
        gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
        for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter:

            # No genomes identified as this domain.
            if len(gids) == 0:
                continue

            self.logger.info(
                f'Processing {len(gids):,} genomes identified as {domain_str}.'
            )
            if marker_set_id == 'bac120':
                marker_info_file = bac120_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir,
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
            else:
                marker_info_file = ar122_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_AR122_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_AR122_USER_MSA.format(prefix=prefix))

            cur_genome_files = {
                gid: f
                for gid, f in genomic_files.items() if gid in gids
            }

            if skip_gtdb_refs:
                gtdb_msa = {}
            else:
                gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy,
                                                    taxa_filter,
                                                    outgroup_taxon)
            gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

            # Generate the user MSA.
            user_msa = align.align_marker_set(cur_genome_files,
                                              marker_info_file, copy_number_f,
                                              self.cpus)

            # self.logger.log(Config.LOG_TASK, f'Aligning {len(cur_genome_files):,} {domain_str} genomes.')
            # hmm_aligner = HmmAligner(self.cpus,
            #                          self.pfam_top_hit_suffix,
            #                          self.tigrfam_top_hit_suffix,
            #                          self.protein_file_suffix,
            #                          self.pfam_hmm_dir,
            #                          self.tigrfam_hmms,
            #                          Config.BAC120_MARKERS,
            #                          Config.AR122_MARKERS)
            # user_msa = hmm_aligner.align_marker_set(cur_genome_files,
            #                                         marker_set_id)

            # Write the individual marker alignments to disk
            if self.debug:
                self._write_individual_markers(user_msa, marker_set_id,
                                               marker_info_file.path, out_dir,
                                               prefix)

            # filter columns without sufficient representation across taxa
            if skip_trimming:
                self.logger.info(
                    'Skipping custom filtering and selection of columns.')
                pruned_seqs = {}
                trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

            elif custom_msa_filters:
                aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                self.logger.info(
                    'Performing custom filtering and selection of columns.')

                trim_msa = TrimMSA(
                    cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0,
                    max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed,
                    os.path.join(out_dir, f'filter_{marker_set_id}'))

                trimmed_seqs, pruned_seqs = trim_msa.trim(
                    aligned_genomes, marker_info_file.path)

                if trimmed_seqs:
                    self.logger.info(
                        'Filtered MSA from {:,} to {:,} AAs.'.format(
                            len(list(aligned_genomes.values())[0]),
                            len(list(trimmed_seqs.values())[0])))

                self.logger.info(
                    'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.'
                    .format(len(pruned_seqs), min_perc_aa))

                filtered_user_genomes = set(pruned_seqs).intersection(user_msa)
                if len(filtered_user_genomes):
                    self.logger.info(
                        'Filtered genomes include {:.} user submitted genomes.'
                        .format(len(filtered_user_genomes)))
            else:
                self.logger.log(
                    Config.LOG_TASK,
                    f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.'
                )
                trimmed_seqs, pruned_seqs = self._apply_mask(
                    gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0)
                self.logger.info(
                    'Masked {} alignment from {:,} to {:,} AAs.'.format(
                        domain_str, len(list(user_msa.values())[0]),
                        len(list(trimmed_seqs.values())[0])))

                if min_perc_aa > 0:
                    self.logger.info(
                        '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.'
                        .format(len(pruned_seqs), domain_str, min_perc_aa))

            # write out filtering information
            with open(marker_filtered_genomes, 'w') as fout:
                for pruned_seq_id, pruned_seq in pruned_seqs.items():
                    if len(pruned_seq) == 0:
                        perc_alignment = 0
                    else:
                        valid_bases = sum(
                            [1 for c in pruned_seq if c.isalpha()])
                        perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                    fout.write(
                        f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n'
                    )

            # write out MSAs
            if not skip_gtdb_refs:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_seqs):,} '
                    f'{domain_str} GTDB and user genomes.')
                self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy)

            trimmed_user_msa = {
                k: v
                for k, v in trimmed_seqs.items() if k in user_msa
            }
            if len(trimmed_user_msa) > 0:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_user_msa):,} '
                    f'{domain_str} user genomes.')
                self._write_msa(trimmed_user_msa, marker_user_msa_path,
                                gtdb_taxonomy)
            else:
                self.logger.info(
                    f'All {domain_str} user genomes have been filtered out.')

            # Create symlinks to the summary files
            if marker_set_id == 'bac120':
                symlink_f(
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_BAC120_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_BAC120_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_BAC120_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_MSA.format(prefix=prefix))))
            elif marker_set_id == 'ar122':
                symlink_f(
                    PATH_AR122_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_AR122_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_AR122_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_AR122_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_MSA.format(prefix=prefix))))
            else:
                raise GenomeMarkerSetUnknown(
                    'There was an error determining the marker set.')