예제 #1
0
    def root(self, options):
        """Root tree using outgroup."""
        self.logger.warning("Tree rooting is still under development!")

        check_file_exists(options.input_tree)

        gtdb_taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in gtdb_taxonomy.iteritems():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree,
                                  outgroup)

        self.logger.info('Done.')
예제 #2
0
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon):
        """Align marker genes in genomes."""

        try:

            if identify_dir != out_dir:
                if not os.path.isdir(os.path.join(out_dir, DIR_IDENTIFY)):
                    os.makedirs(os.path.join(out_dir, DIR_IDENTIFY))

                copy(os.path.join(identify_dir, PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)),
                     os.path.join(out_dir, DIR_IDENTIFY))
                copy(os.path.join(identify_dir, PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)),
                     os.path.join(out_dir, DIR_IDENTIFY))

                identify_gene_file = os.path.join(identify_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))
                copy(identify_gene_file, os.path.join(out_dir, DIR_IDENTIFY))

            if not os.path.exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)):
                os.makedirs(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

            # write out files with marker information
            bac120_marker_info_file = os.path.join(out_dir, PATH_BAC120_MARKER_INFO.format(prefix=prefix))
            self._write_marker_info(Config.BAC120_MARKERS, bac120_marker_info_file)
            ar122_marker_info_file = os.path.join(out_dir, PATH_AR122_MARKER_INFO.format(prefix=prefix))
            self._write_marker_info(Config.AR122_MARKERS, ar122_marker_info_file)

            genomic_files = self._path_to_identify_data(identify_dir)
            self.logger.info('Aligning markers in %d genomes with %d threads.' % (len(genomic_files),
                                                                                  self.cpus))

            # determine marker set for each user genome
            bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(identify_dir, prefix)

            # align user genomes
            gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
            for gids, msa_file, mask_file, marker_set_id in ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120"),
                                                             (ar_gids, Config.CONCAT_AR122, Config.MASK_AR122, "ar122")):

                if len(gids) == 0:
                    continue

                if marker_set_id == 'bac120':
                    self.logger.info('Processing %d genomes identified as bacterial.' % len(gids))
                    marker_info_file = bac120_marker_info_file
                    marker_filtered_genomes = os.path.join(out_dir, PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                    marker_msa_path = os.path.join(out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                    marker_user_msa_path = os.path.join(out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
                else:
                    self.logger.info('Processing %d genomes identified as archaeal.' % len(gids))
                    marker_info_file = ar122_marker_info_file
                    marker_filtered_genomes = os.path.join(out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix))
                    marker_msa_path = os.path.join(out_dir, PATH_AR122_MSA.format(prefix=prefix))
                    marker_user_msa_path = os.path.join(out_dir, PATH_AR122_USER_MSA.format(prefix=prefix))

                cur_genome_files = {
                    gid: f for gid, f in genomic_files.iteritems() if gid in gids}

                if skip_gtdb_refs:
                    gtdb_msa = {}
                else:
                    gtdb_msa = self._msa_filter_by_taxa(msa_file,
                                                        gtdb_taxonomy,
                                                        taxa_filter,
                                                        outgroup_taxon)
                gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

                hmm_aligner = HmmAligner(self.cpus,
                                         self.pfam_top_hit_suffix,
                                         self.tigrfam_top_hit_suffix,
                                         self.protein_file_suffix,
                                         self.pfam_hmm_dir,
                                         self.tigrfam_hmms,
                                         Config.BAC120_MARKERS,
                                         Config.AR122_MARKERS,
                                         Config.RPS23_MARKERS)
                user_msa = hmm_aligner.align_marker_set(cur_genome_files,
                                                        marker_set_id)

                # filter columns without sufficient representation across taxa
                if skip_trimming:
                    self.logger.info(
                        'Skipping custom filtering and selection of columns.')
                    pruned_seqs = {}
                    trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

                elif custom_msa_filters:
                    aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                    self.logger.info(
                        'Performing custom filtering and selection of columns.')

                    trim_msa = TrimMSA(cols_per_gene,
                                       min_perc_aa / 100.0,
                                       min_consensus / 100.0,
                                       max_consensus / 100.0,
                                       min_per_taxa / 100.0,
                                       rnd_seed,
                                       os.path.join(out_dir, 'filter_%s' % marker_set_id))

                    trimmed_seqs, pruned_seqs = trim_msa.trim(aligned_genomes,
                                                              marker_info_file)

                    if trimmed_seqs:
                        self.logger.info('Filtered MSA from %d to %d AAs.' % (
                            len(aligned_genomes.values()[0]),
                            len(trimmed_seqs.values()[0])))

                    self.logger.info('Filtered %d genomes with amino acids in <%.1f%% of columns in filtered MSA.' % (
                        len(pruned_seqs),
                        min_perc_aa))

                    filtered_user_genomes = set(
                        pruned_seqs).intersection(user_msa)
                    if len(filtered_user_genomes):
                        self.logger.info('Filtered genomes include %d user submitted genomes.' % len(
                            filtered_user_genomes))
                else:
                    self.logger.info(
                        'Masking columns of multiple sequence alignment using canonical mask.')
                    trimmed_seqs, pruned_seqs = self._apply_mask(gtdb_msa,
                                                                 user_msa,
                                                                 gtdb_msa_mask,
                                                                 min_perc_aa / 100.0)
                    self.logger.info('Masked alignment from %d to %d AA.' % (len(user_msa.values()[0]),
                                                                             len(trimmed_seqs.values()[0])))

                    if min_perc_aa > 0:
                        self.logger.info('%d user genomes have amino acids in <%.1f%% of columns in filtered MSA.' % (
                            len(pruned_seqs),
                            min_perc_aa))

                # write out filtering information
                with open(os.path.join(out_dir, marker_filtered_genomes), 'w') as fout:
                    for pruned_seq_id, pruned_seq in pruned_seqs.items():
                        if len(pruned_seq) == 0:
                            perc_alignment = 0
                        else:
                            valid_bases = sum(
                                [1 for c in pruned_seq if c.isalpha()])
                            perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                        fout.write('%s\t%s\n' % (pruned_seq_id,
                                                 'Insufficient number of amino acids in MSA (%.1f%%)' % perc_alignment))


                # write out MSAs
                if not skip_gtdb_refs:
                    self.logger.info('Creating concatenated alignment for %d GTDB and user genomes.' % len(trimmed_seqs))
                    self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy)

                trimmed_user_msa = {
                    k: v for k, v in trimmed_seqs.iteritems() if k in user_msa}
                if len(trimmed_user_msa) > 0:
                    self.logger.info('Creating concatenated alignment for %d user genomes.' % len(trimmed_user_msa))
                    self._write_msa(trimmed_user_msa, marker_user_msa_path, gtdb_taxonomy)
                else:
                    if marker_set_id == 'bac120':
                        self.logger.info('All bacterial user genomes have been filtered out.')
                    else:
                        self.logger.info('All archaeal user genomes have been filtered out.')

                # Create symlinks to the summary files
                if marker_set_id == 'bac120':
                    os.symlink(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
                               os.path.join(out_dir, os.path.basename(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))))
                    os.symlink(PATH_BAC120_USER_MSA.format(prefix=prefix),
                               os.path.join(out_dir, os.path.basename(PATH_BAC120_USER_MSA.format(prefix=prefix))))
                    os.symlink(PATH_BAC120_MSA.format(prefix=prefix),
                               os.path.join(out_dir, os.path.basename(PATH_BAC120_MSA.format(prefix=prefix))))
                elif marker_set_id == 'ar122':
                    os.symlink(PATH_AR122_FILTERED_GENOMES.format(prefix=prefix),
                               os.path.join(out_dir, os.path.basename(PATH_AR122_FILTERED_GENOMES.format(prefix=prefix))))
                    os.symlink(PATH_AR122_USER_MSA.format(prefix=prefix),
                               os.path.join(out_dir, os.path.basename(PATH_AR122_USER_MSA.format(prefix=prefix))))
                    os.symlink(PATH_AR122_MSA.format(prefix=prefix),
                               os.path.join(out_dir, os.path.basename(PATH_AR122_MSA.format(prefix=prefix))))
                else:
                    self.logger.error('There was an error determining the marker set.')
                    raise Exception

        except IOError as e:
            self.logger.error(str(e))
            self.logger.error("GTDB-Tk has encountered an error.")