Ejemplo n.º 1
0
 def setUp(self):
     self.classify = Classify()
     self.out_dir = tempfile.mkdtemp(prefix='gtdbtk_tmp_')
     self.prefix = 'gtdbtk'
     self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
     self.aln_dir_ref = 'tests/data/align_dir_reference/align'
     self.user_msa_file = os.path.join(self.aln_dir_ref, 'gtdbtk.ar122.user_msa.fasta')
     self.taxonomy_file = Config.TAXONOMY_FILE
     self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
Ejemplo n.º 2
0
 def test_read(self):
     expected = {
         'GCF_005435136.1':
         ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S1'],
         '2': ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S2']
     }
     path_tax = os.path.join(self.dir_tmp, 'tax_file.tsv')
     with open(path_tax, 'w') as f:
         for k, v in expected.items():
             f.write(f'{k}\t{";".join(v)}\n')
     t = Taxonomy()
     result = t.read(path_tax)
     self.assertDictEqual(expected, result)
Ejemplo n.º 3
0
Archivo: main.py Proyecto: 31380/GTDBTk
    def root(self, options):
        """Root tree using outgroup.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """
        self.logger.warning("Tree rooting is still under development!")

        check_file_exists(options.input_tree)

        if options.custom_taxonomy_file:
            check_file_exists(options.custom_taxonomy_file)
            taxonomy = Taxonomy().read(options.custom_taxonomy_file)
        else:
            taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in taxonomy.items():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree,
                                  outgroup)

        # Symlink to the tree summary file, if not run independently
        if hasattr(options, 'suffix'):
            if options.suffix == 'bac120':
                symlink_f(
                    PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_ROOTED_TREE.format(
                                prefix=options.prefix))))
            elif options.suffix == 'ar122':
                symlink_f(
                    PATH_AR122_ROOTED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_ROOTED_TREE.format(
                                prefix=options.prefix))))
            else:
                raise GenomeMarkerSetUnknown(
                    'There was an error determining the marker set.')

        self.logger.info('Done.')
Ejemplo n.º 4
0
    def setUp(self):
        self.classify = Classify()

        self.generic_out_path = 'tests/data/results'
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        self.out_dir = os.path.join(self.generic_out_path, tmp_folder)
        if not os.path.exists(self.generic_out_path):
            os.makedirs(self.generic_out_path)
        self.prefix = 'gtdbtk'
        self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
        self.user_msa_file = os.path.join(
            self.pplacer_dir_reference, 'gtdbtk.ar122.user_msa.fasta')
        self.taxonomy_file = Config.TAXONOMY_FILE
        self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
Ejemplo n.º 5
0
    def _get_ingroup_domain(self, ingroup_taxon) -> str:
        """Get domain on ingroup taxon."""

        # read GTDB taxonomy in order to establish domain on ingroup taxon
        gtdb_taxonomy = Taxonomy().read(TAXONOMY_FILE)
        ingroup_domain = None
        for taxa in gtdb_taxonomy.values():
            if ingroup_taxon in taxa:
                ingroup_domain = taxa[Taxonomy.DOMAIN_IDX]

        if ingroup_domain is None:
            raise GTDBTkExit(f'Ingroup taxon {ingroup_taxon} was not found in '
                             f'the GTDB taxonomy.')

        return ingroup_domain
Ejemplo n.º 6
0
    def run(self, genomes, no_mash, max_d, out_dir, prefix, mash_k, mash_v,
            mash_s, min_af, mash_db):
        """Runs the pipeline.

        Parameters
        ----------
        genomes : dict[str, str]
            Dict[genome_id] = fasta_path
        no_mash : bool
            True if Mash should be used for pre-filtering, False otherwise.
        max_d : float
             maximum distance to keep [0-1]
        out_dir : str
            The directory to write the output files to.
        prefix : str
            The prefix to use when writing output files.
        mash_k : int
            k-mer size [1-32]
        mash_v : float
            maximum p-value to keep [0-1]
        mash_s : int
            maximum number of non-redundant hashes
        min_af : float
            alignment fraction to consider closest genome
        mash_db : Optional[str]
            The path to read/write the pre-computed Mash reference sketch database.
        """
        self.check_dependencies(no_mash)

        self.logger.info('Loading reference genomes.')
        ref_genomes = self._get_ref_genomes()
        d_compare = defaultdict(set)
        d_paths = {**genomes, **ref_genomes}

        # Pre-filter using Mash if specified.
        if not no_mash:
            dir_mash = os.path.join(out_dir, DIR_ANI_REP_INT_MASH)

            mash = Mash(self.cpus, dir_mash, prefix)
            self.logger.info(f'Using Mash version {mash.version()}')
            mash_results = mash.run(genomes, ref_genomes, max_d, mash_k,
                                    mash_v, mash_s, mash_db)
            for qry_gid, ref_hits in mash_results.items():
                d_compare[qry_gid] = d_compare[qry_gid].union(
                    set(ref_hits.keys()))

        # Compare against all reference genomes.
        else:
            for qry_gid in genomes:
                d_compare[qry_gid] = set(ref_genomes.keys())

        self.logger.info(
            f'Calculating ANI with FastANI v{FastANI._get_version()}.')
        fastani = FastANI(self.cpus, force_single=True)
        fastani_results = fastani.run(d_compare, d_paths)

        taxonomy = Taxonomy().read(TAXONOMY_FILE, canonical_ids=True)
        ANISummaryFile(out_dir, prefix, fastani_results, taxonomy)
        ANIClosestFile(out_dir, prefix, fastani_results, genomes, min_af,
                       taxonomy)
Ejemplo n.º 7
0
    def _assign_taxon_labels(self, fmeasure_for_taxa):
        """Assign taxon labels to nodes.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall), ...]
          Node with highest F-measure for each taxon.
          
        Returns
        -------
        set
            Taxon labels placed in tree.
        """

        placed_taxon = set()
        for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
            if len(fmeasure_for_taxa[taxon]) == 1:
                placed_taxon.add(taxon)

                stat_table = fmeasure_for_taxa[taxon][0]
                node = stat_table.node
                fmeasure = stat_table.fmeasure
                precision = stat_table.precision
                recall = stat_table.recall

                support, taxon_label, aux_info = parse_label(node.label)
                if taxon_label:
                    taxon_label += '; ' + taxon
                else:
                    taxon_label = taxon
                node.label = create_label(support, taxon_label, aux_info)

        return placed_taxon
Ejemplo n.º 8
0
 def test_read_canonical(self):
     to_write = {
         'GCF_005435136.1':
         ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S1'],
         'RS_GCF_005435135.1':
         ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S2']
     }
     expected = {
         'G005435136': to_write['GCF_005435136.1'],
         'G005435135': to_write['RS_GCF_005435135.1'],
     }
     path_tax = os.path.join(self.dir_tmp, 'tax_file.tsv')
     with open(path_tax, 'w') as f:
         for k, v in to_write.items():
             f.write(f'{k}\t{";".join(v)}\n')
     t = Taxonomy()
     result = t.read(path_tax, canonical_ids=True)
     self.assertDictEqual(expected, result)
Ejemplo n.º 9
0
 def test_read_error(self):
     expected = {
         'GCF_005435136.1':
         ['d__D', 'p__P', 'c__C', 'f__F', 'g__G', 's__S']
     }
     path_tax = os.path.join(self.dir_tmp, 'tax_file.tsv')
     with open(path_tax, 'w') as f:
         for k, v in expected.items():
             f.write(f'{k},{";".join(v)}\n')
     t = Taxonomy()
     self.assertRaises(GTDBTkExit, t.read, path_tax)
Ejemplo n.º 10
0
    def _write_statistics_table(self, fmeasure_for_taxa, taxonomy, out_table):
        """Write table containing statistics for each taxon.
        
        Parameters
        ----------
        fmeasure_for_taxa : d[taxon] -> [(Node, F-measure, precision, recall)]
          Node with highest F-measure for each taxon.
        taxonomy : d[unique_id] -> [d__<taxon>; ...; s__<taxon>]
          Taxonomic information for taxa in tree of interest.
        out_table : str
          Output table to write statistics for assigned labels.  
        """

        # get extent taxa
        extant_taxa = Taxonomy().extant_taxa(taxonomy)

        fout_table = open(out_table, 'w')
        fout_table.write(
            'Taxon\tNo. Expected in Tree\tF-measure\tPrecision\tRecall')
        fout_table.write('\tNo. Genomes from Taxon\tNo. Genome In Lineage')
        fout_table.write('\tRogue out\tRogue in\n')
        for taxon in Taxonomy().sort_taxa(fmeasure_for_taxa.keys()):
            if len(fmeasure_for_taxa[taxon]) != 1:
                self.logger.error(
                    'Multiple positions specified for taxon label.')
                sys.exit()

            num_genomes = len(extant_taxa[taxon])

            stat_table = fmeasure_for_taxa[taxon][0]
            fout_table.write(
                '%s\t%d\t%.4f\t%.4f\t%.4f\t%d\t%d\t%s\t%s\n' %
                (taxon, num_genomes, stat_table.fmeasure, stat_table.precision,
                 stat_table.recall, stat_table.taxa_in_lineage,
                 stat_table.num_leaves_with_taxa, ','.join(
                     stat_table.rogue_out), ','.join(stat_table.rogue_in)))

        fout_table.close()
Ejemplo n.º 11
0
    def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, str, str, str]]:
        """Read and merge taxonomy files."""

        self.logger.info('Reading GTDB taxonomy for representative genomes.')
        taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        if options.gtdbtk_classification_file:
            # add and overwrite taxonomy for genomes specified in the
            # GTDB-Tk classification file
            check_file_exists(options.gtdbtk_classification_file)

            self.logger.info('Reading GTDB-Tk classification file.')
            gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file)
            del gtdbtk_taxonomy['user_genome']
            num_reassigned = 0
            for gid, taxa in gtdbtk_taxonomy.items():
                if gid in taxonomy:
                    num_reassigned += 1
                taxonomy[gid] = taxa

            self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.')
            self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')

        if options.custom_taxonomy_file:
            # add and overwrite taxonomy for genomes specified in the
            # custom taxonomy file
            check_file_exists(options.custom_taxonomy_file)

            self.logger.info('Reading custom taxonomy file.')
            custom_taxonomy = Taxonomy().read(options.custom_taxonomy_file)
            num_reassigned = 0
            for gid, taxa in custom_taxonomy.items():
                if gid in taxonomy:
                    num_reassigned += 1
                taxonomy[gid] = taxa

            self.logger.info(f'Read custom taxonomy for {len(custom_taxonomy):,} genomes.')
            self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')

        if options.gtdbtk_classification_file and options.custom_taxonomy_file:
            dup_genomes = set(gtdbtk_taxonomy).intersection(custom_taxonomy)
            if len(dup_genomes) > 0:
                self.logger.error('GTDB-Tk classification and custom taxonomy '
                                  'files must not specify taxonomies for the '
                                  'same genomes.')
                self.logger.error('These files have {:,} genomes in common.'.format(len(dup_genomes)))
                self.logger.error('Example duplicate genome: {}'.format(dup_genomes.pop()))
                raise GTDBTkExit('Duplicated taxonomy information.')

        self.logger.info(f'Read taxonomy for {len(taxonomy):,} genomes.')

        return taxonomy
Ejemplo n.º 12
0
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon,
              genomes_to_process=None):
        """Align marker genes in genomes."""

        # read genomes that failed identify steps to skip them
        failed_genomes_file = os.path.join(
            os.path.join(identify_dir, PATH_FAILS.format(prefix=prefix)))
        if os.path.isfile(failed_genomes_file):
            with open(failed_genomes_file) as fgf:
                failed_genomes = [row.split()[0] for row in fgf]
        else:
            failed_genomes = list()

        # If the user is re-running this step, check if the identify step is consistent.
        genomic_files = self._path_to_identify_data(identify_dir,
                                                    identify_dir != out_dir)
        if genomes_to_process is not None and len(genomic_files) != len(
                genomes_to_process):
            if list(
                    set(genomic_files.keys()) - set(genomes_to_process.keys())
            ).sort() != failed_genomes.sort():
                self.logger.error(
                    '{} are not present in the input list of genome to process.'
                    .format(
                        list(
                            set(genomic_files.keys()) -
                            set(genomes_to_process.keys()))))
                raise InconsistentGenomeBatch(
                    'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
                    'genomes not present in your initial identify directory. Remove them, or run '
                    'GTDB-Tk on a new directory.')

        # If this is being run as a part of classify_wf, copy the required files.
        if identify_dir != out_dir:
            identify_path = os.path.join(out_dir, DIR_IDENTIFY)
            make_sure_path_exists(identify_path)
            copy(
                CopyNumberFileBAC120(identify_dir, prefix).path, identify_path)
            copy(CopyNumberFileAR53(identify_dir, prefix).path, identify_path)
            copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path)

        # Create the align intermediate directory.
        make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

        # Write out files with marker information
        ar53_marker_info_file = MarkerInfoFileAR53(out_dir, prefix)
        ar53_marker_info_file.write()
        bac120_marker_info_file = MarkerInfoFileBAC120(out_dir, prefix)
        bac120_marker_info_file.write()

        # Determine what domain each genome belongs to.
        bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(
            identify_dir, prefix)
        if len(bac_gids) + len(ar_gids) == 0:
            raise GTDBTkExit(f'Unable to assign a domain to any genomes, '
                             f'please check the identify marker summary file, '
                             f'and verify genome quality.')

        # # Create a temporary directory that will be used to generate each of the alignments.
        # with tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_arc, \
        #         tempfile.TemporaryDirectory(prefix='gtdbtk_tmp_') as dir_tmp_bac:
        #
        #     cur_gid_dict = {x: genomic_files[x] for x in ar_gids}
        #     self.logger.info(f'Collecting marker sequences from {len(cur_gid_dict):,} '
        #                      f'genomes identified as archaeal.')
        #     align.concat_single_copy_hits(dir_tmp_arc,
        #                                   cur_gid_dict,
        #                                   ar53_marker_info_file)
        #

        self.logger.info(
            f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.'
        )
        dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120,
                     "bac120", 'bacterial', CopyNumberFileBAC120),
                    (ar_gids, Config.CONCAT_AR53, Config.MASK_AR53, "ar53",
                     'archaeal', CopyNumberFileAR53))
        gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
        for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter:

            # No genomes identified as this domain.
            if len(gids) == 0:
                continue

            self.logger.info(
                f'Processing {len(gids):,} genomes identified as {domain_str}.'
            )
            if marker_set_id == 'bac120':
                marker_info_file = bac120_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir,
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
            else:
                marker_info_file = ar53_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir, PATH_AR53_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_AR53_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_AR53_USER_MSA.format(prefix=prefix))

            cur_genome_files = {
                gid: f
                for gid, f in genomic_files.items() if gid in gids
            }

            if skip_gtdb_refs:
                gtdb_msa = {}
            else:
                gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy,
                                                    taxa_filter,
                                                    outgroup_taxon)
            gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

            # Generate the user MSA.
            user_msa = align.align_marker_set(cur_genome_files,
                                              marker_info_file, copy_number_f,
                                              self.cpus)
            if len(user_msa) == 0:
                self.logger.warning(
                    f'Identified {len(user_msa):,} single copy {domain_str} hits.'
                )
                continue

            # Write the individual marker alignments to disk
            if self.debug:
                self._write_individual_markers(user_msa, marker_set_id,
                                               marker_info_file.path, out_dir,
                                               prefix)

            # filter columns without sufficient representation across taxa
            if skip_trimming:
                self.logger.info(
                    'Skipping custom filtering and selection of columns.')
                pruned_seqs = {}
                trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

            elif custom_msa_filters:
                aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                self.logger.info(
                    'Performing custom filtering and selection of columns.')

                trim_msa = TrimMSA(
                    cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0,
                    max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed,
                    os.path.join(out_dir, f'filter_{marker_set_id}'))

                trimmed_seqs, pruned_seqs = trim_msa.trim(
                    aligned_genomes, marker_info_file.path)

                if trimmed_seqs:
                    self.logger.info(
                        'Filtered MSA from {:,} to {:,} AAs.'.format(
                            len(list(aligned_genomes.values())[0]),
                            len(list(trimmed_seqs.values())[0])))

                self.logger.info(
                    'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.'
                    .format(len(pruned_seqs), min_perc_aa))

                filtered_user_genomes = set(pruned_seqs).intersection(user_msa)
                if len(filtered_user_genomes):
                    self.logger.info(
                        f'Filtered genomes include {len(filtered_user_genomes)} user submitted genomes.'
                    )
            else:
                self.logger.log(
                    Config.LOG_TASK,
                    f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.'
                )
                trimmed_seqs, pruned_seqs = self._apply_mask(
                    gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0)
                self.logger.info(
                    'Masked {} alignment from {:,} to {:,} AAs.'.format(
                        domain_str, len(list(user_msa.values())[0]),
                        len(list(trimmed_seqs.values())[0])))

                if min_perc_aa > 0:
                    self.logger.info(
                        '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.'
                        .format(len(pruned_seqs), domain_str, min_perc_aa))

            # write out filtering information
            with open(marker_filtered_genomes, 'w') as fout:
                for pruned_seq_id, pruned_seq in pruned_seqs.items():
                    if len(pruned_seq) == 0:
                        perc_alignment = 0
                    else:
                        valid_bases = sum(
                            [1 for c in pruned_seq if c.isalpha()])
                        perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                    fout.write(
                        f'{pruned_seq_id}\tInsufficient number of amino acids in MSA ({perc_alignment:.1f}%)\n'
                    )

            # write out MSAs
            if not skip_gtdb_refs:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_seqs):,} '
                    f'{domain_str} GTDB and user genomes.')
                self._write_msa(trimmed_seqs,
                                marker_msa_path,
                                gtdb_taxonomy,
                                zip_output=True)

            trimmed_user_msa = {
                k: v
                for k, v in trimmed_seqs.items() if k in user_msa
            }
            if len(trimmed_user_msa) > 0:
                self.logger.info(
                    f'Creating concatenated alignment for {len(trimmed_user_msa):,} '
                    f'{domain_str} user genomes.')
                self._write_msa(trimmed_user_msa,
                                marker_user_msa_path,
                                gtdb_taxonomy,
                                zip_output=True)
            else:
                self.logger.info(
                    f'All {domain_str} user genomes have been filtered out.')
Ejemplo n.º 13
0
    def align(self,
              identify_dir,
              skip_gtdb_refs,
              taxa_filter,
              min_perc_aa,
              custom_msa_filters,
              skip_trimming,
              rnd_seed,
              cols_per_gene,
              min_consensus,
              max_consensus,
              min_per_taxa,
              out_dir,
              prefix,
              outgroup_taxon,
              genomes_to_process=None):
        """Align marker genes in genomes."""

        if identify_dir != out_dir:
            if not os.path.isdir(os.path.join(out_dir, DIR_IDENTIFY)):
                os.makedirs(os.path.join(out_dir, DIR_IDENTIFY))

            copy(
                os.path.join(identify_dir,
                             PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix)),
                os.path.join(out_dir, DIR_IDENTIFY))
            copy(
                os.path.join(identify_dir,
                             PATH_AR122_MARKER_SUMMARY.format(prefix=prefix)),
                os.path.join(out_dir, DIR_IDENTIFY))

            identify_gene_file = os.path.join(
                identify_dir, PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))
            copy(identify_gene_file, os.path.join(out_dir, DIR_IDENTIFY))

        if not os.path.exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE)):
            os.makedirs(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))

        # write out files with marker information
        bac120_marker_info_file = os.path.join(
            out_dir, PATH_BAC120_MARKER_INFO.format(prefix=prefix))
        self._write_marker_info(Config.BAC120_MARKERS, bac120_marker_info_file)
        ar122_marker_info_file = os.path.join(
            out_dir, PATH_AR122_MARKER_INFO.format(prefix=prefix))
        self._write_marker_info(Config.AR122_MARKERS, ar122_marker_info_file)

        genomic_files = self._path_to_identify_data(identify_dir,
                                                    identify_dir != out_dir)
        if genomes_to_process is not None and len(genomic_files) != len(
                genomes_to_process):
            self.logger.error(
                '{} are not present in the input list of genome to process.'.
                format(
                    list(
                        set(genomic_files.keys()) -
                        set(genomes_to_process.keys()))))
            raise InconsistentGenomeBatch(
                'You are attempting to run GTDB-Tk on a non-empty directory that contains extra '
                'genomes not present in your initial identify directory. Remove them, or run '
                'GTDB-Tk on a new directory.')

        self.logger.info('Aligning markers in %d genomes with %d threads.' %
                         (len(genomic_files), self.cpus))

        # determine marker set for each user genome
        bac_gids, ar_gids, _bac_ar_diff = self.genome_domain(
            identify_dir, prefix)

        # align user genomes
        gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
        for gids, msa_file, mask_file, marker_set_id in ((bac_gids,
                                                          Config.CONCAT_BAC120,
                                                          Config.MASK_BAC120,
                                                          "bac120"),
                                                         (ar_gids,
                                                          Config.CONCAT_AR122,
                                                          Config.MASK_AR122,
                                                          "ar122")):

            domain_str = 'archaeal'
            if marker_set_id == 'bac120':
                domain_str = 'bacterial'

            if len(gids) == 0:
                continue

            self.logger.info(
                'Processing {:,} genomes identified as {}.'.format(
                    len(gids), domain_str))
            if marker_set_id == 'bac120':
                marker_info_file = bac120_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir,
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_BAC120_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_BAC120_USER_MSA.format(prefix=prefix))
            else:
                marker_info_file = ar122_marker_info_file
                marker_filtered_genomes = os.path.join(
                    out_dir, PATH_AR122_FILTERED_GENOMES.format(prefix=prefix))
                marker_msa_path = os.path.join(
                    out_dir, PATH_AR122_MSA.format(prefix=prefix))
                marker_user_msa_path = os.path.join(
                    out_dir, PATH_AR122_USER_MSA.format(prefix=prefix))

            cur_genome_files = {
                gid: f
                for gid, f in genomic_files.items() if gid in gids
            }

            if skip_gtdb_refs:
                gtdb_msa = {}
            else:
                gtdb_msa = self._msa_filter_by_taxa(msa_file, gtdb_taxonomy,
                                                    taxa_filter,
                                                    outgroup_taxon)
            gtdb_msa_mask = os.path.join(Config.MASK_DIR, mask_file)

            hmm_aligner = HmmAligner(self.cpus, self.pfam_top_hit_suffix,
                                     self.tigrfam_top_hit_suffix,
                                     self.protein_file_suffix,
                                     self.pfam_hmm_dir, self.tigrfam_hmms,
                                     Config.BAC120_MARKERS,
                                     Config.AR122_MARKERS)
            user_msa = hmm_aligner.align_marker_set(cur_genome_files,
                                                    marker_set_id)

            # Write the individual marker alignments to disk
            if self.debug:
                self._write_individual_markers(user_msa, marker_set_id,
                                               marker_info_file, out_dir,
                                               prefix)

            # filter columns without sufficient representation across taxa
            if skip_trimming:
                self.logger.info(
                    'Skipping custom filtering and selection of columns.')
                pruned_seqs = {}
                trimmed_seqs = merge_two_dicts(gtdb_msa, user_msa)

            elif custom_msa_filters:
                aligned_genomes = merge_two_dicts(gtdb_msa, user_msa)
                self.logger.info(
                    'Performing custom filtering and selection of columns.')

                trim_msa = TrimMSA(
                    cols_per_gene, min_perc_aa / 100.0, min_consensus / 100.0,
                    max_consensus / 100.0, min_per_taxa / 100.0, rnd_seed,
                    os.path.join(out_dir, 'filter_%s' % marker_set_id))

                trimmed_seqs, pruned_seqs = trim_msa.trim(
                    aligned_genomes, marker_info_file)

                if trimmed_seqs:
                    self.logger.info(
                        'Filtered MSA from {:,} to {:,} AAs.'.format(
                            len(list(aligned_genomes.values())[0]),
                            len(list(trimmed_seqs.values())[0])))

                self.logger.info(
                    'Filtered {:,} genomes with amino acids in <{:.1f}% of columns in filtered MSA.'
                    .format(len(pruned_seqs), min_perc_aa))

                filtered_user_genomes = set(pruned_seqs).intersection(user_msa)
                if len(filtered_user_genomes):
                    self.logger.info(
                        'Filtered genomes include {:.} user submitted genomes.'
                        .format(len(filtered_user_genomes)))
            else:
                self.logger.info(
                    f'Masking columns of {domain_str} multiple sequence alignment using canonical mask.'
                )
                trimmed_seqs, pruned_seqs = self._apply_mask(
                    gtdb_msa, user_msa, gtdb_msa_mask, min_perc_aa / 100.0)
                self.logger.info(
                    'Masked {} alignment from {:,} to {:,} AAs.'.format(
                        domain_str, len(list(user_msa.values())[0]),
                        len(list(trimmed_seqs.values())[0])))

                if min_perc_aa > 0:
                    self.logger.info(
                        '{:,} {} user genomes have amino acids in <{:.1f}% of columns in filtered MSA.'
                        .format(len(pruned_seqs), domain_str, min_perc_aa))

            # write out filtering information
            with open(marker_filtered_genomes, 'w') as fout:
                for pruned_seq_id, pruned_seq in pruned_seqs.items():
                    if len(pruned_seq) == 0:
                        perc_alignment = 0
                    else:
                        valid_bases = sum(
                            [1 for c in pruned_seq if c.isalpha()])
                        perc_alignment = valid_bases * 100.0 / len(pruned_seq)
                    fout.write(
                        '%s\t%s\n' %
                        (pruned_seq_id,
                         'Insufficient number of amino acids in MSA ({:.1f}%)'.
                         format(perc_alignment)))

            # write out MSAs
            if not skip_gtdb_refs:
                self.logger.info(
                    'Creating concatenated alignment for {:,} {} GTDB and user genomes.'
                    .format(len(trimmed_seqs), domain_str))
                self._write_msa(trimmed_seqs, marker_msa_path, gtdb_taxonomy)

            trimmed_user_msa = {
                k: v
                for k, v in trimmed_seqs.items() if k in user_msa
            }
            if len(trimmed_user_msa) > 0:
                self.logger.info(
                    'Creating concatenated alignment for {:,} {} user genomes.'
                    .format(len(trimmed_user_msa), domain_str))
                self._write_msa(trimmed_user_msa, marker_user_msa_path,
                                gtdb_taxonomy)
            else:
                self.logger.info(
                    f'All {domain_str} user genomes have been filtered out.')

            # Create symlinks to the summary files
            if marker_set_id == 'bac120':
                symlink_f(
                    PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_BAC120_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_BAC120_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_BAC120_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_BAC120_MSA.format(prefix=prefix))))
            elif marker_set_id == 'ar122':
                symlink_f(
                    PATH_AR122_FILTERED_GENOMES.format(prefix=prefix),
                    os.path.join(
                        out_dir,
                        os.path.basename(
                            PATH_AR122_FILTERED_GENOMES.format(
                                prefix=prefix))))
                if len(trimmed_user_msa) > 0:
                    symlink_f(
                        PATH_AR122_USER_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_USER_MSA.format(prefix=prefix))))
                if not skip_gtdb_refs:
                    symlink_f(
                        PATH_AR122_MSA.format(prefix=prefix),
                        os.path.join(
                            out_dir,
                            os.path.basename(
                                PATH_AR122_MSA.format(prefix=prefix))))
            else:
                self.logger.error(
                    'There was an error determining the marker set.')
                raise GenomeMarkerSetUnknown
Ejemplo n.º 14
0
class TestClassify(unittest.TestCase):
    def setUp(self):
        self.classify = Classify()

        self.generic_out_path = 'tests/data/results'
        tmp_folder = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        self.out_dir = os.path.join(self.generic_out_path, tmp_folder)
        if not os.path.exists(self.generic_out_path):
            os.makedirs(self.generic_out_path)
        self.prefix = 'gtdbtk'
        self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
        self.aln_dir_ref = 'tests/data/align_dir_reference/align'
        self.user_msa_file = os.path.join(self.aln_dir_ref,
                                          'gtdbtk.ar122.user_msa.fasta')
        self.taxonomy_file = Config.TAXONOMY_FILE
        self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)

    def test_standardise_taxonomy(self):
        taxstring = 'p__phylum1;c_class1'
        marker_set = 'bac120'
        new_taxstring = self.classify.standardise_taxonomy(
            taxstring, marker_set)
        self.assertEqual(new_taxstring,
                         'd__Bacteria;p__phylum1;c_class1;o__;f__;g__;s__')

        # Test that the correct domain is returned.
        self.assertEqual(
            self.classify.standardise_taxonomy('p__P;c__C;o__O;f__F;g__G;s__S',
                                               'bac120'),
            'd__Bacteria;p__P;c__C;o__O;f__F;g__G;s__S')
        self.assertEqual(
            self.classify.standardise_taxonomy('p__P;c__C;o__O;f__F;g__G;s__S',
                                               'ar122'),
            'd__Archaea;p__P;c__C;o__O;f__F;g__G;s__S')

        # Remove ranks and check
        rank_order = {'p': 0, 'c': 1, 'o': 2, 'f': 3, 'g': 4, 's': 5}
        rank_lst = ['p__P', 'c__C', 'o__O', 'f__F', 'g__G', 's__S']
        ranks = {'p': 'P', 'c': 'C', 'o': 'O', 'f': 'F', 'g': 'G', 's': 'S'}
        dom_info = {'d__Bacteria': 'bac120', 'd__Archaea': 'ar122'}

        for k in range(1, len(ranks) - 1):
            for cur_domain in ('d__Bacteria', 'd__Archaea'):
                ranks_selected = rank_lst[0:-k]
                expected = list()
                test_lst = list()
                for cur_rank, _ in sorted(rank_order.items(),
                                          key=lambda x: [1]):
                    if cur_rank in ranks_selected:
                        test_lst.append(f'{cur_rank}__{ranks[cur_rank]}')
                        expected.append(f'{cur_rank}__{ranks[cur_rank]}')
                    else:
                        expected.append(f'{cur_rank}__')

                expected_str = f'{cur_domain};{";".join(expected)}'
                test_str = ";".join(test_lst)

                cur_dom = dom_info[cur_domain]
                test_value = self.classify.standardise_taxonomy(
                    test_str, cur_dom)
                self.assertEqual(expected_str, test_value)

    def test_write_red_dict(self):
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
        marker_dict = self.classify._write_red_dict(self.out_dir, self.prefix,
                                                    'bac120')
        self.assertTrue(len(marker_dict) == 6)
        self.assertTrue('d__' in marker_dict)
        self.assertTrue(marker_dict.get('d__') == 0)
        self.assertTrue('p__' in marker_dict)
        self.assertTrue('c__' in marker_dict)
        self.assertTrue('o__' in marker_dict)
        self.assertTrue('f__' in marker_dict)
        self.assertTrue('g__' in marker_dict)

    def test_get_pplacer_taxonomy(self):
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
        tree = dendropy.Tree.get_from_path(os.path.join(
            os.getcwd(), self.pplacer_dir_reference,
            'gtdbtk.ar122.classify.tree'),
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)
        self.classify._get_pplacer_taxonomy(self.out_dir, self.prefix, 'ar122',
                                            self.user_msa_file, tree)
        results = {}

        with open(
                os.path.join(
                    self.out_dir,
                    PATH_AR122_PPLACER_CLASS.format(prefix=self.prefix)),
                'r') as f:
            for line in f:
                infos = line.strip().split('\t')
                results[infos[0]] = infos[1]
        self.assertTrue(len(results) == 3)
        self.assertTrue('genome_1' in results)
        self.assertTrue('genome_2' in results)
        self.assertTrue('genome_3' in results)
        self.assertEqual(
            results.get('genome_1'),
            'd__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter;s__'
        )

    def test_place_genomes(self):
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
        tree_file = self.classify.place_genomes(self.user_msa_file, 'ar122',
                                                self.out_dir, self.prefix)
        with open(tree_file, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(last_line.startswith('('))
        self.assertTrue(last_line.endswith('d__Archaea;'))

    def test_formatnote(self):
        first3genomes = list(self.gtdb_taxonomy.keys())[:3]
        sorted_dict = ((first3genomes[0], {
            'ani': 98.5,
            'af': 1.0
        }), (first3genomes[1], {
            'ani': 92.6,
            'af': 1.0
        }), (first3genomes[2], {
            'ani': 90.3,
            'af': 1.3
        }))
        labels = [first3genomes[0]]
        note_list = self.classify._formatnote(sorted_dict, labels)
        self.assertTrue(first3genomes[1] in note_list[0])
        self.assertTrue(first3genomes[2] in note_list[1])
        self.assertTrue(note_list[0].endswith(', 92.6, 1.0'))
        self.assertTrue(note_list[1].endswith(', 90.3, 1.3'))

    def test_calculate_red_distances(self):
        tree = os.path.join(self.pplacer_dir_reference,
                            'gtdbtk.ar122.classify.tree')
        result_tree = self.classify._calculate_red_distances(
            tree, self.out_dir)
        egs2 = [
            eg.length for eg in result_tree.postorder_edge_iter()
            if eg.length is not None
        ]

        self.assertTrue(sum(egs2) / len(egs2) < 0.1)

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)
Ejemplo n.º 15
0
class TestClassify(unittest.TestCase):

    def setUp(self):
        self.classify = Classify()

        self.generic_out_path = 'tests/data/results'
        tmp_folder = ''.join(random.choice(
            string.ascii_uppercase + string.digits) for _ in range(10))
        self.out_dir = os.path.join(self.generic_out_path, tmp_folder)
        if not os.path.exists(self.generic_out_path):
            os.makedirs(self.generic_out_path)
        self.prefix = 'gtdbtk'
        self.pplacer_dir_reference = 'tests/data/pplacer_dir_reference'
        self.user_msa_file = os.path.join(
            self.pplacer_dir_reference, 'gtdbtk.ar122.user_msa.fasta')
        self.taxonomy_file = Config.TAXONOMY_FILE
        self.gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)

    def test_standardise_taxonomy(self):
        taxstring = 'p__phylum1;c_class1'
        marker_set = 'bac120'
        new_taxstring = self.classify.standardise_taxonomy(
            taxstring, marker_set)
        self.assertEqual(
            new_taxstring, 'd__Bacteria;p__phylum1;c_class1;o__;f__;g__;s__')

    def test_write_red_dict(self):
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
        marker_dict = self.classify._write_red_dict(
            self.out_dir, self.prefix, 'bac120')
        self.assertTrue(len(marker_dict) == 6)
        self.assertTrue('d__' in marker_dict)
        self.assertTrue(marker_dict.get('d__') == 0)
        self.assertTrue('p__' in marker_dict)
        self.assertTrue('c__' in marker_dict)
        self.assertTrue('o__' in marker_dict)
        self.assertTrue('f__' in marker_dict)
        self.assertTrue('g__' in marker_dict)

    def test_get_pplacer_taxonomy(self):
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
        tree = dendropy.Tree.get_from_path(os.path.join(os.getcwd(), self.pplacer_dir_reference,
                                                        'gtdbtk.ar122.classify.tree'),
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)
        self.classify._get_pplacer_taxonomy(
            self.out_dir, self.prefix, 'ar122', self.user_msa_file, tree)
        results = {}

        with open(os.path.join(self.out_dir, PATH_AR122_PPLACER_CLASS.format(prefix=self.prefix)), 'r') as f:
            for line in f:
                infos = line.strip().split('\t')
                results[infos[0]] = infos[1]
        self.assertTrue(len(results) == 3)
        self.assertTrue('genome_1' in results)
        self.assertTrue('genome_2' in results)
        self.assertTrue('genome_3' in results)
        self.assertEqual(results.get(
            'genome_1'), 'd__Archaea;p__Thermoplasmatota;c__MGII;o__MGIII;f__CG-Epi1;g__UBA8886;s__')

    def test_place_genomes(self):
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
        tree_file = self.classify.place_genomes(
            self.user_msa_file, 'ar122', self.out_dir, self.prefix)
        with open(tree_file, 'r') as f:
            lines = f.read().splitlines()
            last_line = lines[-1]
        self.assertTrue(last_line.startswith('('))
        self.assertTrue(last_line.endswith('d__Archaea;'))

    def test_formatnote(self):
        first3genomes = self.gtdb_taxonomy.keys()[:3]
        sorted_dict = ((first3genomes[0], {'ani': 98.5, 'af': 1.0}), (first3genomes[1], {
                       'ani': 92.6, 'af': 1.0}), (first3genomes[2], {'ani': 90.3, 'af': 1.3}))
        labels = [first3genomes[0]]
        note_list = self.classify._formatnote(sorted_dict, labels)
        self.assertTrue(first3genomes[1]in note_list[0])
        self.assertTrue(first3genomes[2]in note_list[1])
        self.assertTrue(note_list[0].endswith(', 92.6, 1.0'))
        self.assertTrue(note_list[1].endswith(', 90.3, 1.3'))

    def test_calculate_red_distances(self):
        tree = os.path.join(self.pplacer_dir_reference,
                            'gtdbtk.ar122.classify.tree')
        result_tree = self.classify._calculate_red_distances(
            tree, self.out_dir)
        egs2 = [eg.length for eg in result_tree.postorder_edge_iter()
                if eg.length is not None]

        self.assertTrue(sum(egs2) / len(egs2) < 0.1)

    def tearDown(self):
        shutil.rmtree(self.generic_out_path)
Ejemplo n.º 16
0
    def _fmeasure(self, tree, taxonomy):
        """Find node with highest F-measure for each taxon.
        
        Finds best placement for each taxon label
        by calculating the F-measure for every taxon
        at every node.
        
        Parameters
        ----------
        tree : Tree
          Dendropy Tree.
        taxonomy : d[extent_taxon_id] -> taxa list
          Taxon labels for extant taxa.
          
        Returns
        -------
        d[taxon] -> [(Node, F-measure, precision, recall_, ...]
            Node(s) with highest F-measure for each taxon.
        """

        # get named lineages/taxa at each taxonomic rank
        taxa_at_rank = Taxonomy().named_lineages_at_rank(taxonomy)

        # get extant taxa for each taxon label
        extent_taxa_with_label = {}
        for i, rank in enumerate(Taxonomy.rank_labels):
            extent_taxa_with_label[i] = Taxonomy().extant_taxa_for_rank(
                rank, taxonomy)

        # get parent taxon for each taxon:
        taxon_parents = Taxonomy().parents(taxonomy)

        # get number of leaves and taxon in each lineage
        self.logger.info('Calculating taxa within each lineage.')
        for node in tree.preorder_node_iter():
            num_leaves = 0
            taxa_count = defaultdict(lambda: defaultdict(int))
            for leaf in node.leaf_iter():
                num_leaves += 1
                for rank_index, taxon in enumerate(taxonomy[leaf.taxon.label]):
                    if taxon != Taxonomy.rank_prefixes[rank_index]:
                        taxa_count[rank_index][taxon] += 1

            node.num_leaves = num_leaves
            node.taxa_count = taxa_count

        taxa_in_tree = defaultdict(int)
        for leaf in tree.leaf_node_iter():
            for taxon in taxonomy[leaf.taxon.label]:
                taxa_in_tree[taxon] += 1

        # find node with best F-measure for each taxon
        fmeasure_for_taxa = {}
        for rank_index in range(0, len(Taxonomy.rank_labels)):
            # if rank_index == 6: #*** skip species
            #    continue
            self.logger.info('Processing {:,} taxa at {} rank.'.format(
                len(taxa_at_rank[rank_index]),
                Taxonomy.rank_labels[rank_index].capitalize()))

            for taxon in taxa_at_rank[rank_index]:
                if rank_index == 0:
                    # processing taxa at the domain is a special case
                    taxon_parent_node = tree.seed_node
                else:
                    # find first named parent
                    # e.g., Cyanobacteria for Synechococcales in d__Bacteria;p__Cyanobacteria;c__;o__Synechococcales
                    parent_taxon = 'x__'
                    parent_index = rank_index - 1
                    while len(parent_taxon) == 3 and parent_index != -1:
                        parent_taxon = taxon_parents[taxon][parent_index]
                        parent_index -= 1

                    if parent_taxon in fmeasure_for_taxa:
                        # only need to process the lineage below the parent node,
                        # but must take the MRCA if the placement of the parent
                        # taxon is unresolved
                        parent_nodes = []
                        for stat_table in fmeasure_for_taxa[parent_taxon]:
                            parent_nodes.append(stat_table.node)

                        if len(parent_nodes) == 1:
                            taxon_parent_node = parent_nodes[0]
                        else:
                            taxa = []
                            for p in parent_nodes:
                                taxa += [leaf.taxon for leaf in p.leaf_iter()]
                            taxon_parent_node = tree.mrca(taxa=taxa)

                        if taxon_parent_node.taxa_count[rank_index][
                                taxon] < 0.5 * taxa_in_tree[taxon]:
                            # substantial portion of genomes for taxon fall outside
                            # the parent lineages so best search the entire tree
                            taxon_parent_node = tree.seed_node
                    else:
                        # the parent for this taxon was not placed so
                        # it can be ignored (e.g., bacterial phylum in archaeal tree)
                        continue

                cur_taxon_fmeasure = -1
                cur_taxa = set(extent_taxa_with_label[rank_index][taxon])
                total_taxa = len(cur_taxa)

                for node in taxon_parent_node.preorder_iter():
                    taxa_in_lineage = node.taxa_count[rank_index][taxon]
                    num_leaves_with_taxa = sum(
                        node.taxa_count[rank_index].values())

                    if taxa_in_lineage != 0 and num_leaves_with_taxa != 0:
                        precision = float(
                            taxa_in_lineage) / num_leaves_with_taxa
                        recall = float(taxa_in_lineage) / total_taxa
                        fmeasure = (2 * precision * recall) / (precision +
                                                               recall)

                        if fmeasure >= cur_taxon_fmeasure:
                            node_taxa = set(
                                [l.taxon.label for l in node.leaf_iter()])
                            rogue_out = cur_taxa - node_taxa
                            rogue_in = []
                            for gid in node_taxa - cur_taxa:
                                if taxonomy[gid][
                                        rank_index] != Taxonomy.rank_prefixes[
                                            rank_index]:
                                    rogue_in.append(gid)

                            stat_table = self.StatsTable(
                                node=node,
                                fmeasure=fmeasure,
                                precision=precision,
                                recall=recall,
                                taxa_in_lineage=taxa_in_lineage,
                                total_taxa=total_taxa,
                                num_leaves_with_taxa=num_leaves_with_taxa,
                                rogue_out=rogue_out,
                                rogue_in=rogue_in)

                            if fmeasure > cur_taxon_fmeasure:
                                cur_taxon_fmeasure = fmeasure
                                fmeasure_for_taxa[taxon] = [stat_table]
                            elif fmeasure == cur_taxon_fmeasure:
                                fmeasure_for_taxa[taxon].append(stat_table)

        return fmeasure_for_taxa