Beispiel #1
0
    def _read_taxonomy_files(self, options) -> Dict[str, Tuple[str, str, str, str, str, str, str]]:
        """Read and merge taxonomy files."""

        self.logger.info('Reading GTDB taxonomy for representative genomes.')
        taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        if options.gtdbtk_classification_file:
            # add and overwrite taxonomy for genomes specified in the
            # GTDB-Tk classification file
            check_file_exists(options.gtdbtk_classification_file)

            self.logger.info('Reading GTDB-Tk classification file.')
            gtdbtk_taxonomy = Taxonomy().read(options.gtdbtk_classification_file)
            del gtdbtk_taxonomy['user_genome']
            num_reassigned = 0
            for gid, taxa in gtdbtk_taxonomy.items():
                if gid in taxonomy:
                    num_reassigned += 1
                taxonomy[gid] = taxa

            self.logger.info(f'Read GTDB-Tk classifications for {len(gtdbtk_taxonomy):,} genomes.')
            self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')

        if options.custom_taxonomy_file:
            # add and overwrite taxonomy for genomes specified in the
            # custom taxonomy file
            check_file_exists(options.custom_taxonomy_file)

            self.logger.info('Reading custom taxonomy file.')
            custom_taxonomy = Taxonomy().read(options.custom_taxonomy_file)
            num_reassigned = 0
            for gid, taxa in custom_taxonomy.items():
                if gid in taxonomy:
                    num_reassigned += 1
                taxonomy[gid] = taxa

            self.logger.info(f'Read custom taxonomy for {len(custom_taxonomy):,} genomes.')
            self.logger.info(f'Reassigned taxonomy for {num_reassigned:,} GTDB representative genomes.')

        if options.gtdbtk_classification_file and options.custom_taxonomy_file:
            dup_genomes = set(gtdbtk_taxonomy).intersection(custom_taxonomy)
            if len(dup_genomes) > 0:
                self.logger.error('GTDB-Tk classification and custom taxonomy '
                                  'files must not specify taxonomies for the '
                                  'same genomes.')
                self.logger.error('These files have {:,} genomes in common.'.format(len(dup_genomes)))
                self.logger.error('Example duplicate genome: {}'.format(dup_genomes.pop()))
                raise GTDBTkExit('Duplicated taxonomy information.')

        self.logger.info(f'Read taxonomy for {len(taxonomy):,} genomes.')

        return taxonomy
Beispiel #2
0
    def root(self, options):
        """Root tree using outgroup.

        Parameters
        ----------
        options : argparse.Namespace
            The CLI arguments input by the user.
        """
        self.logger.warning("Tree rooting is still under development!")

        check_file_exists(options.input_tree)

        if options.custom_taxonomy_file:
            check_file_exists(options.custom_taxonomy_file)
            taxonomy = Taxonomy().read(options.custom_taxonomy_file)
        else:
            taxonomy = Taxonomy().read(Config.TAXONOMY_FILE)

        self.logger.info('Identifying genomes from the specified outgroup.')
        outgroup = set()
        for genome_id, taxa in taxonomy.items():
            if options.outgroup_taxon in taxa:
                outgroup.add(genome_id)

        reroot = RerootTree()
        reroot.root_with_outgroup(options.input_tree, options.output_tree,
                                  outgroup)

        # Symlink to the tree summary file, if not run independently
        if hasattr(options, 'suffix'):
            if options.suffix == 'bac120':
                symlink_f(
                    PATH_BAC120_ROOTED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_ROOTED_TREE.format(
                                prefix=options.prefix))))
            elif options.suffix == 'ar122':
                symlink_f(
                    PATH_AR122_ROOTED_TREE.format(prefix=options.prefix),
                    os.path.join(
                        options.out_dir,
                        os.path.basename(
                            PATH_AR122_ROOTED_TREE.format(
                                prefix=options.prefix))))
            else:
                raise GenomeMarkerSetUnknown(
                    'There was an error determining the marker set.')

        self.logger.info('Done.')