Exemple #1
0
    def run(self, gtdb_clusters_file,
                    cur_gtdb_metadata_file,
                    uba_genome_paths,
                    qc_passed_file,
                    ncbi_misclassified_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    ani_af_rep_vs_nonrep,
                    gtdb_type_strains_ledger,
                    sp_priority_ledger,
                    genus_priority_ledger,
                    dsmz_bacnames_file):
        """Cluster genomes to selected GTDB representatives."""
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=uba_genome_paths,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # read named GTDB species clusters
        self.logger.info('Reading named and previous placeholder GTDB species clusters.')
        cur_clusters, rep_radius = read_clusters(gtdb_clusters_file)
        self.logger.info(' ... identified {:,} clusters spanning {:,} genomes.'.format(
                            len(cur_clusters),
                            sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # identified genomes with misclassified species assignments at NCBI
        self.logger.info('Identify genomes with misclassified NCBI species assignments.')
        ncbi_species_mngr = NCBI_SpeciesManager(cur_genomes, cur_clusters, self.output_dir)
        ncbi_misclassified_gids = ncbi_species_mngr.parse_ncbi_misclassified_table(ncbi_misclassified_file)
        self.logger.info(' - identified {:,} genomes with erroneous NCBI species assignments'.format(
                            len(ncbi_misclassified_gids)))
                            
        # identify NCBI species considered to be synonyms under the GTDB
        type_strain_synonyms = ncbi_species_mngr.identify_type_strain_synonyms(ncbi_misclassified_gids)
        consensus_synonyms = ncbi_species_mngr.identify_consensus_synonyms(ncbi_misclassified_gids)

        # read ANI and AF between representatives and non-representative genomes
        self.logger.info('Reading ANI and AF between representative and non-representative genomes.')
        ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb'))
        
        # write out synonyms
        ncbi_species_mngr.write_synonym_table(type_strain_synonyms,
                                                consensus_synonyms,
                                                ani_af,
                                                sp_priority_ledger,
                                                genus_priority_ledger,
                                                dsmz_bacnames_file)
    def run(self, gtdb_clusters_file, prev_gtdb_metadata_file,
            cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file,
                                             uba_genome_file=uba_genome_paths)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            create_sp_clusters=False,
                                            uba_genome_file=uba_genome_paths,
                                            qc_passed_file=qc_passed_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # read named GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        cur_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # create curate tree and table indicating new NCBI taxa as these
        # should be considered by GTDB curators
        self.new_ncbi_taxa(prev_genomes, cur_genomes, cur_clusters)
Exemple #3
0
    def type_status(self,
                    cur_gtdb_metadata_file,
                    qc_passed_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    gtdb_type_strains_ledger,
                    ncbi_env_bioproject_ledger,
                    genome_ids):
        """Report information related to a genome being type material."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                            create_sp_clusters=False,
                                            qc_passed_file=qc_passed_file,
                                            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                            untrustworthy_type_ledger=untrustworthy_type_file,
                                            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - current genome set contains {len(cur_genomes):,} genomes.')

        # report information
        pt = PrettyTable()
        pt.field_names = ['Genome ID', 'GTDB representative', 'GTDB type strain', 'GTDB untrustworthy as type',
                          'NCBI type strain', 'NCBI untrustworthy as type', 'GTDB species', 'NCBI species', 'NCBI strain IDs\n']
        for gid in genome_ids:
            gid = canonical_gid(gid)
            if gid not in cur_genomes:
                self.logger.warning(f'Genome {gid} not in current genome set.')
                continue

            pt.add_row([gid,
                        cur_genomes[gid].is_gtdb_sp_rep(),
                        cur_genomes[gid].is_gtdb_type_strain(),
                        cur_genomes[gid].is_gtdb_untrustworthy_as_type(),
                        cur_genomes[gid].is_ncbi_type_strain(),
                        cur_genomes[gid].is_ncbi_untrustworthy_as_type(),
                        cur_genomes[gid].gtdb_taxa.species,
                        cur_genomes[gid].ncbi_taxa.species,
                        cur_genomes[gid].ncbi_strain_identifiers])

        print(pt)
    def run(self, gtdb_metadata_file, genomic_path_file):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # create GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # dereplicate each species cluster
        self.logger.info(
            'Performing dereplication with ANI={:.1f}, AF={:.2f}, Mash ANI={:.2f}, max genomes={:,}.'
            .format(self.derep_ani, self.derep_af, self.min_mash_intra_sp_ani,
                    self.max_genomes_per_sp))
        derep_genomes = self.derep_sp_clusters(genomes)

        # write out `subspecies` clusters
        out_file = os.path.join(self.output_dir, 'subsp_clusters.tsv')
        fout = open(out_file, 'w')
        fout.write(
            'Genome ID\tGTDB Species\tGTDB Taxonomy\tPriority score\tNo. clustered genomes\tNo. clustered genomes\tClustered genomes\n'
        )
        for species, subsp_clusters in derep_genomes.items():
            for rid, cids in subsp_clusters.items():
                assert species == genomes[rid].gtdb_taxa.species
                fout.write('{}\t{}\t{}\t{:.3f}\t{}\t{}\n'.format(
                    rid, genomes[rid].gtdb_taxa.species,
                    genomes[rid].gtdb_taxa, self.priority_score(rid, genomes),
                    len(cids), ','.join(cids)))
    def run(self, gtdb_clusters_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            gtdb_type_strains_ledger, sp_priority_ledger,
            genus_priority_ledger, dsmz_bacnames_file):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # read named GTDB species clusters
        self.logger.info(
            'Reading named and previous placeholder GTDB species clusters.')
        cur_clusters, rep_radius = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # identify genomes with erroneous NCBI species assignments
        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by ANI type strain genomes.'
        )
        self.identify_misclassified_genomes_ani(cur_genomes, cur_clusters)

        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by GTDB cluster of type strain genomes.'
        )
        self.identify_misclassified_genomes_cluster(cur_genomes, cur_clusters)
Exemple #6
0
    def run(self, gtdb_clusters_file, prev_gtdb_metadata_file,
            cur_gtdb_metadata_file, qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            ' - previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # read named GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        cur_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' - identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # create curate tree and table indicating new NCBI taxa as these
        # should be considered by GTDB curators
        self.new_ncbi_taxa(prev_genomes, cur_genomes, cur_clusters)
    def run(self, gtdb_clusters_file, gtdb_metadata_file, genomic_path_file,
            uba_gid_table):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # map user IDs to UBA IDs
        with open(uba_gid_table) as f:
            for line in f:
                tokens = line.strip().split('\t')

                if len(tokens) == 3:
                    self.user_id_map[tokens[0]] = tokens[2]
                else:
                    self.user_id_map[tokens[0]] = tokens[1]

        # create previous and current GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file,
                                        uba_genome_file=uba_gid_table)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # dereplicate each species cluster
        self.logger.info(
            'Performing dereplication with ANI={:.1f}, AF={:.2f}, Mash ANI={:.2f}, max genomes={:,}.'
            .format(self.derep_ani, self.derep_af, self.min_mash_intra_sp_ani,
                    self.max_genomes_per_sp))
        derep_genomes = self.derep_sp_clusters(genomes)

        # write out `subspecies` clusters
        out_file = os.path.join(self.output_dir, 'subsp_clusters.tsv')
        fout = open(out_file, 'w')
        fout.write(
            'Genome ID\tGTDB Species\tGTDB Taxonomy\tPriority score\tNo. clustered genomes\tNo. clustered genomes\tClustered genomes\n'
        )
        for species, subsp_clusters in derep_genomes.items():
            for rid, cids in subsp_clusters.items():
                assert species == genomes[rid].gtdb_taxa.species
                fout.write('{}\t{}\t{}\t{:.3f}\t{}\t{}\n'.format(
                    rid, genomes[rid].gtdb_taxa.species,
                    genomes[rid].gtdb_taxa, self.priority_score(rid, genomes),
                    len(cids), ','.join(cids)))
    def run(self, target_genus, gtdb_metadata_file, genomic_path_file):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # create GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # identify GTDB representatives from target genus
        self.logger.info('Identifying GTDB representatives from target genus.')
        target_gids = set()
        for gid in genomes:
            if genomes[gid].is_gtdb_sp_rep(
            ) and genomes[gid].gtdb_taxa.genus == target_genus:
                target_gids.add(gid)
        self.logger.info(' - identified {:,} genomes.'.format(
            len(target_gids)))

        # calculate FastANI ANI/AF between target genomes
        self.logger.info('Calculating pairwise ANI between target genomes.')
        ani_af = self.fastani.pairwise(target_gids,
                                       genomes.genomic_files,
                                       check_cache=True)
        self.fastani.write_cache(silence=True)

        # write out results
        genus_label = target_genus.replace('g__', '').lower()
        fout = open(
            os.path.join(self.output_dir,
                         '{}_rep_ani.tsv'.format(genus_label)), 'w')
        fout.write(
            'Query ID\tQuery species\tTarget ID\tTarget species\tANI\tAF\n')
        for qid in target_gids:
            for rid in target_gids:
                ani, af = FastANI.symmetric_ani(ani_af, qid, rid)

                fout.write('{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\n'.format(
                    qid, genomes[qid].gtdb_taxa.species, rid,
                    genomes[rid].gtdb_taxa.species, ani, af))
        fout.close()
    def run(self, gtdb_metadata_file, genome_path_file, species1, species2):
        """Produce information relevant to merging two sister species."""

        # read GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genome_path_file)
        self.logger.info(
            ' - identified {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # find species of interest
        gid1 = None
        gid2 = None
        for gid, species in genomes.sp_clusters.species():
            if species == species1:
                gid1 = gid
            elif species == species2:
                gid2 = gid

        if gid1 is None:
            self.logger.error(
                f'Unable to find representative genome for {species1}.')
            sys.exit(-1)

        if gid2 is None:
            self.logger.error(
                f'Unable to find representative genome for {species2}.')
            sys.exit(-1)

        self.logger.info(' - identified {:,} genomes in {}.'.format(
            len(genomes.sp_clusters[gid1]), species1))
        self.logger.info(' - identified {:,} genomes in {}.'.format(
            len(genomes.sp_clusters[gid2]), species2))

        # calculate ANI between all genome in genus
        genus1 = genomes[gid1].gtdb_genus
        genus2 = genomes[gid2].gtdb_genus
        if genus1 != genus2:
            self.logger.error(
                f'Genomes must be from same genus: {genus1} {genus2}')
            sys.exit(-1)

        self.logger.info(f'Identifying {genus1} species representatives.')
        reps_in_genera = set()
        for rid in genomes.sp_clusters:
            if genomes[rid].gtdb_genus == genus1:
                reps_in_genera.add(rid)

        self.logger.info(
            f' - identified {len(reps_in_genera):,} representatives.')

        # calculate ANI between genomes
        self.logger.info(f'Calculating ANI to {species1}.')
        gid_pairs = []
        for gid in reps_in_genera:
            if gid != gid1:
                gid_pairs.append((gid1, gid))
                gid_pairs.append((gid, gid1))
        ani_af1 = self.fastani.pairs(gid_pairs, genomes.genomic_files)

        self.logger.info(f'Calculating ANI to {species2}.')
        gid_pairs = []
        for gid in reps_in_genera:
            if gid != gid2:
                gid_pairs.append((gid2, gid))
                gid_pairs.append((gid, gid2))
        ani_af2 = self.fastani.pairs(gid_pairs, genomes.genomic_files)

        # report results
        ani12, af12 = ani_af1[gid1][gid2]
        ani21, af21 = ani_af2[gid2][gid1]
        ani, af = FastANI.symmetric_ani(ani_af1, gid1, gid2)

        self.logger.info(
            f'{species1} ({gid1}) -> {species2} ({gid2}): ANI={ani12:.1f}%, AF={af12:.2f}'
        )
        self.logger.info(
            f'{species2} ({gid2}) -> {species1} ({gid1}): ANI={ani21:.1f}%, AF={af21:.2f}'
        )
        self.logger.info(f'Max. ANI={ani:.1f}%, Max. AF={af:.2f}')

        # report top hits
        self.top_hits(species1, gid1, ani_af1, genomes)
        self.top_hits(species2, gid2, ani_af2, genomes)

        # calculate ANI from species to all genomes in merged species cluster
        merged_sp_cluster = genomes.sp_clusters[gid1].union(
            genomes.sp_clusters[gid2])
        self.merge_ani_radius(species1, gid1, merged_sp_cluster,
                              genomes.genomic_files)
        self.merge_ani_radius(species2, gid2, merged_sp_cluster,
                              genomes.genomic_files)
Exemple #10
0
    def run(self, updated_sp_rep_file, gtdb_clusters_file,
            prev_gtdb_metadata_file, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger):
        """Summary statistics indicating changes to GTDB species clusters."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # update current genomes with GTDB-Tk classifications
        self.logger.info(
            'Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(
            gtdbtk_classify_file, prev_genomes)
        self.logger.info(
            f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.'
        )

        # report changes in genome sets
        self.logger.info('Comparing previous and current genome sets.')
        prev_gids = set(prev_genomes)
        new_gids = set(cur_genomes)
        num_same_genomes = len(prev_gids.intersection(new_gids))
        num_lost_genomes = len(prev_gids - new_gids)
        num_new_genomes = len(new_gids - prev_gids)
        self.logger.info(
            f' ... identified {num_same_genomes:,} genomes as being present in both genome sets.'
        )
        self.logger.info(
            f' ... identified {num_lost_genomes:,} genomes as being lost from the previous genome set.'
        )
        self.logger.info(
            f' ... identified {num_new_genomes:,} genomes as being new to the current genome set.'
        )

        # get changes to representatives of previous GTDB species clusters
        updated_rids = self._parse_updated_sp_reps(updated_sp_rep_file)

        # get new GTDB species clusters
        self.logger.info('Reading current GTDB clusters.')
        new_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... current genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(new_clusters),
                    sum(len(cids) for cids in new_clusters.values())))

        new_rid_map = {}
        for rid, cids in new_clusters.items():
            for cid in cids:
                new_rid_map[cid] = rid

        # UBA genome sanity check
        prev_uba_count = 0
        for gid in prev_genomes:
            if gid.startswith('UBA'):
                prev_uba_count += 1

        cur_uba_count = 0
        for gid in cur_genomes:
            if gid.startswith('UBA'):
                cur_uba_count += 1

        new_uba_count = 0
        for rid, cids in new_clusters.items():
            for cid in cids:
                if cid.startswith('UBA'):
                    new_uba_count += 1

        self.logger.info(
            f'Verified all genome / cluster sets contain the same number of UBA genomes: {prev_uba_count:,}'
        )
        assert prev_uba_count == cur_uba_count == new_uba_count

        # tabulate changes in GTDB species clusters
        self.logger.info('Calculating statistics of GTDB species clusters.')

        fout = open(
            os.path.join(self.output_dir, 'gtdb_sp_clusters_change_stats.tsv'),
            'w')
        fout.write(
            'Previous representative\tPrevious name\tNew representative\tNew name\tRepresentative status\tName status'
        )
        fout.write(
            '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\tNote\n'
        )

        rep_lost_count = 0
        rep_changed_count = 0
        rep_unchanged_count = 0
        rep_merged_count = 0

        name_lost_count = 0
        name_changed_count = 0
        name_unchanged_count = 0
        name_merged_count = 0

        prev_cluster_ids = set()
        total_num_same = 0
        total_num_lost = 0
        total_num_new = 0
        total_num_migrated_in = 0
        total_num_migrated_out = 0
        for prev_rid, prev_cids in prev_genomes.sp_clusters.items():
            prev_gtdb_sp = prev_genomes[prev_rid].gtdb_taxa.species

            new_rid = updated_rids[prev_rid]
            prev_cluster_ids.add(new_rid)
            note = ''
            if new_rid is None:
                new_rid = 'none'
                new_sp = 'none'
                rep_status = 'LOST'
                name_status = 'LOST'  # what does this mean; presumable a species name can be recycled elsewhere!

                new_cluster = set()

                rep_lost_count += 1
                name_lost_count += 1
            elif new_rid not in new_clusters:
                # representative must have been merged when selecting
                # representatives for NCBI species
                merged_rid = new_rid_map[new_rid]
                merged_sp = cur_genomes[merged_rid].gtdb_taxa.species
                note = 'merged with {} with representative {}'.format(
                    merged_sp, merged_rid)

                new_rid = 'none'
                rep_status = 'MERGED'
                name_status = 'MERGED'

                new_cluster = set()

                rep_merged_count += 1
                name_merged_count += 1
            else:
                new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
                new_cluster = new_clusters[new_rid]

                if prev_rid == new_rid:
                    rep_status = 'UNCHANGED'
                    rep_unchanged_count += 1
                else:
                    rep_status = 'CHANGED'
                    rep_changed_count += 1

                if prev_gtdb_sp == new_gtdb_sp:
                    name_status = 'UNCHANGED'
                    name_unchanged_count += 1
                else:
                    name_status = 'CHANGED'
                    name_changed_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format(prev_rid, prev_gtdb_sp,
                                                       new_rid, new_gtdb_sp,
                                                       rep_status,
                                                       name_status))

            num_same = len(new_cluster.intersection(prev_cids))
            num_lost = len(prev_cids - new_gids)
            num_new = len(new_cluster - prev_gids)
            num_migrated_in = len(
                (new_cluster - prev_cids).intersection(prev_gids))
            num_migrated_out = len(
                (prev_cids - new_cluster).intersection(new_gids))
            assert len(new_cluster) == len(
                prev_cids
            ) - num_lost + num_new + num_migrated_in - num_migrated_out
            assert len(prev_cids) == num_same + num_lost + num_migrated_out

            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                len(prev_cids), len(new_cluster), num_same, num_lost, num_new,
                num_migrated_in, num_migrated_out, note))

            total_num_same += num_same
            total_num_lost += num_lost
            total_num_new += num_new
            total_num_migrated_in += num_migrated_in
            total_num_migrated_out += num_migrated_out

        # add in new GTDB species clusters
        new_cluster_count = 0
        for new_rid in new_clusters:
            if new_rid in prev_cluster_ids:
                continue

            new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
            rep_status = 'NEW'
            name_status = 'NEW'
            new_cluster_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format('n/a', 'n/a', new_rid,
                                                       new_gtdb_sp, rep_status,
                                                       name_status))

            num_new = len(new_clusters[new_rid] - prev_gids)
            num_migrated_in = len(
                new_clusters[new_rid].intersection(prev_gids))
            assert len(new_clusters[new_rid]) == num_new + num_migrated_in
            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                0, len(new_clusters[new_rid]), 0, 0, num_new, num_migrated_in,
                0, ''))

            total_num_new += num_new
            total_num_migrated_in += num_migrated_in

        # report genome statistics
        num_union = len(new_gids.union(prev_gids))
        assert len(
            new_gids.union(prev_gids)
        ) == total_num_same + total_num_lost + total_num_new + total_num_migrated_in
        assert total_num_migrated_in == total_num_migrated_out
        self.logger.info(
            f'There were {len(prev_gids):,} genomes in the previous genome sets.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.'
            .format(total_num_same, total_num_same * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were lost from the species cluster.'
            .format(total_num_lost, total_num_lost * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that migrated between species cluster.'
            .format(total_num_migrated_in,
                    total_num_migrated_in * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} new genomes which is a {:.2f}% increase.'.
            format(total_num_new,
                   len(new_gids) * 100.0 / len(prev_gids) - 100))

        # report representative statistics
        assert len(new_clusters) == len(
            prev_genomes.sp_clusters
        ) + new_cluster_count - rep_lost_count - rep_merged_count
        self.logger.info(
            f'There are {len(new_clusters):,} total GTDB species representatives.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) unchanged representatives.'.format(
                rep_unchanged_count,
                rep_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed representatives.'.format(
                rep_changed_count,
                rep_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost representatives.'.format(
                rep_lost_count,
                rep_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged representatives.'.format(
                rep_merged_count,
                rep_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} new representatives which is a {:.2f}% increase.'
            .format(
                new_cluster_count,
                len(new_clusters) * 100.0 / len(prev_genomes.sp_clusters) -
                100))

        self.logger.info(
            ' ... identified {:,} ({:.2f}%) cluster names.'.format(
                name_unchanged_count,
                name_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed cluster names.'.format(
                name_changed_count,
                name_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost cluster names.'.format(
                name_lost_count,
                name_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged cluster names.'.format(
                name_merged_count,
                name_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) new cluster names.'.format(
                new_cluster_count,
                new_cluster_count * 100.0 / len(prev_genomes.sp_clusters)))
Exemple #11
0
    def run(self,
            gtdb_clusters_file,
            prev_gtdb_metadata_file,
            cur_gtdb_metadata_file,
            qc_passed_file,
            ncbi_genbank_assembly_file,
            untrustworthy_type_file,
            gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Summary statistics indicating changes to GTDB species cluster membership."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file,
                                             gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                             ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                             untrustworthy_type_ledger=untrustworthy_type_file,
                                             ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(' - previous genome set has {:,} species clusters spanning {:,} genomes.'.format(
            len(prev_genomes.sp_clusters),
            prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                            create_sp_clusters=False,
                                            qc_passed_file=qc_passed_file,
                                            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                            untrustworthy_type_ledger=untrustworthy_type_file,
                                            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # report changes in genome sets
        self.logger.info('Comparing previous and current genome sets.')
        prev_gids = set(prev_genomes)
        new_gids = set(cur_genomes)
        num_same_genomes = len(prev_gids.intersection(new_gids))
        num_lost_genomes = len(prev_gids - new_gids)
        num_new_genomes = len(new_gids - prev_gids)
        self.logger.info(
            f' - identified {num_same_genomes:,} genomes as being present in both genome sets.')
        self.logger.info(
            f' - identified {num_lost_genomes:,} genomes as being lost from the previous genome set.')
        self.logger.info(
            f' - identified {num_new_genomes:,} genomes as being new to the current genome set.')

        # get new GTDB species clusters
        self.logger.info('Reading current GTDB clusters.')
        new_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(' - current genome set has {:,} species clusters spanning {:,} genomes.'.format(
            len(new_clusters),
            sum(len(cids) for cids in new_clusters.values())))

        new_rid_map = {}
        for rid, cids in new_clusters.items():
            for cid in cids:
                new_rid_map[cid] = rid

        # get mapping of previous GTDB representatives to new GTDB species clusters
        self.logger.info(
            'Mapping previous GTDB representatives to new representatives.')
        prev_to_new_rid = prev_genomes.sp_clusters.updated_representatives(
            new_clusters)
        self.logger.info(
            ' - mapped {:,} previous representatives.'.format(len(prev_to_new_rid)))

        new_to_prev_rids = defaultdict(list)
        for prev_rid, new_rid in prev_to_new_rid.items():
            new_to_prev_rids[new_rid].append(prev_rid)

        # tabulate changes in GTDB species clusters
        self.logger.info('Calculating statistics of GTDB species clusters.')

        fout = open(os.path.join(self.output_dir,
                                 'gtdb_sp_clusters_change_stats.tsv'), 'w')
        fout.write(
            'New representative\tPrevious representative(s)\tPrevious name(s)\tRepresentative status')
        fout.write(
            '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\n')

        rep_lost_count = 0
        rep_changed_count = 0
        rep_unchanged_count = 0
        rep_merger_count = 0

        prev_cluster_ids = set()
        total_num_same = 0
        total_num_lost = 0
        total_num_new = 0
        total_num_migrated_in = 0
        total_num_migrated_out = 0

        for new_rid, prev_rids in new_to_prev_rids.items():
            prev_cluster_ids.add(new_rid)

            prev_gtdb_sp = [
                prev_genomes[prev_rid].gtdb_taxa.species for prev_rid in prev_rids]

            prev_cids = set()
            for prev_rid in prev_rids:
                prev_cids.update(prev_genomes.sp_clusters[prev_rid])

            if new_rid is None:
                new_rid = 'none'
                rep_status = 'LOST'
                new_cluster = set()
                rep_lost_count += len(prev_rids)
            else:
                new_cluster = new_clusters[new_rid]

                if len(prev_rids) == 1:
                    if prev_rids[0] == new_rid:
                        rep_status = 'UNCHANGED'
                        rep_unchanged_count += 1
                    else:
                        rep_status = 'CHANGED'
                        rep_changed_count += 1
                else:
                    rep_status = 'MERGER'
                    rep_merger_count += len(prev_rids)

            fout.write('{}\t{}\t{}\t{}'.format(
                new_rid,
                ', '.join(prev_rids),
                ', '.join(prev_gtdb_sp),
                rep_status))

            num_same = len(new_cluster.intersection(prev_cids))
            num_new = len(new_cluster - prev_gids)
            num_lost = len(prev_cids - new_gids)

            num_migrated_in = len(
                (new_cluster - prev_cids).intersection(prev_gids))
            num_migrated_out = len(
                (prev_cids - new_cluster).intersection(new_gids))

            assert len(new_cluster) == len(prev_cids) - num_lost + \
                num_new + num_migrated_in - num_migrated_out
            assert len(prev_cids) == num_same + num_lost + num_migrated_out

            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                len(prev_cids),
                len(new_cluster),
                num_same,
                num_lost,
                num_new,
                num_migrated_in,
                num_migrated_out))

            total_num_same += num_same
            total_num_lost += num_lost
            total_num_new += num_new
            total_num_migrated_in += num_migrated_in
            total_num_migrated_out += num_migrated_out

        assert len(prev_genomes.sp_clusters) == rep_unchanged_count + \
            rep_changed_count + rep_merger_count + rep_lost_count

        # add in new GTDB species clusters
        new_cluster_count = 0
        for new_rid in new_clusters:
            if new_rid in prev_cluster_ids:
                continue

            new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
            rep_status = 'NEW'
            new_cluster_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}'.format(
                'n/a',
                'n/a',
                new_rid,
                new_gtdb_sp,
                rep_status))

            num_new = len(new_clusters[new_rid] - prev_gids)
            num_migrated_in = len(
                new_clusters[new_rid].intersection(prev_gids))
            assert len(new_clusters[new_rid]) == num_new + num_migrated_in
            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                0,
                len(new_clusters[new_rid]),
                0,
                0,
                num_new,
                num_migrated_in,
                0))

            total_num_new += num_new
            total_num_migrated_in += num_migrated_in

        assert len(new_gids.union(prev_gids)) == total_num_same + \
            total_num_lost + total_num_new + total_num_migrated_in
        assert total_num_migrated_in == total_num_migrated_out

        # report genome statistics
        assert len(prev_gids) == total_num_same + \
            total_num_lost + total_num_migrated_in
        self.logger.info(
            f'There were {len(prev_gids):,} genomes in the previous release.')
        self.logger.info(' - identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.'.format(
            total_num_same,
            total_num_same*100.0/len(prev_gids)))
        self.logger.info(' - identified {:,} ({:.2f}%) genomes that were lost from the species cluster.'.format(
            total_num_lost,
            total_num_lost*100.0/len(prev_gids)))
        self.logger.info(' - identified {:,} ({:.2f}%) genomes that migrated between species cluster.'.format(
            total_num_migrated_in,
            total_num_migrated_in*100.0/len(prev_gids)))
        self.logger.info('Identified {:,} new genomes which is a {:.2f}% increase.'.format(
            total_num_new,
            len(new_gids)*100.0/len(prev_gids) - 100))

        # report representative statistics
        assert len(prev_genomes.sp_clusters) == rep_unchanged_count + \
            rep_changed_count + rep_lost_count + rep_merger_count
        self.logger.info(
            f'There were {len(prev_genomes.sp_clusters):,} previous GTDB species representatives.')
        self.logger.info(' - identified {:,} ({:.2f}%) unchanged representatives.'.format(
            rep_unchanged_count,
            rep_unchanged_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info(' - identified {:,} ({:.2f}%) changed representatives.'.format(
            rep_changed_count,
            rep_changed_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info(' - identified {:,} ({:.2f}%) lost representatives.'.format(
            rep_lost_count,
            rep_lost_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info(' - identified {:,} ({:.2f}%) merged representatives.'.format(
            rep_merger_count,
            rep_merger_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info('Identified {:,} new representatives which is a {:.2f}% increase.'.format(
            new_cluster_count,
            len(new_clusters)*100.0/len(prev_genomes.sp_clusters) - 100))
    def run(self, 
                cur_gtdb_metadata_file,
                cur_genomic_path_file,
                qc_passed_file,
                ncbi_genbank_assembly_file,
                ltp_taxonomy_file,
                gtdb_type_strains_ledger,
                untrustworthy_type_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""
        
        # get species in LTP reference database
        self.logger.info('Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(f' ... identified {len(ltp_defined_species):,} species.')
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=None,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # update current genomes with GTDB-Tk classifications
        self.logger.info('Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes)
        self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.')
        
        # parsing genomes manually established to be untrustworthy as type
        self.logger.info('Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = {}
        with open(untrustworthy_type_ledger) as f:
            header = f.readline().strip().split('\t')
            
            ncbi_sp_index = header.index('NCBI species')
            reason_index = header.index('Reason for declaring untrustworthy')
            
            for line in f:
                tokens = line.strip().split('\t')
                
                gid = canonical_gid(tokens[0])
                manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index])
        self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.')

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info('Determining number of type strain genomes in each NCBI species.')
        sp_type_strain_genomes = defaultdict(set)
        for gid in cur_genomes:
            if cur_genomes[gid].is_effective_type_strain():
                ncbi_sp = cur_genomes[gid].ncbi_taxa.species
                if ncbi_sp != 's__':
                    # yes, NCBI has genomes marked as assembled from type material
                    # that do not actually have a binomial species name
                    sp_type_strain_genomes[ncbi_sp].add(gid)

        multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1]
        self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.')
        
        # sort by number of genome assemblies
        self.logger.info('Calculating ANI between type strain genomes in each species.')
        
        fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w')
        fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n')
        
        fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment')
        fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w')
        fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n')
        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                                        gid, 
                                        ncbi_sp, 
                                        cur_genomes[gid].gtdb_taxa.species,
                                        '<not tested>',
                                        'n/a',
                                        'Manual curation: ' + reason))
        
        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0
        
        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0
        
        use_pickled_results = False #***
        if use_pickled_results:
            self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir))
        
        prev_gtdb_sp_conflicts = 0
        for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])):
            if len(type_gids) == 1:
                continue
                
            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                                ncbi_sp, 
                                len(type_gids),
                                processed+1, 
                                len(multi_type_strains_sp),
                                (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_')
            if not use_pickled_results: #***
                ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files)
                pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb'))
            else:
                ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb'))
            
            anis = []
            afs = []
            gid_anis = defaultdict(lambda: {})
            gid_afs = defaultdict(lambda: {})
            all_similar = True
            for gid1, gid2 in combinations(type_gids, 2):
                ani, af = symmetric_ani(ani_af, gid1, gid2)
                if ani < 99 or af < 0.65:
                    all_similar = False
                    
                anis.append(ani)
                afs.append(af)
                
                gid_anis[gid1][gid2] = ani
                gid_anis[gid2][gid1] = ani
                
                gid_afs[gid1][gid2] = af
                gid_afs[gid2][gid1] = af
                
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            unresolved_species = False
            
            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            if not all_similar:
                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True
                
                # write out highly divergent cases for manual inspection; 
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                                        gid,
                                                        ncbi_sp,
                                                        cur_genomes[gid].gtdb_taxa.genus,
                                                        cur_genomes[gid].gtdb_taxa.species,
                                                        ' / '.join(ltp_species),
                                                        np_mean(list(gid_anis[gid].values())),
                                                        np_std(list(gid_anis[gid].values())),
                                                        np_mean(list(gid_afs[gid].values())),
                                                        np_std(list(gid_afs[gid].values())),
                                                        cur_genomes[gid].excluded_from_refseq_note,
                                                        cur_genomes[gid].ncbi_taxa,
                                                        cur_genomes[gid].gtdb_taxa))
                
                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, 
                                                                                                    ncbi_sp, 
                                                                                                    type_gids, 
                                                                                                    ltp_metadata, 
                                                                                                    ltp_defined_species,
                                                                                                    cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1
                
                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1
                           
                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1
                        
                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                if resolved:
                    unresolved_species = False
                    
                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True

                    # write results to file
                    for gid, reason in untrustworthy_gids.items():
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                        
                        if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note:
                            reason += "; considered `untrustworthy as type` at NCBI"
                        fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid,
                                                                                ncbi_sp,
                                                                                cur_genomes[gid].gtdb_taxa.species,
                                                                                ' / '.join(ltp_species),
                                                                                reason))
                                                                                
                        # Sanity check that if the untrustworthy genome has an LTP to only the
                        # expected species, that all other genomes also have a hit to the 
                        # expected species (or potentially no hit). Otherwise, more consideration
                        # should be given to the genome with the conflicting LTP hit.
                        if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                            other_sp = set()
                            for test_gid in type_gids:
                                ltp_species = self.ltp_species(test_gid, ltp_metadata)
                                if ltp_species and ncbi_sp not in ltp_species:
                                    other_sp.update(ltp_species)
                                
                            if other_sp:
                                self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.')
                                
                    num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note])
                    if num_ncbi_untrustworthy != len(type_gids):
                        for gid in type_gids:
                            if (gid not in untrustworthy_gids 
                                and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note):
                                self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format(
                                                        gid, 
                                                        ncbi_sp,
                                                        num_ncbi_untrustworthy,
                                                        len(type_gids)))
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1
                    
                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                    gid,
                                    ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)
                    
                fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                            gid,
                            gid in untrustworthy_gids,
                            ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species),
                            gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa))

            fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                        ncbi_sp,
                        len(type_gids),
                        all_similar,
                        np_mean(anis),
                        np_std(anis),
                        np_mean(afs),
                        np_std(afs),
                        note,
                        ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()
        
        self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.')
        self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.')
        self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.')
        self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.')
        self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.')
        self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.')

        if unresolved_sp_count > 0:
            self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.')
            self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.')
            self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.")
        
        self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')
Exemple #13
0
    def run(self, prev_gtdb_metadata_file, cur_gtdb_metadata_file,
            ncbi_genbank_assembly_file, gtdb_domain_report,
            gtdb_type_strains_ledger, qc_exception_file,
            ncbi_env_bioproject_ledger, min_comp, max_cont, min_quality,
            sh_exception, min_perc_markers, max_contigs, min_N50,
            max_ambiguous):
        """Quality check all potential GTDB genomes."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - previous genome set contains {len(prev_genomes):,} genomes.')
        self.logger.info(
            ' - previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # parse genomes flagged as exceptions from QC
        qc_exceptions = self.parse_qc_exception_file(qc_exception_file)
        self.logger.info(
            f'Identified {len(qc_exceptions):,} genomes flagged as exceptions from QC.'
        )

        # get percentage of bac120 or ar122 marker genes
        marker_perc = self.parse_marker_percentages(gtdb_domain_report)

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(
            ncbi_genbank_assembly_file)

        # QC all genomes
        self.logger.info('Validating genomes.')
        passed_qc_gids, failed_qc_gids = self.qc_genomes(
            cur_genomes, marker_perc, qc_exceptions, excluded_from_refseq_note,
            min_comp, max_cont, min_quality, sh_exception, min_perc_markers,
            max_contigs, min_N50, max_ambiguous)

        # check domain assignment of genomes passing QC
        # and report potential issues
        self.check_domain_assignments(gtdb_domain_report, passed_qc_gids)

        # report results of QC on genomes from each NCBI species
        self.check_qc_of_ncbi_species(cur_genomes, marker_perc, qc_exceptions,
                                      excluded_from_refseq_note, min_comp,
                                      max_cont, min_quality, sh_exception,
                                      min_perc_markers, max_contigs, min_N50,
                                      max_ambiguous)

        # sanity check QC results by identifying any genomes that passed QC last release, but
        # have now been flagged as failing QC. This should rarely, if ever, happen unless the
        # genomic data of the assembly has been updated.
        unexpected_qc_fail = []
        for gid in prev_genomes:
            if gid in cur_genomes:
                if not same_assembly_version(prev_genomes[gid].ncbi_accn,
                                             cur_genomes[gid].ncbi_accn):
                    # genome assembly has changed so QC status is not expected to be the same
                    continue

                if gid in failed_qc_gids:
                    unexpected_qc_fail.append(gid)

        if len(unexpected_qc_fail) > 0:
            self.logger.warning(
                'Identified {:,} genomes that passed QC in previous GTDB release, that failed QC in this release.'
                .format(len(unexpected_qc_fail)))
            self.logger.warning(' - examples: {}'.format(','.join(
                unexpected_qc_fail[0:10])))
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger,
            sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  dsmz_bacnames_file)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get all GTDB species represented by a type strain:
        gtdb_type_species = set()
        for rid in mc_taxonomy:
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX])

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type strain genomes with incongruent GTDB species assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_sp = taxa[Taxonomy.SPECIES_INDEX]
                gtdb_generic = generic_name(gtdb_sp)

                ncbi_sp = cur_genomes[rid].ncbi_taxa.species
                ncbi_generic = generic_name(ncbi_sp)

                if ncbi_sp == 's__':
                    # NCBI taxonomy is sometimes behind the genome annotation pages,
                    # and do not have a species assignment even for type strain genome
                    continue

                # check if genome is a valid genus transfer into a genus
                # that already contains a species with the specific
                # name which results in a polyphyletic suffix being required
                # e.g. G002240355 is Prauserella marina at NCBI and is
                # transferred into Saccharomonospora under the GTDB. However,
                # Saccharomonospora marina already exists so this genome
                # needs to be S. marina_A.
                if (is_placeholder_taxon(gtdb_sp)
                        and gtdb_generic != ncbi_generic
                        and canonical_species(gtdb_sp) in gtdb_type_species):
                    continue

                if not test_same_epithet(specific_epithet(gtdb_sp),
                                         specific_epithet(ncbi_sp)):
                    num_incongruent += 1
                    fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        rid, gtdb_sp, ncbi_sp,
                        cur_genomes[rid].is_gtdb_type_strain(),
                        cur_genomes[rid].is_ncbi_type_strain(),
                        cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent species assignments.'.
            format(num_incongruent))
        fout.close()
Exemple #15
0
    def run(self, 
            prev_gtdb_metadata_file,
            cur_gtdb_metadata_file,
            cur_uba_gid_file,
            genomes_new_updated_file,
            qc_passed_file,
            gtdbtk_classify_file,
            ncbi_genbank_assembly_file,
            untrustworthy_type_file,
            gtdb_type_strains_ledger):
        """Identify species representatives that have changed from previous release."""
        
        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                uba_genome_file=cur_uba_gid_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ...previous genome set contains {len(prev_genomes):,} genomes.')
        self.logger.info(' ...previous genome set has {:,} species clusters spanning {:,} genomes.'.format(
                            len(prev_genomes.sp_clusters),
                            prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=cur_uba_gid_file,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ...current genome set contains {len(cur_genomes):,} genomes.')

        # get previous and current genomes from type strains
        self.logger.info('Determining genomes identified as being assembled from type strain.')
        prev_type_strain_gids = prev_genomes.gtdb_type_strain_genomes()
        cur_type_strain_gids = cur_genomes.gtdb_type_strain_genomes()
        new_type_strain_gids = cur_type_strain_gids - prev_type_strain_gids
        self.logger.info(' ...identified {:,} previous and {:,} current genomes from type strain.'.format(
                            len(prev_type_strain_gids),
                            len(cur_type_strain_gids)))
        self.logger.info(' ...{:,} type strain genomes are new to the current genome set.'.format(
                            len(new_type_strain_gids)))

        # created expanded previous GTDB species clusters
        self.logger.info('Creating species clusters of new and updated genomes based on GTDB-Tk classifications.')
        new_updated_sp_clusters = SpeciesClusters()
        new_updated_sp_clusters.create_expanded_clusters(prev_genomes.sp_clusters,
                                                            genomes_new_updated_file,
                                                            qc_passed_file,
                                                            gtdbtk_classify_file)
        self.logger.info('Identified {:,} expanded species clusters spanning {:,} genomes.'.format(
                            len(new_updated_sp_clusters),
                            new_updated_sp_clusters.total_num_genomes()))

        # determine status of each previous GTDB representative
        self.logger.info('Determining status of each previous GTDB representative.')
        
        fout_summary = open(os.path.join(self.output_dir, 'rep_change_summary.tsv'), 'w')
        fout_summary.write('Genome ID\tPrevious GTDB species\tNo. genomes in cluster')
        fout_summary.write('\tGENOMIC_CHANGE\tNCBI_SPECIES_CHANGE\tTYPE_STRAIN_CHANGE\tDOMAIN_CHECK')
        fout_summary.write('\tNew type strains\tRepresentative changed\n')
        
        fout_detailed  = open(os.path.join(self.output_dir, 'rep_change_detailed.tsv'), 'w')
        fout_detailed.write('Genome ID\tPrevious GTDB species\tChange type\tChange\n')
        
        unchanged_genome = set()
        updated_genome = set()
        lost_genome = set()
        user_genome = set()
        unchanged_sp = set()
        reassigned_sp = set()
        unchanged_type_strain = set()
        lost_type_strain = set()
        gain_type_strain = set()
        new_type_strain = set()
        changed_domain = set()
        unchanged_domain = set()
        num_rep_changes = 0
        first_type_strain = set()
        for prev_rid, prev_gtdb_sp in prev_genomes.sp_clusters.species():
            fout_summary.write(f'{prev_rid}\t{prev_gtdb_sp}\t{len(prev_genomes.sp_clusters[prev_rid])}')
            if prev_rid in cur_genomes:

                # check if genome assembly has been updated
                if prev_rid in new_updated_sp_clusters.updated_gids:
                    updated_genome.add(prev_rid)
                    fout_summary.write('\tUPDATED')
                    prev_ncbi_accn = prev_genomes[prev_rid].ncbi_accn
                    cur_ncbi_accn = cur_genomes[prev_rid].ncbi_accn
                    assert(prev_ncbi_accn != cur_ncbi_accn)
                    fout_detailed.write((f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:UPDATED\tNCBI accession updated from '
                                            f'{prev_genomes[prev_rid].ncbi_accn} to {cur_genomes[prev_rid].ncbi_accn}\n'))
                else:
                    unchanged_genome.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                    
                # check if NCBI species assignment has changed
                prev_ncbi_sp = prev_genomes[prev_rid].ncbi_taxa.species
                cur_ncbi_sp = cur_genomes[prev_rid].ncbi_taxa.species
                if prev_genomes[prev_rid].ncbi_taxa.specific_epithet == cur_genomes[prev_rid].ncbi_taxa.specific_epithet:
                    unchanged_sp.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                else:
                    reassigned_sp.add(prev_rid)
                    fout_summary.write('\tREASSIGNED')
                    fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tNCBI_SPECIES_CHANGE:REASSIGNED\tNCBI species reassigned from {prev_ncbi_sp} to {cur_ncbi_sp}\n')

                # check if type material status has changed
                if prev_rid in prev_type_strain_gids and prev_rid in cur_type_strain_gids:
                    unchanged_type_strain.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                elif prev_rid not in prev_type_strain_gids and prev_rid not in cur_type_strain_gids:
                    unchanged_type_strain.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                elif prev_rid in prev_type_strain_gids and prev_rid not in cur_type_strain_gids:
                    lost_type_strain.add(prev_rid)
                    fout_summary.write('\tLOST')
                    fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:LOST\tNo longer considered a genome from type strain\n')
                elif prev_rid not in prev_type_strain_gids and prev_rid in cur_type_strain_gids:
                    gain_type_strain.add(prev_rid)
                    fout_summary.write('\tGAINED')
                    fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:GAINED\tNow considered a genome from type strain\n')
                else:
                    assert(False)

                # check if domain assignment has changed
                if prev_genomes[prev_rid].gtdb_taxa.domain != cur_genomes[prev_rid].gtdb_taxa.domain:
                    changed_domain.add(prev_rid)
                    fout_detailed.write('{}\t{}\tDOMAIN_CHECK:REASSIGNED\tRepresentative changed from {} to {}\n'.format(
                                            prev_rid,
                                            prev_gtdb_sp,
                                            prev_genomes[prev_rid].gtdb_taxa.domain,
                                            cur_genomes[prev_rid].gtdb_taxa.domain))
                    fout_summary.write('\tREASSIGNED')
                else:
                    unchanged_domain.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                    
                # check if genome cluster has new genomes assembled from the type strain of the species
                sp_gids = prev_genomes.sp_clusters[prev_rid]
                if prev_rid in new_updated_sp_clusters:
                    sp_gids = sp_gids.union(new_updated_sp_clusters[prev_rid])
                new_ts = new_type_strain_gids.intersection(sp_gids)

                if new_ts:
                    if not prev_type_strain_gids.intersection(sp_gids) and not new_ts.intersection(gain_type_strain):
                        first_type_strain.add(prev_gtdb_sp)
                        
                    new_type_strain.add(prev_rid)
                    fout_detailed.write('{}\t{}\tNEW_TYPE_STRAINS:NEW\tSpecies cluster has {:,} new genomes from type strain: {}\n'.format(
                                            prev_rid,
                                            prev_gtdb_sp,
                                            len(new_ts),
                                            ','.join(new_ts)))
                    
                fout_summary.write(f'\t{len(new_ts)}')
                
                if (prev_rid in unchanged_genome 
                    and prev_rid in unchanged_sp
                    and prev_rid in unchanged_type_strain
                    and prev_rid in unchanged_domain):
                    fout_summary.write('\tNO')
                else:
                    fout_summary.write('\tYES')
                    num_rep_changes += 1

                fout_summary.write('\n')
            else:
                lost_genome.add(prev_rid)
                fout_summary.write('\t{}\t{}\t{}\t{}\t{}\n'.format('LOST','N/A','N/A','N/A', 'YES'))
                fout_detailed.write(f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:LOST\tGenome not present in current GTDB release\n')
                num_rep_changes += 1
                
                
        fout_summary.close()
        fout_detailed.close()
        
        num_prev_sp_clusters = len(prev_genomes.sp_clusters)
        num_rep_changes_perc = num_rep_changes*100.0/num_prev_sp_clusters
        self.logger.info(f' ... identified {num_rep_changes:,} ({num_rep_changes_perc:.1f}%) species with a change to the representative genome.')

        self.logger.info('Genomic changes:')
        unchanged_perc = len(unchanged_genome)*100.0 / num_prev_sp_clusters
        updated_perc = len(updated_genome)*100.0 / num_prev_sp_clusters
        lost_perc = len(lost_genome)*100.0 / num_prev_sp_clusters
        user_perc = len(user_genome)*100.0 / num_prev_sp_clusters
        self.logger.info(f'  unchanged_genome: {len(unchanged_genome):,} ({unchanged_perc:.1f}%)')
        self.logger.info(f'  updated_genome: {len(updated_genome):,} ({updated_perc:.1f}%)')
        self.logger.info(f'  lost_genome: {len(lost_genome):,} ({lost_perc:.1f}%)')
        self.logger.info(f'  user_genome: {len(user_genome):,} ({user_perc:.1f}%)')
        
        self.logger.info('NCBI species assignment changes:')
        cur_sp_count = len(unchanged_genome) + len(updated_genome)
        unchanged_sp_perc = len(unchanged_sp)*100.0 / cur_sp_count
        reassigned_sp_perc = len(reassigned_sp)*100.0 / cur_sp_count
        self.logger.info(f'  unchanged_sp: {len(unchanged_sp):,} ({unchanged_sp_perc:.1f}%)')
        self.logger.info(f'  reassigned_sp: {len(reassigned_sp):,} ({reassigned_sp_perc:.1f}%)')
        
        self.logger.info('Status of type strain genome declarations:')
        prev_ts_count = len(unchanged_type_strain) + len(lost_type_strain)
        unchanged_type_strain_perc = len(unchanged_type_strain)*100.0 / prev_ts_count
        lost_type_strain_perc = len(lost_type_strain)*100.0 / prev_ts_count
        gain_type_strain_perc = len(gain_type_strain)*100.0 / prev_ts_count
        new_type_strain_perc = len(new_type_strain)*100.0 / prev_ts_count
        self.logger.info(f'  unchanged_type_strain: {len(unchanged_type_strain):,} ({unchanged_type_strain_perc:.1f}%)')
        self.logger.info(f'  lost_type_strain: {len(lost_type_strain):,} ({lost_type_strain_perc:.1f}%)')
        self.logger.info(f'  gain_type_strain: {len(gain_type_strain):,} ({gain_type_strain_perc:.1f}%)')
        self.logger.info(f'  new_type_strain: {len(new_type_strain):,} ({new_type_strain_perc:.1f}%)')

        self.logger.info('GTDB domain assignment change:')
        unchanged_domain_perc = len(unchanged_domain)*100.0 / num_prev_sp_clusters
        changed_domain_perc = len(changed_domain)*100.0 / num_prev_sp_clusters
        self.logger.info(f'  unchanged: {len(unchanged_domain):,} ({unchanged_domain_perc:.1f}%)')
        self.logger.info(f'  reassigned: {len(changed_domain):,} ({changed_domain_perc:.1f}%)')
        
        print('first_type_strain', len(first_type_strain))
        
    def run(self, rep_change_summary_file, prev_gtdb_metadata_file,
            prev_genomic_path_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, genomes_new_updated_file,
            qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            sp_priority_ledger):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info(
            'Reading path to previous and current genomic FASTA files.')
        prev_genomes.load_genomic_file_paths(prev_genomic_path_file)
        prev_genomes.load_genomic_file_paths(uba_genome_paths)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # created expanded previous GTDB species clusters
        new_updated_sp_clusters = SpeciesClusters()

        self.logger.info(
            'Creating species clusters of new and updated genomes based on GTDB-Tk classifications.'
        )
        new_updated_sp_clusters.create_expanded_clusters(
            prev_genomes.sp_clusters, genomes_new_updated_file, qc_passed_file,
            gtdbtk_classify_file)

        self.logger.info(
            'Identified {:,} expanded species clusters spanning {:,} genomes.'.
            format(len(new_updated_sp_clusters),
                   new_updated_sp_clusters.total_num_genomes()))

        # initialize species priority manager
        self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger)

        # take required action for each changed representatives
        self.action_genomic_lost(rep_change_summary_file, prev_genomes,
                                 cur_genomes, new_updated_sp_clusters)

        self.action_genomic_update(rep_change_summary_file, prev_genomes,
                                   cur_genomes, new_updated_sp_clusters)

        self.action_type_strain_lost(rep_change_summary_file, prev_genomes,
                                     cur_genomes, new_updated_sp_clusters)

        self.action_domain_change(rep_change_summary_file, prev_genomes,
                                  cur_genomes)

        if True:  #***
            improved_reps = self.action_improved_rep(prev_genomes, cur_genomes,
                                                     new_updated_sp_clusters)

            pickle.dump(
                improved_reps,
                open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'wb'))
        else:
            self.logger.warning(
                'Reading improved_reps for pre-cached file. Generally used only for debugging.'
            )
            improved_reps = pickle.load(
                open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'rb'))

        for prev_rid, (new_rid, action) in improved_reps.items():
            self.update_rep(prev_rid, new_rid, action)

        self.action_naming_priority(prev_genomes, cur_genomes,
                                    new_updated_sp_clusters)

        # report basic statistics
        num_retired_sp = sum(
            [1 for v in self.new_reps.values() if v[0] is None])
        num_replaced_rids = sum(
            [1 for v in self.new_reps.values() if v[0] is not None])
        self.logger.info(f'Identified {num_retired_sp:,} retired species.')
        self.logger.info(
            f'Identified {num_replaced_rids:,} species with a modified representative genome.'
        )

        self.action_log.close()

        # write out representatives for existing species clusters
        fout = open(os.path.join(self.output_dir, 'updated_species_reps.tsv'),
                    'w')
        fout.write(
            'Previous representative ID\tNew representative ID\tAction\tRepresentative status\n'
        )
        for rid in prev_genomes.sp_clusters:
            if rid in self.new_reps:
                new_rid, action = self.new_reps[rid]
                if new_rid is not None:
                    fout.write(f'{rid}\t{new_rid}\t{action}\tREPLACED\n')
                else:
                    fout.write(f'{rid}\t{new_rid}\t{action}\tLOST\n')
            else:
                fout.write(f'{rid}\t{rid}\tNONE\tUNCHANGED\n')

        fout.close()

        # write out updated species clusters
        out_file = os.path.join(self.output_dir, 'updated_sp_clusters.tsv')
        self.write_updated_clusters(prev_genomes, cur_genomes, self.new_reps,
                                    new_updated_sp_clusters, out_file)
    def run(self, prev_gtdb_metadata_file, cur_gtdb_metadata_file,
            genomes_new_updated_file, qc_passed_file, gtdbtk_classify_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            disband_cluster_ledger, gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Identify species representatives that have changed from previous release."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - previous genome set contains {len(prev_genomes):,} genomes.')
        self.logger.info(
            ' - previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # get previous and current genomes from type strains
        self.logger.info(
            'Determining genomes identified as being assembled from type strain.'
        )
        prev_type_strain_gids = prev_genomes.gtdb_type_strain_genomes()
        cur_type_strain_gids = cur_genomes.gtdb_type_strain_genomes()
        new_type_strain_gids = cur_type_strain_gids - prev_type_strain_gids
        self.logger.info(
            ' - identified {:,} previous and {:,} current genomes from type strain.'
            .format(len(prev_type_strain_gids), len(cur_type_strain_gids)))
        self.logger.info(
            ' - {:,} type strain genomes are new to the current genome set.'.
            format(len(new_type_strain_gids)))

        # create expanded previous GTDB species clusters
        self.logger.info(
            'Creating species clusters of new and updated genomes based on GTDB-Tk classifications.'
        )
        new_updated_sp_clusters = SpeciesClusters()
        new_updated_sp_clusters.create_expanded_clusters(
            prev_genomes, genomes_new_updated_file, qc_passed_file,
            gtdbtk_classify_file)
        self.logger.info(
            'Identified {:,} expanded species clusters spanning {:,} genomes.'.
            format(len(new_updated_sp_clusters),
                   new_updated_sp_clusters.total_num_genomes()))

        # read GTDB clusters to be disbanded
        self.logger.info(
            'Parsing ledger indicating GTDB clusters to be disbanded.')
        disbanded_rids = parse_disbanded_cluster_ledger(disband_cluster_ledger)
        self.logger.info(' - identified {:,} clusters to be disbanded.'.format(
            len(disbanded_rids)))

        # determine status of each previous GTDB representative
        self.logger.info(
            'Determining status of each previous GTDB representative.')

        fout_summary = open(
            os.path.join(self.output_dir, 'rep_change_summary.tsv'), 'w')
        fout_summary.write(
            'Genome ID\tPrevious GTDB species\tNo. genomes in cluster')
        fout_summary.write(
            '\tGENOMIC_CHANGE\tNCBI_SPECIES_CHANGE\tTYPE_STRAIN_CHANGE\tDOMAIN_CHECK\tNCBI_ASSEMBLY_QUALITY\tDISBANDED_CHECK'
        )
        fout_summary.write('\tNew type strains\tRepresentative changed\n')

        fout_detailed = open(
            os.path.join(self.output_dir, 'rep_change_detailed.tsv'), 'w')
        fout_detailed.write(
            'Genome ID\tPrevious GTDB species\tChange type\tChange\n')

        unchanged_genome = set()
        updated_genome = set()
        lost_genome = set()

        unchanged_sp = set()
        reassigned_sp = set()

        unchanged_type_strain = set()
        lost_type_strain = set()
        gain_type_strain = set()
        new_type_strain = set()
        ncbi_anomalous_assembly = set()

        changed_domain = set()
        unchanged_domain = set()

        num_rep_changes = 0
        disbanded_count = 0

        for prev_rid, prev_gtdb_sp in prev_genomes.sp_clusters.species():
            fout_summary.write(
                f'{prev_rid}\t{prev_gtdb_sp}\t{len(prev_genomes.sp_clusters[prev_rid])}'
            )

            if prev_rid in cur_genomes:
                # check if genome assembly has been updated
                if prev_rid in new_updated_sp_clusters.updated_gids:
                    updated_genome.add(prev_rid)
                    fout_summary.write('\tUPDATED')
                    prev_ncbi_accn = prev_genomes[prev_rid].ncbi_accn
                    cur_ncbi_accn = cur_genomes[prev_rid].ncbi_accn
                    assert prev_ncbi_accn != cur_ncbi_accn
                    fout_detailed.write((
                        f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:UPDATED\tNCBI accession updated from '
                        f'{prev_genomes[prev_rid].ncbi_accn} to {cur_genomes[prev_rid].ncbi_accn}\n'
                    ))
                else:
                    unchanged_genome.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')

                # check if NCBI species assignment has changed
                prev_ncbi_sp = prev_genomes[prev_rid].ncbi_taxa.species
                cur_ncbi_sp = cur_genomes[prev_rid].ncbi_taxa.species
                if prev_genomes[
                        prev_rid].ncbi_taxa.specific_epithet == cur_genomes[
                            prev_rid].ncbi_taxa.specific_epithet:
                    unchanged_sp.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                else:
                    reassigned_sp.add(prev_rid)
                    fout_summary.write('\tREASSIGNED')
                    fout_detailed.write(
                        f'{prev_rid}\t{prev_gtdb_sp}\tNCBI_SPECIES_CHANGE:REASSIGNED\tNCBI species reassigned from {prev_ncbi_sp} to {cur_ncbi_sp}\n'
                    )

                # check if type material status has changed
                if prev_rid in prev_type_strain_gids and prev_rid in cur_type_strain_gids:
                    unchanged_type_strain.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                elif prev_rid not in prev_type_strain_gids and prev_rid not in cur_type_strain_gids:
                    unchanged_type_strain.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')
                elif prev_rid in prev_type_strain_gids and prev_rid not in cur_type_strain_gids:
                    lost_type_strain.add(prev_rid)
                    fout_summary.write('\tLOST')
                    fout_detailed.write(
                        f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:LOST\tNo longer considered a genome from type strain\n'
                    )
                elif prev_rid not in prev_type_strain_gids and prev_rid in cur_type_strain_gids:
                    gain_type_strain.add(prev_rid)
                    fout_summary.write('\tGAINED')
                    fout_detailed.write(
                        f'{prev_rid}\t{prev_gtdb_sp}\tTYPE_STRAIN_CHANGE:GAINED\tNow considered a genome from type strain\n'
                    )
                else:
                    assert False

                # check if domain assignment has changed
                if prev_genomes[prev_rid].gtdb_taxa.domain != cur_genomes[
                        prev_rid].gtdb_taxa.domain:
                    changed_domain.add(prev_rid)
                    fout_detailed.write(
                        '{}\t{}\tDOMAIN_CHECK:REASSIGNED\tRepresentative changed from {} to {}\n'
                        .format(prev_rid, prev_gtdb_sp,
                                prev_genomes[prev_rid].gtdb_taxa.domain,
                                cur_genomes[prev_rid].gtdb_taxa.domain))
                    fout_summary.write('\tREASSIGNED')
                else:
                    unchanged_domain.add(prev_rid)
                    fout_summary.write('\tUNCHANGED')

                # check if NCBI has marked genome assembly as problematic
                if cur_genomes[prev_rid].is_ncbi_many_frameshifted_proteins(
                ) or cur_genomes[prev_rid].is_ncbi_anomalous_assembly():
                    ncbi_anomalous_assembly.add(prev_rid)
                    fout_summary.write('\tNCBI_ANOMALOUS_ASSEMBLY')
                    fout_detailed.write(
                        '{}\t{}\tNCBI_ASSEMBLY_METADATA:NCBI_ANOMALOUS_ASSEMBLY\tExcluded = {}\n'
                        .format(
                            prev_rid, prev_gtdb_sp,
                            cur_genomes[prev_rid].excluded_from_refseq_note))
                else:
                    fout_summary.write('\tNCBI_GOOD_ASSEMBLY')

                # check if GTDB species cluster is flagged to be disbanded
                if prev_rid in disbanded_rids:
                    disbanded_count += 1
                    fout_summary.write('\tTRUE')
                    fout_detailed.write(
                        '{}\t{}\tEXPLICIT_UPDATE:DISBANDED\t\n'.format(
                            prev_rid, prev_gtdb_sp))
                else:
                    fout_summary.write('\tFALSE')

                # check if genome cluster has new genomes assembled from the type strain of the species
                sp_gids = prev_genomes.sp_clusters[prev_rid]
                if prev_rid in new_updated_sp_clusters:
                    sp_gids = sp_gids.union(new_updated_sp_clusters[prev_rid])
                new_ts = new_type_strain_gids.intersection(sp_gids)

                if new_ts:
                    new_type_strain.add(prev_rid)
                    fout_detailed.write(
                        '{}\t{}\tNEW_TYPE_STRAINS:NEW\tSpecies cluster has {:,} new genomes from type strain: {}\n'
                        .format(prev_rid, prev_gtdb_sp, len(new_ts),
                                ','.join(new_ts)))

                fout_summary.write(f'\t{len(new_ts)}')

                # check if representative has changed
                if (prev_rid in unchanged_genome and prev_rid in unchanged_sp
                        and prev_rid in unchanged_type_strain
                        and prev_rid in unchanged_domain
                        and prev_rid not in disbanded_rids):
                    fout_summary.write('\tNO')
                else:
                    fout_summary.write('\tYES')
                    num_rep_changes += 1

                fout_summary.write('\n')
            else:
                lost_genome.add(prev_rid)
                fout_summary.write('\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                    'LOST', 'N/A', 'N/A', 'N/A', 'N/A', 'YES'))
                fout_detailed.write(
                    f'{prev_rid}\t{prev_gtdb_sp}\tGENOMIC_CHANGE:LOST\tGenome not present in current GTDB release\n'
                )
                num_rep_changes += 1

        fout_summary.close()
        fout_detailed.close()

        num_prev_sp_clusters = len(prev_genomes.sp_clusters)
        num_rep_changes_perc = num_rep_changes * 100.0 / num_prev_sp_clusters
        self.logger.info(
            f' - identified {num_rep_changes:,} ({num_rep_changes_perc:.1f}%) species with a change to the representative genome.'
        )

        self.logger.info('Genomic changes:')
        unchanged_perc = len(unchanged_genome) * 100.0 / num_prev_sp_clusters
        updated_perc = len(updated_genome) * 100.0 / num_prev_sp_clusters
        lost_perc = len(lost_genome) * 100.0 / num_prev_sp_clusters
        self.logger.info(
            f'  unchanged_genome: {len(unchanged_genome):,} ({unchanged_perc:.1f}%)'
        )
        self.logger.info(
            f'  updated_genome: {len(updated_genome):,} ({updated_perc:.1f}%)')
        self.logger.info(
            f'  lost_genome: {len(lost_genome):,} ({lost_perc:.1f}%)')

        self.logger.info('NCBI species assignment changes:')
        cur_sp_count = len(unchanged_genome) + len(updated_genome)
        unchanged_sp_perc = len(unchanged_sp) * 100.0 / cur_sp_count
        reassigned_sp_perc = len(reassigned_sp) * 100.0 / cur_sp_count
        self.logger.info(
            f'  unchanged_sp: {len(unchanged_sp):,} ({unchanged_sp_perc:.1f}%)'
        )
        self.logger.info(
            f'  reassigned_sp: {len(reassigned_sp):,} ({reassigned_sp_perc:.1f}%)'
        )

        self.logger.info('Status of type strain genome declarations:')
        prev_ts_count = len(unchanged_type_strain) + len(lost_type_strain)
        unchanged_type_strain_perc = len(
            unchanged_type_strain) * 100.0 / prev_ts_count
        lost_type_strain_perc = len(lost_type_strain) * 100.0 / prev_ts_count
        gain_type_strain_perc = len(gain_type_strain) * 100.0 / prev_ts_count
        new_type_strain_perc = len(new_type_strain) * 100.0 / prev_ts_count
        self.logger.info(
            f'  unchanged_type_strain: {len(unchanged_type_strain):,} ({unchanged_type_strain_perc:.1f}%)'
        )
        self.logger.info(
            f'  lost_type_strain: {len(lost_type_strain):,} ({lost_type_strain_perc:.1f}%)'
        )
        self.logger.info(
            f'  gain_type_strain: {len(gain_type_strain):,} ({gain_type_strain_perc:.1f}%)'
        )
        self.logger.info(
            f'  new_type_strain: {len(new_type_strain):,} ({new_type_strain_perc:.1f}%)'
        )

        self.logger.info('GTDB domain assignment change:')
        unchanged_domain_perc = len(
            unchanged_domain) * 100.0 / num_prev_sp_clusters
        changed_domain_perc = len(
            changed_domain) * 100.0 / num_prev_sp_clusters
        self.logger.info(
            f'  unchanged: {len(unchanged_domain):,} ({unchanged_domain_perc:.1f}%)'
        )
        self.logger.info(
            f'  reassigned: {len(changed_domain):,} ({changed_domain_perc:.1f}%)'
        )

        self.logger.info(
            'Identified {:,} representatives marked as anomalous assemblies at NCBI.'
            .format(len(ncbi_anomalous_assembly)))

        self.logger.info(
            'Identified {:,} GTDB clusters to be disbanded.'.format(
                disbanded_count))
    def run(self, lpsn_metadata_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, qc_passed_file, ncbi_genbank_assembly_file,
            gtdb_type_strains_ledger, untrustworthy_type_ledger):
        """Identify type genomes based on type 16S rRNA sequences indicated at LPSN."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # get LPSN species names with specified sequence type material
        self.logger.info('Parsing LPSN type 16S rRNA data.')
        lpsn_sp_type_ssu = self.parse_lpsn_ssu_metadata(lpsn_metadata_file)
        self.logger.info(
            f' - identified {len(lpsn_sp_type_ssu):,} species with type 16S rRNA sequence.'
        )

        # get NCBI species assignments for genomes and genomes marked as being
        # type strain genomes
        ncbi_candidatus = set()
        ncbi_sp_gids = defaultdict(set)
        ncbi_assem_report = {}
        gtdb_type_strains = defaultdict(set)
        for gid in cur_genomes:
            ncbi_sp = cur_genomes[gid].ncbi_taxa.species
            ncbi_sp_gids[ncbi_sp].add(gid)

            if 'Candidatus' in cur_genomes[gid].ncbi_unfiltered_taxa.species:
                ncbi_candidatus.add(gid)

            if cur_genomes[gid].is_gtdb_type_strain():
                gtdb_type_strains[ncbi_sp].add(gid)

            ncbi_assem_report[gid] = cur_genomes.genomic_files[gid].replace(
                '_genomic.fna', '_assembly_report.txt')

        # match LPSN species with type rRNA sequences to genomes
        # with the same NCBI species classification
        self.logger.info(
            'Identifying type genomes through LPSN type 16S rRNA sequences.')

        worker_queue = mp.Queue()
        writer_queue = mp.Queue()

        for lpsn_sp, rRNA in lpsn_sp_type_ssu.items():
            worker_queue.put((lpsn_sp, rRNA))

        for _ in range(self.cpus):
            worker_queue.put(None)

        try:
            worker_proc = [
                mp.Process(target=self._worker,
                           args=(cur_genomes, ncbi_sp_gids, ncbi_candidatus,
                                 ncbi_assem_report, worker_queue,
                                 writer_queue)) for _ in range(self.cpus)
            ]
            write_proc = mp.Process(target=self._writer,
                                    args=(cur_genomes, gtdb_type_strains,
                                          len(lpsn_sp_type_ssu), writer_queue))

            write_proc.start()

            for p in worker_proc:
                p.start()

            for p in worker_proc:
                p.join()

            writer_queue.put(None)
            write_proc.join()
        except:
            for p in worker_proc:
                p.terminate()
            write_proc.terminate()

        self.logger.info(
            "[IMPORTANT]: add genomes where `Is GTDB type genome` is FALSE to the `gtdb_type_strains` ledger."
        )
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            gtdb_type_strains_ledger, sp_priority_ledger,
            genus_priority_ledger, ncbi_env_bioproject_ledger, lpsn_gss_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  lpsn_gss_file,
                                                  self.output_dir)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - current genome set contains {len(cur_genomes):,} genomes.')

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type species genomes with incongruent GTDB genus assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_species_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB genus\tNCBI genus\tGTDB genus priority date\tNCBI genus priority date\tPriority status\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_gtdb_type_species():
                gtdb_genus = taxa[Taxonomy.GENUS_INDEX]
                ncbi_genus = cur_genomes[rid].ncbi_taxa.genus

                if gtdb_genus != ncbi_genus:
                    priority_genus = sp_priority_mngr.genus_priority(
                        gtdb_genus, ncbi_genus)

                    if priority_genus != gtdb_genus:
                        num_incongruent += 1

                        if priority_genus == ncbi_genus:
                            priority_status = 'NCBI genus name has priority'
                        else:
                            priority_status = 'Genus with priority must be manually established'

                        fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                            rid, gtdb_genus, ncbi_genus,
                            sp_priority_mngr.genus_priority_year(gtdb_genus),
                            sp_priority_mngr.genus_priority_year(ncbi_genus),
                            priority_status,
                            cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent genus assignments.'.
            format(num_incongruent))
        fout.close()
Exemple #20
0
    def run(self, gtdb_clusters_file, prev_gtdb_metadata_file,
            cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file,
            gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            sp_priority_ledger, gtdb_taxa_updates_ledger, dsmz_bacnames_file):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            gtdbtk_classify_file=gtdbtk_classify_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # read named GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        cur_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # set current genomes to have same GTDB assignments as in previous
        # GTDB release. This is necessary, since genomes may have different
        # NCBI accession numbers between releases and thus the previous GTDB
        # taxonomy will not be reflected in the latest GTDB database. The
        # exception is if a genome has changed domains, in which case the
        # previous assignment is invalid.
        self.logger.info(
            'Setting GTDB taxonomy of genomes in current genome set.')
        update_count = 0
        conflicting_domain_count = 0
        for prev_gid in prev_genomes:
            if prev_gid in cur_genomes:
                if prev_genomes[prev_gid].gtdb_taxa != cur_genomes[
                        prev_gid].gtdb_taxa:
                    if prev_genomes[prev_gid].gtdb_taxa.domain == cur_genomes[
                            prev_gid].gtdb_taxa.domain:
                        update_count += 1
                        cur_genomes[prev_gid].gtdb_taxa.update_taxa(
                            prev_genomes[prev_gid].gtdb_taxa)
                    else:
                        conflicting_domain_count += 1
        self.logger.info(f' ... updated {update_count:,} genomes.')
        self.logger.info(
            f' ... identified {conflicting_domain_count:,} genomes with conflicting domain assignments.'
        )

        # get explicit updates to previous GTDB taxa
        self.logger.info('Reading explicit taxa updates.')
        explicit_taxon_updates = self._parse_explicit_taxa_updates(
            gtdb_taxa_updates_ledger)
        self.logger.info(
            f' ... identified {len(explicit_taxon_updates):,} updates.')

        self.logger.info(
            'Updating current genomes to reflect explicit taxa updates.')
        update_count = 0
        for cur_taxon, new_taxon in explicit_taxon_updates.items():
            rank_prefix = cur_taxon[0:3]
            rank_index = Taxonomy.rank_prefixes.index(rank_prefix)

            for gid in cur_genomes:
                if cur_genomes[gid].gtdb_taxa.get_taxa(
                        rank_index) == cur_taxon:
                    update_count += 1
                    cur_genomes[gid].gtdb_taxa.set_taxa(rank_index, new_taxon)

                    if rank_prefix == 'g__':
                        # should also update the species name
                        new_sp = cur_genomes[gid].gtdb_taxa.species.replace(
                            cur_taxon[3:], new_taxon[3:])
                        cur_genomes[gid].gtdb_taxa.set_taxa(
                            rank_index + 1, new_sp)

        self.logger.info(f' ... updated {update_count:,} genomes.')

        # initialize species priority manager
        self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                       dsmz_bacnames_file)

        # create table with new NCBI genera that likely need to be incorporated into
        # this release of the GTDB
        self.new_ncbi_genera(prev_genomes, cur_genomes, cur_clusters,
                             gtdbtk_classify_file)

        self.new_ncbi_families(prev_genomes, cur_genomes, cur_clusters,
                               gtdbtk_classify_file)
    def run(self, 
                metadata_file,
                cur_uba_gid_file,
                ncbi_genbank_assembly_file,
                gtdb_domain_report,
                qc_exception_file,
                min_comp,
                max_cont,
                min_quality,
                sh_exception,
                min_perc_markers,
                max_contigs,
                min_N50,
                max_ambiguous,
                output_dir):
        """Quality check all potential GTDB genomes."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(metadata_file,
                                                create_sp_clusters=False,
                                                uba_genome_file=cur_uba_gid_file)
        self.logger.info(f' ...current genome set contains {len(cur_genomes):,} genomes.')

        # parse genomes flagged as exceptions from QC
        qc_exceptions = set()
        with open(qc_exception_file, encoding='utf-8') as f:
            f.readline()
            for line in f:
                gid = canonical_gid(line.split('\t')[0].strip())
                qc_exceptions.add(gid)
        self.logger.info(f'Identified {len(qc_exceptions):,} genomes flagged as exceptions from QC.')
        
        # get percentage of bac120 or ar122 marker genes
        marker_perc = self.read_marker_percentages(gtdb_domain_report, 
                                                    cur_genomes)

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file)

        # QC all genomes
        self.logger.info('Validating genomes.')
        fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w')
        fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w')
        
        header = 'Accession\tNCBI species\tGTDB taxonomy'
        header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases'
        
        fout_retained.write(header + '\tNote\n')
        fout_failed.write(header)
        fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n')

        pass_qc_gids = set()
        failed_qc_gids = set()
        for gid in cur_genomes:
            failed_tests = defaultdict(int)
            passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid],
                                                    min_comp,
                                                    max_cont,
                                                    min_quality,
                                                    sh_exception,
                                                    min_perc_markers,
                                                    max_contigs,
                                                    min_N50,
                                                    max_ambiguous,
                                                    failed_tests)

            if passed_qc or gid in qc_exceptions:
                pass_qc_gids.add(gid)
                fout_retained.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa))
                fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % (
                                        cur_genomes[gid].comp,
                                        cur_genomes[gid].cont,
                                        cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                        ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        cur_genomes[gid].contig_count,
                                        cur_genomes[gid].contig_n50,
                                        cur_genomes[gid].ambiguous_bases,
                                        'Passed QC' if passed_qc else 'Flagged as exception'))
            else:
                failed_qc_gids.add(gid) 
                fout_failed.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa))
                fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % (
                                        cur_genomes[gid].comp,
                                        cur_genomes[gid].cont,
                                        cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                        ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        cur_genomes[gid].contig_count,
                                        cur_genomes[gid].contig_n50,
                                        cur_genomes[gid].ambiguous_bases))
                fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    failed_tests['comp'],
                                    failed_tests['cont'],
                                    failed_tests['qual'],
                                    failed_tests['marker_perc'],
                                    failed_tests['contig_count'],
                                    failed_tests['N50'],
                                    failed_tests['ambig']))
        fout_retained.close()
        fout_failed.close()
        
        self.logger.info('Retained {:,} ({:.2f}%) genomes and filtered {:,} ({:.2f}%) genomes.'.format(
                            len(pass_qc_gids),
                            len(pass_qc_gids)*100.0/len(cur_genomes),
                            len(failed_qc_gids),
                            len(failed_qc_gids)*100.0/len(cur_genomes)))
        
        # check domain assignment of genomes passing QC
        # report potential issues
        self.check_domain_assignments(gtdb_domain_report, 
                                        cur_genomes,
                                        pass_qc_gids)
                                                                
        # QC genomes in each named species
        named_ncbi_species = cur_genomes.named_ncbi_species()
        self.logger.info(f'Performing QC of type genome for each of the {len(named_ncbi_species):,} NCBI species.')
        
        fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w')
        fout_type_fail.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)')
        fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n')
        
        fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w')
        fout_fail_sp.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)')
        fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases')
        fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases')
        fout_fail_sp.write('\tNCBI exclude from RefSeq\n')
        
        fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w')
        fout_sp_lost.write('NCBI species\tNo. genomes\tNo. type genomes')
        fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers')
        fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n')
        
        lost_type = 0
        lost_sp = 0
        filtered_genomes = 0
        failed_tests_cumulative = defaultdict(int)
        for sp, gids in named_ncbi_species.items():
            type_pass = set()
            type_fail = set()
            other_pass = set()
            other_fail = set()
            
            failed_tests_gids = {}
            for gid in gids:
                failed_tests = defaultdict(int)
                passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid],
                                                        min_comp,
                                                        max_cont,
                                                        min_quality,
                                                        sh_exception,
                                                        min_perc_markers,
                                                        max_contigs,
                                                        min_N50,
                                                        max_ambiguous,
                                                        failed_tests)
                                    
                failed_tests_gids[gid] = failed_tests

                if cur_genomes[gid].is_gtdb_type_strain() or cur_genomes[gid].is_ncbi_type_strain():
                    if passed_qc or gid in qc_exceptions:
                        type_pass.add(gid)
                    else:
                        type_fail.add(gid)
                        filtered_genomes += 1
                else:
                    if passed_qc or gid in qc_exceptions:
                        other_pass.add(gid)
                    else:
                        other_fail.add(gid)
                        filtered_genomes += 1
                        
                # tally failed species
                for test, count in failed_tests.items():
                    failed_tests_cumulative[test] += count

            if len(type_pass) >= 1:
                # great: one or more type genomes pass QC and will be selected as the type genome
                continue 
            
            if len(type_fail):
                # all potential type genomes for species failed QC so report these for manual inspection
                lost_type += 1
                for gid in type_fail:
                    fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % (
                                            sp,
                                            gid,
                                            cur_genomes[gid].gtdb_taxa,
                                            cur_genomes[gid].ncbi_taxa,
                                            cur_genomes[gid].gtdb_type_designation_sources,
                                            cur_genomes[gid].ncbi_type_material,
                                            float(cur_genomes[gid].length)/1e6,
                                            cur_genomes[gid].comp,
                                            cur_genomes[gid].cont,
                                            cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                            cur_genomes[gid].strain_heterogeneity_100,
                                            marker_perc[gid],
                                            cur_genomes[gid].contig_count,
                                            cur_genomes[gid].contig_n50,
                                            cur_genomes[gid].ambiguous_bases,
                                            excluded_from_refseq_note[gid],
                                            len(other_pass) == 0))
                
            if len(other_pass) == 0:
                # no genomes for species pass QC so report loss of species
                lost_sp += 1
                fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail)))
                fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    sum([failed_tests_gids[gid]['comp'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['cont'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['qual'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['contig_count'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['N50'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['ambig'] for gid in gids])))
                                    
                for gid in type_fail.union(other_fail):
                    fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % (
                                            sp,
                                            gid,
                                            cur_genomes[gid].gtdb_taxa,
                                            cur_genomes[gid].ncbi_taxa,
                                            gid in type_fail,
                                            float(cur_genomes[gid].length)/1e6,
                                            cur_genomes[gid].comp,
                                            cur_genomes[gid].cont,
                                            cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                            cur_genomes[gid].strain_heterogeneity_100,
                                            marker_perc[gid],
                                            cur_genomes[gid].contig_count,
                                            cur_genomes[gid].contig_n50,
                                            cur_genomes[gid].ambiguous_bases))
                    fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (
                                        failed_tests_gids[gid]['comp'],
                                        failed_tests_gids[gid]['cont'],
                                        failed_tests_gids[gid]['qual'],
                                        failed_tests_gids[gid]['marker_perc'],
                                        failed_tests_gids[gid]['contig_count'],
                                        failed_tests_gids[gid]['N50'],
                                        failed_tests_gids[gid]['ambig']))
                    fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid])

        fout_type_fail.close()
        fout_fail_sp.close()
        fout_sp_lost.close()
        
        self.logger.info(f'Filtered {filtered_genomes:,} genomes assigned to NCBI species.')
        self.logger.info(f'Identified {lost_type:,} species with type genomes failing QC and {lost_sp:,} total species failing QC.')
        self.logger.info('Genomes from NCBI species filtered by each criterion:')
        for test in sorted(failed_tests_cumulative):
            self.logger.info(f'{test}: {failed_tests_cumulative[test]:,}')
Exemple #22
0
    def run(self, named_cluster_file,
            cur_gtdb_metadata_file,
            cur_genomic_path_file,
            qc_passed_file,
            ncbi_genbank_assembly_file,
            untrustworthy_type_file,
            ani_af_rep_vs_nonrep,
            gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Infer de novo species clusters and representatives for remaining genomes."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                            create_sp_clusters=False,
                                            qc_passed_file=qc_passed_file,
                                            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                            untrustworthy_type_ledger=untrustworthy_type_file,
                                            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # determine representatives and genomes clustered to each representative
        self.logger.info('Reading named GTDB species clusters.')
        named_rep_gids, rep_clustered_gids, rep_radius = self.parse_named_clusters(
            named_cluster_file)
        self.logger.info(
            ' - identified {:,} representative genomes.'.format(len(named_rep_gids)))
        self.logger.info(
            ' - identified {:,} clustered genomes.'.format(len(rep_clustered_gids)))

        # determine genomes left to be clustered
        unclustered_gids = set(cur_genomes.genomes.keys()) - \
            named_rep_gids - rep_clustered_gids
        self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(
            len(unclustered_gids)))

        # establish closest representative for each unclustered genome
        self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(
            len(unclustered_gids)))
        nonrep_radius = self.nonrep_radius(
            unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info(
            'Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self.mash_ani_unclustered(cur_genomes, unclustered_gids)

        # select de novo species representatives in a greedy fashion based on genome quality
        de_novo_rep_gids = self.selected_rep_genomes(cur_genomes,
                                                     nonrep_radius,
                                                     unclustered_gids,
                                                     mash_anis)

        # cluster all non-representative genomes to representative genomes
        final_cluster_radius = rep_radius.copy()
        final_cluster_radius.update(nonrep_radius)

        final_clusters, _ani_af = self.cluster_genomes(cur_genomes,
                                                       de_novo_rep_gids,
                                                       named_rep_gids,
                                                       final_cluster_radius)

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]

        self.logger.info(
            'Writing {:,} species clusters to file.'.format(len(final_clusters)))
        self.logger.info('Writing {:,} cluster radius information to file.'.format(
            len(final_cluster_radius)))

        write_clusters(final_clusters,
                       final_cluster_radius,
                       cur_genomes,
                       os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv'))

        write_rep_radius(final_cluster_radius,
                         cur_genomes,
                         os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))

        # write out archaeal and bacterial GTDB representatives
        fout_ar = open(os.path.join(self.output_dir, 'gtdb_reps_ar.lst'), 'w')
        fout_bac = open(os.path.join(
            self.output_dir, 'gtdb_reps_bac.lst'), 'w')
        for rid in final_clusters:
            if cur_genomes[rid].gtdb_taxa.domain == 'd__Bacteria':
                fout_bac.write('{}\n'.format(cur_genomes[rid].ncbi_accn))
            elif cur_genomes[rid].gtdb_taxa.domain == 'd__Archaea':
                fout_ar.write('{}\n'.format(cur_genomes[rid].ncbi_accn))
            else:
                self.logger.error(
                    'GTDB representative has unassigned domain: {}'.format(rid))

        fout_ar.close()
        fout_bac.close()
    def run(self, named_cluster_file, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, ani_af_rep_vs_nonrep,
            gtdb_type_strains_ledger, sp_priority_ledger):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # read named GTDB species clusters
        self.logger.info(
            'Reading named and previous placeholder GTDB species clusters.')
        clusters, rep_radius = read_clusters(named_cluster_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(clusters),
                sum([len(gids) + 1 for gids in clusters.values()])))

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info(
            'Determining effective type strain genomes in each NCBI species.')
        ncbi_sp_type_strain_genomes = cur_genomes.ncbi_sp_effective_type_genomes(
        )
        self.logger.info(
            ' ... identified effective type strain genomes for {:,} NCBI species.'
            .format(len(ncbi_sp_type_strain_genomes)))

        # verify that type genomes for a species are contained in a
        # single GTDB species cluster
        rid_map = {}
        for rid, gids in clusters.items():
            rid_map[rid] = rid
            for gid in gids:
                rid_map[gid] = rid

        for ncbi_sp, type_gids in ncbi_sp_type_strain_genomes.items():
            gtdb_rids = set(
                [rid_map[gid] for gid in type_gids if gid in rid_map])
            if len(gtdb_rids) > 1:
                self.logger.warning(
                    'Type strain genomes from NCBI species {} were assigned to {:,} GTDB species clusters: {}.'
                    .format(ncbi_sp, len(gtdb_rids),
                            [(rid, cur_genomes[rid].gtdb_taxa.species)
                             for rid in gtdb_rids]))

        # identify synonyms
        self.logger.info('Identifying synonyms.')
        synonyms = defaultdict(list)
        failed_type_strain_priority = 0
        for rid, gids in clusters.items():
            rep_ncbi_sp = cur_genomes[rid].ncbi_taxa.species

            # find species that are a synonym to the current representative,
            # using the best quality genome for each species to establish
            # synonym statistics such as ANI and AF
            type_gids = [
                gid for gid in gids
                if cur_genomes[gid].is_effective_type_strain()
            ]
            if not cur_genomes[rid].is_effective_type_strain() and len(
                    type_gids) > 0:
                failed_type_strain_priority += 1
                continue

            q = {
                gid: cur_genomes[gid].score_type_strain()
                for gid in type_gids
            }
            q_sorted = sorted(q.items(),
                              key=lambda kv: (kv[1], kv[0]),
                              reverse=True)
            processed_sp = set()
            for gid, _quality in q_sorted:
                cur_ncbi_sp = cur_genomes[gid].ncbi_taxa.species

                if cur_ncbi_sp in processed_sp:
                    continue

                if cur_ncbi_sp != rep_ncbi_sp:
                    synonyms[rid].append(gid)
                    processed_sp.add(cur_ncbi_sp)

        self.logger.info(
            ' ... identified {:,} GTDB representatives resulting in {:,} synonyms.'
            .format(len(synonyms),
                    sum([len(gids) for gids in synonyms.values()])))

        if failed_type_strain_priority:
            self.logger.warning(
                f'Identified {failed_type_strain_priority:,} non-type strain representatives that failed to priotize an effective type strain genome.'
            )

        # read ANI and AF between representatives and non-representative genomes
        self.logger.info(
            'Reading ANI and AF between representative and non-representative genomes.'
        )
        ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb'))

        # write out synonyms
        self.write_synonym_table(synonyms, cur_genomes, ani_af,
                                 sp_priority_ledger)
    def run(self, named_cluster_file,
                    cur_gtdb_metadata_file,
                    cur_genomic_path_file,
                    uba_genome_paths,
                    qc_passed_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    ani_af_rep_vs_nonrep,
                    gtdb_type_strains_ledger):
        """Infer de novo species clusters and representatives for remaining genomes."""
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=uba_genome_paths,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # determine representatives and genomes clustered to each representative
        self.logger.info('Reading named GTDB species clusters.')
        named_rep_gids, rep_clustered_gids, rep_radius = self._parse_named_clusters(named_cluster_file)
        self.logger.info(' ... identified {:,} representative genomes.'.format(len(named_rep_gids)))
        self.logger.info(' ... identified {:,} clustered genomes.'.format(len(rep_clustered_gids)))
        
        # determine genomes left to be clustered
        unclustered_gids = set(cur_genomes.genomes.keys()) - named_rep_gids - rep_clustered_gids
        self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(len(unclustered_gids)))

        # establish closest representative for each unclustered genome
        self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(len(unclustered_gids)))
        nonrep_radius = self._nonrep_radius(unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(cur_genomes, unclustered_gids)

        # select de novo species representatives in a greedy fashion based on genome quality
        de_novo_rep_gids = self._selected_rep_genomes(cur_genomes,
                                                        nonrep_radius, 
                                                        unclustered_gids, 
                                                        mash_anis)

        # cluster all non-representative genomes to representative genomes
        final_cluster_radius = rep_radius.copy()
        final_cluster_radius.update(nonrep_radius)
        
        final_clusters, ani_af = self._cluster_genomes(cur_genomes,
                                                        de_novo_rep_gids,
                                                        named_rep_gids, 
                                                        final_cluster_radius)

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]

        self.logger.info('Writing {:,} species clusters to file.'.format(len(final_clusters)))
        self.logger.info('Writing {:,} cluster radius information to file.'.format(len(final_cluster_radius)))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        cur_genomes,
                        os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv'))

        write_rep_radius(final_cluster_radius, 
                            cur_genomes,
                            os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))
    def run(self, named_rep_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            rep_mash_sketch_file, rep_ani_file, gtdb_type_strains_ledger):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # get representative genomes
        rep_gids = set()
        with open(named_rep_file) as f:
            header = f.readline().strip().split('\t')
            rep_index = header.index('Representative')
            sp_index = header.index('Proposed species')

            for line in f:
                line_split = line.strip().split('\t')
                gid = line_split[rep_index]
                assert gid in cur_genomes
                rep_gids.add(gid)

        self.logger.info(
            'Identified representative genomes for {:,} species.'.format(
                len(rep_gids)))

        # calculate circumscription radius for representative genomes
        self.logger.info(
            'Determining ANI species circumscription for {:,} representative genomes.'
            .format(len(rep_gids)))
        rep_radius = self._rep_radius(rep_gids, rep_ani_file)
        write_rep_radius(
            rep_radius, cur_genomes,
            os.path.join(self.output_dir, 'gtdb_rep_ani_radius.tsv'))

        # calculate ANI between representative and non-representative genomes
        self.logger.info(
            'Calculating ANI between representative and non-representative genomes.'
        )
        ani_af = self._calculate_ani(cur_genomes, rep_gids,
                                     rep_mash_sketch_file)
        self.logger.info(
            ' ... ANI values determined for {:,} query genomes.'.format(
                len(ani_af)))
        self.logger.info(
            ' ... ANI values determined for {:,} genome pairs.'.format(
                sum([len(ani_af[qid]) for qid in ani_af])))

        # cluster remaining genomes to representatives
        non_reps = set(cur_genomes.genomes) - set(rep_radius)
        self.logger.info(
            'Clustering {:,} non-representatives to {:,} representatives using species-specific ANI radii.'
            .format(len(non_reps), len(rep_radius)))
        clusters = self._cluster(ani_af, non_reps, rep_radius)

        # write out clusters
        write_clusters(
            clusters, rep_radius, cur_genomes,
            os.path.join(self.output_dir, 'gtdb_named_rep_clusters.tsv'))
Exemple #26
0
    def run(self, cur_gtdb_metadata_file, cur_genomic_path_file,
            qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file,
            gtdb_type_strains_ledger, untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""

        # get species in LTP reference database
        self.logger.info(
            'Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(
            f' - identified {len(ltp_defined_species):,} species.')

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # parsing genomes manually established to be untrustworthy as type
        self.logger.info(
            'Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = self.parse_untrustworthy_type_ledger(
            untrustworthy_type_ledger)
        self.logger.info(
            f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.'
        )

        # Identify NCBI species with multiple genomes assembled from type strain of species. This
        # is done using a series of heuristics that aim to ensure that the selected type strain
        # genome is reliable. More formal evaluation and a manuscript descirbing this selection
        # process is ultimately required. Ideally, the community will eventually adopt a
        # database that indicates a single `type genome assembly` for each species instead
        # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist.
        self.logger.info(
            'Determining number of type strain genomes in each NCBI species.')
        multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes)
        self.logger.info(
            f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.'
        )

        # resolve species with multiple type strain genomes
        fout = open(
            os.path.join(self.output_dir, 'multi_type_strain_species.tsv'),
            'w')
        fout.write(
            'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n'
        )

        fout_genomes = open(
            os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write(
            'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment'
        )
        fout_genomes.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n'
        )

        fout_unresolved = open(
            os.path.join(self.output_dir,
                         'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_high_divergence = open(
            os.path.join(self.output_dir,
                         'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_untrustworthy = open(
            os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'),
            'w')
        fout_untrustworthy.write(
            'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n'
        )

        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                '<not tested>', 'n/a', 'Manual curation: ' + reason))

        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0

        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        ncbi_rep_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0

        # *** Perhaps should be an external flag, but used right now to speed up debugging
        use_pickled_results = False
        if use_pickled_results:
            self.logger.warning(
                'Using previously calculated ANI results in: {}'.format(
                    self.ani_pickle_dir))

        prev_gtdb_sp_conflicts = 0

        self.logger.info(
            'Resolving species with multiple type strain genomes:')
        for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(),
                                         key=lambda kv: len(kv[1])):
            assert len(type_gids) > 1

            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                ncbi_sp, len(type_gids), processed + 1,
                len(multi_type_strains_sp), (processed + 1) * 100.0 /
                len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani(
                ncbi_sp, type_gids, cur_genomes, use_pickled_results)

            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            unresolved_species = False
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            if not all_similar:
                note = ''

                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True

                # write out highly divergent cases for manual inspection;
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_high_divergence.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(
                    gid_anis, ncbi_sp, type_gids, ltp_metadata,
                    ltp_defined_species, cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(
                        gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(
                        gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1

                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1

                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1

                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                # try to resovle by considering genomes annotated as representative genomes at NCBI
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_reps(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by considering NCBI representative genomes'
                        ncbi_rep_resolved += 1

                if resolved:
                    unresolved_species = False

                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(
                            specific_epithet(
                                cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1

                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_unresolved.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining
            ncbi_untrustworthy_gids = set([
                gid for gid in type_gids if 'untrustworthy as type' in
                cur_genomes[gid].excluded_from_refseq_note
            ])
            if len(type_gids - set(untrustworthy_gids) -
                   ncbi_untrustworthy_gids) >= 1:
                for gid in ncbi_untrustworthy_gids:
                    untrustworthy_gids[
                        gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available"

            # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes
            num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids)
            for gid in type_gids:
                if (gid not in untrustworthy_gids and 'untrustworthy as type'
                        in cur_genomes[gid].excluded_from_refseq_note):
                    self.logger.warning(
                        "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]."
                        .format(gid, ncbi_sp, num_ncbi_untrustworthy,
                                len(type_gids)))

            # write out genomes identified as being untrustworthy
            for gid, reason in untrustworthy_gids.items():
                ltp_species = self.ltp_species(gid, ltp_metadata)

                if 'untrustworthy as type' in cur_genomes[
                        gid].excluded_from_refseq_note:
                    reason += "; considered `untrustworthy as type` at NCBI"
                fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                    gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                    ' / '.join(ltp_species), reason))

                # Sanity check that if the untrustworthy genome has an LTP to only the
                # expected species, that all other genomes also have a hit to the
                # expected species (or potentially no hit). Otherwise, more consideration
                # should be given to the genome with the conflicting LTP hit.
                if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                    other_sp = set()
                    for test_gid in type_gids:
                        ltp_species = self.ltp_species(test_gid, ltp_metadata)
                        if ltp_species and ncbi_sp not in ltp_species:
                            other_sp.update(ltp_species)

                    if other_sp:
                        self.logger.warning(
                            f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.'
                        )

            # write out information about all type genomes
            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)

                fout_genomes.write(
                    '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n'
                    .format(gid, gid in untrustworthy_gids, ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species), gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa,
                            untrustworthy_gids.get(gid, '')))

            fout.write(
                '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                    ncbi_sp, len(type_gids), all_similar, np_mean(anis),
                    np_std(anis), np_mean(afs), np_std(afs), note,
                    ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()

        self.logger.info(
            f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.'
        )
        self.logger.info(
            f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.'
        )
        self.logger.info(
            f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.'
        )
        self.logger.info(
            f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.'
        )
        self.logger.info(
            f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.'
        )

        if unresolved_sp_count > 0:
            self.logger.warning(
                f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.'
            )
            self.logger.warning(
                'These should be handled before proceeding with the next step of GTDB species updating.'
            )
            self.logger.warning(
                "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'."
            )

        self.logger.info(
            f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.'
        )
    def run(self, gtdb_metadata_file, genomic_path_file):
        """Dereplicate GTDB species clusters using ANI/AF criteria."""

        # create GTDB genome sets
        self.logger.info('Creating GTDB genome set.')
        genomes = Genomes()
        genomes.load_from_metadata_file(gtdb_metadata_file)
        genomes.load_genomic_file_paths(genomic_path_file)
        self.logger.info(
            ' - genome set has {:,} species clusters spanning {:,} genomes.'.
            format(len(genomes.sp_clusters),
                   genomes.sp_clusters.total_num_genomes()))

        # get GTDB representatives from same genus
        self.logger.info('Identifying GTDB representatives in the same genus.')
        genus_gids = defaultdict(list)
        num_reps = 0
        for gid in genomes:
            if not genomes[gid].gtdb_is_rep:
                continue

            gtdb_genus = genomes[gid].gtdb_taxa.genus
            genus_gids[gtdb_genus].append(gid)
            num_reps += 1
        self.logger.info(
            f' - identified {len(genus_gids):,} genera spanning {num_reps:,} representatives'
        )

        # get all intragenus comparisons
        self.logger.info('Determining all intragenus comparisons.')
        gid_pairs = []
        for gids in genus_gids.values():
            if len(gids) < 2:
                continue

            for g1, g2 in permutations(gids, 2):
                gid_pairs.append((g1, g2))
        self.logger.info(
            f' - identified {len(gid_pairs):,} intragenus comparisons')

        # calculate FastANI ANI/AF between target genomes
        self.logger.info('Calculating ANI between intragenus pairs.')
        ani_af = self.fastani.pairs(gid_pairs,
                                    genomes.genomic_files,
                                    report_progress=True,
                                    check_cache=True)
        self.fastani.write_cache(silence=True)

        # write out results
        fout = open(
            os.path.join(self.output_dir, 'intragenus_ani_af_reps.tsv'), 'w')
        fout.write(
            'Query ID\tQuery species\tTarget ID\tTarget species\tANI\tAF\n')
        for qid in ani_af:
            for rid in ani_af:
                ani, af = FastANI.symmetric_ani(ani_af, qid, rid)

                fout.write('{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\n'.format(
                    qid, genomes[qid].gtdb_taxa.species, rid,
                    genomes[rid].gtdb_taxa.species, ani, af))
        fout.close()