Exemple #1
0
    def resolve_gtdb_species(self, gid_anis, ncbi_sp, type_gids, cur_genomes):
        """Resolve by identifying genomes with a conflicting GTDB species assignments to different type material."""

        ncbi_sp_epithet = specific_epithet(ncbi_sp)

        untrustworthy_gids = {}
        matched_sp_epithet = 0
        for gid in type_gids:
            if ncbi_sp_epithet == cur_genomes[gid].gtdb_taxa.specific_epithet:
                matched_sp_epithet += 1
            else:
                # check if genome is classified to a GTDB species cluster supported
                # by a type strain genome in which case we should consider this
                # genome untrustworthy
                gtdb_sp = cur_genomes[gid].gtdb_taxa.species
                if gtdb_sp != 's__':
                    gtdb_sp_rid = cur_genomes.gtdb_sp_rep(gtdb_sp)
                    if cur_genomes[gtdb_sp_rid].is_effective_type_strain():
                        # genome has been assigned to another species
                        # defined by a type strain genome
                        ani, af = self.fastani.symmetric_ani_cached(
                            gid, gtdb_sp_rid, cur_genomes[gid].genomic_file,
                            cur_genomes[gtdb_sp_rid].genomic_file)
                        untrustworthy_gids[
                            gid] = f'Conflicting GTDB species assignment of {cur_genomes[gid].gtdb_taxa.species} [ANI={ani:.2f}%; AF={af:.2f}%]'

        all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids)

        # conflict is resolved if remaining genomes pass ANI similarity test,
        if all_similar and len(
                untrustworthy_gids) > 0 and matched_sp_epithet > 0:
            return True, untrustworthy_gids

        return False, {}
    def add_suffixed_species(self, species):
        """Account for species names when generating further placeholder names.

        This is required as species can be transferred between genera and will
        retain their existing suffix. As such, we must track that this genera
        now has a species name with a given suffix.

        (e.g., Lactobacillus_G kunkeei_A is transferred to Apilactobacillus
               as A. kunkeei_A, so the next suffixed A. kunkeei representative
               must have a 'B' suffix.)
        """

        canonical_species = canonical_taxon(species)
        if canonical_species not in self.taxon_suffix:
            self.taxon_suffix[canonical_species] = 'A'
        else:
            specific = specific_epithet(species)
            suffix = taxon_suffix(specific)

            if canonical_species in self.taxon_suffix:
                if self.is_higher_suffix(suffix, self.taxon_suffix[canonical_species]):
                    self.taxon_suffix[canonical_species] = suffix
            else:
                # add new canonical taxon to suffix map
                self.taxon_suffix[canonical_species] = suffix
    def identify_consensus_synonyms(self, ncbi_misclassified_gids):
        """Identify synonyms arising from all genomes with an NCBI species classification
            which lack a type strain genome being contained in a GTDB cluster defined by
            a type strain genome."""

        # get genomes with NCBI species assignment
        ncbi_species_gids = defaultdict(list)
        ncbi_all_sp_gids = set()
        for gid in self.cur_genomes:
            if gid in ncbi_misclassified_gids:
                continue

            ncbi_species = self.cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)
            if ncbi_species != 's__' and ncbi_specific not in self.forbidden_specific_names:
                ncbi_species_gids[ncbi_species].append(gid)
                ncbi_all_sp_gids.add(gid)

        # identify consensus synonyms
        consensus_synonyms = defaultdict(list)

        for ncbi_species, ncbi_sp_gids in ncbi_species_gids.items():
            if ncbi_species in self.ncbi_sp_type_strain_genomes:
                # NCBI species define by type strain genome so should
                # be either the representative of a GTDB species cluster
                # or a type strain synonym
                continue

            # determine if all genomes in NCBI species are contained
            # in a single GTDB species represented by a type strain genome
            for gtdb_rid, cids in self.cur_clusters.items():
                assert gtdb_rid in cids

                if not self.cur_genomes[gtdb_rid].is_effective_type_strain():
                    continue

                ncbi_cur_sp_in_cluster = cids.intersection(ncbi_sp_gids)
                ncbi_cur_sp_in_cluster_perc = len(
                    ncbi_cur_sp_in_cluster) * 100.0 / len(ncbi_sp_gids)
                if ncbi_cur_sp_in_cluster_perc == 100:
                    # using the best quality genome in NCBI species to establish
                    # synonym statistics such as ANI and AF
                    q = {
                        gid: self.cur_genomes[gid].score_type_strain()
                        for gid in ncbi_sp_gids
                    }
                    q_sorted = sorted(q.items(),
                                      key=lambda kv: (kv[1], kv[0]),
                                      reverse=True)
                    consensus_synonyms[gtdb_rid].append(q_sorted[0][0])
                    break

        self.logger.info(
            ' - identified {:,} GTDB representatives resulting in {:,} majority vote synonyms.'
            .format(len(consensus_synonyms),
                    sum([len(gids) for gids in consensus_synonyms.values()])))

        return consensus_synonyms
    def validate_type_strain_clustering(self, mc_species):
        """Validate that all type strain genomes for an NCBI species occur in a single GTDB cluster."""

        self.logger.info(
            'Verifying that all type strain genomes for a NCBI species occur in a single GTDB cluster.'
        )
        rid_map = {}
        for rid, gids in self.cur_clusters.items():
            rid_map[rid] = rid
            for gid in gids:
                rid_map[gid] = rid

        for ncbi_sp, type_gids in self.ncbi_sp_type_strain_genomes.items():
            gtdb_rids = set([rid_map[gid] for gid in type_gids])

            gtdb_rids = set()
            for gid in type_gids:
                rid = rid_map[gid]
                ncbi_specific = specific_epithet(
                    self.cur_genomes[rid].ncbi_taxa.species)

                if (rid in mc_species and
                        specific_epithet(mc_species[rid]) != ncbi_specific):
                    # skip this genome as it has been manually changed, likely
                    # to resolve this NCBI species having multiple type strain genomes
                    continue

                gtdb_rids.add(rid)

            if len(gtdb_rids) > 1:
                self.logger.error(
                    'Type strain genomes from NCBI species {} were assigned to {:,} GTDB species clusters: {}.'
                    .format(ncbi_sp, len(gtdb_rids),
                            [(rid, self.cur_genomes[rid].gtdb_taxa.species)
                             for rid in gtdb_rids]))
                sys.exit(-1)
    def ncbi_sp_gtdb_cluster_table(self, final_taxonomy):
        """Create table indicating GTDB species clusters for each NCBI species."""

        # get map between genomes and representatives
        gid_to_rid = {}
        for rid, cids in self.cur_clusters.items():
            for cid in cids:
                gid_to_rid[cid] = rid

        # get genomes with NCBI species assignment
        ncbi_species_gids = defaultdict(list)
        for rid, cids in self.cur_clusters.items():
            if rid not in final_taxonomy:
                continue  # other domain

            for cid in cids:
                ncbi_species = self.cur_genomes[cid].ncbi_taxa.species
                ncbi_specific = specific_epithet(ncbi_species)
                if ncbi_species != 's__' and ncbi_specific not in self.forbidden_specific_names:
                    ncbi_species_gids[ncbi_species].append(cid)

        # write out table
        fout = open(
            os.path.join(self.output_dir, 'ncbi_sp_to_gtdb_cluster_map.tsv'),
            'w')
        fout.write(
            'NCBI species\tNo. genomes\tNo. GTDB clusters\tHighest prevalence (%)\tGTDB species clusters\n'
        )

        for ncbi_sp, gids in ncbi_species_gids.items():
            gtdb_rep_list = [gid_to_rid[gid] for gid in gids]
            gtdb_rep_count = Counter(gtdb_rep_list)

            gtdb_rep_str = []
            for idx, (rid, count) in enumerate(gtdb_rep_count.most_common()):
                gtdb_rep_str.append('{} ({}): {}'.format(
                    final_taxonomy[rid][Taxonomy.SPECIES_INDEX], rid, count))

                if idx == 0:
                    highest_prevalence = count * 100.0 / len(gids)

            fout.write('{}\t{}\t{}\t{:.2f}\t{}\n'.format(
                ncbi_sp, len(gids), len(set(gtdb_rep_list)),
                highest_prevalence, '; '.join(gtdb_rep_str)))

        fout.close()
Exemple #6
0
    def run(self, cur_gtdb_metadata_file, cur_genomic_path_file,
            qc_passed_file, ncbi_genbank_assembly_file, ltp_taxonomy_file,
            gtdb_type_strains_ledger, untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""

        # get species in LTP reference database
        self.logger.info(
            'Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(
            f' - identified {len(ltp_defined_species):,} species.')

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_ledger,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # parsing genomes manually established to be untrustworthy as type
        self.logger.info(
            'Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = self.parse_untrustworthy_type_ledger(
            untrustworthy_type_ledger)
        self.logger.info(
            f' - identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.'
        )

        # Identify NCBI species with multiple genomes assembled from type strain of species. This
        # is done using a series of heuristics that aim to ensure that the selected type strain
        # genome is reliable. More formal evaluation and a manuscript descirbing this selection
        # process is ultimately required. Ideally, the community will eventually adopt a
        # database that indicates a single `type genome assembly` for each species instead
        # of just indicating a type strain from which many (sometimes dissimilar) assemblies exist.
        self.logger.info(
            'Determining number of type strain genomes in each NCBI species.')
        multi_type_strains_sp = self.sp_with_mult_type_strains(cur_genomes)
        self.logger.info(
            f' - identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.'
        )

        # resolve species with multiple type strain genomes
        fout = open(
            os.path.join(self.output_dir, 'multi_type_strain_species.tsv'),
            'w')
        fout.write(
            'NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n'
        )

        fout_genomes = open(
            os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write(
            'Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment'
        )
        fout_genomes.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\tReason for GTDB untrustworthy as type\n'
        )

        fout_unresolved = open(
            os.path.join(self.output_dir,
                         'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write(
            '\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_high_divergence = open(
            os.path.join(self.output_dir,
                         'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write(
            'Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n'
        )

        fout_untrustworthy = open(
            os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'),
            'w')
        fout_untrustworthy.write(
            'Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n'
        )

        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                '<not tested>', 'n/a', 'Manual curation: ' + reason))

        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0

        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        ncbi_rep_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0

        # *** Perhaps should be an external flag, but used right now to speed up debugging
        use_pickled_results = False
        if use_pickled_results:
            self.logger.warning(
                'Using previously calculated ANI results in: {}'.format(
                    self.ani_pickle_dir))

        prev_gtdb_sp_conflicts = 0

        self.logger.info(
            'Resolving species with multiple type strain genomes:')
        for ncbi_sp, type_gids in sorted(multi_type_strains_sp.items(),
                                         key=lambda kv: len(kv[1])):
            assert len(type_gids) > 1

            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                ncbi_sp, len(type_gids), processed + 1,
                len(multi_type_strains_sp), (processed + 1) * 100.0 /
                len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            all_similar, anis, afs, gid_anis, gid_afs = self.calculate_type_strain_ani(
                ncbi_sp, type_gids, cur_genomes, use_pickled_results)

            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            unresolved_species = False
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            if not all_similar:
                note = ''

                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True

                # write out highly divergent cases for manual inspection;
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_high_divergence.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(
                    gid_anis, ncbi_sp, type_gids, ltp_metadata,
                    ltp_defined_species, cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(
                        gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(
                        gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1

                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1

                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(
                        gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1

                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                # try to resovle by considering genomes annotated as representative genomes at NCBI
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_reps(
                        gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by considering NCBI representative genomes'
                        ncbi_rep_resolved += 1

                if resolved:
                    unresolved_species = False

                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(
                            specific_epithet(
                                cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1

                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)

                        fout_unresolved.write(
                            '{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'
                            .format(gid, ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            # remove genomes marked as untrustworthy as type at NCBI if one or more potential type strain genomes remaining
            ncbi_untrustworthy_gids = set([
                gid for gid in type_gids if 'untrustworthy as type' in
                cur_genomes[gid].excluded_from_refseq_note
            ])
            if len(type_gids - set(untrustworthy_gids) -
                   ncbi_untrustworthy_gids) >= 1:
                for gid in ncbi_untrustworthy_gids:
                    untrustworthy_gids[
                        gid] = "Genome annotated as `untrustworthy as type` at NCBI and there are other potential type strain genomes available"

            # report cases where genomes marked as untrustworthy as type at NCBI are being retained as potential type strain genomes
            num_ncbi_untrustworthy = len(ncbi_untrustworthy_gids)
            for gid in type_gids:
                if (gid not in untrustworthy_gids and 'untrustworthy as type'
                        in cur_genomes[gid].excluded_from_refseq_note):
                    self.logger.warning(
                        "Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy]."
                        .format(gid, ncbi_sp, num_ncbi_untrustworthy,
                                len(type_gids)))

            # write out genomes identified as being untrustworthy
            for gid, reason in untrustworthy_gids.items():
                ltp_species = self.ltp_species(gid, ltp_metadata)

                if 'untrustworthy as type' in cur_genomes[
                        gid].excluded_from_refseq_note:
                    reason += "; considered `untrustworthy as type` at NCBI"
                fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                    gid, ncbi_sp, cur_genomes[gid].gtdb_taxa.species,
                    ' / '.join(ltp_species), reason))

                # Sanity check that if the untrustworthy genome has an LTP to only the
                # expected species, that all other genomes also have a hit to the
                # expected species (or potentially no hit). Otherwise, more consideration
                # should be given to the genome with the conflicting LTP hit.
                if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                    other_sp = set()
                    for test_gid in type_gids:
                        ltp_species = self.ltp_species(test_gid, ltp_metadata)
                        if ltp_species and ncbi_sp not in ltp_species:
                            other_sp.update(ltp_species)

                    if other_sp:
                        self.logger.warning(
                            f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.'
                        )

            # write out information about all type genomes
            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)

                fout_genomes.write(
                    '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\t{}\n'
                    .format(gid, gid in untrustworthy_gids, ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species), gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa,
                            untrustworthy_gids.get(gid, '')))

            fout.write(
                '{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                    ncbi_sp, len(type_gids), all_similar, np_mean(anis),
                    np_std(anis), np_mean(afs), np_std(afs), note,
                    ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()

        self.logger.info(
            f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.'
        )
        self.logger.info(
            f' - resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.'
        )
        self.logger.info(
            f' - resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.'
        )
        self.logger.info(
            f' - resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.'
        )
        self.logger.info(
            f' - resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.'
        )
        self.logger.info(
            f' - resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.'
        )
        self.logger.info(
            f' - resolved {ncbi_rep_resolved:,} species by considering RefSeq reference and representative designations at NCBI.'
        )

        if unresolved_sp_count > 0:
            self.logger.warning(
                f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.'
            )
            self.logger.warning(
                'These should be handled before proceeding with the next step of GTDB species updating.'
            )
            self.logger.warning(
                "This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'."
            )

        self.logger.info(
            f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.'
        )
    def infer_epithet_map(self, gids_of_interest, mc_species, cur_genomes,
                          cur_clusters):
        """Infer mapping of NCBI epithet to GTDB epithet which may be different due to gender of genus."""

        # **************************************
        # This should be updated so it only includes valid transfers, and not
        # results due to misclassifications at NCBI. For example, right now this
        # code reports Enterobacter cancerogenus being transferred to Pantoea, but
        # really this is just a misclassified NCBI genome.

        # get species in GTDB genus
        generic_rids = defaultdict(list)
        for rid in cur_clusters:
            if rid not in gids_of_interest:
                continue

            gtdb_generic = cur_genomes[rid].gtdb_taxa.genus.replace('g__', '')
            if rid in mc_species:
                gtdb_generic = generic_name(mc_species[rid])

            generic_rids[gtdb_generic].append(rid)

        # establish epithets that are nearly identical
        # except for small change to suffix which is
        # assumed to be due to a gender change
        for gtdb_generic, rids in generic_rids.items():
            ncbi_sp_epithet_list = defaultdict(list)
            for rid in rids:
                ncbi_species = cur_genomes[rid].ncbi_taxa.species
                if ncbi_species == 's__':
                    continue

                ncbi_generic = generic_name(ncbi_species)
                ncbi_specific = specific_epithet(ncbi_species)

                if rid in mc_species:
                    gtdb_species = mc_species[rid]
                else:
                    gtdb_species = cur_genomes[rid].gtdb_taxa.species

                gtdb_specific = canonical_taxon(specific_epithet(gtdb_species))

                self.gtdb_ncbi_generic_map[gtdb_generic][gtdb_specific].append(
                    ncbi_generic)

                if test_same_epithet(ncbi_specific, gtdb_specific):
                    ncbi_sp_epithet_list[ncbi_specific].append(gtdb_specific)

            for ncbi_specific, gtdb_specific_list in ncbi_sp_epithet_list.items(
            ):
                gtdb_specific_counter = Counter(gtdb_specific_list)

                top_gtdb_specific, count = gtdb_specific_counter.most_common(
                    1)[0]

                map_perc = count * 100.0 / len(gtdb_specific_list)
                if map_perc >= 50:
                    self.sp_epithet_map[gtdb_generic][
                        ncbi_specific] = top_gtdb_specific

                if map_perc != 100:
                    self.logger.warning(
                        'Imperfect suffix mapping between from {} {} to {} at {:.1f}%.'
                        .format(gtdb_generic, top_gtdb_specific, ncbi_specific,
                                count * 100.0 / len(gtdb_specific_list)))
Exemple #8
0
    def classify_ncbi_species_consensus(self, ncbi_synonyms, misclassified_gids):
        """Classify each NCBI species as either unambiguously, ambiguously, or synonym."""

        # get genomes with NCBI species assignment
        ncbi_all_species = set()
        ncbi_species_gids = defaultdict(list)
        ncbi_all_species_gids = set()
        for gid in self.cur_genomes:
            ncbi_species = self.cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)
            if ncbi_species != 's__' and ncbi_specific not in self.forbidden_specific_names:
                ncbi_all_species.add(ncbi_species)
                
                if gid not in misclassified_gids:
                    ncbi_species_gids[ncbi_species].append(gid)
                    ncbi_all_species_gids.add(gid)
                    
        # establish if NCBI species can be unambiguously assigned to a GTDB species cluster
        fout = open(os.path.join(self.output_dir, 'ncbi_sp_classification.tsv'), 'w')
        fout.write('NCBI species\tClassification\tAssignment type\tGTDB representative\t% NCBI species in cluster\t% cluster with NCBI species\tNCBI classifications in cluster\tNote\n')
        unambiguous_ncbi_sp = {}
        ambiguous_ncbi_sp = set()
        for ncbi_species, ncbi_sp_gids in ncbi_species_gids.items():
            if ncbi_species in ncbi_synonyms:
                continue

            # get type strains for NCBI species that are GTDB species representatives
            type_strain_rids = [gid for gid in ncbi_sp_gids
                                    if (self.cur_genomes[gid].is_effective_type_strain() and gid in self.cur_clusters)]

            if len(type_strain_rids) == 0:
                highest_sp_in_cluster_perc = 0
                highest_cluster_perc = 0
                highest_gtdb_rid = None
                for gtdb_rid, cids in self.cur_clusters.items():
                    assert gtdb_rid in cids
                    
                    ncbi_cur_sp_in_cluster = cids.intersection(ncbi_sp_gids)
                    ncbi_any_sp_in_cluster = cids.intersection(ncbi_all_species_gids)
                    
                    if len(ncbi_any_sp_in_cluster) > 0:
                        ncbi_cur_sp_in_cluster_perc = len(ncbi_cur_sp_in_cluster)*100.0/len(ncbi_sp_gids)
                        
                        if ncbi_cur_sp_in_cluster_perc > highest_sp_in_cluster_perc:
                            highest_sp_in_cluster_perc = ncbi_cur_sp_in_cluster_perc
                            highest_cluster_perc = len(ncbi_cur_sp_in_cluster)*100.0/len(ncbi_any_sp_in_cluster) 
                            highest_gtdb_rid = gtdb_rid
                        
                    if highest_sp_in_cluster_perc == 100:
                        break

                if highest_sp_in_cluster_perc == 100 and highest_cluster_perc == 100:
                    unambiguous_ncbi_sp[ncbi_species] = (highest_gtdb_rid, 'UNANIMOUS_CONSENSUS')
                else:
                    ambiguous_ncbi_sp.add(ncbi_species) 
                                                        
                ncbi_sp_str = []
                for sp, count in Counter([self.cur_genomes[gid].ncbi_taxa.species for gid in self.cur_clusters[highest_gtdb_rid]]).most_common():
                    ncbi_sp_str.append('{} ({:,})'.format(sp, count))
                ncbi_sp_str = '; '.join(ncbi_sp_str)
                
                fout.write('{}\t{}\t{}\t{}\t{:.2f}\t{:.2f}\t{}\t{}\n'.format(
                            ncbi_species,
                            'UNAMBIGUOUS' if ncbi_species in unambiguous_ncbi_sp else 'AMBIGUOUS',
                            'UNANIMOUS_CONSENSUS' if ncbi_species in unambiguous_ncbi_sp else 'AMBIGUOUS',
                            highest_gtdb_rid,
                            highest_sp_in_cluster_perc,
                            highest_cluster_perc,
                            ncbi_sp_str,
                            ''))
                            
            elif len(type_strain_rids) == 1:
                gtdb_rid = type_strain_rids[0]
                unambiguous_ncbi_sp[ncbi_species] = (gtdb_rid, 'TYPE_STRAIN_GENOME')
                
                cids = self.cur_clusters[gtdb_rid]
                ncbi_cur_sp_in_cluster = cids.intersection(ncbi_sp_gids)
                ncbi_any_sp_in_cluster = cids.intersection(ncbi_all_species_gids)
                cur_sp_in_cluster_perc = len(ncbi_cur_sp_in_cluster)*100.0/len(ncbi_sp_gids)
                cluster_perc = len(ncbi_cur_sp_in_cluster)*100.0/len(ncbi_any_sp_in_cluster)
                
                ncbi_sp_str = []
                for sp, count in Counter([self.cur_genomes[gid].ncbi_taxa.species for gid in self.cur_clusters[gtdb_rid]]).most_common():
                    ncbi_sp_str.append('{} ({:,})'.format(sp, count))
                ncbi_sp_str = '; '.join(ncbi_sp_str)
                
                fout.write('{}\t{}\t{}\t{}\t{:.2f}\t{:.2f}\t{}\t{}\n'.format(
                            ncbi_species,
                            'UNAMBIGUOUS',
                            'TYPE_STRAIN_GENOME',
                            gtdb_rid,
                            cur_sp_in_cluster_perc,
                            cluster_perc,
                            ncbi_sp_str,
                            ''))
            else:
                self.logger.error('Multiple GTDB species clusters represented by type strain genomes of {}: {}'.format(
                    ncbi_species, type_strain_rids))
                sys.exit(-1)
                
        # get NCBI synonyms
        for ncbi_species, ncbi_sp_gids in ncbi_species_gids.items():
            if ncbi_species not in ncbi_synonyms:
                continue

            fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                            ncbi_species,
                            'SYNONYM',
                            'SYNONYM',
                            'n/a',
                            'n/a',
                            'n/a',
                            'n/a',
                            'Synonym of {} under GTDB'.format(ncbi_synonyms[ncbi_species])))
                            
        fout.close()
        
        # sanity check results
        assert set(unambiguous_ncbi_sp).intersection(ncbi_synonyms) == set()
        assert set(ambiguous_ncbi_sp).intersection(ncbi_synonyms) == set()
        assert set(unambiguous_ncbi_sp).intersection(ambiguous_ncbi_sp) == set()
        assert set(ncbi_all_species) - set(unambiguous_ncbi_sp) - set(ambiguous_ncbi_sp) - set(ncbi_synonyms) == set()

        gtdb_rep_ncbi_sp_assignments = {}
        for ncbi_species, (gtdb_rid, classification) in unambiguous_ncbi_sp.items():
            if gtdb_rid in gtdb_rep_ncbi_sp_assignments:
                self.logger.error('GTDB representative {} assigned to multiple unambiguous NCBI species: {} {}'.format(
                                    gtdb_rid,
                                    gtdb_rep_ncbi_sp_assignments[gtdb_rid],
                                    ncbi_species))
                sys.exit(-1)
            gtdb_rep_ncbi_sp_assignments[gtdb_rid] = ncbi_species

        return ncbi_all_species, unambiguous_ncbi_sp, ambiguous_ncbi_sp
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger,
            sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  dsmz_bacnames_file)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get all GTDB species represented by a type strain:
        gtdb_type_species = set()
        for rid in mc_taxonomy:
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX])

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type strain genomes with incongruent GTDB species assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_sp = taxa[Taxonomy.SPECIES_INDEX]
                gtdb_generic = generic_name(gtdb_sp)

                ncbi_sp = cur_genomes[rid].ncbi_taxa.species
                ncbi_generic = generic_name(ncbi_sp)

                if ncbi_sp == 's__':
                    # NCBI taxonomy is sometimes behind the genome annotation pages,
                    # and do not have a species assignment even for type strain genome
                    continue

                # check if genome is a valid genus transfer into a genus
                # that already contains a species with the specific
                # name which results in a polyphyletic suffix being required
                # e.g. G002240355 is Prauserella marina at NCBI and is
                # transferred into Saccharomonospora under the GTDB. However,
                # Saccharomonospora marina already exists so this genome
                # needs to be S. marina_A.
                if (is_placeholder_taxon(gtdb_sp)
                        and gtdb_generic != ncbi_generic
                        and canonical_species(gtdb_sp) in gtdb_type_species):
                    continue

                if not test_same_epithet(specific_epithet(gtdb_sp),
                                         specific_epithet(ncbi_sp)):
                    num_incongruent += 1
                    fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        rid, gtdb_sp, ncbi_sp,
                        cur_genomes[rid].is_gtdb_type_strain(),
                        cur_genomes[rid].is_ncbi_type_strain(),
                        cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent species assignments.'.
            format(num_incongruent))
        fout.close()
    def identify_misclassified_genomes_ani(self, cur_genomes, cur_clusters):
        """Identify genomes with erroneous NCBI species assignments, based on ANI to type strain genomes."""

        forbidden_names = set(['cyanobacterium'])

        # get mapping from genomes to their representatives
        gid_to_rid = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                gid_to_rid[cid] = rid

        # get genomes with NCBI species assignment
        ncbi_sp_gids = defaultdict(list)
        for gid in cur_genomes:
            ncbi_species = cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)

            if ncbi_species != 's__' and ncbi_specific not in forbidden_names:
                ncbi_sp_gids[ncbi_species].append(gid)

        # get NCBI species anchored by a type strain genome
        ncbi_type_anchored_species = {}
        for rid, cids in cur_clusters.items():
            if cur_genomes[rid].is_effective_type_strain():
                ncbi_type_species = cur_genomes[rid].ncbi_taxa.species
                if ncbi_type_species != 's__':
                    ncbi_type_anchored_species[ncbi_type_species] = rid
        self.logger.info(
            ' - identified {:,} NCBI species anchored by a type strain genome.'
            .format(len(ncbi_type_anchored_species)))

        # identify genomes with erroneous NCBI species assignments
        fout = open(
            os.path.join(
                self.output_dir, 'ncbi_misclassified_sp.ani_{}.tsv'.format(
                    self.ani_ncbi_erroneous)), 'w')
        fout.write(
            'Genome ID\tNCBI species\tGenome cluster\tType species cluster\tANI to type strain\tAF to type strain\n'
        )

        misclassified_gids = set()
        for idx, (ncbi_species,
                  species_gids) in enumerate(ncbi_sp_gids.items()):
            if ncbi_species not in ncbi_type_anchored_species:
                continue

            type_rid = ncbi_type_anchored_species[ncbi_species]
            gids_to_check = []
            for gid in species_gids:
                cur_rid = gid_to_rid[gid]
                if type_rid != cur_rid:
                    # need to check genome as it has the same NCBI species name
                    # as a type strain genome, but resides in a different GTDB
                    # species cluster
                    gids_to_check.append(gid)

            if len(gids_to_check) > 0:
                gid_pairs = []
                for gid in gids_to_check:
                    gid_pairs.append((type_rid, gid))
                    gid_pairs.append((gid, type_rid))

                statusStr = '-> Establishing erroneous assignments for {} [ANI pairs: {:,}; {:,} of {:,} species].'.format(
                    ncbi_species, len(gid_pairs), idx + 1,
                    len(ncbi_sp_gids)).ljust(96)
                sys.stdout.write('{}\r'.format(statusStr))
                sys.stdout.flush()

                ani_af = self.fastani.pairs(gid_pairs,
                                            cur_genomes.genomic_files,
                                            report_progress=False,
                                            check_cache=True)

                for gid in gids_to_check:
                    ani, af = symmetric_ani(ani_af, type_rid, gid)
                    if ani < self.ani_ncbi_erroneous:
                        misclassified_gids.add(gid)
                        fout.write('{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\n'.format(
                            gid, ncbi_species, gid_to_rid[gid], type_rid, ani,
                            af))

        sys.stdout.write('\n')
        fout.close()

        misclassified_species = set(
            [cur_genomes[gid].ncbi_taxa.species for gid in misclassified_gids])
        self.logger.info(
            ' - identified {:,} genomes from {:,} species as having misclassified NCBI species assignments.'
            .format(len(misclassified_gids), len(misclassified_species)))

        return misclassified_gids
    def identify_misclassified_genomes_cluster(self, cur_genomes,
                                               cur_clusters):
        """Identify genomes with erroneous NCBI species assignments, based on GTDB clustering of type strain genomes."""

        forbidden_names = set(['cyanobacterium'])

        # get mapping from genomes to their representatives
        gid_to_rid = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                gid_to_rid[cid] = rid

        # get genomes with NCBI species assignment
        ncbi_sp_gids = defaultdict(list)
        for gid in cur_genomes:
            ncbi_species = cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)

            if ncbi_species != 's__' and ncbi_specific not in forbidden_names:
                ncbi_sp_gids[ncbi_species].append(gid)

        # get NCBI species anchored by a type strain genome
        ncbi_type_anchored_species = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                if cur_genomes[cid].is_effective_type_strain():
                    ncbi_type_species = cur_genomes[cid].ncbi_taxa.species
                    ncbi_specific = specific_epithet(ncbi_species)
                    if ncbi_type_species != 's__' and ncbi_specific not in forbidden_names:
                        if (ncbi_type_species in ncbi_type_anchored_species
                                and rid !=
                                ncbi_type_anchored_species[ncbi_type_species]):
                            self.logger.error(
                                'NCBI species {} has multiple effective type strain genomes in different clusters.'
                                .format(ncbi_type_species))
                            sys.exit(-1)

                        ncbi_type_anchored_species[ncbi_type_species] = rid
        self.logger.info(
            ' - identified {:,} NCBI species anchored by a type strain genome.'
            .format(len(ncbi_type_anchored_species)))

        # identify genomes with erroneous NCBI species assignments
        fout = open(
            os.path.join(self.output_dir,
                         'ncbi_misclassified_sp.gtdb_clustering.tsv'), 'w')
        fout.write(
            'Genome ID\tNCBI species\tGenome cluster\tType species cluster\n')

        misclassified_gids = set()
        for idx, (ncbi_species,
                  species_gids) in enumerate(ncbi_sp_gids.items()):
            if ncbi_species not in ncbi_type_anchored_species:
                continue

            # find genomes with NCBI species assignments that are in a
            # different cluster than the type strain genome
            type_rid = ncbi_type_anchored_species[ncbi_species]
            for gid in species_gids:
                cur_rid = gid_to_rid[gid]
                if type_rid != cur_rid:
                    misclassified_gids.add(gid)
                    fout.write('{}\t{}\t{}\t{}\t\n'.format(
                        gid, ncbi_species, cur_rid, type_rid))

        sys.stdout.write('\n')
        fout.close()

        misclassified_species = set(
            [cur_genomes[gid].ncbi_taxa.species for gid in misclassified_gids])
        self.logger.info(
            ' - identified {:,} genomes from {:,} species as having misclassified NCBI species assignments.'
            .format(len(misclassified_gids), len(misclassified_species)))

        return misclassified_gids
    def classify_ncbi_species(self, ncbi_synonyms, misclassified_gids):
        """Classify each NCBI species as either unambiguously, ambiguously, or synonym."""

        # get genomes with NCBI species assignment
        ncbi_all_species = set()
        ncbi_species_gids = defaultdict(list)
        ncbi_all_species_gids = set()
        for gid in self.cur_genomes:
            if gid in misclassified_gids:
                continue

            ncbi_species = self.cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)
            if ncbi_species != 's__' and ncbi_specific not in self.forbidden_specific_names:
                ncbi_all_species.add(ncbi_species)
                ncbi_species_gids[ncbi_species].append(gid)
                ncbi_all_species_gids.add(gid)

        # process NCBI species with type strain genomes followed by
        # remaining NCBI species
        ncbi_type_strain_species = []
        ncbi_nontype_species = []
        for ncbi_species, ncbi_sp_gids in ncbi_species_gids.items():
            has_type_strain_genome = False
            for gid in ncbi_sp_gids:
                if self.cur_genomes[gid].is_effective_type_strain():
                    has_type_strain_genome = True
                    break

            if has_type_strain_genome:
                ncbi_type_strain_species.append(ncbi_species)
            else:
                ncbi_nontype_species.append(ncbi_species)
        self.logger.info(
            ' - identified {:,} NCBI species with effective type strain genome and {:,} NCBI species without a type strain genome.'
            .format(len(ncbi_type_strain_species), len(ncbi_nontype_species)))

        # establish if NCBI species can be unambiguously assigned to a GTDB species cluster
        fout = open(
            os.path.join(self.output_dir, 'ncbi_sp_classification.tsv'), 'w')
        fout.write(
            'NCBI taxon\tClassification\tAssignment type\tGTDB representative\t% NCBI species in cluster\t% cluster with NCBI species\tNCBI classifications in cluster\tNote\n'
        )
        unambiguous_ncbi_sp = {}
        ambiguous_ncbi_sp = set()
        unambiguous_rids = set()
        for ncbi_species in ncbi_type_strain_species + ncbi_nontype_species:
            if ncbi_species in ncbi_synonyms:
                continue

            ncbi_sp_gids = ncbi_species_gids[ncbi_species]

            # get type strains for NCBI species that are GTDB species representatives
            type_strain_rids = [
                gid for gid in ncbi_sp_gids
                if (self.cur_genomes[gid].is_effective_type_strain()
                    and gid in self.cur_clusters)
            ]

            if len(type_strain_rids) == 0:
                highest_sp_in_cluster_perc = 0
                highest_cluster_perc = 0
                highest_gtdb_rid = None
                for gtdb_rid, cids in self.cur_clusters.items():
                    assert gtdb_rid in cids

                    ncbi_cur_sp_in_cluster = cids.intersection(ncbi_sp_gids)
                    ncbi_any_sp_in_cluster = cids.intersection(
                        ncbi_all_species_gids)

                    if len(ncbi_any_sp_in_cluster) > 0:
                        ncbi_cur_sp_in_cluster_perc = len(
                            ncbi_cur_sp_in_cluster) * 100.0 / len(ncbi_sp_gids)
                        cluster_perc = len(
                            ncbi_cur_sp_in_cluster) * 100.0 / len(
                                ncbi_any_sp_in_cluster)

                        if (ncbi_cur_sp_in_cluster_perc >
                                highest_sp_in_cluster_perc
                                or (ncbi_cur_sp_in_cluster_perc
                                    == highest_sp_in_cluster_perc
                                    and cluster_perc > highest_cluster_perc)):
                            highest_sp_in_cluster_perc = ncbi_cur_sp_in_cluster_perc
                            highest_cluster_perc = cluster_perc
                            highest_gtdb_rid = gtdb_rid

                sp_in_cluster_threshold = 50
                cluster_threshold = 50

                if (highest_gtdb_rid not in unambiguous_rids and
                        highest_sp_in_cluster_perc > sp_in_cluster_threshold
                        and highest_cluster_perc > cluster_threshold):
                    unambiguous_ncbi_sp[ncbi_species] = (
                        highest_gtdb_rid, NCBI_SpeciesManager.MAJORITY_VOTE)
                else:
                    ambiguous_ncbi_sp.add(ncbi_species)

                ncbi_sp_str = []
                for sp, count in Counter([
                        self.cur_genomes[gid].ncbi_taxa.species
                        for gid in self.cur_clusters[highest_gtdb_rid]
                ]).most_common():
                    ncbi_sp_str.append('{} ({:,})'.format(sp, count))
                ncbi_sp_str = '; '.join(ncbi_sp_str)

                fout.write('{}\t{}\t{}\t{}\t{:.2f}\t{:.2f}\t{}\t{}\n'.format(
                    ncbi_species,
                    NCBI_SpeciesManager.UNAMBIGUOUS if ncbi_species
                    in unambiguous_ncbi_sp else NCBI_SpeciesManager.AMBIGUOUS,
                    NCBI_SpeciesManager.MAJORITY_VOTE if ncbi_species
                    in unambiguous_ncbi_sp else NCBI_SpeciesManager.AMBIGUOUS,
                    highest_gtdb_rid, highest_sp_in_cluster_perc,
                    highest_cluster_perc, ncbi_sp_str, ''))

            elif len(type_strain_rids) == 1:
                gtdb_rid = type_strain_rids[0]
                unambiguous_ncbi_sp[ncbi_species] = (
                    gtdb_rid, NCBI_SpeciesManager.TYPE_STRAIN_GENOME)
                unambiguous_rids.add(gtdb_rid)

                cids = self.cur_clusters[gtdb_rid]
                ncbi_cur_sp_in_cluster = cids.intersection(ncbi_sp_gids)
                ncbi_any_sp_in_cluster = cids.intersection(
                    ncbi_all_species_gids)
                cur_sp_in_cluster_perc = len(
                    ncbi_cur_sp_in_cluster) * 100.0 / len(ncbi_sp_gids)
                cluster_perc = len(ncbi_cur_sp_in_cluster) * \
                    100.0/len(ncbi_any_sp_in_cluster)

                ncbi_sp_str = []
                for sp, count in Counter([
                        self.cur_genomes[gid].ncbi_taxa.species
                        for gid in self.cur_clusters[gtdb_rid]
                ]).most_common():
                    ncbi_sp_str.append('{} ({:,})'.format(sp, count))
                ncbi_sp_str = '; '.join(ncbi_sp_str)

                fout.write('{}\t{}\t{}\t{}\t{:.2f}\t{:.2f}\t{}\t{}\n'.format(
                    ncbi_species, NCBI_SpeciesManager.UNAMBIGUOUS,
                    NCBI_SpeciesManager.TYPE_STRAIN_GENOME, gtdb_rid,
                    cur_sp_in_cluster_perc, cluster_perc, ncbi_sp_str, ''))
            else:
                self.logger.error(
                    'Multiple GTDB species clusters represented by type strain genomes of {}: {}'
                    .format(ncbi_species, type_strain_rids))
                sys.exit(-1)

        # get NCBI synonyms
        for ncbi_species, ncbi_sp_gids in ncbi_species_gids.items():
            if ncbi_species not in ncbi_synonyms:
                continue

            fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                ncbi_species, 'SYNONYM', 'SYNONYM', 'n/a', 'n/a', 'n/a', 'n/a',
                'Synonym of {} under GTDB'.format(
                    ncbi_synonyms[ncbi_species])))

        fout.close()

        # sanity check results
        assert set(unambiguous_ncbi_sp).intersection(ncbi_synonyms) == set()
        assert set(ambiguous_ncbi_sp).intersection(ncbi_synonyms) == set()
        assert set(unambiguous_ncbi_sp).intersection(
            ambiguous_ncbi_sp) == set()
        assert set(ncbi_all_species) - set(unambiguous_ncbi_sp) - \
            set(ambiguous_ncbi_sp) - set(ncbi_synonyms) == set()

        gtdb_rep_ncbi_sp_assignments = {}
        for ncbi_species, (gtdb_rid,
                           _classification) in unambiguous_ncbi_sp.items():
            if gtdb_rid in gtdb_rep_ncbi_sp_assignments:
                self.logger.error(
                    'GTDB representative {} assigned to multiple unambiguous NCBI species: {} {}'
                    .format(gtdb_rid, gtdb_rep_ncbi_sp_assignments[gtdb_rid],
                            ncbi_species))
                # ***sys.exit(-1)
            gtdb_rep_ncbi_sp_assignments[gtdb_rid] = ncbi_species

        return ncbi_all_species, unambiguous_ncbi_sp, ambiguous_ncbi_sp
    def run(self, 
                cur_gtdb_metadata_file,
                cur_genomic_path_file,
                qc_passed_file,
                ncbi_genbank_assembly_file,
                ltp_taxonomy_file,
                gtdb_type_strains_ledger,
                untrustworthy_type_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""
        
        # get species in LTP reference database
        self.logger.info('Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(f' ... identified {len(ltp_defined_species):,} species.')
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=None,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # update current genomes with GTDB-Tk classifications
        self.logger.info('Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes)
        self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.')
        
        # parsing genomes manually established to be untrustworthy as type
        self.logger.info('Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = {}
        with open(untrustworthy_type_ledger) as f:
            header = f.readline().strip().split('\t')
            
            ncbi_sp_index = header.index('NCBI species')
            reason_index = header.index('Reason for declaring untrustworthy')
            
            for line in f:
                tokens = line.strip().split('\t')
                
                gid = canonical_gid(tokens[0])
                manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index])
        self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.')

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info('Determining number of type strain genomes in each NCBI species.')
        sp_type_strain_genomes = defaultdict(set)
        for gid in cur_genomes:
            if cur_genomes[gid].is_effective_type_strain():
                ncbi_sp = cur_genomes[gid].ncbi_taxa.species
                if ncbi_sp != 's__':
                    # yes, NCBI has genomes marked as assembled from type material
                    # that do not actually have a binomial species name
                    sp_type_strain_genomes[ncbi_sp].add(gid)

        multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1]
        self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.')
        
        # sort by number of genome assemblies
        self.logger.info('Calculating ANI between type strain genomes in each species.')
        
        fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w')
        fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n')
        
        fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment')
        fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w')
        fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n')
        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                                        gid, 
                                        ncbi_sp, 
                                        cur_genomes[gid].gtdb_taxa.species,
                                        '<not tested>',
                                        'n/a',
                                        'Manual curation: ' + reason))
        
        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0
        
        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0
        
        use_pickled_results = False #***
        if use_pickled_results:
            self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir))
        
        prev_gtdb_sp_conflicts = 0
        for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])):
            if len(type_gids) == 1:
                continue
                
            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                                ncbi_sp, 
                                len(type_gids),
                                processed+1, 
                                len(multi_type_strains_sp),
                                (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_')
            if not use_pickled_results: #***
                ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files)
                pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb'))
            else:
                ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb'))
            
            anis = []
            afs = []
            gid_anis = defaultdict(lambda: {})
            gid_afs = defaultdict(lambda: {})
            all_similar = True
            for gid1, gid2 in combinations(type_gids, 2):
                ani, af = symmetric_ani(ani_af, gid1, gid2)
                if ani < 99 or af < 0.65:
                    all_similar = False
                    
                anis.append(ani)
                afs.append(af)
                
                gid_anis[gid1][gid2] = ani
                gid_anis[gid2][gid1] = ani
                
                gid_afs[gid1][gid2] = af
                gid_afs[gid2][gid1] = af
                
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            unresolved_species = False
            
            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            if not all_similar:
                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True
                
                # write out highly divergent cases for manual inspection; 
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                                        gid,
                                                        ncbi_sp,
                                                        cur_genomes[gid].gtdb_taxa.genus,
                                                        cur_genomes[gid].gtdb_taxa.species,
                                                        ' / '.join(ltp_species),
                                                        np_mean(list(gid_anis[gid].values())),
                                                        np_std(list(gid_anis[gid].values())),
                                                        np_mean(list(gid_afs[gid].values())),
                                                        np_std(list(gid_afs[gid].values())),
                                                        cur_genomes[gid].excluded_from_refseq_note,
                                                        cur_genomes[gid].ncbi_taxa,
                                                        cur_genomes[gid].gtdb_taxa))
                
                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, 
                                                                                                    ncbi_sp, 
                                                                                                    type_gids, 
                                                                                                    ltp_metadata, 
                                                                                                    ltp_defined_species,
                                                                                                    cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1
                
                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1
                           
                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1
                        
                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                if resolved:
                    unresolved_species = False
                    
                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True

                    # write results to file
                    for gid, reason in untrustworthy_gids.items():
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                        
                        if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note:
                            reason += "; considered `untrustworthy as type` at NCBI"
                        fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid,
                                                                                ncbi_sp,
                                                                                cur_genomes[gid].gtdb_taxa.species,
                                                                                ' / '.join(ltp_species),
                                                                                reason))
                                                                                
                        # Sanity check that if the untrustworthy genome has an LTP to only the
                        # expected species, that all other genomes also have a hit to the 
                        # expected species (or potentially no hit). Otherwise, more consideration
                        # should be given to the genome with the conflicting LTP hit.
                        if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                            other_sp = set()
                            for test_gid in type_gids:
                                ltp_species = self.ltp_species(test_gid, ltp_metadata)
                                if ltp_species and ncbi_sp not in ltp_species:
                                    other_sp.update(ltp_species)
                                
                            if other_sp:
                                self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.')
                                
                    num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note])
                    if num_ncbi_untrustworthy != len(type_gids):
                        for gid in type_gids:
                            if (gid not in untrustworthy_gids 
                                and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note):
                                self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format(
                                                        gid, 
                                                        ncbi_sp,
                                                        num_ncbi_untrustworthy,
                                                        len(type_gids)))
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1
                    
                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                    gid,
                                    ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)
                    
                fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                            gid,
                            gid in untrustworthy_gids,
                            ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species),
                            gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa))

            fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                        ncbi_sp,
                        len(type_gids),
                        all_similar,
                        np_mean(anis),
                        np_std(anis),
                        np_mean(afs),
                        np_std(afs),
                        note,
                        ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()
        
        self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.')
        self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.')
        self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.')
        self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.')
        self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.')
        self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.')

        if unresolved_sp_count > 0:
            self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.')
            self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.')
            self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.")
        
        self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')