Ejemplo n.º 1
0
    def resolve_gtdb_family(self, gid_anis, ncbi_sp, type_gids, cur_genomes):
        """Resolve by identifying genomes with a conflicting GTDB family assignment."""

        genus = 'g__' + generic_name(ncbi_sp)
        gtdb_genus_rep = cur_genomes.gtdb_type_species_of_genus(genus)
        if not gtdb_genus_rep:
            return False, {}

        expected_gtdb_family = cur_genomes[gtdb_genus_rep].gtdb_taxa.family

        untrustworthy_gids = {}
        matched_family = 0
        for gid in type_gids:
            if cur_genomes[gid].gtdb_taxa.family == expected_gtdb_family:
                matched_family += 1
            else:
                # genome is classified to a different GTDB family than
                # expected for this species
                untrustworthy_gids[
                    gid] = f'Conflicting GTDB family assignment of {cur_genomes[gid].gtdb_taxa.family}, expected {expected_gtdb_family}'

        all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids)

        # conflict is resolved if remaining genomes pass ANI similarity test,
        if all_similar and len(untrustworthy_gids) > 0 and matched_family > 0:
            return True, untrustworthy_gids

        return False, {}
    def resolve_gtdb_genus(self, gid_anis, ncbi_sp, type_gids, cur_genomes):
        """Resolve by identifying genomes with a conflicting GTDB genus assignments."""
        
        ncbi_genus = 'g__' + generic_name(ncbi_sp)
        
        untrustworthy_gids = {}
        matched_genus = 0
        for gid in type_gids:
            canonical_gtdb_genus = canonical_taxon(cur_genomes[gid].gtdb_taxa.genus)

            if ncbi_genus == canonical_gtdb_genus:
                matched_genus += 1
            else:
                untrustworthy_gids[gid] = f'Conflicting GTDB genus assignment of {cur_genomes[gid].gtdb_taxa.genus}, expected {ncbi_genus}'

        all_similar = self.check_strain_ani(gid_anis, untrustworthy_gids)
        
        if all_similar and len(untrustworthy_gids) > 0 and matched_genus > 0:
            return True, untrustworthy_gids

        return False, {}
    def infer_epithet_map(self, gids_of_interest, mc_species, cur_genomes,
                          cur_clusters):
        """Infer mapping of NCBI epithet to GTDB epithet which may be different due to gender of genus."""

        # **************************************
        # This should be updated so it only includes valid transfers, and not
        # results due to misclassifications at NCBI. For example, right now this
        # code reports Enterobacter cancerogenus being transferred to Pantoea, but
        # really this is just a misclassified NCBI genome.

        # get species in GTDB genus
        generic_rids = defaultdict(list)
        for rid in cur_clusters:
            if rid not in gids_of_interest:
                continue

            gtdb_generic = cur_genomes[rid].gtdb_taxa.genus.replace('g__', '')
            if rid in mc_species:
                gtdb_generic = generic_name(mc_species[rid])

            generic_rids[gtdb_generic].append(rid)

        # establish epithets that are nearly identical
        # except for small change to suffix which is
        # assumed to be due to a gender change
        for gtdb_generic, rids in generic_rids.items():
            ncbi_sp_epithet_list = defaultdict(list)
            for rid in rids:
                ncbi_species = cur_genomes[rid].ncbi_taxa.species
                if ncbi_species == 's__':
                    continue

                ncbi_generic = generic_name(ncbi_species)
                ncbi_specific = specific_epithet(ncbi_species)

                if rid in mc_species:
                    gtdb_species = mc_species[rid]
                else:
                    gtdb_species = cur_genomes[rid].gtdb_taxa.species

                gtdb_specific = canonical_taxon(specific_epithet(gtdb_species))

                self.gtdb_ncbi_generic_map[gtdb_generic][gtdb_specific].append(
                    ncbi_generic)

                if test_same_epithet(ncbi_specific, gtdb_specific):
                    ncbi_sp_epithet_list[ncbi_specific].append(gtdb_specific)

            for ncbi_specific, gtdb_specific_list in ncbi_sp_epithet_list.items(
            ):
                gtdb_specific_counter = Counter(gtdb_specific_list)

                top_gtdb_specific, count = gtdb_specific_counter.most_common(
                    1)[0]

                map_perc = count * 100.0 / len(gtdb_specific_list)
                if map_perc >= 50:
                    self.sp_epithet_map[gtdb_generic][
                        ncbi_specific] = top_gtdb_specific

                if map_perc != 100:
                    self.logger.warning(
                        'Imperfect suffix mapping between from {} {} to {} at {:.1f}%.'
                        .format(gtdb_generic, top_gtdb_specific, ncbi_specific,
                                count * 100.0 / len(gtdb_specific_list)))
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger,
            sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  dsmz_bacnames_file)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get all GTDB species represented by a type strain:
        gtdb_type_species = set()
        for rid in mc_taxonomy:
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX])

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type strain genomes with incongruent GTDB species assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_sp = taxa[Taxonomy.SPECIES_INDEX]
                gtdb_generic = generic_name(gtdb_sp)

                ncbi_sp = cur_genomes[rid].ncbi_taxa.species
                ncbi_generic = generic_name(ncbi_sp)

                if ncbi_sp == 's__':
                    # NCBI taxonomy is sometimes behind the genome annotation pages,
                    # and do not have a species assignment even for type strain genome
                    continue

                # check if genome is a valid genus transfer into a genus
                # that already contains a species with the specific
                # name which results in a polyphyletic suffix being required
                # e.g. G002240355 is Prauserella marina at NCBI and is
                # transferred into Saccharomonospora under the GTDB. However,
                # Saccharomonospora marina already exists so this genome
                # needs to be S. marina_A.
                if (is_placeholder_taxon(gtdb_sp)
                        and gtdb_generic != ncbi_generic
                        and canonical_species(gtdb_sp) in gtdb_type_species):
                    continue

                if not test_same_epithet(specific_epithet(gtdb_sp),
                                         specific_epithet(ncbi_sp)):
                    num_incongruent += 1
                    fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        rid, gtdb_sp, ncbi_sp,
                        cur_genomes[rid].is_gtdb_type_strain(),
                        cur_genomes[rid].is_ncbi_type_strain(),
                        cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent species assignments.'.
            format(num_incongruent))
        fout.close()