def run(self, gtdb_clusters_file, prev_gtdb_metadata_file,
            cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file,
                                             uba_genome_file=uba_genome_paths)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            create_sp_clusters=False,
                                            uba_genome_file=uba_genome_paths,
                                            qc_passed_file=qc_passed_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # read named GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        cur_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # create curate tree and table indicating new NCBI taxa as these
        # should be considered by GTDB curators
        self.new_ncbi_taxa(prev_genomes, cur_genomes, cur_clusters)
Exemple #2
0
    def run(self, gtdb_clusters_file,
                    cur_gtdb_metadata_file,
                    uba_genome_paths,
                    qc_passed_file,
                    ncbi_misclassified_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    ani_af_rep_vs_nonrep,
                    gtdb_type_strains_ledger,
                    sp_priority_ledger,
                    genus_priority_ledger,
                    dsmz_bacnames_file):
        """Cluster genomes to selected GTDB representatives."""
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=uba_genome_paths,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # read named GTDB species clusters
        self.logger.info('Reading named and previous placeholder GTDB species clusters.')
        cur_clusters, rep_radius = read_clusters(gtdb_clusters_file)
        self.logger.info(' ... identified {:,} clusters spanning {:,} genomes.'.format(
                            len(cur_clusters),
                            sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # identified genomes with misclassified species assignments at NCBI
        self.logger.info('Identify genomes with misclassified NCBI species assignments.')
        ncbi_species_mngr = NCBI_SpeciesManager(cur_genomes, cur_clusters, self.output_dir)
        ncbi_misclassified_gids = ncbi_species_mngr.parse_ncbi_misclassified_table(ncbi_misclassified_file)
        self.logger.info(' - identified {:,} genomes with erroneous NCBI species assignments'.format(
                            len(ncbi_misclassified_gids)))
                            
        # identify NCBI species considered to be synonyms under the GTDB
        type_strain_synonyms = ncbi_species_mngr.identify_type_strain_synonyms(ncbi_misclassified_gids)
        consensus_synonyms = ncbi_species_mngr.identify_consensus_synonyms(ncbi_misclassified_gids)

        # read ANI and AF between representatives and non-representative genomes
        self.logger.info('Reading ANI and AF between representative and non-representative genomes.')
        ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb'))
        
        # write out synonyms
        ncbi_species_mngr.write_synonym_table(type_strain_synonyms,
                                                consensus_synonyms,
                                                ani_af,
                                                sp_priority_ledger,
                                                genus_priority_ledger,
                                                dsmz_bacnames_file)
    def run(self, gtdb_clusters_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            gtdb_type_strains_ledger, sp_priority_ledger,
            genus_priority_ledger, dsmz_bacnames_file):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # read named GTDB species clusters
        self.logger.info(
            'Reading named and previous placeholder GTDB species clusters.')
        cur_clusters, rep_radius = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # identify genomes with erroneous NCBI species assignments
        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by ANI type strain genomes.'
        )
        self.identify_misclassified_genomes_ani(cur_genomes, cur_clusters)

        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by GTDB cluster of type strain genomes.'
        )
        self.identify_misclassified_genomes_cluster(cur_genomes, cur_clusters)
Exemple #4
0
    def run(self, gtdb_clusters_file, prev_gtdb_metadata_file,
            cur_gtdb_metadata_file, qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            ' - previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # read named GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        cur_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' - identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # create curate tree and table indicating new NCBI taxa as these
        # should be considered by GTDB curators
        self.new_ncbi_taxa(prev_genomes, cur_genomes, cur_clusters)
Exemple #5
0
    def run(self, gtdb_clusters_file, prev_gtdb_metadata_file,
            cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file,
            gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            sp_priority_ledger, gtdb_taxa_updates_ledger, dsmz_bacnames_file):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            gtdbtk_classify_file=gtdbtk_classify_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # read named GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        cur_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # set current genomes to have same GTDB assignments as in previous
        # GTDB release. This is necessary, since genomes may have different
        # NCBI accession numbers between releases and thus the previous GTDB
        # taxonomy will not be reflected in the latest GTDB database. The
        # exception is if a genome has changed domains, in which case the
        # previous assignment is invalid.
        self.logger.info(
            'Setting GTDB taxonomy of genomes in current genome set.')
        update_count = 0
        conflicting_domain_count = 0
        for prev_gid in prev_genomes:
            if prev_gid in cur_genomes:
                if prev_genomes[prev_gid].gtdb_taxa != cur_genomes[
                        prev_gid].gtdb_taxa:
                    if prev_genomes[prev_gid].gtdb_taxa.domain == cur_genomes[
                            prev_gid].gtdb_taxa.domain:
                        update_count += 1
                        cur_genomes[prev_gid].gtdb_taxa.update_taxa(
                            prev_genomes[prev_gid].gtdb_taxa)
                    else:
                        conflicting_domain_count += 1
        self.logger.info(f' ... updated {update_count:,} genomes.')
        self.logger.info(
            f' ... identified {conflicting_domain_count:,} genomes with conflicting domain assignments.'
        )

        # get explicit updates to previous GTDB taxa
        self.logger.info('Reading explicit taxa updates.')
        explicit_taxon_updates = self._parse_explicit_taxa_updates(
            gtdb_taxa_updates_ledger)
        self.logger.info(
            f' ... identified {len(explicit_taxon_updates):,} updates.')

        self.logger.info(
            'Updating current genomes to reflect explicit taxa updates.')
        update_count = 0
        for cur_taxon, new_taxon in explicit_taxon_updates.items():
            rank_prefix = cur_taxon[0:3]
            rank_index = Taxonomy.rank_prefixes.index(rank_prefix)

            for gid in cur_genomes:
                if cur_genomes[gid].gtdb_taxa.get_taxa(
                        rank_index) == cur_taxon:
                    update_count += 1
                    cur_genomes[gid].gtdb_taxa.set_taxa(rank_index, new_taxon)

                    if rank_prefix == 'g__':
                        # should also update the species name
                        new_sp = cur_genomes[gid].gtdb_taxa.species.replace(
                            cur_taxon[3:], new_taxon[3:])
                        cur_genomes[gid].gtdb_taxa.set_taxa(
                            rank_index + 1, new_sp)

        self.logger.info(f' ... updated {update_count:,} genomes.')

        # initialize species priority manager
        self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                       dsmz_bacnames_file)

        # create table with new NCBI genera that likely need to be incorporated into
        # this release of the GTDB
        self.new_ncbi_genera(prev_genomes, cur_genomes, cur_clusters,
                             gtdbtk_classify_file)

        self.new_ncbi_families(prev_genomes, cur_genomes, cur_clusters,
                               gtdbtk_classify_file)
Exemple #6
0
    def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters,
            species_exception_file, output_dir):
        """Quality check all potential GTDB genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info(
            'Reading NCBI and GTDB taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(
            gtdb_metadata_file, species_exception_file)
        prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)
        self.logger.info(
            'Read NCBI taxonomy for %d genomes with %d manually defined updates.'
            % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' %
                         len(prev_gtdb_taxonomy))

        # get GTDB metadata
        type_metadata = read_gtdb_metadata(gtdb_metadata_file, [
            'gtdb_type_designation', 'gtdb_type_designation_sources',
            'gtdb_type_species_of_genus'
        ])

        quality_metadata = read_quality_metadata(gtdb_metadata_file)

        # read species clusters
        sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters)
        self.logger.info('Read %d species clusters.' % len(sp_clusters))

        # sanity check species clusters all defined by genomes passing QC
        for gid in sp_clusters:
            if gid not in passed_qc:
                self.logger.error(
                    'Genome %s defines a species cluster, but fails QC.' % gid)
                sys.exit(-1)

        # modify GTDB taxonomy to reflect new species clustering and report incongruencies
        self.logger.info(
            'Identifying species with incongruent specific names.')
        self._incongruent_specific_names(species, ncbi_taxonomy,
                                         prev_gtdb_taxonomy, type_metadata,
                                         output_dir)

        self._incongruent_genus_names(species, ncbi_taxonomy,
                                      prev_gtdb_taxonomy, type_metadata,
                                      output_dir)

        # get GIDs for canonical and validation trees
        fout_bac_can_gtdb = open(
            os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w')
        fout_bac_val_gtdb = open(
            os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w')
        fout_ar_can_gtdb = open(
            os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w')
        fout_ar_val_gtdb = open(
            os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w')

        fout_bac_val = open(
            os.path.join(output_dir, 'gids_bac_validation.lst'), 'w')
        fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'),
                           'w')
        fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'),
                            'w')
        fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'),
                           'w')
        fout_bac_val.write('#Accession\tSpecies\tNote\n')
        fout_ar_val.write('#Accession\tSpecies\tNote\n')
        fout_bac_can.write('#Accession\tSpecies\tNote\n')
        fout_ar_can.write('#Accession\tSpecies\tNote\n')

        for rid in sp_clusters:
            domain = prev_gtdb_taxonomy[rid][0]
            if domain == 'd__Bacteria':
                fout_val = fout_bac_val
                fout_can = fout_bac_can

                fout_can_gtdb = fout_bac_can_gtdb
                fout_val_gtdb = fout_bac_val_gtdb
            elif domain == 'd__Archaea':
                fout_val = fout_ar_val
                fout_can = fout_ar_can
                fout_can_gtdb = fout_ar_can_gtdb
                fout_val_gtdb = fout_ar_val_gtdb
            else:
                self.logger.error('Genome %s has no GTDB domain assignment.' %
                                  rid)
                sys.exit(-1)

            # substitute proposed species name into GTDB taxonomy
            taxa = prev_gtdb_taxonomy[rid][0:6] + [species[rid]]
            new_gtdb_str = '; '.join(taxa)
            fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))

            fout_val.write(
                '%s\t%s\t%s\n' %
                (rid, species[rid], 'GTDB type or representative genome'))
            fout_can.write(
                '%s\t%s\t%s\n' %
                (rid, species[rid], 'GTDB type or representative genome'))

            cluster_gids = set(sp_clusters[rid])
            for gid in cluster_gids:
                if gid not in passed_qc:
                    self.logger.error(
                        'Genome %s is in a species cluster, but fails QC.' %
                        gid)
                    sys.exit(-1)

            if len(cluster_gids) > 0:
                # select highest-quality genome
                q = quality_score(cluster_gids, quality_metadata)
                gid = max(q.items(), key=operator.itemgetter(1))[0]

                taxa = prev_gtdb_taxonomy[gid][0:6] + [species[rid]]
                new_gtdb_str = '; '.join(taxa)

                fout_val.write(
                    '%s\t%s\t%s\n' %
                    (gid, species[rid],
                     'selected highest-quality genome (Q=%.2f)' % q[gid]))
                fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str))

        fout_bac_val.close()
        fout_ar_val.close()
        fout_bac_can.close()
        fout_ar_can.close()

        fout_bac_can_gtdb.close()
        fout_bac_val_gtdb.close()
        fout_ar_can_gtdb.close()
        fout_ar_val_gtdb.close()
Exemple #7
0
    def run(self, updated_sp_rep_file, gtdb_clusters_file,
            prev_gtdb_metadata_file, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger):
        """Summary statistics indicating changes to GTDB species clusters."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # update current genomes with GTDB-Tk classifications
        self.logger.info(
            'Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(
            gtdbtk_classify_file, prev_genomes)
        self.logger.info(
            f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.'
        )

        # report changes in genome sets
        self.logger.info('Comparing previous and current genome sets.')
        prev_gids = set(prev_genomes)
        new_gids = set(cur_genomes)
        num_same_genomes = len(prev_gids.intersection(new_gids))
        num_lost_genomes = len(prev_gids - new_gids)
        num_new_genomes = len(new_gids - prev_gids)
        self.logger.info(
            f' ... identified {num_same_genomes:,} genomes as being present in both genome sets.'
        )
        self.logger.info(
            f' ... identified {num_lost_genomes:,} genomes as being lost from the previous genome set.'
        )
        self.logger.info(
            f' ... identified {num_new_genomes:,} genomes as being new to the current genome set.'
        )

        # get changes to representatives of previous GTDB species clusters
        updated_rids = self._parse_updated_sp_reps(updated_sp_rep_file)

        # get new GTDB species clusters
        self.logger.info('Reading current GTDB clusters.')
        new_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... current genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(new_clusters),
                    sum(len(cids) for cids in new_clusters.values())))

        new_rid_map = {}
        for rid, cids in new_clusters.items():
            for cid in cids:
                new_rid_map[cid] = rid

        # UBA genome sanity check
        prev_uba_count = 0
        for gid in prev_genomes:
            if gid.startswith('UBA'):
                prev_uba_count += 1

        cur_uba_count = 0
        for gid in cur_genomes:
            if gid.startswith('UBA'):
                cur_uba_count += 1

        new_uba_count = 0
        for rid, cids in new_clusters.items():
            for cid in cids:
                if cid.startswith('UBA'):
                    new_uba_count += 1

        self.logger.info(
            f'Verified all genome / cluster sets contain the same number of UBA genomes: {prev_uba_count:,}'
        )
        assert prev_uba_count == cur_uba_count == new_uba_count

        # tabulate changes in GTDB species clusters
        self.logger.info('Calculating statistics of GTDB species clusters.')

        fout = open(
            os.path.join(self.output_dir, 'gtdb_sp_clusters_change_stats.tsv'),
            'w')
        fout.write(
            'Previous representative\tPrevious name\tNew representative\tNew name\tRepresentative status\tName status'
        )
        fout.write(
            '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\tNote\n'
        )

        rep_lost_count = 0
        rep_changed_count = 0
        rep_unchanged_count = 0
        rep_merged_count = 0

        name_lost_count = 0
        name_changed_count = 0
        name_unchanged_count = 0
        name_merged_count = 0

        prev_cluster_ids = set()
        total_num_same = 0
        total_num_lost = 0
        total_num_new = 0
        total_num_migrated_in = 0
        total_num_migrated_out = 0
        for prev_rid, prev_cids in prev_genomes.sp_clusters.items():
            prev_gtdb_sp = prev_genomes[prev_rid].gtdb_taxa.species

            new_rid = updated_rids[prev_rid]
            prev_cluster_ids.add(new_rid)
            note = ''
            if new_rid is None:
                new_rid = 'none'
                new_sp = 'none'
                rep_status = 'LOST'
                name_status = 'LOST'  # what does this mean; presumable a species name can be recycled elsewhere!

                new_cluster = set()

                rep_lost_count += 1
                name_lost_count += 1
            elif new_rid not in new_clusters:
                # representative must have been merged when selecting
                # representatives for NCBI species
                merged_rid = new_rid_map[new_rid]
                merged_sp = cur_genomes[merged_rid].gtdb_taxa.species
                note = 'merged with {} with representative {}'.format(
                    merged_sp, merged_rid)

                new_rid = 'none'
                rep_status = 'MERGED'
                name_status = 'MERGED'

                new_cluster = set()

                rep_merged_count += 1
                name_merged_count += 1
            else:
                new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
                new_cluster = new_clusters[new_rid]

                if prev_rid == new_rid:
                    rep_status = 'UNCHANGED'
                    rep_unchanged_count += 1
                else:
                    rep_status = 'CHANGED'
                    rep_changed_count += 1

                if prev_gtdb_sp == new_gtdb_sp:
                    name_status = 'UNCHANGED'
                    name_unchanged_count += 1
                else:
                    name_status = 'CHANGED'
                    name_changed_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format(prev_rid, prev_gtdb_sp,
                                                       new_rid, new_gtdb_sp,
                                                       rep_status,
                                                       name_status))

            num_same = len(new_cluster.intersection(prev_cids))
            num_lost = len(prev_cids - new_gids)
            num_new = len(new_cluster - prev_gids)
            num_migrated_in = len(
                (new_cluster - prev_cids).intersection(prev_gids))
            num_migrated_out = len(
                (prev_cids - new_cluster).intersection(new_gids))
            assert len(new_cluster) == len(
                prev_cids
            ) - num_lost + num_new + num_migrated_in - num_migrated_out
            assert len(prev_cids) == num_same + num_lost + num_migrated_out

            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                len(prev_cids), len(new_cluster), num_same, num_lost, num_new,
                num_migrated_in, num_migrated_out, note))

            total_num_same += num_same
            total_num_lost += num_lost
            total_num_new += num_new
            total_num_migrated_in += num_migrated_in
            total_num_migrated_out += num_migrated_out

        # add in new GTDB species clusters
        new_cluster_count = 0
        for new_rid in new_clusters:
            if new_rid in prev_cluster_ids:
                continue

            new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
            rep_status = 'NEW'
            name_status = 'NEW'
            new_cluster_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}\t{}'.format('n/a', 'n/a', new_rid,
                                                       new_gtdb_sp, rep_status,
                                                       name_status))

            num_new = len(new_clusters[new_rid] - prev_gids)
            num_migrated_in = len(
                new_clusters[new_rid].intersection(prev_gids))
            assert len(new_clusters[new_rid]) == num_new + num_migrated_in
            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                0, len(new_clusters[new_rid]), 0, 0, num_new, num_migrated_in,
                0, ''))

            total_num_new += num_new
            total_num_migrated_in += num_migrated_in

        # report genome statistics
        num_union = len(new_gids.union(prev_gids))
        assert len(
            new_gids.union(prev_gids)
        ) == total_num_same + total_num_lost + total_num_new + total_num_migrated_in
        assert total_num_migrated_in == total_num_migrated_out
        self.logger.info(
            f'There were {len(prev_gids):,} genomes in the previous genome sets.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.'
            .format(total_num_same, total_num_same * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that were lost from the species cluster.'
            .format(total_num_lost, total_num_lost * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) genomes that migrated between species cluster.'
            .format(total_num_migrated_in,
                    total_num_migrated_in * 100.0 / len(prev_gids)))
        self.logger.info(
            ' ... identified {:,} new genomes which is a {:.2f}% increase.'.
            format(total_num_new,
                   len(new_gids) * 100.0 / len(prev_gids) - 100))

        # report representative statistics
        assert len(new_clusters) == len(
            prev_genomes.sp_clusters
        ) + new_cluster_count - rep_lost_count - rep_merged_count
        self.logger.info(
            f'There are {len(new_clusters):,} total GTDB species representatives.'
        )
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) unchanged representatives.'.format(
                rep_unchanged_count,
                rep_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed representatives.'.format(
                rep_changed_count,
                rep_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost representatives.'.format(
                rep_lost_count,
                rep_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged representatives.'.format(
                rep_merged_count,
                rep_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} new representatives which is a {:.2f}% increase.'
            .format(
                new_cluster_count,
                len(new_clusters) * 100.0 / len(prev_genomes.sp_clusters) -
                100))

        self.logger.info(
            ' ... identified {:,} ({:.2f}%) cluster names.'.format(
                name_unchanged_count,
                name_unchanged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) changed cluster names.'.format(
                name_changed_count,
                name_changed_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) lost cluster names.'.format(
                name_lost_count,
                name_lost_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) merged cluster names.'.format(
                name_merged_count,
                name_merged_count * 100.0 / len(prev_genomes.sp_clusters)))
        self.logger.info(
            ' ... identified {:,} ({:.2f}%) new cluster names.'.format(
                new_cluster_count,
                new_cluster_count * 100.0 / len(prev_genomes.sp_clusters)))
Exemple #8
0
    def run(self,
            gtdb_clusters_file,
            prev_gtdb_metadata_file,
            cur_gtdb_metadata_file,
            qc_passed_file,
            ncbi_genbank_assembly_file,
            untrustworthy_type_file,
            gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Summary statistics indicating changes to GTDB species cluster membership."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(prev_gtdb_metadata_file,
                                             gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                             ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                             untrustworthy_type_ledger=untrustworthy_type_file,
                                             ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(' - previous genome set has {:,} species clusters spanning {:,} genomes.'.format(
            len(prev_genomes.sp_clusters),
            prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                            create_sp_clusters=False,
                                            qc_passed_file=qc_passed_file,
                                            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                            untrustworthy_type_ledger=untrustworthy_type_file,
                                            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # report changes in genome sets
        self.logger.info('Comparing previous and current genome sets.')
        prev_gids = set(prev_genomes)
        new_gids = set(cur_genomes)
        num_same_genomes = len(prev_gids.intersection(new_gids))
        num_lost_genomes = len(prev_gids - new_gids)
        num_new_genomes = len(new_gids - prev_gids)
        self.logger.info(
            f' - identified {num_same_genomes:,} genomes as being present in both genome sets.')
        self.logger.info(
            f' - identified {num_lost_genomes:,} genomes as being lost from the previous genome set.')
        self.logger.info(
            f' - identified {num_new_genomes:,} genomes as being new to the current genome set.')

        # get new GTDB species clusters
        self.logger.info('Reading current GTDB clusters.')
        new_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(' - current genome set has {:,} species clusters spanning {:,} genomes.'.format(
            len(new_clusters),
            sum(len(cids) for cids in new_clusters.values())))

        new_rid_map = {}
        for rid, cids in new_clusters.items():
            for cid in cids:
                new_rid_map[cid] = rid

        # get mapping of previous GTDB representatives to new GTDB species clusters
        self.logger.info(
            'Mapping previous GTDB representatives to new representatives.')
        prev_to_new_rid = prev_genomes.sp_clusters.updated_representatives(
            new_clusters)
        self.logger.info(
            ' - mapped {:,} previous representatives.'.format(len(prev_to_new_rid)))

        new_to_prev_rids = defaultdict(list)
        for prev_rid, new_rid in prev_to_new_rid.items():
            new_to_prev_rids[new_rid].append(prev_rid)

        # tabulate changes in GTDB species clusters
        self.logger.info('Calculating statistics of GTDB species clusters.')

        fout = open(os.path.join(self.output_dir,
                                 'gtdb_sp_clusters_change_stats.tsv'), 'w')
        fout.write(
            'New representative\tPrevious representative(s)\tPrevious name(s)\tRepresentative status')
        fout.write(
            '\tNo. previous genomes\tNo. current genomes\tNo. same\tNo. lost\tNo. new\tNo. migrated in\tNo. migrated out\n')

        rep_lost_count = 0
        rep_changed_count = 0
        rep_unchanged_count = 0
        rep_merger_count = 0

        prev_cluster_ids = set()
        total_num_same = 0
        total_num_lost = 0
        total_num_new = 0
        total_num_migrated_in = 0
        total_num_migrated_out = 0

        for new_rid, prev_rids in new_to_prev_rids.items():
            prev_cluster_ids.add(new_rid)

            prev_gtdb_sp = [
                prev_genomes[prev_rid].gtdb_taxa.species for prev_rid in prev_rids]

            prev_cids = set()
            for prev_rid in prev_rids:
                prev_cids.update(prev_genomes.sp_clusters[prev_rid])

            if new_rid is None:
                new_rid = 'none'
                rep_status = 'LOST'
                new_cluster = set()
                rep_lost_count += len(prev_rids)
            else:
                new_cluster = new_clusters[new_rid]

                if len(prev_rids) == 1:
                    if prev_rids[0] == new_rid:
                        rep_status = 'UNCHANGED'
                        rep_unchanged_count += 1
                    else:
                        rep_status = 'CHANGED'
                        rep_changed_count += 1
                else:
                    rep_status = 'MERGER'
                    rep_merger_count += len(prev_rids)

            fout.write('{}\t{}\t{}\t{}'.format(
                new_rid,
                ', '.join(prev_rids),
                ', '.join(prev_gtdb_sp),
                rep_status))

            num_same = len(new_cluster.intersection(prev_cids))
            num_new = len(new_cluster - prev_gids)
            num_lost = len(prev_cids - new_gids)

            num_migrated_in = len(
                (new_cluster - prev_cids).intersection(prev_gids))
            num_migrated_out = len(
                (prev_cids - new_cluster).intersection(new_gids))

            assert len(new_cluster) == len(prev_cids) - num_lost + \
                num_new + num_migrated_in - num_migrated_out
            assert len(prev_cids) == num_same + num_lost + num_migrated_out

            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                len(prev_cids),
                len(new_cluster),
                num_same,
                num_lost,
                num_new,
                num_migrated_in,
                num_migrated_out))

            total_num_same += num_same
            total_num_lost += num_lost
            total_num_new += num_new
            total_num_migrated_in += num_migrated_in
            total_num_migrated_out += num_migrated_out

        assert len(prev_genomes.sp_clusters) == rep_unchanged_count + \
            rep_changed_count + rep_merger_count + rep_lost_count

        # add in new GTDB species clusters
        new_cluster_count = 0
        for new_rid in new_clusters:
            if new_rid in prev_cluster_ids:
                continue

            new_gtdb_sp = cur_genomes[new_rid].gtdb_taxa.species
            rep_status = 'NEW'
            new_cluster_count += 1

            fout.write('{}\t{}\t{}\t{}\t{}'.format(
                'n/a',
                'n/a',
                new_rid,
                new_gtdb_sp,
                rep_status))

            num_new = len(new_clusters[new_rid] - prev_gids)
            num_migrated_in = len(
                new_clusters[new_rid].intersection(prev_gids))
            assert len(new_clusters[new_rid]) == num_new + num_migrated_in
            fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                0,
                len(new_clusters[new_rid]),
                0,
                0,
                num_new,
                num_migrated_in,
                0))

            total_num_new += num_new
            total_num_migrated_in += num_migrated_in

        assert len(new_gids.union(prev_gids)) == total_num_same + \
            total_num_lost + total_num_new + total_num_migrated_in
        assert total_num_migrated_in == total_num_migrated_out

        # report genome statistics
        assert len(prev_gids) == total_num_same + \
            total_num_lost + total_num_migrated_in
        self.logger.info(
            f'There were {len(prev_gids):,} genomes in the previous release.')
        self.logger.info(' - identified {:,} ({:.2f}%) genomes that were assigned to same species cluster.'.format(
            total_num_same,
            total_num_same*100.0/len(prev_gids)))
        self.logger.info(' - identified {:,} ({:.2f}%) genomes that were lost from the species cluster.'.format(
            total_num_lost,
            total_num_lost*100.0/len(prev_gids)))
        self.logger.info(' - identified {:,} ({:.2f}%) genomes that migrated between species cluster.'.format(
            total_num_migrated_in,
            total_num_migrated_in*100.0/len(prev_gids)))
        self.logger.info('Identified {:,} new genomes which is a {:.2f}% increase.'.format(
            total_num_new,
            len(new_gids)*100.0/len(prev_gids) - 100))

        # report representative statistics
        assert len(prev_genomes.sp_clusters) == rep_unchanged_count + \
            rep_changed_count + rep_lost_count + rep_merger_count
        self.logger.info(
            f'There were {len(prev_genomes.sp_clusters):,} previous GTDB species representatives.')
        self.logger.info(' - identified {:,} ({:.2f}%) unchanged representatives.'.format(
            rep_unchanged_count,
            rep_unchanged_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info(' - identified {:,} ({:.2f}%) changed representatives.'.format(
            rep_changed_count,
            rep_changed_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info(' - identified {:,} ({:.2f}%) lost representatives.'.format(
            rep_lost_count,
            rep_lost_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info(' - identified {:,} ({:.2f}%) merged representatives.'.format(
            rep_merger_count,
            rep_merger_count*100.0/len(prev_genomes.sp_clusters)))
        self.logger.info('Identified {:,} new representatives which is a {:.2f}% increase.'.format(
            new_cluster_count,
            len(new_clusters)*100.0/len(prev_genomes.sp_clusters) - 100))
    def run(self, named_cluster_file, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, ani_af_rep_vs_nonrep,
            gtdb_type_strains_ledger, sp_priority_ledger):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # read named GTDB species clusters
        self.logger.info(
            'Reading named and previous placeholder GTDB species clusters.')
        clusters, rep_radius = read_clusters(named_cluster_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(clusters),
                sum([len(gids) + 1 for gids in clusters.values()])))

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info(
            'Determining effective type strain genomes in each NCBI species.')
        ncbi_sp_type_strain_genomes = cur_genomes.ncbi_sp_effective_type_genomes(
        )
        self.logger.info(
            ' ... identified effective type strain genomes for {:,} NCBI species.'
            .format(len(ncbi_sp_type_strain_genomes)))

        # verify that type genomes for a species are contained in a
        # single GTDB species cluster
        rid_map = {}
        for rid, gids in clusters.items():
            rid_map[rid] = rid
            for gid in gids:
                rid_map[gid] = rid

        for ncbi_sp, type_gids in ncbi_sp_type_strain_genomes.items():
            gtdb_rids = set(
                [rid_map[gid] for gid in type_gids if gid in rid_map])
            if len(gtdb_rids) > 1:
                self.logger.warning(
                    'Type strain genomes from NCBI species {} were assigned to {:,} GTDB species clusters: {}.'
                    .format(ncbi_sp, len(gtdb_rids),
                            [(rid, cur_genomes[rid].gtdb_taxa.species)
                             for rid in gtdb_rids]))

        # identify synonyms
        self.logger.info('Identifying synonyms.')
        synonyms = defaultdict(list)
        failed_type_strain_priority = 0
        for rid, gids in clusters.items():
            rep_ncbi_sp = cur_genomes[rid].ncbi_taxa.species

            # find species that are a synonym to the current representative,
            # using the best quality genome for each species to establish
            # synonym statistics such as ANI and AF
            type_gids = [
                gid for gid in gids
                if cur_genomes[gid].is_effective_type_strain()
            ]
            if not cur_genomes[rid].is_effective_type_strain() and len(
                    type_gids) > 0:
                failed_type_strain_priority += 1
                continue

            q = {
                gid: cur_genomes[gid].score_type_strain()
                for gid in type_gids
            }
            q_sorted = sorted(q.items(),
                              key=lambda kv: (kv[1], kv[0]),
                              reverse=True)
            processed_sp = set()
            for gid, _quality in q_sorted:
                cur_ncbi_sp = cur_genomes[gid].ncbi_taxa.species

                if cur_ncbi_sp in processed_sp:
                    continue

                if cur_ncbi_sp != rep_ncbi_sp:
                    synonyms[rid].append(gid)
                    processed_sp.add(cur_ncbi_sp)

        self.logger.info(
            ' ... identified {:,} GTDB representatives resulting in {:,} synonyms.'
            .format(len(synonyms),
                    sum([len(gids) for gids in synonyms.values()])))

        if failed_type_strain_priority:
            self.logger.warning(
                f'Identified {failed_type_strain_priority:,} non-type strain representatives that failed to priotize an effective type strain genome.'
            )

        # read ANI and AF between representatives and non-representative genomes
        self.logger.info(
            'Reading ANI and AF between representative and non-representative genomes.'
        )
        ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb'))

        # write out synonyms
        self.write_synonym_table(synonyms, cur_genomes, ani_af,
                                 sp_priority_ledger)
Exemple #10
0
    def run(self, metadata_file, genome_path_file, final_cluster_file):
        """Cluster User genomes to GTDB species clusters."""

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)

        # read existing cluster information
        self.logger.info('Reading already established species clusters.')
        sp_clusters, species, rep_radius = read_clusters(final_cluster_file)

        clustered_genomes = set()
        for rep_id in sp_clusters:
            clustered_genomes.add(rep_id)
            clustered_genomes.update(sp_clusters[rep_id])

        self.logger.info(
            'Identified %d species clusters spanning %d genomes.' %
            (len(sp_clusters), len(clustered_genomes)))

        # get User genomes to cluster
        self.logger.info('Parse quality statistics for all genomes.')
        quality_metadata = read_quality_metadata(metadata_file)

        user_genomes = set()
        for gid in quality_metadata:
            if gid in clustered_genomes:
                continue

            if (quality_metadata[gid].checkm_completeness > 50
                    and quality_metadata[gid].checkm_contamination < 10):
                user_genomes.add(gid)

        self.logger.info('Identified %d User genomes to cluster.' %
                         len(user_genomes))

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info(
            'Calculating Mash ANI estimates between User genomes and species clusters.'
        )
        mash_anis = self._mash_ani(genome_files, user_genomes, sp_clusters)

        # cluster User genomes to species clusters
        self.logger.info('Assigning User genomes to closest species cluster.')
        self._cluster(genome_files, sp_clusters, rep_radius, user_genomes,
                      mash_anis)

        clustered_genomes = 0
        for rep_id in sp_clusters:
            clustered_genomes += 1
            clustered_genomes += len(sp_clusters[rep_id])

        self.logger.info(
            'The %d species clusters span %d genomes, including User genomes.'
            % (len(sp_clusters), clustered_genomes))

        # report clustering
        user_cluster_file = os.path.join(self.output_dir,
                                         'gtdb_user_clusters.tsv')
        fout = open(user_cluster_file, 'w')
        fout.write('Type genome\tNo. clustered genomes\tClustered genomes\n')
        for rep_id in sp_clusters:
            fout.write('%s\t%d\t%s\n' % (rep_id, len(
                sp_clusters[rep_id]), ','.join(sp_clusters[rep_id])))
        fout.close()