def run(self, named_rep_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            rep_mash_sketch_file, rep_ani_file, gtdb_type_strains_ledger):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # get representative genomes
        rep_gids = set()
        with open(named_rep_file) as f:
            header = f.readline().strip().split('\t')
            rep_index = header.index('Representative')
            sp_index = header.index('Proposed species')

            for line in f:
                line_split = line.strip().split('\t')
                gid = line_split[rep_index]
                assert gid in cur_genomes
                rep_gids.add(gid)

        self.logger.info(
            'Identified representative genomes for {:,} species.'.format(
                len(rep_gids)))

        # calculate circumscription radius for representative genomes
        self.logger.info(
            'Determining ANI species circumscription for {:,} representative genomes.'
            .format(len(rep_gids)))
        rep_radius = self._rep_radius(rep_gids, rep_ani_file)
        write_rep_radius(
            rep_radius, cur_genomes,
            os.path.join(self.output_dir, 'gtdb_rep_ani_radius.tsv'))

        # calculate ANI between representative and non-representative genomes
        self.logger.info(
            'Calculating ANI between representative and non-representative genomes.'
        )
        ani_af = self._calculate_ani(cur_genomes, rep_gids,
                                     rep_mash_sketch_file)
        self.logger.info(
            ' ... ANI values determined for {:,} query genomes.'.format(
                len(ani_af)))
        self.logger.info(
            ' ... ANI values determined for {:,} genome pairs.'.format(
                sum([len(ani_af[qid]) for qid in ani_af])))

        # cluster remaining genomes to representatives
        non_reps = set(cur_genomes.genomes) - set(rep_radius)
        self.logger.info(
            'Clustering {:,} non-representatives to {:,} representatives using species-specific ANI radii.'
            .format(len(non_reps), len(rep_radius)))
        clusters = self._cluster(ani_af, non_reps, rep_radius)

        # write out clusters
        write_clusters(
            clusters, rep_radius, cur_genomes,
            os.path.join(self.output_dir, 'gtdb_named_rep_clusters.tsv'))
    def run(self, named_cluster_file,
                    cur_gtdb_metadata_file,
                    cur_genomic_path_file,
                    uba_genome_paths,
                    qc_passed_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    ani_af_rep_vs_nonrep,
                    gtdb_type_strains_ledger):
        """Infer de novo species clusters and representatives for remaining genomes."""
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=uba_genome_paths,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # determine representatives and genomes clustered to each representative
        self.logger.info('Reading named GTDB species clusters.')
        named_rep_gids, rep_clustered_gids, rep_radius = self._parse_named_clusters(named_cluster_file)
        self.logger.info(' ... identified {:,} representative genomes.'.format(len(named_rep_gids)))
        self.logger.info(' ... identified {:,} clustered genomes.'.format(len(rep_clustered_gids)))
        
        # determine genomes left to be clustered
        unclustered_gids = set(cur_genomes.genomes.keys()) - named_rep_gids - rep_clustered_gids
        self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(len(unclustered_gids)))

        # establish closest representative for each unclustered genome
        self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(len(unclustered_gids)))
        nonrep_radius = self._nonrep_radius(unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(cur_genomes, unclustered_gids)

        # select de novo species representatives in a greedy fashion based on genome quality
        de_novo_rep_gids = self._selected_rep_genomes(cur_genomes,
                                                        nonrep_radius, 
                                                        unclustered_gids, 
                                                        mash_anis)

        # cluster all non-representative genomes to representative genomes
        final_cluster_radius = rep_radius.copy()
        final_cluster_radius.update(nonrep_radius)
        
        final_clusters, ani_af = self._cluster_genomes(cur_genomes,
                                                        de_novo_rep_gids,
                                                        named_rep_gids, 
                                                        final_cluster_radius)

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]

        self.logger.info('Writing {:,} species clusters to file.'.format(len(final_clusters)))
        self.logger.info('Writing {:,} cluster radius information to file.'.format(len(final_cluster_radius)))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        cur_genomes,
                        os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv'))

        write_rep_radius(final_cluster_radius, 
                            cur_genomes,
                            os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))
Example #3
0
    def run(self, qc_file,
                metadata_file,
                gtdb_user_genomes_file,
                genome_path_file,
                type_genome_cluster_file,
                type_genome_synonym_file,
                ncbi_refseq_assembly_file,
                ncbi_genbank_assembly_file,
                ani_af_nontype_vs_type,
                species_exception_file,
                rnd_type_genome):
        """Infer de novo species clusters and type genomes for remaining genomes."""
        
        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))
        
        # get NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy))
        
        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files))
        assert(len(genome_files) == len(passed_qc))
        
        # determine type genomes and genomes clustered to type genomes
        type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file)
        assert(len(type_species) == len(type_gids))
        self.logger.info('Identified %d type genomes.' % len(type_gids))
        self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids))
        
        # calculate quality score for genomes
        self.logger.info('Parse quality statistics for all genomes.')
        quality_metadata = read_quality_metadata(metadata_file)
        
        # calculate genome quality score
        self.logger.info('Calculating genome quality score.')
        genome_quality = quality_score(quality_metadata.keys(), quality_metadata)

        # determine genomes left to be clustered
        unclustered_gids = passed_qc - type_gids - type_clustered_gids
        self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids))

        # establish closest type genome for each unclustered genome
        self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids))
        nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type)
        
        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids)

        # select species representatives genomes in a greedy fashion based on genome quality
        rep_genomes = self._selected_rep_genomes(genome_files,
                                                    nontype_radius, 
                                                    unclustered_gids, 
                                                    mash_anis,
                                                    quality_metadata,
                                                    rnd_type_genome)
        
        # cluster all non-type/non-rep genomes to species type/rep genomes
        final_cluster_radius = type_radius.copy()
        final_cluster_radius.update(nontype_radius)
        
        final_clusters, ani_af = self._cluster_genomes(genome_files,
                                                        rep_genomes,
                                                        type_gids, 
                                                        passed_qc,
                                                        final_cluster_radius)
        rep_clusters = {}
        for gid in rep_genomes:
            rep_clusters[gid] = final_clusters[gid]

        # get list of synonyms in order to restrict usage of species names
        synonyms = self._parse_synonyms(type_genome_synonym_file)
        self.logger.info('Identified %d synonyms.' % len(synonyms))
        
        # determine User genomes with NCBI accession number that may form species names
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file)
        self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank))
        
        # assign species names to de novo species clusters
        names_in_use = synonyms.union(type_species)
        self.logger.info('Identified %d species names already in use.' % len(names_in_use))
        self.logger.info('Assigning species name to each de novo species cluster.')
        cluster_sp_names = self._assign_species_names(rep_clusters, 
                                                        names_in_use, 
                                                        gtdb_taxonomy,
                                                        gtdb_user_to_genbank)
        
         # write out file with details about selected representative genomes
        self._write_rep_info(rep_clusters, 
                                cluster_sp_names,
                                quality_metadata,
                                genome_quality,
                                excluded_from_refseq_note,
                                ani_af,
                                os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv'))
                                             
        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]
            
        all_species = cluster_sp_names
        all_species.update(species_type_gid)

        self.logger.info('Writing %d species clusters to file.' % len(all_species))
        self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        all_species, 
                        os.path.join(self.output_dir, 'gtdb_clusters_final.tsv'))

        write_rep_radius(final_cluster_radius, 
                            all_species, 
                            os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))
        
Example #4
0
    def run(self, qc_file, metadata_file, genome_path_file,
            named_type_genome_file, type_genome_ani_file, mash_sketch_file,
            species_exception_file):
        """Cluster genomes to selected GTDB type genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get type genomes
        type_gids = set()
        species_type_gid = {}
        with open(named_type_genome_file) as f:
            header = f.readline().strip().split('\t')
            type_gid_index = header.index('Type genome')
            sp_index = header.index('NCBI species')

            for line in f:
                line_split = line.strip().split('\t')
                type_gids.add(line_split[type_gid_index])
                species_type_gid[
                    line_split[type_gid_index]] = line_split[sp_index]
        self.logger.info('Identified type genomes for %d species.' %
                         len(species_type_gid))

        # calculate circumscription radius for type genomes
        self.logger.info(
            'Determining ANI species circumscription for %d type genomes.' %
            len(type_gids))
        type_radius = self._type_genome_radius(type_gids, type_genome_ani_file)
        assert (len(type_radius) == len(species_type_gid))

        write_rep_radius(
            type_radius, species_type_gid,
            os.path.join(self.output_dir, 'gtdb_type_genome_ani_radius.tsv'))

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info(
            'Considering %d genomes after removing unwanted User genomes.' %
            len(genome_files))
        assert (len(genome_files) == len(passed_qc))

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(
            metadata_file, species_exception_file)
        self.logger.info(
            'Read NCBI taxonomy for %d genomes with %d manually defined updates.'
            % (len(ncbi_taxonomy), ncbi_update_count))

        # calculate ANI between type and non-type genomes
        self.logger.info('Calculating ANI between type and non-type genomes.')
        ani_af = self._calculate_ani(type_gids, genome_files, ncbi_taxonomy,
                                     mash_sketch_file)

        # cluster remaining genomes to type genomes
        nontype_gids = set(genome_files) - set(type_radius)
        self.logger.info(
            'Clustering %d non-type genomes to type genomes using species specific ANI radii.'
            % len(nontype_gids))
        clusters = self._cluster(ani_af, nontype_gids, type_radius)

        # write out clusters
        write_clusters(
            clusters, type_radius, species_type_gid,
            os.path.join(self.output_dir, 'gtdb_type_genome_clusters.tsv'))
Example #5
0
    def run(self, named_cluster_file,
            cur_gtdb_metadata_file,
            cur_genomic_path_file,
            qc_passed_file,
            ncbi_genbank_assembly_file,
            untrustworthy_type_file,
            ani_af_rep_vs_nonrep,
            gtdb_type_strains_ledger,
            ncbi_env_bioproject_ledger):
        """Infer de novo species clusters and representatives for remaining genomes."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                            create_sp_clusters=False,
                                            qc_passed_file=qc_passed_file,
                                            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                            untrustworthy_type_ledger=untrustworthy_type_file,
                                            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)

        # determine representatives and genomes clustered to each representative
        self.logger.info('Reading named GTDB species clusters.')
        named_rep_gids, rep_clustered_gids, rep_radius = self.parse_named_clusters(
            named_cluster_file)
        self.logger.info(
            ' - identified {:,} representative genomes.'.format(len(named_rep_gids)))
        self.logger.info(
            ' - identified {:,} clustered genomes.'.format(len(rep_clustered_gids)))

        # determine genomes left to be clustered
        unclustered_gids = set(cur_genomes.genomes.keys()) - \
            named_rep_gids - rep_clustered_gids
        self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(
            len(unclustered_gids)))

        # establish closest representative for each unclustered genome
        self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(
            len(unclustered_gids)))
        nonrep_radius = self.nonrep_radius(
            unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info(
            'Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self.mash_ani_unclustered(cur_genomes, unclustered_gids)

        # select de novo species representatives in a greedy fashion based on genome quality
        de_novo_rep_gids = self.selected_rep_genomes(cur_genomes,
                                                     nonrep_radius,
                                                     unclustered_gids,
                                                     mash_anis)

        # cluster all non-representative genomes to representative genomes
        final_cluster_radius = rep_radius.copy()
        final_cluster_radius.update(nonrep_radius)

        final_clusters, _ani_af = self.cluster_genomes(cur_genomes,
                                                       de_novo_rep_gids,
                                                       named_rep_gids,
                                                       final_cluster_radius)

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]

        self.logger.info(
            'Writing {:,} species clusters to file.'.format(len(final_clusters)))
        self.logger.info('Writing {:,} cluster radius information to file.'.format(
            len(final_cluster_radius)))

        write_clusters(final_clusters,
                       final_cluster_radius,
                       cur_genomes,
                       os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv'))

        write_rep_radius(final_cluster_radius,
                         cur_genomes,
                         os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))

        # write out archaeal and bacterial GTDB representatives
        fout_ar = open(os.path.join(self.output_dir, 'gtdb_reps_ar.lst'), 'w')
        fout_bac = open(os.path.join(
            self.output_dir, 'gtdb_reps_bac.lst'), 'w')
        for rid in final_clusters:
            if cur_genomes[rid].gtdb_taxa.domain == 'd__Bacteria':
                fout_bac.write('{}\n'.format(cur_genomes[rid].ncbi_accn))
            elif cur_genomes[rid].gtdb_taxa.domain == 'd__Archaea':
                fout_ar.write('{}\n'.format(cur_genomes[rid].ncbi_accn))
            else:
                self.logger.error(
                    'GTDB representative has unassigned domain: {}'.format(rid))

        fout_ar.close()
        fout_bac.close()