def run(self, manual_taxonomy, cur_gtdb_metadata_file, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            gtdb_type_strains_ledger, sp_priority_ledger,
            genus_priority_ledger, ncbi_env_bioproject_ledger, lpsn_gss_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  lpsn_gss_file,
                                                  self.output_dir)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - current genome set contains {len(cur_genomes):,} genomes.')

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type species genomes with incongruent GTDB genus assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_species_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB genus\tNCBI genus\tGTDB genus priority date\tNCBI genus priority date\tPriority status\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_gtdb_type_species():
                gtdb_genus = taxa[Taxonomy.GENUS_INDEX]
                ncbi_genus = cur_genomes[rid].ncbi_taxa.genus

                if gtdb_genus != ncbi_genus:
                    priority_genus = sp_priority_mngr.genus_priority(
                        gtdb_genus, ncbi_genus)

                    if priority_genus != gtdb_genus:
                        num_incongruent += 1

                        if priority_genus == ncbi_genus:
                            priority_status = 'NCBI genus name has priority'
                        else:
                            priority_status = 'Genus with priority must be manually established'

                        fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                            rid, gtdb_genus, ncbi_genus,
                            sp_priority_mngr.genus_priority_year(gtdb_genus),
                            sp_priority_mngr.genus_priority_year(ncbi_genus),
                            priority_status,
                            cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent genus assignments.'.
            format(num_incongruent))
        fout.close()
Example #2
0
    def run(self, gtdb_clusters_file, prev_gtdb_metadata_file,
            cur_gtdb_metadata_file, uba_genome_paths, qc_passed_file,
            gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            sp_priority_ledger, gtdb_taxa_updates_ledger, dsmz_bacnames_file):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            gtdbtk_classify_file=gtdbtk_classify_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # read named GTDB species clusters
        self.logger.info('Reading GTDB species clusters.')
        cur_clusters, _ = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # set current genomes to have same GTDB assignments as in previous
        # GTDB release. This is necessary, since genomes may have different
        # NCBI accession numbers between releases and thus the previous GTDB
        # taxonomy will not be reflected in the latest GTDB database. The
        # exception is if a genome has changed domains, in which case the
        # previous assignment is invalid.
        self.logger.info(
            'Setting GTDB taxonomy of genomes in current genome set.')
        update_count = 0
        conflicting_domain_count = 0
        for prev_gid in prev_genomes:
            if prev_gid in cur_genomes:
                if prev_genomes[prev_gid].gtdb_taxa != cur_genomes[
                        prev_gid].gtdb_taxa:
                    if prev_genomes[prev_gid].gtdb_taxa.domain == cur_genomes[
                            prev_gid].gtdb_taxa.domain:
                        update_count += 1
                        cur_genomes[prev_gid].gtdb_taxa.update_taxa(
                            prev_genomes[prev_gid].gtdb_taxa)
                    else:
                        conflicting_domain_count += 1
        self.logger.info(f' ... updated {update_count:,} genomes.')
        self.logger.info(
            f' ... identified {conflicting_domain_count:,} genomes with conflicting domain assignments.'
        )

        # get explicit updates to previous GTDB taxa
        self.logger.info('Reading explicit taxa updates.')
        explicit_taxon_updates = self._parse_explicit_taxa_updates(
            gtdb_taxa_updates_ledger)
        self.logger.info(
            f' ... identified {len(explicit_taxon_updates):,} updates.')

        self.logger.info(
            'Updating current genomes to reflect explicit taxa updates.')
        update_count = 0
        for cur_taxon, new_taxon in explicit_taxon_updates.items():
            rank_prefix = cur_taxon[0:3]
            rank_index = Taxonomy.rank_prefixes.index(rank_prefix)

            for gid in cur_genomes:
                if cur_genomes[gid].gtdb_taxa.get_taxa(
                        rank_index) == cur_taxon:
                    update_count += 1
                    cur_genomes[gid].gtdb_taxa.set_taxa(rank_index, new_taxon)

                    if rank_prefix == 'g__':
                        # should also update the species name
                        new_sp = cur_genomes[gid].gtdb_taxa.species.replace(
                            cur_taxon[3:], new_taxon[3:])
                        cur_genomes[gid].gtdb_taxa.set_taxa(
                            rank_index + 1, new_sp)

        self.logger.info(f' ... updated {update_count:,} genomes.')

        # initialize species priority manager
        self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                       dsmz_bacnames_file)

        # create table with new NCBI genera that likely need to be incorporated into
        # this release of the GTDB
        self.new_ncbi_genera(prev_genomes, cur_genomes, cur_clusters,
                             gtdbtk_classify_file)

        self.new_ncbi_families(prev_genomes, cur_genomes, cur_clusters,
                               gtdbtk_classify_file)
Example #3
0
    def write_synonym_table(self,
                            type_strain_synonyms,
                            consensus_synonyms,
                            ani_af,
                            sp_priority_ledger,
                            genus_priority_ledger,
                            dsmz_bacnames_file):
        """Create table indicating species names that should be considered synonyms."""
        
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                    genus_priority_ledger,
                                                    dsmz_bacnames_file)

        out_file = os.path.join(self.output_dir, 'synonyms.tsv')
        fout = open(out_file, 'w')
        fout.write('Synonym type\tNCBI species\tGTDB representative\tStrain IDs\tType sources\tPriority year')
        fout.write('\tGTDB type species\tGTDB type strain\tNCBI assembly type')
        fout.write('\tNCBI synonym\tHighest-quality synonym genome\tSynonym strain IDs\tSynonym type sources\tSynonym priority year')
        fout.write('\tSynonym GTDB type species\tSynonym GTDB type strain\tSynonym NCBI assembly type')
        fout.write('\tANI\tAF\tWarnings\n')
        
        incorrect_priority = 0
        failed_type_strain_priority = 0
        for synonyms, synonym_type in [(type_strain_synonyms, 'TYPE_STRAIN_SYNONYM'), 
                                        (consensus_synonyms, 'CONSENSUS_SYNONYM')]:
            for rid, synonym_ids in synonyms.items():
                for gid in synonym_ids:
                    ani, af = symmetric_ani(ani_af, rid, gid)

                    fout.write(synonym_type)
                    fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                                self.cur_genomes[rid].ncbi_taxa.species,
                                rid,
                                ','.join(sorted(self.cur_genomes[rid].strain_ids())),
                                ','.join(sorted(self.cur_genomes[rid].gtdb_type_sources())).upper().replace('STRAININFO', 'StrainInfo'),
                                sp_priority_mngr.species_priority_year(self.cur_genomes, rid),
                                self.cur_genomes[rid].is_gtdb_type_species(),
                                self.cur_genomes[rid].is_gtdb_type_strain(),
                                self.cur_genomes[rid].ncbi_type_material))
                    
                    synonym_priority_year = sp_priority_mngr.species_priority_year(self.cur_genomes, gid)
                    if synonym_priority_year == Genome.NO_PRIORITY_YEAR:
                        synonym_priority_year = 'n/a'
                    
                    fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                                self.cur_genomes[gid].ncbi_taxa.species,
                                gid,
                                ','.join(sorted(self.cur_genomes[gid].strain_ids())),
                                ','.join(sorted(self.cur_genomes[gid].gtdb_type_sources())).upper().replace('STRAININFO', 'StrainInfo'),
                                synonym_priority_year,
                                self.cur_genomes[gid].is_gtdb_type_species(),
                                self.cur_genomes[gid].is_gtdb_type_strain(),
                                self.cur_genomes[gid].ncbi_type_material))
                    fout.write('\t{:.3f}\t{:.4f}'.format(ani, af))
                    
                    if self.cur_genomes[rid].is_effective_type_strain() and self.cur_genomes[gid].is_effective_type_strain():
                            priority_gid, note = sp_priority_mngr.species_priority(self.cur_genomes, rid, gid)
                            if priority_gid != rid:
                                incorrect_priority += 1
                                fout.write('\tIncorrect priority: {}'.format(note))
                    elif not self.cur_genomes[rid].is_gtdb_type_strain() and self.cur_genomes[gid].is_gtdb_type_strain():
                            failed_type_strain_priority += 1
                            fout.write('\tFailed to prioritize type strain of species')

                    fout.write('\n')
        
        if incorrect_priority:
            self.logger.warning(f' - identified {incorrect_priority:,} synonyms with incorrect priority.')
            
        if failed_type_strain_priority:
            self.logger.warning(f' - identified {failed_type_strain_priority:,} synonyms that failed to priotize the type strain of the species.')
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger,
            sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  dsmz_bacnames_file)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get all GTDB species represented by a type strain:
        gtdb_type_species = set()
        for rid in mc_taxonomy:
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX])

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type strain genomes with incongruent GTDB species assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_sp = taxa[Taxonomy.SPECIES_INDEX]
                gtdb_generic = generic_name(gtdb_sp)

                ncbi_sp = cur_genomes[rid].ncbi_taxa.species
                ncbi_generic = generic_name(ncbi_sp)

                if ncbi_sp == 's__':
                    # NCBI taxonomy is sometimes behind the genome annotation pages,
                    # and do not have a species assignment even for type strain genome
                    continue

                # check if genome is a valid genus transfer into a genus
                # that already contains a species with the specific
                # name which results in a polyphyletic suffix being required
                # e.g. G002240355 is Prauserella marina at NCBI and is
                # transferred into Saccharomonospora under the GTDB. However,
                # Saccharomonospora marina already exists so this genome
                # needs to be S. marina_A.
                if (is_placeholder_taxon(gtdb_sp)
                        and gtdb_generic != ncbi_generic
                        and canonical_species(gtdb_sp) in gtdb_type_species):
                    continue

                if not test_same_epithet(specific_epithet(gtdb_sp),
                                         specific_epithet(ncbi_sp)):
                    num_incongruent += 1
                    fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        rid, gtdb_sp, ncbi_sp,
                        cur_genomes[rid].is_gtdb_type_strain(),
                        cur_genomes[rid].is_ncbi_type_strain(),
                        cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent species assignments.'.
            format(num_incongruent))
        fout.close()
    def write_synonym_table(self, synonyms, cur_genomes, ani_af,
                            sp_priority_ledger):
        """Create table indicating species names that should be considered synonyms."""

        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger)

        out_file = os.path.join(self.output_dir, 'synonyms.tsv')
        fout = open(out_file, 'w')
        fout.write(
            'NCBI species\tGTDB species\tRepresentative\tStrain IDs\tRepresentative type sources\tPriority year\tGTDB type species\tGTDB type strain\tNCBI assembly type'
        )
        fout.write(
            '\tNCBI synonym\tGTDB synonym\tSynonym genome\tSynonym strain IDs\tSynonym type sources\tPriority year\tGTDB type species\tGTDB type strain\tSynonym NCBI assembly type'
        )
        fout.write('\tANI\tAF\tWarnings\n')

        incorrect_priority = 0
        failed_type_strain_priority = 0
        for rid, synonym_ids in synonyms.items():
            for gid in synonym_ids:
                ani, af = symmetric_ani(ani_af, rid, gid)

                fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                    cur_genomes[rid].ncbi_taxa.species,
                    cur_genomes[rid].gtdb_taxa.species, rid,
                    ','.join(sorted(cur_genomes[rid].strain_ids())),
                    ','.join(sorted(
                        cur_genomes[rid].gtdb_type_sources())).upper().replace(
                            'STRAININFO',
                            'StrainInfo'), cur_genomes[rid].year_of_priority(),
                    cur_genomes[rid].is_gtdb_type_species(),
                    cur_genomes[rid].is_gtdb_type_strain(),
                    cur_genomes[rid].ncbi_type_material))
                fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                    cur_genomes[gid].ncbi_taxa.species,
                    cur_genomes[gid].gtdb_taxa.species, gid,
                    ','.join(sorted(cur_genomes[gid].strain_ids())),
                    ','.join(sorted(
                        cur_genomes[gid].gtdb_type_sources())).upper().replace(
                            'STRAININFO',
                            'StrainInfo'), cur_genomes[gid].year_of_priority(),
                    cur_genomes[gid].is_gtdb_type_species(),
                    cur_genomes[gid].is_gtdb_type_strain(),
                    cur_genomes[gid].ncbi_type_material))
                fout.write('\t{:.3f}\t{:.4f}'.format(ani, af))

                if cur_genomes[rid].is_gtdb_type_strain(
                ) and cur_genomes[gid].is_gtdb_type_strain():
                    priority_gid, note = sp_priority_mngr.priority(
                        cur_genomes, rid, gid)
                    if priority_gid != rid:
                        incorrect_priority += 1
                        fout.write('\tIncorrect priority: {}'.format(note))
                elif not cur_genomes[rid].is_gtdb_type_strain(
                ) and cur_genomes[gid].is_gtdb_type_strain():
                    failed_type_strain_priority += 1
                    fout.write('\tFailed to prioritize type strain of species')

                fout.write('\n')

        if incorrect_priority:
            self.logger.warning(
                f'Identified {incorrect_priority:,} synonyms with incorrect priority.'
            )

        if failed_type_strain_priority:
            self.logger.warning(
                f'Identified {failed_type_strain_priority:,} synonyms that failed to priotize the type strain of the species.'
            )
    def run(self, rep_change_summary_file, prev_gtdb_metadata_file,
            prev_genomic_path_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, genomes_new_updated_file,
            qc_passed_file, gtdbtk_classify_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, gtdb_type_strains_ledger,
            sp_priority_ledger):
        """Perform initial actions required for changed representatives."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            uba_genome_file=uba_genome_paths,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            ' ... previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info(
            'Reading path to previous and current genomic FASTA files.')
        prev_genomes.load_genomic_file_paths(prev_genomic_path_file)
        prev_genomes.load_genomic_file_paths(uba_genome_paths)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # created expanded previous GTDB species clusters
        new_updated_sp_clusters = SpeciesClusters()

        self.logger.info(
            'Creating species clusters of new and updated genomes based on GTDB-Tk classifications.'
        )
        new_updated_sp_clusters.create_expanded_clusters(
            prev_genomes.sp_clusters, genomes_new_updated_file, qc_passed_file,
            gtdbtk_classify_file)

        self.logger.info(
            'Identified {:,} expanded species clusters spanning {:,} genomes.'.
            format(len(new_updated_sp_clusters),
                   new_updated_sp_clusters.total_num_genomes()))

        # initialize species priority manager
        self.sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger)

        # take required action for each changed representatives
        self.action_genomic_lost(rep_change_summary_file, prev_genomes,
                                 cur_genomes, new_updated_sp_clusters)

        self.action_genomic_update(rep_change_summary_file, prev_genomes,
                                   cur_genomes, new_updated_sp_clusters)

        self.action_type_strain_lost(rep_change_summary_file, prev_genomes,
                                     cur_genomes, new_updated_sp_clusters)

        self.action_domain_change(rep_change_summary_file, prev_genomes,
                                  cur_genomes)

        if True:  #***
            improved_reps = self.action_improved_rep(prev_genomes, cur_genomes,
                                                     new_updated_sp_clusters)

            pickle.dump(
                improved_reps,
                open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'wb'))
        else:
            self.logger.warning(
                'Reading improved_reps for pre-cached file. Generally used only for debugging.'
            )
            improved_reps = pickle.load(
                open(os.path.join(self.output_dir, 'improved_reps.pkl'), 'rb'))

        for prev_rid, (new_rid, action) in improved_reps.items():
            self.update_rep(prev_rid, new_rid, action)

        self.action_naming_priority(prev_genomes, cur_genomes,
                                    new_updated_sp_clusters)

        # report basic statistics
        num_retired_sp = sum(
            [1 for v in self.new_reps.values() if v[0] is None])
        num_replaced_rids = sum(
            [1 for v in self.new_reps.values() if v[0] is not None])
        self.logger.info(f'Identified {num_retired_sp:,} retired species.')
        self.logger.info(
            f'Identified {num_replaced_rids:,} species with a modified representative genome.'
        )

        self.action_log.close()

        # write out representatives for existing species clusters
        fout = open(os.path.join(self.output_dir, 'updated_species_reps.tsv'),
                    'w')
        fout.write(
            'Previous representative ID\tNew representative ID\tAction\tRepresentative status\n'
        )
        for rid in prev_genomes.sp_clusters:
            if rid in self.new_reps:
                new_rid, action = self.new_reps[rid]
                if new_rid is not None:
                    fout.write(f'{rid}\t{new_rid}\t{action}\tREPLACED\n')
                else:
                    fout.write(f'{rid}\t{new_rid}\t{action}\tLOST\n')
            else:
                fout.write(f'{rid}\t{rid}\tNONE\tUNCHANGED\n')

        fout.close()

        # write out updated species clusters
        out_file = os.path.join(self.output_dir, 'updated_sp_clusters.tsv')
        self.write_updated_clusters(prev_genomes, cur_genomes, self.new_reps,
                                    new_updated_sp_clusters, out_file)