コード例 #1
0
    def __init__(self, ani_ncbi_erroneous, ani_cache_file, cpus, output_dir):
        """Initialization."""

        self.output_dir = output_dir
        self.logger = logging.getLogger('timestamp')

        self.ani_ncbi_erroneous = ani_ncbi_erroneous
        self.fastani = FastANI(ani_cache_file, cpus)
コード例 #2
0
    def nonrep_radius(self, unclustered_gids, rep_gids, ani_af_rep_vs_nonrep):
        """Calculate circumscription radius for unclustered, nontype genomes."""

        # set radius for genomes to default values
        nonrep_radius = {}
        for gid in unclustered_gids:
            nonrep_radius[gid] = GenomeRadius(ani=self.ani_sp,
                                              af=None,
                                              neighbour_gid=None)

        # determine closest type ANI neighbour and restrict ANI radius as necessary
        ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb'))
        for nonrep_gid in unclustered_gids:
            if nonrep_gid not in ani_af:
                continue

            for rep_gid in rep_gids:
                if rep_gid not in ani_af[nonrep_gid]:
                    continue

                ani, af = FastANI.symmetric_ani(ani_af, nonrep_gid, rep_gid)

                if ani > nonrep_radius[nonrep_gid].ani and af >= self.af_sp:
                    nonrep_radius[nonrep_gid] = GenomeRadius(ani=ani,
                                                             af=af,
                                                             neighbour_gid=rep_gid)

        self.logger.info('ANI circumscription radius: min={:.2f}, mean={:.2f}, max={:.2f}'.format(
            min([d.ani for d in nonrep_radius.values()]),
            np_mean([d.ani for d in nonrep_radius.values()]),
            max([d.ani for d in nonrep_radius.values()])))

        return nonrep_radius
コード例 #3
0
    def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['fastANI', 'mash'])
        
        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')
        
        self.true_str = ['t', 'T', 'true', 'True']
        
        self.ani_sp = ani_sp
        self.af_sp = af_sp

        self.min_mash_ani = 90.0

        self.fastani = FastANI(ani_cache_file, cpus)
コード例 #4
0
    def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir):
        """Initialization."""

        check_dependencies(['fastANI', 'mash'])

        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')

        self.ani_sp = ani_sp
        self.af_sp = af_sp

        self.max_ani_neighbour = 97.0
        self.min_mash_ani = 90.0

        self.ClusteredGenome = namedtuple('ClusteredGenome', 'ani af gid')

        self.fastani = FastANI(ani_cache_file, cpus)
コード例 #5
0
    def _cluster(self, ani_af, non_reps, rep_radius):
        """Cluster non-representative to representative genomes using species specific ANI thresholds."""

        clusters = {}
        for rep_id in rep_radius:
            clusters[rep_id] = []

        num_clustered = 0
        for idx, non_rid in enumerate(non_reps):
            if idx % 100 == 0:
                sys.stdout.write('==> Processed {:,} of {:,} genomes [no. clustered = {:,}].\r'.format(
                    idx+1,
                    len(non_reps),
                    num_clustered))
                sys.stdout.flush()

            if non_rid not in ani_af:
                continue

            closest_rid = None
            closest_ani = 0
            closest_af = 0
            for rid in rep_radius:
                if rid not in ani_af[non_rid]:
                    continue

                ani, af = FastANI.symmetric_ani(ani_af, rid, non_rid)

                if af >= self.af_sp:
                    if ani > closest_ani or (ani == closest_ani and af > closest_af):
                        closest_rid = rid
                        closest_ani = ani
                        closest_af = af

            if closest_rid:
                if closest_ani > rep_radius[closest_rid].ani:
                    num_clustered += 1
                    clusters[closest_rid].append(self.ClusteredGenome(gid=non_rid,
                                                                      ani=closest_ani,
                                                                      af=closest_af))

        sys.stdout.write('==> Processed {:,} of {:,} genomes [no. clustered = {:,}].\r'.format(
            len(non_reps),
            len(non_reps),
            num_clustered))
        sys.stdout.flush()
        sys.stdout.write('\n')

        num_unclustered = len(non_reps) - num_clustered
        self.logger.info('Assigned {:,} genomes to {:,} representatives; {:,} genomes remain unclustered.'.format(
            sum([len(clusters[rid]) for rid in clusters]),
            len(clusters),
            num_unclustered))

        return clusters
コード例 #6
0
 def __init__(self, ani_cache_file, cpus, output_dir):
     """Initialization."""
     
     self.ltp_dir = 'rna_ltp_132'
     self.ltp_results_file = 'ssu.taxonomy.tsv'
     self.LTP_METADATA = namedtuple('LTP_METADATA', 'taxonomy taxa species ssu_len evalue bitscore aln_len perc_iden perc_aln')
     
     self.ltp_pi_threshold = 99.0
     self.ltp_pa_threshold = 90.0
     self.ltp_ssu_len_threshold = 900
     self.ltp_evalue_threshold = 1e-10
     
     self.output_dir = output_dir
     self.logger = logging.getLogger('timestamp')
     self.cpus = cpus
     
     self.fastani = FastANI(ani_cache_file, cpus)
     
     self.ani_pickle_dir = os.path.join(self.output_dir, 'ani_pickles')
     if not os.path.exists(self.ani_pickle_dir):
         os.makedirs(self.ani_pickle_dir)
コード例 #7
0
    def __init__(self, ani_cache_file, cpus, output_dir):
        """Initialization."""

        self.output_dir = output_dir
        self.logger = logging.getLogger('timestamp')

        self.fastani = FastANI(ani_cache_file, cpus)

        # action parameters
        self.genomic_update_ani = 99.0
        self.genomic_update_af = 0.80

        self.new_rep_ani = 99.0
        self.new_rep_af = 0.80
        self.new_rep_qs_threshold = 10  # increase in ANI score require to select
        # new representative

        self.action_log = open(os.path.join(self.output_dir, 'action_log.tsv'),
                               'w')
        self.action_log.write(
            'Genome ID\tPrevious GTDB species\tAction\tParameters\n')

        self.new_reps = {}
コード例 #8
0
class UpdateClusterDeNovo(object):
    """Infer de novo species clusters and representatives for remaining genomes."""

    def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['fastANI', 'mash'])
        
        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')
        
        self.true_str = ['t', 'T', 'true', 'True']
        
        self.ani_sp = ani_sp
        self.af_sp = af_sp

        self.min_mash_ani = 90.0

        self.fastani = FastANI(ani_cache_file, cpus)
        
    def _parse_named_clusters(self, named_cluster_file):
        """Parse named GTDB species clusters."""
        
        rep_gids = set()
        rep_clustered_gids = set()
        rep_radius = {}
        with open(named_cluster_file) as f:
            headers = f.readline().strip().split('\t')
            
            rep_index = headers.index('Representative')
            num_clustered_index = headers.index('No. clustered genomes')
            clustered_genomes_index = headers.index('Clustered genomes')
            closest_type_index = headers.index('Closest representative')
            ani_radius_index = headers.index('ANI radius')
            af_index = headers.index('AF closest')

            for line in f:
                line_split = line.strip().split('\t')

                rep_gid = line_split[rep_index]
                rep_gids.add(rep_gid)
                
                num_clustered = int(line_split[num_clustered_index])
                if num_clustered > 0:
                    for gid in [g.strip() for g in line_split[clustered_genomes_index].split(',')]:
                        rep_clustered_gids.add(gid)
                        
                rep_radius[rep_gid] = GenomeRadius(ani = float(line_split[ani_radius_index]), 
                                                     af = float(line_split[af_index]),
                                                     neighbour_gid = line_split[closest_type_index])
                        
        return rep_gids, rep_clustered_gids, rep_radius

    def _nonrep_radius(self, unclustered_gids, rep_gids, ani_af_rep_vs_nonrep):
        """Calculate circumscription radius for unclustered, nontype genomes."""
        
        # set radius for genomes to default values
        nonrep_radius = {}
        for gid in unclustered_gids:
            nonrep_radius[gid] = GenomeRadius(ani = self.ani_sp, 
                                                     af = None,
                                                     neighbour_gid = None)

        # determine closest type ANI neighbour and restrict ANI radius as necessary
        ani_af = pickle.load(open(ani_af_rep_vs_nonrep, 'rb'))
        for nonrep_gid in unclustered_gids:
            if nonrep_gid not in ani_af:
                continue
                    
            for rep_gid in rep_gids:
                if rep_gid not in ani_af[nonrep_gid]:
                    continue
                    
                ani, af = symmetric_ani(ani_af, nonrep_gid, rep_gid)

                if ani > nonrep_radius[nonrep_gid].ani and af >= self.af_sp:
                    nonrep_radius[nonrep_gid] = GenomeRadius(ani = ani, 
                                                             af = af,
                                                             neighbour_gid = rep_gid)
                    
        self.logger.info('ANI circumscription radius: min={:.2f}, mean={:.2f}, max={:.2f}'.format(
                                min([d.ani for d in nonrep_radius.values()]), 
                                np_mean([d.ani for d in nonrep_radius.values()]), 
                                max([d.ani for d in nonrep_radius.values()])))
                        
        return nonrep_radius
        
    def _mash_ani_unclustered(self, cur_genomes, gids):
        """Calculate pairwise Mash ANI estimates between genomes."""
        
        mash = Mash(self.cpus)
        
        # create Mash sketch for potential representative genomes
        mash_nontype_sketch_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.msh')
        genome_list_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.lst')
        mash.sketch(gids, cur_genomes.genomic_files, genome_list_file, mash_nontype_sketch_file)

        # get Mash distances
        mash_dist_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.dst')
        mash.dist_pairwise( float(100 - self.min_mash_ani)/100, mash_nontype_sketch_file, mash_dist_file)

        # read Mash distances
        mash_ani = mash.read_ani(mash_dist_file)
        
        # report pairs above Mash threshold
        mash_ani_pairs = []
        for qid in mash_ani:
            for rid in mash_ani[qid]:
                if mash_ani[qid][rid] >= self.min_mash_ani:
                    n_qid = cur_genomes.user_uba_id_map.get(qid, qid)
                    n_rid = cur_genomes.user_uba_id_map.get(rid, rid)
                    if n_qid != n_rid:
                        mash_ani_pairs.append((n_qid, n_rid))
                        mash_ani_pairs.append((n_rid, n_qid))
                
        self.logger.info('Identified {:,} genome pairs with a Mash ANI >= {:.1f}%.'.format(
                            len(mash_ani_pairs), 
                            self.min_mash_ani))

        return mash_ani
        
    def _selected_rep_genomes(self,
                                cur_genomes,
                                nonrep_radius, 
                                unclustered_qc_gids, 
                                mash_ani):
        """Select de novo representatives for species clusters in a greedy fashion using species-specific ANI thresholds."""

        # sort genomes by quality score
        self.logger.info('Selecting de novo representatives in a greedy manner based on quality.')
        q = {gid:cur_genomes[gid].score_type_strain() for gid in unclustered_qc_gids}
        q_sorted = sorted(q.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)

        # greedily determine representatives for new species clusters
        cluster_rep_file = os.path.join(self.output_dir, 'cluster_reps.tsv')
        clusters = set()
        if not os.path.exists(cluster_rep_file):
            clustered_genomes = 0
            max_ani_pairs = 0
            for idx, (cur_gid, _score) in enumerate(q_sorted):

                # determine reference genomes to calculate ANI between
                ani_pairs = []
                if cur_gid in mash_ani:
                    for rep_gid in clusters:
                        if mash_ani[cur_gid].get(rep_gid, 0) >= self.min_mash_ani:
                            ani_pairs.append((cur_gid, rep_gid))
                            ani_pairs.append((rep_gid, cur_gid))

                # determine if genome clusters with representative
                clustered = False
                if ani_pairs:
                    if len(ani_pairs) > max_ani_pairs:
                        max_ani_pairs = len(ani_pairs)
                    
                    ani_af = self.fastani.pairs(ani_pairs, cur_genomes.genomic_files, report_progress=False)

                    closest_rep_gid = None
                    closest_rep_ani = 0
                    closest_rep_af = 0
                    for rep_gid in clusters:
                        ani, af = symmetric_ani(ani_af, cur_gid, rep_gid)

                        if af >= self.af_sp:
                            if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af):
                                closest_rep_gid = rep_gid
                                closest_rep_ani = ani
                                closest_rep_af = af

                        if ani > nonrep_radius[cur_gid].ani and af >= self.af_sp:
                            nonrep_radius[cur_gid] = GenomeRadius(ani = ani, 
                                                                         af = af,
                                                                         neighbour_gid = rep_gid)
                                                                         
                    if closest_rep_gid and closest_rep_ani > nonrep_radius[closest_rep_gid].ani:
                        clustered = True
                    
                if not clustered:
                    # genome is a new species cluster representative
                    clusters.add(cur_gid)
                else:
                    clustered_genomes += 1
                
                if (idx+1) % 10 == 0 or idx+1 == len(q_sorted):
                    statusStr = '-> Clustered {:,} of {:,} ({:.2f}%) genomes [ANI pairs: {:,}; clustered genomes: {:,}; clusters: {:,}].'.format(
                                    idx+1, 
                                    len(q_sorted), 
                                    float(idx+1)*100/len(q_sorted),
                                    max_ani_pairs,
                                    clustered_genomes,
                                    len(clusters)).ljust(96)
                    sys.stdout.write('{}\r'.format(statusStr))
                    sys.stdout.flush()
                    max_ani_pairs = 0
            sys.stdout.write('\n')
            
            # write out selected cluster representative
            fout = open(cluster_rep_file, 'w')
            for gid in clusters:
                fout.write('{}\n'.format(gid))
            fout.close()
        else:
            # read cluster reps from file
            self.logger.warning('Using previously determined cluster representatives.')
            for line in open(cluster_rep_file):
                gid = line.strip()
                clusters.add(gid)
                
        self.logger.info('Selected {:,} representative genomes for de novo species clusters.'.format(len(clusters)))
        
        return clusters
        
    def _cluster_genomes(self,
                            cur_genomes,
                            de_novo_rep_gids,
                            named_rep_gids, 
                            final_cluster_radius):
        """Cluster new representatives to representatives of named GTDB species clusters."""
        
        all_reps = de_novo_rep_gids.union(named_rep_gids)
        nonrep_gids = set(cur_genomes.genomes.keys()) - all_reps
        self.logger.info('Clustering {:,} genomes to {:,} named and de novo representatives.'.format(
                            len(nonrep_gids), len(all_reps)))

        if True: #***
            # calculate MASH distance between non-representatives and representatives genomes
            mash = Mash(self.cpus)
            
            mash_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.msh')
            rep_genome_list_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.lst')
            mash.sketch(all_reps, cur_genomes.genomic_files, rep_genome_list_file, mash_rep_sketch_file)

            mash_none_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.msh')
            non_rep_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.lst')
            mash.sketch(nonrep_gids, cur_genomes.genomic_files, non_rep_file, mash_none_rep_sketch_file)

            # get Mash distances
            mash_dist_file = os.path.join(self.output_dir, 'gtdb_rep_vs_nonrep_genomes.dst')
            mash.dist(float(100 - self.min_mash_ani)/100, 
                        mash_rep_sketch_file, 
                        mash_none_rep_sketch_file, 
                        mash_dist_file)

            # read Mash distances
            mash_ani = mash.read_ani(mash_dist_file)
            
            # calculate ANI between non-representatives and representatives genomes
            clusters = {}
            for gid in all_reps:
                clusters[gid] = []

            if False: #***
                mash_ani_pairs = []
                for gid in nonrep_gids:
                    if gid in mash_ani:
                        for rid in clusters:
                            if mash_ani[gid].get(rid, 0) >= self.min_mash_ani:
                                n_gid = cur_genomes.user_uba_id_map.get(gid, gid)
                                n_rid = cur_genomes.user_uba_id_map.get(rid, rid)
                                if n_gid != n_rid:
                                    mash_ani_pairs.append((n_gid, n_rid))
                                    mash_ani_pairs.append((n_rid, n_gid))
                                    
            mash_ani_pairs = []
            for qid in mash_ani:
                n_qid = cur_genomes.user_uba_id_map.get(qid, qid)
                assert n_qid in nonrep_gids
                
                for rid in mash_ani[qid]:
                    n_rid = cur_genomes.user_uba_id_map.get(rid, rid)
                    assert n_rid in all_reps
                    
                    if (mash_ani[qid][rid] >= self.min_mash_ani
                        and n_qid != n_rid):
                        mash_ani_pairs.append((n_qid, n_rid))
                        mash_ani_pairs.append((n_rid, n_qid))
                            
            self.logger.info('Calculating ANI between {:,} species clusters and {:,} unclustered genomes ({:,} pairs):'.format(
                                len(clusters), 
                                len(nonrep_gids),
                                len(mash_ani_pairs)))
            ani_af = self.fastani.pairs(mash_ani_pairs, cur_genomes.genomic_files)

            # assign genomes to closest representatives 
            # that is within the representatives ANI radius
            self.logger.info('Assigning genomes to closest representative.')
            for idx, cur_gid in enumerate(nonrep_gids):
                closest_rep_gid = None
                closest_rep_ani = 0
                closest_rep_af = 0
                for rep_gid in clusters:
                    ani, af = symmetric_ani(ani_af, cur_gid, rep_gid)
                    
                    if ani >= final_cluster_radius[rep_gid].ani and af >= self.af_sp:
                        if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af):
                            closest_rep_gid = rep_gid
                            closest_rep_ani = ani
                            closest_rep_af = af
                    
                if closest_rep_gid:
                    clusters[closest_rep_gid].append(ClusteredGenome(gid=cur_gid, 
                                                                            ani=closest_rep_ani, 
                                                                            af=closest_rep_af))
                else:
                    self.logger.warning('Failed to assign genome {} to representative.'.format(cur_gid))
                    if closest_rep_gid:
                        self.logger.warning(' ...closest_rep_gid = {}'.format(closest_rep_gid))
                        self.logger.warning(' ...closest_rep_ani = {:.2f}'.format(closest_rep_ani))
                        self.logger.warning(' ...closest_rep_af = {:.2f}'.format(closest_rep_af))
                        self.logger.warning(' ...closest rep radius = {:.2f}'.format(final_cluster_radius[closest_rep_gid].ani))
                    else:
                        self.logger.warning(' ...no representative with an AF >{:.2f} identified.'.format(self.af_sp))
                 
                statusStr = '-> Assigned {:,} of {:,} ({:.2f}%) genomes.'.format(idx+1, 
                                                                                    len(nonrep_gids), 
                                                                                    float(idx+1)*100/len(nonrep_gids)).ljust(86)
                sys.stdout.write('{}\r'.format(statusStr))
                sys.stdout.flush()
            sys.stdout.write('\n')
            
            pickle.dump(clusters, open(os.path.join(self.output_dir, 'clusters.pkl'), 'wb'))
            pickle.dump(ani_af, open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.de_novo.pkl'), 'wb'))
        else:
            self.logger.warning('Using previously calculated results in: {}'.format('clusters.pkl'))
            clusters = pickle.load(open(os.path.join(self.output_dir, 'clusters.pkl'), 'rb'))
            
            self.logger.warning('Using previously calculated results in: {}'.format('ani_af_rep_vs_nonrep.de_novo.pkl'))
            ani_af = pickle.load(open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.de_novo.pkl'), 'rb'))

        return clusters, ani_af

    def run(self, named_cluster_file,
                    cur_gtdb_metadata_file,
                    cur_genomic_path_file,
                    uba_genome_paths,
                    qc_passed_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    ani_af_rep_vs_nonrep,
                    gtdb_type_strains_ledger):
        """Infer de novo species clusters and representatives for remaining genomes."""
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=uba_genome_paths,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # determine representatives and genomes clustered to each representative
        self.logger.info('Reading named GTDB species clusters.')
        named_rep_gids, rep_clustered_gids, rep_radius = self._parse_named_clusters(named_cluster_file)
        self.logger.info(' ... identified {:,} representative genomes.'.format(len(named_rep_gids)))
        self.logger.info(' ... identified {:,} clustered genomes.'.format(len(rep_clustered_gids)))
        
        # determine genomes left to be clustered
        unclustered_gids = set(cur_genomes.genomes.keys()) - named_rep_gids - rep_clustered_gids
        self.logger.info('Identified {:,} unclustered genomes passing QC.'.format(len(unclustered_gids)))

        # establish closest representative for each unclustered genome
        self.logger.info('Determining ANI circumscription for {:,} unclustered genomes.'.format(len(unclustered_gids)))
        nonrep_radius = self._nonrep_radius(unclustered_gids, named_rep_gids, ani_af_rep_vs_nonrep)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(cur_genomes, unclustered_gids)

        # select de novo species representatives in a greedy fashion based on genome quality
        de_novo_rep_gids = self._selected_rep_genomes(cur_genomes,
                                                        nonrep_radius, 
                                                        unclustered_gids, 
                                                        mash_anis)

        # cluster all non-representative genomes to representative genomes
        final_cluster_radius = rep_radius.copy()
        final_cluster_radius.update(nonrep_radius)
        
        final_clusters, ani_af = self._cluster_genomes(cur_genomes,
                                                        de_novo_rep_gids,
                                                        named_rep_gids, 
                                                        final_cluster_radius)

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]

        self.logger.info('Writing {:,} species clusters to file.'.format(len(final_clusters)))
        self.logger.info('Writing {:,} cluster radius information to file.'.format(len(final_cluster_radius)))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        cur_genomes,
                        os.path.join(self.output_dir, 'gtdb_clusters_de_novo.tsv'))

        write_rep_radius(final_cluster_radius, 
                            cur_genomes,
                            os.path.join(self.output_dir, 'gtdb_ani_radius_de_novo.tsv'))
コード例 #9
0
class ClusterNamedTypes(object):
    """Cluster genomes to selected GTDB type genomes."""
    def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir):
        """Initialization."""

        check_dependencies(['fastANI', 'mash'])

        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')

        self.ani_sp = ani_sp
        self.af_sp = af_sp

        self.max_ani_neighbour = 97.0
        self.min_mash_ani = 90.0

        self.ClusteredGenome = namedtuple('ClusteredGenome', 'ani af gid')

        self.fastani = FastANI(ani_cache_file, cpus)

    def _type_genome_radius(self, type_gids, type_genome_ani_file):
        """Calculate circumscription radius for type genomes."""

        # set type radius for all type genomes to default values
        type_radius = {}
        for gid in type_gids:
            type_radius[gid] = GenomeRadius(ani=self.ani_sp,
                                            af=None,
                                            neighbour_gid=None)

        # determine closest ANI neighbour and restrict ANI radius as necessary
        with open(type_genome_ani_file) as f:
            header = f.readline().strip().split('\t')

            type_gid1_index = header.index('Type genome 1')
            type_gid2_index = header.index('Type genome 2')
            ani_index = header.index('ANI')
            af_index = header.index('AF')

            for line in f:
                line_split = line.strip().split('\t')

                type_gid1 = line_split[type_gid1_index]
                type_gid2 = line_split[type_gid2_index]

                if type_gid1 not in type_gids or type_gid2 not in type_gids:
                    continue

                ani = float(line_split[ani_index])
                af = float(line_split[af_index])

                if ani > type_radius[type_gid1].ani:
                    if af < self.af_sp:
                        if ani >= self.ani_sp:
                            self.logger.warning(
                                'ANI for %s and %s is >%.2f, but AF <%.2f [pair skipped].'
                                % (type_gid1, type_gid2, ani, af))
                        continue

                    if ani > self.max_ani_neighbour:
                        self.logger.error('ANI neighbour %s is >%.2f for %s.' %
                                          (type_gid2, ani, type_gid1))

                    type_radius[type_gid1] = GenomeRadius(
                        ani=ani, af=af, neighbour_gid=type_gid2)

        self.logger.info(
            'ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' %
            (min([d.ani for d in type_radius.values()
                  ]), np_mean([d.ani for d in type_radius.values()
                               ]), max([d.ani for d in type_radius.values()])))

        return type_radius

    def _calculate_ani(self, type_gids, genome_files, ncbi_taxonomy,
                       type_genome_sketch_file):
        """Calculate ANI between type and non-type genomes."""

        mash = Mash(self.cpus)

        # create Mash sketch for type genomes
        if not type_genome_sketch_file or not os.path.exists(
                type_genome_sketch_file):
            type_genome_list_file = os.path.join(self.output_dir,
                                                 'gtdb_type_genomes.lst')
            type_genome_sketch_file = os.path.join(self.output_dir,
                                                   'gtdb_type_genomes.msh')
            mash.sketch(type_gids, genome_files, type_genome_list_file,
                        type_genome_sketch_file)

        # create Mash sketch for non-type genomes
        nontype_gids = set()
        for gid in genome_files:
            if gid not in type_gids:
                nontype_gids.add(gid)

        nontype_genome_list_file = os.path.join(self.output_dir,
                                                'gtdb_nontype_genomes.lst')
        nontype_genome_sketch_file = os.path.join(self.output_dir,
                                                  'gtdb_nontype_genomes.msh')
        mash.sketch(nontype_gids, genome_files, nontype_genome_list_file,
                    nontype_genome_sketch_file)

        # get Mash distances
        mash_dist_file = os.path.join(self.output_dir,
                                      'gtdb_type_vs_nontype_genomes.dst')
        mash.dist(
            float(100 - self.min_mash_ani) / 100, type_genome_sketch_file,
            nontype_genome_sketch_file, mash_dist_file)

        # read Mash distances
        mash_ani = mash.read_ani(mash_dist_file)

        # get pairs above Mash threshold
        mash_ani_pairs = []
        for qid in mash_ani:
            for rid in mash_ani[qid]:
                if mash_ani[qid][rid] >= self.min_mash_ani:
                    if qid != rid:
                        mash_ani_pairs.append((qid, rid))
                        mash_ani_pairs.append((rid, qid))

        self.logger.info(
            'Identified %d genome pairs with a Mash ANI >= %.1f%%.' %
            (len(mash_ani_pairs), self.min_mash_ani))

        # calculate ANI between pairs
        self.logger.info('Calculating ANI between %d genome pairs:' %
                         len(mash_ani_pairs))
        if True:  #***
            ani_af = self.fastani.pairs(mash_ani_pairs, genome_files)
            pickle.dump(
                ani_af,
                open(
                    os.path.join(self.output_dir,
                                 'ani_af_type_vs_nontype.pkl'), 'wb'))
        else:
            ani_af = pickle.load(
                open(
                    os.path.join(self.output_dir,
                                 'ani_af_type_vs_nontype.pkl'), 'rb'))

        return ani_af

    def _cluster(self, ani_af, nontype_gids, type_radius):
        """Cluster non-type genomes to type genomes using species specific ANI thresholds."""

        clusters = {}
        for rep_id in type_radius:
            clusters[rep_id] = []

        for idx, nontype_gid in enumerate(nontype_gids):
            if idx % 100 == 0:
                sys.stdout.write('==> Processed %d of %d genomes.\r' %
                                 (idx + 1, len(nontype_gids)))
                sys.stdout.flush()

            if nontype_gid not in ani_af:
                continue

            closest_type_gid = None
            closest_ani = 0
            closest_af = 0
            for type_gid in type_radius:
                if type_gid not in ani_af[nontype_gid]:
                    continue

                ani, af = symmetric_ani(ani_af, type_gid, nontype_gid)

                if af >= self.af_sp:
                    if ani > closest_ani or (ani == closest_ani
                                             and af > closest_af):
                        closest_type_gid = type_gid
                        closest_ani = ani
                        closest_af = af

            if closest_type_gid:
                if closest_ani > type_radius[closest_type_gid].ani:
                    clusters[closest_type_gid].append(
                        self.ClusteredGenome(gid=nontype_gid,
                                             ani=closest_ani,
                                             af=closest_af))

        sys.stdout.write('==> Processed %d of %d genomes.\r' %
                         (idx, len(nontype_gids)))
        sys.stdout.flush()
        sys.stdout.write('\n')

        self.logger.info(
            'Assigned %d genomes to representatives.' %
            sum([len(clusters[type_gid]) for type_gid in clusters]))

        return clusters

    def run(self, qc_file, metadata_file, genome_path_file,
            named_type_genome_file, type_genome_ani_file, mash_sketch_file,
            species_exception_file):
        """Cluster genomes to selected GTDB type genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get type genomes
        type_gids = set()
        species_type_gid = {}
        with open(named_type_genome_file) as f:
            header = f.readline().strip().split('\t')
            type_gid_index = header.index('Type genome')
            sp_index = header.index('NCBI species')

            for line in f:
                line_split = line.strip().split('\t')
                type_gids.add(line_split[type_gid_index])
                species_type_gid[
                    line_split[type_gid_index]] = line_split[sp_index]
        self.logger.info('Identified type genomes for %d species.' %
                         len(species_type_gid))

        # calculate circumscription radius for type genomes
        self.logger.info(
            'Determining ANI species circumscription for %d type genomes.' %
            len(type_gids))
        type_radius = self._type_genome_radius(type_gids, type_genome_ani_file)
        assert (len(type_radius) == len(species_type_gid))

        write_rep_radius(
            type_radius, species_type_gid,
            os.path.join(self.output_dir, 'gtdb_type_genome_ani_radius.tsv'))

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info(
            'Considering %d genomes after removing unwanted User genomes.' %
            len(genome_files))
        assert (len(genome_files) == len(passed_qc))

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(
            metadata_file, species_exception_file)
        self.logger.info(
            'Read NCBI taxonomy for %d genomes with %d manually defined updates.'
            % (len(ncbi_taxonomy), ncbi_update_count))

        # calculate ANI between type and non-type genomes
        self.logger.info('Calculating ANI between type and non-type genomes.')
        ani_af = self._calculate_ani(type_gids, genome_files, ncbi_taxonomy,
                                     mash_sketch_file)

        # cluster remaining genomes to type genomes
        nontype_gids = set(genome_files) - set(type_radius)
        self.logger.info(
            'Clustering %d non-type genomes to type genomes using species specific ANI radii.'
            % len(nontype_gids))
        clusters = self._cluster(ani_af, nontype_gids, type_radius)

        # write out clusters
        write_clusters(
            clusters, type_radius, species_type_gid,
            os.path.join(self.output_dir, 'gtdb_type_genome_clusters.tsv'))
コード例 #10
0
class ClusterDeNovo(object):
    """Infer de novo species clusters and type genomes for remaining genomes."""

    def __init__(self, ani_sp, af_sp, ani_cache_file, cpus, output_dir):
        """Initialization."""
        
        check_dependencies(['fastANI', 'mash'])
        
        self.cpus = cpus
        self.output_dir = output_dir

        self.logger = logging.getLogger('timestamp')
        
        self.true_str = ['t', 'T', 'true', 'True']
        
        self.ani_sp = ani_sp
        self.af_sp = af_sp

        self.min_mash_ani = 90.0
        
        self.ClusteredGenome = namedtuple('ClusteredGenome', 'ani af gid')
        
        self.fastani = FastANI(ani_cache_file, cpus)
        
    def _parse_type_clusters(self, type_genome_cluster_file):
        """Parse type genomes clustering information."""
        
        type_species = set()
        species_type_gid = {}
        type_gids = set()
        type_clustered_gids = set()
        type_radius = {}
        with open(type_genome_cluster_file) as f:
            headers = f.readline().strip().split('\t')
            
            type_sp_index = headers.index('NCBI species')
            type_genome_index = headers.index('Type genome')
            num_clustered_index = headers.index('No. clustered genomes')
            clustered_genomes_index = headers.index('Clustered genomes')
            closest_type_index = headers.index('Closest type genome')
            ani_radius_index = headers.index('ANI radius')
            af_index = headers.index('AF closest')

            for line in f:
                line_split = line.strip().split('\t')
                
                type_sp = line_split[type_sp_index]
                type_species.add(type_sp)
                
                type_gid = line_split[type_genome_index]
                type_gids.add(type_gid)
                
                species_type_gid[type_gid] = type_sp
                
                num_clustered = int(line_split[num_clustered_index])
                if num_clustered > 0:
                    for gid in [g.strip() for g in line_split[clustered_genomes_index].split(',')]:
                        type_clustered_gids.add(gid)
                        
                type_radius[type_gid] = GenomeRadius(ani = float(line_split[ani_radius_index]), 
                                                     af = float(line_split[af_index]),
                                                     neighbour_gid = line_split[closest_type_index])
                        
        return type_species, species_type_gid, type_gids, type_clustered_gids, type_radius
        
    def _parse_synonyms(self, type_genome_synonym_file):
        """Parse synonyms."""
        
        synonyms = set()
        with open(type_genome_synonym_file) as f:
            headers = f.readline().strip().split('\t')
            
            synonym_index = headers.index('Synonym')
            
            for line in f:
                line_split = line.strip().split('\t')
                
                synonym = line_split[synonym_index]
                synonyms.add(synonym)
                
        return synonyms
        
    def _nontype_radius(self, unclustered_gids, type_gids, ani_af_nontype_vs_type):
        """Calculate circumscription radius for unclustered, nontype genomes."""
        
        # set type radius for all type genomes to default values
        nontype_radius = {}
        for gid in unclustered_gids:
            nontype_radius[gid] = GenomeRadius(ani = self.ani_sp, 
                                                     af = None,
                                                     neighbour_gid = None)

        # determine closest type ANI neighbour and restrict ANI radius as necessary
        ani_af = pickle.load(open(ani_af_nontype_vs_type, 'rb'))
        for nontype_gid in unclustered_gids:
            if nontype_gid not in ani_af:
                continue
                    
            for type_gid in type_gids:
                if type_gid not in ani_af[nontype_gid]:
                    continue
                    
                ani, af = symmetric_ani(ani_af, nontype_gid, type_gid)

                if ani > nontype_radius[nontype_gid].ani and af >= self.af_sp:
                    nontype_radius[nontype_gid] = GenomeRadius(ani = ani, 
                                                                 af = af,
                                                                 neighbour_gid = type_gid)
                    
        self.logger.info('ANI circumscription radius: min=%.2f, mean=%.2f, max=%.2f' % (
                                min([d.ani for d in nontype_radius.values()]), 
                                np_mean([d.ani for d in nontype_radius.values()]), 
                                max([d.ani for d in nontype_radius.values()])))
                        
        return nontype_radius
        
    def _mash_ani_unclustered(self, genome_files, gids):
        """Calculate pairwise Mash ANI estimates between genomes."""
        
        mash = Mash(self.cpus)
        
        # create Mash sketch for potential representative genomes
        mash_nontype_sketch_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.msh')
        genome_list_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.lst')
        mash.sketch(gids, genome_files, genome_list_file, mash_nontype_sketch_file)

        # get Mash distances
        mash_dist_file = os.path.join(self.output_dir, 'gtdb_unclustered_genomes.dst')
        mash.dist_pairwise( float(100 - self.min_mash_ani)/100, mash_nontype_sketch_file, mash_dist_file)

        # read Mash distances
        mash_ani = mash.read_ani(mash_dist_file)
        
        # report pairs above Mash threshold
        mash_ani_pairs = []
        for qid in mash_ani:
            for rid in mash_ani[qid]:
                if mash_ani[qid][rid] >= self.min_mash_ani:
                    if qid != rid:
                        mash_ani_pairs.append((qid, rid))
                        mash_ani_pairs.append((rid, qid))
                
        self.logger.info('Identified %d genome pairs with a Mash ANI >= %.1f%%.' % (len(mash_ani_pairs), self.min_mash_ani))

        return mash_ani
        
    def _selected_rep_genomes(self,
                                genome_files,
                                nontype_radius, 
                                unclustered_qc_gids, 
                                mash_ani,
                                quality_metadata,
                                rnd_type_genome):
        """Select representative genomes for species clusters in a  greedy fashion using species-specific ANI thresholds."""

        # sort genomes by quality score
        if rnd_type_genome:
            self.logger.info('Selecting random de novo type genomes.')
            sorted_gids = []
            for gid in random.sample(unclustered_qc_gids, len(unclustered_qc_gids)):
                sorted_gids.append((gid, 0))
        else:
            self.logger.info('Selecting de novo type genomes in a greedy manner based on quality.')
            qscore = quality_score(unclustered_qc_gids, quality_metadata)
            sorted_gids = sorted(qscore.items(), key=operator.itemgetter(1), reverse=True)

        # greedily determine representatives for new species clusters
        cluster_rep_file = os.path.join(self.output_dir, 'cluster_reps.tsv')
        clusters = set()
        if not os.path.exists(cluster_rep_file):
            self.logger.info('Clustering genomes to identify representatives.')
            clustered_genomes = 0
            max_ani_pairs = 0
            for idx, (cur_gid, _score) in enumerate(sorted_gids):

                # determine reference genomes to calculate ANI between
                ani_pairs = []
                if cur_gid in mash_ani:
                    for rep_gid in clusters:
                        if mash_ani[cur_gid].get(rep_gid, 0) >= self.min_mash_ani:
                            ani_pairs.append((cur_gid, rep_gid))
                            ani_pairs.append((rep_gid, cur_gid))

                # determine if genome clusters with representative
                clustered = False
                if ani_pairs:
                    if len(ani_pairs) > max_ani_pairs:
                        max_ani_pairs = len(ani_pairs)
                    
                    ani_af = self.fastani.pairs(ani_pairs, genome_files, report_progress=False)

                    closest_rep_gid = None
                    closest_rep_ani = 0
                    closest_rep_af = 0
                    for rep_gid in clusters:
                        ani, af = symmetric_ani(ani_af, cur_gid, rep_gid)

                        if af >= self.af_sp:
                            if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af):
                                closest_rep_gid = rep_gid
                                closest_rep_ani = ani
                                closest_rep_af = af

                        if ani > nontype_radius[cur_gid].ani and af >= self.af_sp:
                            nontype_radius[cur_gid] = GenomeRadius(ani = ani, 
                                                                         af = af,
                                                                         neighbour_gid = rep_gid)
                                                                         
                    if closest_rep_gid and closest_rep_ani > nontype_radius[closest_rep_gid].ani:
                        clustered = True
                    
                if not clustered:
                    # genome is a new species cluster representative
                    clusters.add(cur_gid)
                else:
                    clustered_genomes += 1
                
                if (idx+1) % 10 == 0 or idx+1 == len(sorted_gids):
                    statusStr = '-> Clustered %d of %d (%.2f%%) genomes [ANI pairs: %d; clustered genomes: %d; clusters: %d].'.ljust(96) % (
                                    idx+1, 
                                    len(sorted_gids), 
                                    float(idx+1)*100/len(sorted_gids),
                                    max_ani_pairs,
                                    clustered_genomes,
                                    len(clusters))
                    sys.stdout.write('%s\r' % statusStr)
                    sys.stdout.flush()
                    max_ani_pairs = 0
            sys.stdout.write('\n')
            
            # write out selected cluster representative
            fout = open(cluster_rep_file, 'w')
            for gid in clusters:
                fout.write('%s\n' % gid)
            fout.close()
        else:
            # read cluster reps from file
            self.logger.warning('Using previously determined cluster representatives.')
            for line in open(cluster_rep_file):
                gid = line.strip()
                clusters.add(gid)
                
        self.logger.info('Selected %d representative genomes for de novo species clusters.' % len(clusters))
        
        return clusters
        
    def _cluster_genomes(self, 
                            genome_files,
                            rep_genomes,
                            type_gids, 
                            passed_qc,
                            final_cluster_radius):
        """Cluster all non-type/representative genomes to selected type/representatives genomes."""

        all_reps = rep_genomes.union(type_gids)
        
        # calculate MASH distance between non-type/representative genomes and selected type/representatives genomes
        mash = Mash(self.cpus)
        
        mash_type_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.msh')
        type_rep_genome_list_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.lst')
        mash.sketch(all_reps, genome_files, type_rep_genome_list_file, mash_type_rep_sketch_file)
        
        mash_none_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.msh')
        type_none_rep_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.lst')
        mash.sketch(passed_qc - all_reps, genome_files, type_none_rep_file, mash_none_rep_sketch_file)

        # get Mash distances
        mash_dist_file = os.path.join(self.output_dir, 'gtdb_rep_vs_nonrep_genomes.dst')
        mash.dist(float(100 - self.min_mash_ani)/100, mash_type_rep_sketch_file, mash_none_rep_sketch_file, mash_dist_file)

        # read Mash distances
        mash_ani = mash.read_ani(mash_dist_file)
        
        # calculate ANI between non-type/representative genomes and selected type/representatives genomes
        clusters = {}
        for gid in all_reps:
            clusters[gid] = []
        
        genomes_to_cluster = passed_qc - set(clusters)
        ani_pairs = []
        for gid in genomes_to_cluster:
            if gid in mash_ani:
                for rep_gid in clusters:
                    if mash_ani[gid].get(rep_gid, 0) >= self.min_mash_ani:
                        ani_pairs.append((gid, rep_gid))
                        ani_pairs.append((rep_gid, gid))
                        
        self.logger.info('Calculating ANI between %d species clusters and %d unclustered genomes (%d pairs):' % (
                            len(clusters), 
                            len(genomes_to_cluster),
                            len(ani_pairs)))
        ani_af = self.fastani.pairs(ani_pairs, genome_files)

        # assign genomes to closest representatives 
        # that is within the representatives ANI radius
        self.logger.info('Assigning genomes to closest representative.')
        for idx, cur_gid in enumerate(genomes_to_cluster):
            closest_rep_gid = None
            closest_rep_ani = 0
            closest_rep_af = 0
            for rep_gid in clusters:
                ani, af = symmetric_ani(ani_af, cur_gid, rep_gid)
                
                if ani >= final_cluster_radius[rep_gid].ani and af >= self.af_sp:
                    if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af):
                        closest_rep_gid = rep_gid
                        closest_rep_ani = ani
                        closest_rep_af = af
                
            if closest_rep_gid:
                clusters[closest_rep_gid].append(self.ClusteredGenome(gid=cur_gid, 
                                                                        ani=closest_rep_ani, 
                                                                        af=closest_rep_af))
            else:
                self.logger.warning('Failed to assign genome %s to representative.' % cur_gid)
                if closest_rep_gid:
                    self.logger.warning(' ...closest_rep_gid = %s' % closest_rep_gid)
                    self.logger.warning(' ...closest_rep_ani = %.2f' % closest_rep_ani)
                    self.logger.warning(' ...closest_rep_af = %.2f' % closest_rep_af)
                    self.logger.warning(' ...closest rep radius = %.2f' % final_cluster_radius[closest_rep_gid].ani)
                else:
                    self.logger.warning(' ...no representative with an AF >%.2f identified.' % self.af_sp)
             
            statusStr = '-> Assigned %d of %d (%.2f%%) genomes.'.ljust(86) % (idx+1, 
                                                                                len(genomes_to_cluster), 
                                                                                float(idx+1)*100/len(genomes_to_cluster))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
        sys.stdout.write('\n')

        return clusters, ani_af
        
    def _assign_species_names(self, clusters, names_in_use, gtdb_taxonomy, gtdb_user_to_genbank):
        """Assign a species name to each species cluster."""
        
        orig_names_in_use = set(names_in_use)

        fout = open(os.path.join(self.output_dir, 'gtdb_assigned_sp.tsv'), 'w')
        fout.write('Representative genome\tAssigned species\tGTDB taxonomy\tNo. clustered genomes\tClustered GTDB genera\tClustered GTDB species\tSpecies name in use\tMost common name in use\tClustered genomes\n')
        cluster_sp_names = {}
        for rid in sorted(clusters, key=lambda x: len(clusters[x]), reverse=True):
            clustered_gids = [c.gid for c in clusters[rid]]
            
            # find most common genus name in cluster
            gtdb_genera = [gtdb_taxonomy[gid][5] for gid in clustered_gids] + [gtdb_taxonomy[rid][5]]
            gtdb_genus_counter = Counter(gtdb_genera)
            gtdb_common_genus = None 
            gtdb_common_genus_count = 0
            for genus, count in gtdb_genus_counter.most_common(): 
                if genus != 'g__':
                    gtdb_common_genus = genus
                    gtdb_common_genus_count = count
                    break
                    
            # in case of ties involving genus of representative genome, 
            # defer to classification of representative
            rep_genus = gtdb_taxonomy[rid][5]
            if gtdb_genus_counter[rep_genus] == gtdb_common_genus_count and rep_genus != 'g__':
                gtdb_common_genus = rep_genus
            
            # get most common GTDB species name 
            gtdb_sp = [gtdb_taxonomy[gid][6] for gid in clustered_gids] + [gtdb_taxonomy[rid][6]]
            gtdb_sp_counter = Counter(gtdb_sp)
            gtdb_common_sp = None
            gtdb_common_sp_count = 0
            for sp, count in gtdb_sp_counter.most_common(): 
                if sp != 's__':
                    gtdb_common_sp = sp
                    gtdb_common_sp_count = count
                    break
                    
            most_common_in_use = gtdb_common_sp in names_in_use

            min_req_genomes = 0.5*(sum(gtdb_sp_counter.values()) - gtdb_sp_counter.get('s__', 0))
            if gtdb_common_sp_count >= min_req_genomes and not most_common_in_use:
                # assign common species if it occurs in >=50% of the clustered genomes,
                # excluding genomes with no species assignment
                names_in_use.add(gtdb_common_sp)
                cluster_sp_names[rid] = gtdb_common_sp
            else:
                # derive new species name from genus, if possible, 
                # and accession number of representative genome
                genus = '{unresolved}'
                if gtdb_common_genus and gtdb_common_genus != 'g__':
                    genus = gtdb_common_genus[3:]
                
                acc = rid
                if rid.startswith('U_'):
                    if rid in gtdb_user_to_genbank:
                        acc = gtdb_user_to_genbank[rid]
                    else:
                        # create accession from GTDB User ID of the form:
                        # U_<number>u.0 which will give 'sp<number>u'
                        acc = 'U_' + rid.replace('U_', '') + 'u.0'

                derived_sp = 's__' + '%s sp%s' % (genus, acc[acc.rfind('_')+1:acc.rfind('.')])
                if derived_sp in names_in_use:
                    self.logger.error('Derived species name already in use: %s, %s' % (derived_sp, acc))
                    sys.exit(-1)

                names_in_use.add(derived_sp)
                cluster_sp_names[rid] = derived_sp
                
            fout.write('%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' % (
                        rid, 
                        cluster_sp_names[rid],
                        '; '.join(gtdb_taxonomy[rid]),
                        len(clustered_gids),
                        ', '.join("%s=%r" % (genus, count) for (genus, count) in gtdb_genus_counter.most_common()),
                        ', '.join("%s=%r" % (sp, count) for (sp, count) in gtdb_sp_counter.most_common()),
                        ', '.join("%s=%s" % (sp, sp in names_in_use) for sp, _count in gtdb_sp_counter.most_common()),
                        '%s=%d' % (gtdb_common_sp, gtdb_common_sp_count) if most_common_in_use else 'n/a',
                        ', '.join(clustered_gids)))
                
        fout.close()
        
        return cluster_sp_names
        
    def _write_rep_info(self, 
                        clusters, 
                        cluster_sp_names, 
                        quality_metadata, 
                        genome_quality,
                        excluded_from_refseq_note,
                        ani_af,
                        output_file):
        """Write out information about selected representative genomes."""
                                            
        fout = open(output_file, 'w')
        fout.write('Species\tType genome\tNCBI assembly level\tNCBI genome category')
        fout.write('\tGenome size (bp)\tQuality score\tCompleteness (%)\tContamination (%)\tNo. scaffolds\tNo. contigs\tN50 contigs\tAmbiguous bases\tSSU count\tSSU length (bp)')
        fout.write('\tNo. genomes in cluster\tMean ANI\tMean AF\tMin ANI\tMin AF\tNCBI exclude from RefSeq\n')
        
        for gid in clusters:
            fout.write('%s\t%s\t%s\t%s' % (
                        cluster_sp_names[gid], 
                        gid, 
                        quality_metadata[gid].ncbi_assembly_level,
                        quality_metadata[gid].ncbi_genome_category))

            fout.write('\t%d\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.1f\t%d\t%d\t%d' % (
                            quality_metadata[gid].genome_size,
                            genome_quality[gid], 
                            quality_metadata[gid].checkm_completeness,
                            quality_metadata[gid].checkm_contamination,
                            quality_metadata[gid].scaffold_count,
                            quality_metadata[gid].contig_count,
                            quality_metadata[gid].n50_contigs,
                            quality_metadata[gid].ambiguous_bases,
                            quality_metadata[gid].ssu_count,
                            quality_metadata[gid].ssu_length if quality_metadata[gid].ssu_length else 0))
                            
            anis = []
            afs = []
            for cluster_id in clusters[gid]:
                ani, af = symmetric_ani(ani_af, gid, cluster_id)
                anis.append(ani)
                afs.append(af)
            
            if anis:
                fout.write('\t%d\t%.1f\t%.2f\t%.1f\t%.2f\t%s\n' % (len(clusters[gid]),
                                                                    np_mean(anis), np_mean(afs),
                                                                    min(anis), min(afs),
                                                                    excluded_from_refseq_note.get(gid, '')))
            else:
                fout.write('\t%d\t%s\t%s\t%s\t%s\t%s\n' % (len(clusters[gid]),
                                                            'n/a', 'n/a', 'n/a', 'n/a',
                                                            excluded_from_refseq_note.get(gid, '')))
        fout.close()
        
    def _gtdb_user_genomes(self, gtdb_user_genomes_file, metadata_file):
        """Get map between GTDB User genomes and GenBank accessions."""
        
        uba_to_genbank = {}
        for line in open(gtdb_user_genomes_file):
            line_split = line.strip().split('\t')
            gb_acc = line_split[0]
            uba_id = line_split[4]
            uba_to_genbank[uba_id] = gb_acc
        
        user_to_genbank = {}
        m = read_gtdb_metadata(metadata_file, ['organism_name'])
        for gid, metadata in m.items():
            if '(UBA' in str(metadata.organism_name):
                uba_id = metadata.organism_name[metadata.organism_name.find('(')+1:-1]
                if uba_id in uba_to_genbank:
                    user_to_genbank[gid] = uba_to_genbank[uba_id]

        return user_to_genbank

    def run(self, qc_file,
                metadata_file,
                gtdb_user_genomes_file,
                genome_path_file,
                type_genome_cluster_file,
                type_genome_synonym_file,
                ncbi_refseq_assembly_file,
                ncbi_genbank_assembly_file,
                ani_af_nontype_vs_type,
                species_exception_file,
                rnd_type_genome):
        """Infer de novo species clusters and type genomes for remaining genomes."""
        
        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))
        
        # get NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy))
        
        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files))
        assert(len(genome_files) == len(passed_qc))
        
        # determine type genomes and genomes clustered to type genomes
        type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file)
        assert(len(type_species) == len(type_gids))
        self.logger.info('Identified %d type genomes.' % len(type_gids))
        self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids))
        
        # calculate quality score for genomes
        self.logger.info('Parse quality statistics for all genomes.')
        quality_metadata = read_quality_metadata(metadata_file)
        
        # calculate genome quality score
        self.logger.info('Calculating genome quality score.')
        genome_quality = quality_score(quality_metadata.keys(), quality_metadata)

        # determine genomes left to be clustered
        unclustered_gids = passed_qc - type_gids - type_clustered_gids
        self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids))

        # establish closest type genome for each unclustered genome
        self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids))
        nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type)
        
        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids)

        # select species representatives genomes in a greedy fashion based on genome quality
        rep_genomes = self._selected_rep_genomes(genome_files,
                                                    nontype_radius, 
                                                    unclustered_gids, 
                                                    mash_anis,
                                                    quality_metadata,
                                                    rnd_type_genome)
        
        # cluster all non-type/non-rep genomes to species type/rep genomes
        final_cluster_radius = type_radius.copy()
        final_cluster_radius.update(nontype_radius)
        
        final_clusters, ani_af = self._cluster_genomes(genome_files,
                                                        rep_genomes,
                                                        type_gids, 
                                                        passed_qc,
                                                        final_cluster_radius)
        rep_clusters = {}
        for gid in rep_genomes:
            rep_clusters[gid] = final_clusters[gid]

        # get list of synonyms in order to restrict usage of species names
        synonyms = self._parse_synonyms(type_genome_synonym_file)
        self.logger.info('Identified %d synonyms.' % len(synonyms))
        
        # determine User genomes with NCBI accession number that may form species names
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file)
        self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank))
        
        # assign species names to de novo species clusters
        names_in_use = synonyms.union(type_species)
        self.logger.info('Identified %d species names already in use.' % len(names_in_use))
        self.logger.info('Assigning species name to each de novo species cluster.')
        cluster_sp_names = self._assign_species_names(rep_clusters, 
                                                        names_in_use, 
                                                        gtdb_taxonomy,
                                                        gtdb_user_to_genbank)
        
         # write out file with details about selected representative genomes
        self._write_rep_info(rep_clusters, 
                                cluster_sp_names,
                                quality_metadata,
                                genome_quality,
                                excluded_from_refseq_note,
                                ani_af,
                                os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv'))
                                             
        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]
            
        all_species = cluster_sp_names
        all_species.update(species_type_gid)

        self.logger.info('Writing %d species clusters to file.' % len(all_species))
        self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        all_species, 
                        os.path.join(self.output_dir, 'gtdb_clusters_final.tsv'))

        write_rep_radius(final_cluster_radius, 
                            all_species, 
                            os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))
        
コード例 #11
0
class UpdateErroneousNCBI(object):
    """Identify genomes with erroneous NCBI species assignments."""
    def __init__(self, ani_ncbi_erroneous, ani_cache_file, cpus, output_dir):
        """Initialization."""

        self.output_dir = output_dir
        self.logger = logging.getLogger('timestamp')

        self.ani_ncbi_erroneous = ani_ncbi_erroneous
        self.fastani = FastANI(ani_cache_file, cpus)

    def identify_misclassified_genomes_ani(self, cur_genomes, cur_clusters):
        """Identify genomes with erroneous NCBI species assignments, based on ANI to type strain genomes."""

        forbidden_names = set(['cyanobacterium'])

        # get mapping from genomes to their representatives
        gid_to_rid = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                gid_to_rid[cid] = rid

        # get genomes with NCBI species assignment
        ncbi_sp_gids = defaultdict(list)
        for gid in cur_genomes:
            ncbi_species = cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)

            if ncbi_species != 's__' and ncbi_specific not in forbidden_names:
                ncbi_sp_gids[ncbi_species].append(gid)

        # get NCBI species anchored by a type strain genome
        ncbi_type_anchored_species = {}
        for rid, cids in cur_clusters.items():
            if cur_genomes[rid].is_effective_type_strain():
                ncbi_type_species = cur_genomes[rid].ncbi_taxa.species
                if ncbi_type_species != 's__':
                    ncbi_type_anchored_species[ncbi_type_species] = rid
        self.logger.info(
            ' - identified {:,} NCBI species anchored by a type strain genome.'
            .format(len(ncbi_type_anchored_species)))

        # identify genomes with erroneous NCBI species assignments
        fout = open(
            os.path.join(
                self.output_dir, 'ncbi_misclassified_sp.ani_{}.tsv'.format(
                    self.ani_ncbi_erroneous)), 'w')
        fout.write(
            'Genome ID\tNCBI species\tGenome cluster\tType species cluster\tANI to type strain\tAF to type strain\n'
        )

        misclassified_gids = set()
        for idx, (ncbi_species,
                  species_gids) in enumerate(ncbi_sp_gids.items()):
            if ncbi_species not in ncbi_type_anchored_species:
                continue

            type_rid = ncbi_type_anchored_species[ncbi_species]
            gids_to_check = []
            for gid in species_gids:
                cur_rid = gid_to_rid[gid]
                if type_rid != cur_rid:
                    # need to check genome as it has the same NCBI species name
                    # as a type strain genome, but resides in a different GTDB
                    # species cluster
                    gids_to_check.append(gid)

            if len(gids_to_check) > 0:
                gid_pairs = []
                for gid in gids_to_check:
                    gid_pairs.append((type_rid, gid))
                    gid_pairs.append((gid, type_rid))

                statusStr = '-> Establishing erroneous assignments for {} [ANI pairs: {:,}; {:,} of {:,} species].'.format(
                    ncbi_species, len(gid_pairs), idx + 1,
                    len(ncbi_sp_gids)).ljust(96)
                sys.stdout.write('{}\r'.format(statusStr))
                sys.stdout.flush()

                ani_af = self.fastani.pairs(gid_pairs,
                                            cur_genomes.genomic_files,
                                            report_progress=False,
                                            check_cache=True)

                for gid in gids_to_check:
                    ani, af = symmetric_ani(ani_af, type_rid, gid)
                    if ani < self.ani_ncbi_erroneous:
                        misclassified_gids.add(gid)
                        fout.write('{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\n'.format(
                            gid, ncbi_species, gid_to_rid[gid], type_rid, ani,
                            af))

        sys.stdout.write('\n')
        fout.close()

        misclassified_species = set(
            [cur_genomes[gid].ncbi_taxa.species for gid in misclassified_gids])
        self.logger.info(
            ' - identified {:,} genomes from {:,} species as having misclassified NCBI species assignments.'
            .format(len(misclassified_gids), len(misclassified_species)))

        return misclassified_gids

    def identify_misclassified_genomes_cluster(self, cur_genomes,
                                               cur_clusters):
        """Identify genomes with erroneous NCBI species assignments, based on GTDB clustering of type strain genomes."""

        forbidden_names = set(['cyanobacterium'])

        # get mapping from genomes to their representatives
        gid_to_rid = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                gid_to_rid[cid] = rid

        # get genomes with NCBI species assignment
        ncbi_sp_gids = defaultdict(list)
        for gid in cur_genomes:
            ncbi_species = cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)

            if ncbi_species != 's__' and ncbi_specific not in forbidden_names:
                ncbi_sp_gids[ncbi_species].append(gid)

        # get NCBI species anchored by a type strain genome
        ncbi_type_anchored_species = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                if cur_genomes[cid].is_effective_type_strain():
                    ncbi_type_species = cur_genomes[cid].ncbi_taxa.species
                    ncbi_specific = specific_epithet(ncbi_species)
                    if ncbi_type_species != 's__' and ncbi_specific not in forbidden_names:
                        if (ncbi_type_species in ncbi_type_anchored_species
                                and rid !=
                                ncbi_type_anchored_species[ncbi_type_species]):
                            self.logger.error(
                                'NCBI species {} has multiple effective type strain genomes in different clusters.'
                                .format(ncbi_type_species))
                            sys.exit(-1)

                        ncbi_type_anchored_species[ncbi_type_species] = rid
        self.logger.info(
            ' - identified {:,} NCBI species anchored by a type strain genome.'
            .format(len(ncbi_type_anchored_species)))

        # identify genomes with erroneous NCBI species assignments
        fout = open(
            os.path.join(self.output_dir,
                         'ncbi_misclassified_sp.gtdb_clustering.tsv'), 'w')
        fout.write(
            'Genome ID\tNCBI species\tGenome cluster\tType species cluster\n')

        misclassified_gids = set()
        for idx, (ncbi_species,
                  species_gids) in enumerate(ncbi_sp_gids.items()):
            if ncbi_species not in ncbi_type_anchored_species:
                continue

            # find genomes with NCBI species assignments that are in a
            # different cluster than the type strain genome
            type_rid = ncbi_type_anchored_species[ncbi_species]
            for gid in species_gids:
                cur_rid = gid_to_rid[gid]
                if type_rid != cur_rid:
                    misclassified_gids.add(gid)
                    fout.write('{}\t{}\t{}\t{}\t\n'.format(
                        gid, ncbi_species, cur_rid, type_rid))

        sys.stdout.write('\n')
        fout.close()

        misclassified_species = set(
            [cur_genomes[gid].ncbi_taxa.species for gid in misclassified_gids])
        self.logger.info(
            ' - identified {:,} genomes from {:,} species as having misclassified NCBI species assignments.'
            .format(len(misclassified_gids), len(misclassified_species)))

        return misclassified_gids

    def run(self, gtdb_clusters_file, cur_gtdb_metadata_file,
            cur_genomic_path_file, uba_genome_paths, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            gtdb_type_strains_ledger, sp_priority_ledger,
            genus_priority_ledger, dsmz_bacnames_file):
        """Cluster genomes to selected GTDB representatives."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get path to previous and current genomic FASTA files
        self.logger.info('Reading path to current genomic FASTA files.')
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        cur_genomes.load_genomic_file_paths(uba_genome_paths)

        # read named GTDB species clusters
        self.logger.info(
            'Reading named and previous placeholder GTDB species clusters.')
        cur_clusters, rep_radius = read_clusters(gtdb_clusters_file)
        self.logger.info(
            ' ... identified {:,} clusters spanning {:,} genomes.'.format(
                len(cur_clusters),
                sum([len(gids) + 1 for gids in cur_clusters.values()])))

        # identify genomes with erroneous NCBI species assignments
        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by ANI type strain genomes.'
        )
        self.identify_misclassified_genomes_ani(cur_genomes, cur_clusters)

        self.logger.info(
            'Identifying genomes with erroneous NCBI species assignments as established by GTDB cluster of type strain genomes.'
        )
        self.identify_misclassified_genomes_cluster(cur_genomes, cur_clusters)
コード例 #12
0
    def write_synonym_table(self, type_strain_synonyms, consensus_synonyms,
                            ani_af, sp_priority_ledger, genus_priority_ledger,
                            lpsn_gss_file):
        """Create table indicating species names that should be considered synonyms."""

        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  lpsn_gss_file,
                                                  self.output_dir)

        out_file = os.path.join(self.output_dir, 'synonyms.tsv')
        fout = open(out_file, 'w')
        fout.write(
            'Synonym type\tNCBI species\tGTDB representative\tStrain IDs\tType sources\tPriority year'
        )
        fout.write('\tGTDB type species\tGTDB type strain\tNCBI assembly type')
        fout.write(
            '\tNCBI synonym\tHighest-quality synonym genome\tSynonym strain IDs\tSynonym type sources\tSynonym priority year'
        )
        fout.write(
            '\tSynonym GTDB type species\tSynonym GTDB type strain\tSynonym NCBI assembly type'
        )
        fout.write('\tANI\tAF\tWarnings\n')

        incorrect_priority = 0
        failed_type_strain_priority = 0
        for synonyms, synonym_type in [
            (type_strain_synonyms, 'TYPE_STRAIN_SYNONYM'),
            (consensus_synonyms, 'MAJORITY_VOTE_SYNONYM')
        ]:
            for rid, synonym_ids in synonyms.items():
                for gid in synonym_ids:
                    ani, af = FastANI.symmetric_ani(ani_af, rid, gid)

                    fout.write(synonym_type)
                    fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                        self.cur_genomes[rid].ncbi_taxa.species, rid,
                        ','.join(sorted(self.cur_genomes[rid].strain_ids())),
                        ','.join(
                            sorted(self.cur_genomes[rid].gtdb_type_sources())
                        ).upper().replace('STRAININFO', 'StrainInfo'),
                        sp_priority_mngr.species_priority_year(
                            self.cur_genomes,
                            rid), self.cur_genomes[rid].is_gtdb_type_species(),
                        self.cur_genomes[rid].is_gtdb_type_strain(),
                        self.cur_genomes[rid].ncbi_type_material))

                    synonym_priority_year = sp_priority_mngr.species_priority_year(
                        self.cur_genomes, gid)
                    if synonym_priority_year == Genome.NO_PRIORITY_YEAR:
                        synonym_priority_year = 'n/a'

                    fout.write('\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                        self.cur_genomes[gid].ncbi_taxa.species, gid,
                        ','.join(sorted(self.cur_genomes[gid].strain_ids())),
                        ','.join(
                            sorted(self.cur_genomes[gid].gtdb_type_sources())
                        ).upper().replace('STRAININFO',
                                          'StrainInfo'), synonym_priority_year,
                        self.cur_genomes[gid].is_gtdb_type_species(),
                        self.cur_genomes[gid].is_gtdb_type_strain(),
                        self.cur_genomes[gid].ncbi_type_material))
                    fout.write('\t{:.3f}\t{:.4f}'.format(ani, af))

                    if self.cur_genomes[rid].is_effective_type_strain(
                    ) and self.cur_genomes[gid].is_effective_type_strain():
                        priority_gid, note = sp_priority_mngr.species_priority(
                            self.cur_genomes, rid, gid)
                        if priority_gid != rid:
                            incorrect_priority += 1
                            fout.write('\tIncorrect priority: {}'.format(note))
                    elif not self.cur_genomes[rid].is_gtdb_type_strain(
                    ) and self.cur_genomes[gid].is_gtdb_type_strain():
                        failed_type_strain_priority += 1
                        fout.write(
                            '\tFailed to prioritize type strain of species')

                    fout.write('\n')

        if incorrect_priority:
            self.logger.warning(
                f' - identified {incorrect_priority:,} synonyms with incorrect priority.'
            )

        if failed_type_strain_priority:
            self.logger.warning(
                f' - identified {failed_type_strain_priority:,} synonyms that failed to priotize the type strain of the species.'
            )