Ejemplo n.º 1
0
    def _mash_ani(self, genome_files, user_genomes, sp_clusters):
        """Calculate Mash ANI estimates between User genomes and species clusters."""

        mash = Mash(self.cpus)

        # create Mash sketch for User genomes
        mash_user_sketch_file = os.path.join(self.output_dir,
                                             'gtdb_user_genomes.msh')
        genome_list_file = os.path.join(self.output_dir,
                                        'gtdb_user_genomes.lst')
        mash.sketch(user_genomes, genome_files, genome_list_file,
                    mash_user_sketch_file)

        # create Mash sketch for species clusters
        mash_sp_sketch_file = os.path.join(self.output_dir,
                                           'gtdb_sp_genomes.msh')
        genome_list_file = os.path.join(self.output_dir, 'gtdb_sp_genomes.lst')
        mash.sketch(sp_clusters, genome_files, genome_list_file,
                    mash_sp_sketch_file)

        # get Mash distances
        mash_dist_file = os.path.join(self.output_dir, 'gtdb_user_vs_sp.dst')
        mash.dist(
            float(100 - self.min_mash_ani) / 100, mash_sp_sketch_file,
            mash_user_sketch_file, mash_dist_file)

        # read Mash distances
        mash_ani = mash.read_ani(mash_dist_file)

        # report pairs above Mash threshold
        mash_ani_pairs = []
        for qid in mash_ani:
            for rid in mash_ani[qid]:
                if mash_ani[qid][rid] >= self.min_mash_ani:
                    if qid != rid:
                        mash_ani_pairs.append((qid, rid))
                        mash_ani_pairs.append((rid, qid))

        self.logger.info(
            'Identified %d genome pairs with a Mash ANI >= %.1f%%.' %
            (len(mash_ani_pairs), self.min_mash_ani))

        return mash_ani
    def _calculate_ani(self, cur_genomes, rep_gids, rep_mash_sketch_file):
        """Calculate ANI between representative and non-representative genomes."""

        if True:  #***
            mash = Mash(self.cpus)

            # create Mash sketch for representative genomes
            if not rep_mash_sketch_file or not os.path.exists(
                    rep_mash_sketch_file):
                rep_genome_list_file = os.path.join(self.output_dir,
                                                    'gtdb_reps.lst')
                rep_mash_sketch_file = os.path.join(self.output_dir,
                                                    'gtdb_reps.msh')
                mash.sketch(rep_gids, cur_genomes.genomic_files,
                            rep_genome_list_file, rep_mash_sketch_file)

            # create Mash sketch for non-representative genomes
            nonrep_gids = set()
            for gid in cur_genomes:
                if gid not in rep_gids:
                    nonrep_gids.add(gid)

            nonrep_genome_list_file = os.path.join(self.output_dir,
                                                   'gtdb_nonreps.lst')
            nonrep_genome_sketch_file = os.path.join(self.output_dir,
                                                     'gtdb_nonreps.msh')
            mash.sketch(nonrep_gids, cur_genomes.genomic_files,
                        nonrep_genome_list_file, nonrep_genome_sketch_file)

            # get Mash distances
            mash_dist_file = os.path.join(self.output_dir,
                                          'gtdb_reps_vs_nonreps.dst')
            mash.dist(
                float(100 - self.min_mash_ani) / 100, rep_mash_sketch_file,
                nonrep_genome_sketch_file, mash_dist_file)

            # read Mash distances
            mash_ani = mash.read_ani(mash_dist_file)

            # get pairs above Mash threshold
            mash_ani_pairs = []
            for qid in mash_ani:
                for rid in mash_ani[qid]:
                    if mash_ani[qid][rid] >= self.min_mash_ani:
                        n_qid = cur_genomes.user_uba_id_map.get(qid, qid)
                        n_rid = cur_genomes.user_uba_id_map.get(rid, rid)
                        if n_qid != n_rid:
                            mash_ani_pairs.append((n_qid, n_rid))
                            mash_ani_pairs.append((n_rid, n_qid))

            self.logger.info(
                'Identified {:,} genome pairs with a Mash ANI >= {:.1f}%.'.
                format(len(mash_ani_pairs), self.min_mash_ani))

            # calculate ANI between pairs
            self.logger.info(
                'Calculating ANI between {:,} genome pairs:'.format(
                    len(mash_ani_pairs)))
            ani_af = self.fastani.pairs(mash_ani_pairs,
                                        cur_genomes.genomic_files)
            pickle.dump(
                ani_af,
                open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.pkl'),
                     'wb'))
        else:
            self.logger.warning(
                'Using previously calculated results in: {}'.format(
                    'ani_af_rep_vs_nonrep.pkl'))
            ani_af = pickle.load(
                open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.pkl'),
                     'rb'))

        return ani_af
    def _cluster_genomes(self,
                            cur_genomes,
                            de_novo_rep_gids,
                            named_rep_gids, 
                            final_cluster_radius):
        """Cluster new representatives to representatives of named GTDB species clusters."""
        
        all_reps = de_novo_rep_gids.union(named_rep_gids)
        nonrep_gids = set(cur_genomes.genomes.keys()) - all_reps
        self.logger.info('Clustering {:,} genomes to {:,} named and de novo representatives.'.format(
                            len(nonrep_gids), len(all_reps)))

        if True: #***
            # calculate MASH distance between non-representatives and representatives genomes
            mash = Mash(self.cpus)
            
            mash_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.msh')
            rep_genome_list_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.lst')
            mash.sketch(all_reps, cur_genomes.genomic_files, rep_genome_list_file, mash_rep_sketch_file)

            mash_none_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.msh')
            non_rep_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.lst')
            mash.sketch(nonrep_gids, cur_genomes.genomic_files, non_rep_file, mash_none_rep_sketch_file)

            # get Mash distances
            mash_dist_file = os.path.join(self.output_dir, 'gtdb_rep_vs_nonrep_genomes.dst')
            mash.dist(float(100 - self.min_mash_ani)/100, 
                        mash_rep_sketch_file, 
                        mash_none_rep_sketch_file, 
                        mash_dist_file)

            # read Mash distances
            mash_ani = mash.read_ani(mash_dist_file)
            
            # calculate ANI between non-representatives and representatives genomes
            clusters = {}
            for gid in all_reps:
                clusters[gid] = []

            if False: #***
                mash_ani_pairs = []
                for gid in nonrep_gids:
                    if gid in mash_ani:
                        for rid in clusters:
                            if mash_ani[gid].get(rid, 0) >= self.min_mash_ani:
                                n_gid = cur_genomes.user_uba_id_map.get(gid, gid)
                                n_rid = cur_genomes.user_uba_id_map.get(rid, rid)
                                if n_gid != n_rid:
                                    mash_ani_pairs.append((n_gid, n_rid))
                                    mash_ani_pairs.append((n_rid, n_gid))
                                    
            mash_ani_pairs = []
            for qid in mash_ani:
                n_qid = cur_genomes.user_uba_id_map.get(qid, qid)
                assert n_qid in nonrep_gids
                
                for rid in mash_ani[qid]:
                    n_rid = cur_genomes.user_uba_id_map.get(rid, rid)
                    assert n_rid in all_reps
                    
                    if (mash_ani[qid][rid] >= self.min_mash_ani
                        and n_qid != n_rid):
                        mash_ani_pairs.append((n_qid, n_rid))
                        mash_ani_pairs.append((n_rid, n_qid))
                            
            self.logger.info('Calculating ANI between {:,} species clusters and {:,} unclustered genomes ({:,} pairs):'.format(
                                len(clusters), 
                                len(nonrep_gids),
                                len(mash_ani_pairs)))
            ani_af = self.fastani.pairs(mash_ani_pairs, cur_genomes.genomic_files)

            # assign genomes to closest representatives 
            # that is within the representatives ANI radius
            self.logger.info('Assigning genomes to closest representative.')
            for idx, cur_gid in enumerate(nonrep_gids):
                closest_rep_gid = None
                closest_rep_ani = 0
                closest_rep_af = 0
                for rep_gid in clusters:
                    ani, af = symmetric_ani(ani_af, cur_gid, rep_gid)
                    
                    if ani >= final_cluster_radius[rep_gid].ani and af >= self.af_sp:
                        if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af):
                            closest_rep_gid = rep_gid
                            closest_rep_ani = ani
                            closest_rep_af = af
                    
                if closest_rep_gid:
                    clusters[closest_rep_gid].append(ClusteredGenome(gid=cur_gid, 
                                                                            ani=closest_rep_ani, 
                                                                            af=closest_rep_af))
                else:
                    self.logger.warning('Failed to assign genome {} to representative.'.format(cur_gid))
                    if closest_rep_gid:
                        self.logger.warning(' ...closest_rep_gid = {}'.format(closest_rep_gid))
                        self.logger.warning(' ...closest_rep_ani = {:.2f}'.format(closest_rep_ani))
                        self.logger.warning(' ...closest_rep_af = {:.2f}'.format(closest_rep_af))
                        self.logger.warning(' ...closest rep radius = {:.2f}'.format(final_cluster_radius[closest_rep_gid].ani))
                    else:
                        self.logger.warning(' ...no representative with an AF >{:.2f} identified.'.format(self.af_sp))
                 
                statusStr = '-> Assigned {:,} of {:,} ({:.2f}%) genomes.'.format(idx+1, 
                                                                                    len(nonrep_gids), 
                                                                                    float(idx+1)*100/len(nonrep_gids)).ljust(86)
                sys.stdout.write('{}\r'.format(statusStr))
                sys.stdout.flush()
            sys.stdout.write('\n')
            
            pickle.dump(clusters, open(os.path.join(self.output_dir, 'clusters.pkl'), 'wb'))
            pickle.dump(ani_af, open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.de_novo.pkl'), 'wb'))
        else:
            self.logger.warning('Using previously calculated results in: {}'.format('clusters.pkl'))
            clusters = pickle.load(open(os.path.join(self.output_dir, 'clusters.pkl'), 'rb'))
            
            self.logger.warning('Using previously calculated results in: {}'.format('ani_af_rep_vs_nonrep.de_novo.pkl'))
            ani_af = pickle.load(open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.de_novo.pkl'), 'rb'))

        return clusters, ani_af
Ejemplo n.º 4
0
    def _calculate_ani(self, type_gids, genome_files, ncbi_taxonomy,
                       type_genome_sketch_file):
        """Calculate ANI between type and non-type genomes."""

        mash = Mash(self.cpus)

        # create Mash sketch for type genomes
        if not type_genome_sketch_file or not os.path.exists(
                type_genome_sketch_file):
            type_genome_list_file = os.path.join(self.output_dir,
                                                 'gtdb_type_genomes.lst')
            type_genome_sketch_file = os.path.join(self.output_dir,
                                                   'gtdb_type_genomes.msh')
            mash.sketch(type_gids, genome_files, type_genome_list_file,
                        type_genome_sketch_file)

        # create Mash sketch for non-type genomes
        nontype_gids = set()
        for gid in genome_files:
            if gid not in type_gids:
                nontype_gids.add(gid)

        nontype_genome_list_file = os.path.join(self.output_dir,
                                                'gtdb_nontype_genomes.lst')
        nontype_genome_sketch_file = os.path.join(self.output_dir,
                                                  'gtdb_nontype_genomes.msh')
        mash.sketch(nontype_gids, genome_files, nontype_genome_list_file,
                    nontype_genome_sketch_file)

        # get Mash distances
        mash_dist_file = os.path.join(self.output_dir,
                                      'gtdb_type_vs_nontype_genomes.dst')
        mash.dist(
            float(100 - self.min_mash_ani) / 100, type_genome_sketch_file,
            nontype_genome_sketch_file, mash_dist_file)

        # read Mash distances
        mash_ani = mash.read_ani(mash_dist_file)

        # get pairs above Mash threshold
        mash_ani_pairs = []
        for qid in mash_ani:
            for rid in mash_ani[qid]:
                if mash_ani[qid][rid] >= self.min_mash_ani:
                    if qid != rid:
                        mash_ani_pairs.append((qid, rid))
                        mash_ani_pairs.append((rid, qid))

        self.logger.info(
            'Identified %d genome pairs with a Mash ANI >= %.1f%%.' %
            (len(mash_ani_pairs), self.min_mash_ani))

        # calculate ANI between pairs
        self.logger.info('Calculating ANI between %d genome pairs:' %
                         len(mash_ani_pairs))
        if True:  #***
            ani_af = self.fastani.pairs(mash_ani_pairs, genome_files)
            pickle.dump(
                ani_af,
                open(
                    os.path.join(self.output_dir,
                                 'ani_af_type_vs_nontype.pkl'), 'wb'))
        else:
            ani_af = pickle.load(
                open(
                    os.path.join(self.output_dir,
                                 'ani_af_type_vs_nontype.pkl'), 'rb'))

        return ani_af
Ejemplo n.º 5
0
    def _cluster_genomes(self, 
                            genome_files,
                            rep_genomes,
                            type_gids, 
                            passed_qc,
                            final_cluster_radius):
        """Cluster all non-type/representative genomes to selected type/representatives genomes."""

        all_reps = rep_genomes.union(type_gids)
        
        # calculate MASH distance between non-type/representative genomes and selected type/representatives genomes
        mash = Mash(self.cpus)
        
        mash_type_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.msh')
        type_rep_genome_list_file = os.path.join(self.output_dir, 'gtdb_rep_genomes.lst')
        mash.sketch(all_reps, genome_files, type_rep_genome_list_file, mash_type_rep_sketch_file)
        
        mash_none_rep_sketch_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.msh')
        type_none_rep_file = os.path.join(self.output_dir, 'gtdb_nonrep_genomes.lst')
        mash.sketch(passed_qc - all_reps, genome_files, type_none_rep_file, mash_none_rep_sketch_file)

        # get Mash distances
        mash_dist_file = os.path.join(self.output_dir, 'gtdb_rep_vs_nonrep_genomes.dst')
        mash.dist(float(100 - self.min_mash_ani)/100, mash_type_rep_sketch_file, mash_none_rep_sketch_file, mash_dist_file)

        # read Mash distances
        mash_ani = mash.read_ani(mash_dist_file)
        
        # calculate ANI between non-type/representative genomes and selected type/representatives genomes
        clusters = {}
        for gid in all_reps:
            clusters[gid] = []
        
        genomes_to_cluster = passed_qc - set(clusters)
        ani_pairs = []
        for gid in genomes_to_cluster:
            if gid in mash_ani:
                for rep_gid in clusters:
                    if mash_ani[gid].get(rep_gid, 0) >= self.min_mash_ani:
                        ani_pairs.append((gid, rep_gid))
                        ani_pairs.append((rep_gid, gid))
                        
        self.logger.info('Calculating ANI between %d species clusters and %d unclustered genomes (%d pairs):' % (
                            len(clusters), 
                            len(genomes_to_cluster),
                            len(ani_pairs)))
        ani_af = self.fastani.pairs(ani_pairs, genome_files)

        # assign genomes to closest representatives 
        # that is within the representatives ANI radius
        self.logger.info('Assigning genomes to closest representative.')
        for idx, cur_gid in enumerate(genomes_to_cluster):
            closest_rep_gid = None
            closest_rep_ani = 0
            closest_rep_af = 0
            for rep_gid in clusters:
                ani, af = symmetric_ani(ani_af, cur_gid, rep_gid)
                
                if ani >= final_cluster_radius[rep_gid].ani and af >= self.af_sp:
                    if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af):
                        closest_rep_gid = rep_gid
                        closest_rep_ani = ani
                        closest_rep_af = af
                
            if closest_rep_gid:
                clusters[closest_rep_gid].append(self.ClusteredGenome(gid=cur_gid, 
                                                                        ani=closest_rep_ani, 
                                                                        af=closest_rep_af))
            else:
                self.logger.warning('Failed to assign genome %s to representative.' % cur_gid)
                if closest_rep_gid:
                    self.logger.warning(' ...closest_rep_gid = %s' % closest_rep_gid)
                    self.logger.warning(' ...closest_rep_ani = %.2f' % closest_rep_ani)
                    self.logger.warning(' ...closest_rep_af = %.2f' % closest_rep_af)
                    self.logger.warning(' ...closest rep radius = %.2f' % final_cluster_radius[closest_rep_gid].ani)
                else:
                    self.logger.warning(' ...no representative with an AF >%.2f identified.' % self.af_sp)
             
            statusStr = '-> Assigned %d of %d (%.2f%%) genomes.'.ljust(86) % (idx+1, 
                                                                                len(genomes_to_cluster), 
                                                                                float(idx+1)*100/len(genomes_to_cluster))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
        sys.stdout.write('\n')

        return clusters, ani_af
Ejemplo n.º 6
0
    def cluster_genomes(self,
                        cur_genomes,
                        de_novo_rep_gids,
                        named_rep_gids,
                        final_cluster_radius):
        """Cluster new representatives to representatives of named GTDB species clusters."""

        all_reps = de_novo_rep_gids.union(named_rep_gids)
        nonrep_gids = set(cur_genomes.genomes.keys()) - all_reps
        self.logger.info('Clustering {:,} genomes to {:,} named and de novo representatives.'.format(
            len(nonrep_gids), len(all_reps)))

        if True:  # ***
            # calculate MASH distance between non-representatives and representatives genomes
            mash = Mash(self.cpus)

            mash_rep_sketch_file = os.path.join(
                self.output_dir, 'gtdb_rep_genomes.msh')
            rep_genome_list_file = os.path.join(
                self.output_dir, 'gtdb_rep_genomes.lst')
            mash.sketch(all_reps, cur_genomes.genomic_files,
                        rep_genome_list_file, mash_rep_sketch_file)

            mash_none_rep_sketch_file = os.path.join(
                self.output_dir, 'gtdb_nonrep_genomes.msh')
            non_rep_file = os.path.join(
                self.output_dir, 'gtdb_nonrep_genomes.lst')
            mash.sketch(nonrep_gids, cur_genomes.genomic_files,
                        non_rep_file, mash_none_rep_sketch_file)

            # get Mash distances
            mash_dist_file = os.path.join(
                self.output_dir, 'gtdb_rep_vs_nonrep_genomes.dst')
            mash.dist(float(100 - self.min_mash_ani)/100,
                      mash_rep_sketch_file,
                      mash_none_rep_sketch_file,
                      mash_dist_file)

            # read Mash distances
            mash_ani = mash.read_ani(mash_dist_file)

            # calculate ANI between non-representatives and representatives genomes
            clusters = {}
            for gid in all_reps:
                clusters[gid] = []

            mash_ani_pairs = []
            for qid in mash_ani:
                assert qid in nonrep_gids

                for rid in mash_ani[qid]:
                    assert rid in all_reps

                    if (mash_ani[qid][rid] >= self.min_mash_ani
                            and qid != rid):
                        mash_ani_pairs.append((qid, rid))
                        mash_ani_pairs.append((rid, qid))

            self.logger.info('Calculating ANI between {:,} species clusters and {:,} unclustered genomes ({:,} pairs):'.format(
                len(clusters),
                len(nonrep_gids),
                len(mash_ani_pairs)))
            ani_af = self.fastani.pairs(
                mash_ani_pairs, cur_genomes.genomic_files)

            # assign genomes to closest representatives
            # that is within the representatives ANI radius
            self.logger.info('Assigning genomes to closest representative.')
            for idx, cur_gid in enumerate(nonrep_gids):
                closest_rep_gid = None
                closest_rep_ani = 0
                closest_rep_af = 0
                for rep_gid in clusters:
                    ani, af = FastANI.symmetric_ani(ani_af, cur_gid, rep_gid)

                    isclose_abs_tol = 1e-4
                    if (ani >= final_cluster_radius[rep_gid].ani - isclose_abs_tol
                            and af >= self.af_sp - isclose_abs_tol):
                        # the isclose_abs_tol factor is used in order to avoid missing genomes due to
                        # small rounding errors when comparing floating point values. In particular,
                        # the ANI radius for named GTDB representatives is read from file so small
                        # rounding errors could occur. This has only been observed once, but seems
                        # like good practice to use isclose here.
                        if ani > closest_rep_ani or (ani == closest_rep_ani and af > closest_rep_af):
                            closest_rep_gid = rep_gid
                            closest_rep_ani = ani
                            closest_rep_af = af

                if closest_rep_gid:
                    clusters[closest_rep_gid].append(ClusteredGenome(gid=cur_gid,
                                                                     ani=closest_rep_ani,
                                                                     af=closest_rep_af))
                else:
                    self.logger.warning(
                        'Failed to assign genome {} to representative.'.format(cur_gid))
                    if closest_rep_gid:
                        self.logger.warning(
                            ' - closest_rep_gid = {}'.format(closest_rep_gid))
                        self.logger.warning(
                            ' - closest_rep_ani = {:.2f}'.format(closest_rep_ani))
                        self.logger.warning(
                            ' - closest_rep_af = {:.2f}'.format(closest_rep_af))
                        self.logger.warning(
                            ' - closest rep radius = {:.2f}'.format(final_cluster_radius[closest_rep_gid].ani))
                    else:
                        self.logger.warning(
                            ' - no representative with an AF >{:.2f} identified.'.format(self.af_sp))

                statusStr = '-> Assigned {:,} of {:,} ({:.2f}%) genomes.'.format(idx+1,
                                                                                 len(nonrep_gids),
                                                                                 float(idx+1)*100/len(nonrep_gids)).ljust(86)
                sys.stdout.write('{}\r'.format(statusStr))
                sys.stdout.flush()
            sys.stdout.write('\n')

            pickle.dump(clusters, open(os.path.join(
                self.output_dir, 'clusters.pkl'), 'wb'))
            pickle.dump(ani_af, open(os.path.join(self.output_dir,
                                                  'ani_af_rep_vs_nonrep.de_novo.pkl'), 'wb'))
        else:
            self.logger.warning(
                'Using previously calculated results in: {}'.format('clusters.pkl'))
            clusters = pickle.load(
                open(os.path.join(self.output_dir, 'clusters.pkl'), 'rb'))

            self.logger.warning('Using previously calculated results in: {}'.format(
                'ani_af_rep_vs_nonrep.de_novo.pkl'))
            ani_af = pickle.load(
                open(os.path.join(self.output_dir, 'ani_af_rep_vs_nonrep.de_novo.pkl'), 'rb'))

        return clusters, ani_af