Example #1
0
    def run(self, metadata_file,
                gtdb_user_genomes_file,
                gtdb_user_reps,
                ncbi_refseq_assembly_file,
                ncbi_genbank_assembly_file,
                gtdb_domain_report,
                qc_exception_file,
                species_exception_file,
                min_comp,
                max_cont,
                min_quality,
                sh_exception,
                min_perc_markers,
                max_contigs,
                min_N50,
                max_ambiguous,
                output_dir):
        """Quality check all potential GTDB genomes."""

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file)
        ncbi_species = binomial_species(ncbi_taxonomy)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy))
        
        # determine User genomes to retain for consideration
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file)
        self.logger.info('Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.' % len(gtdb_user_to_genbank))
        
        user_genomes = 0
        for line in open(gtdb_user_reps):
            line_split = line.strip().split('\t')
            gid, taxonomy = line_split
            if gid not in gtdb_user_to_genbank:
                if 'd__Bacteria' in taxonomy:
                    self.logger.warning('Bacterial genome %s has no NCBI accession and is being skipped.' % gid)
                else:
                    gtdb_user_to_genbank[gid] = gid
                    user_genomes += 1
        self.logger.info('Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.' % user_genomes)

        # parse genomes flagged as exceptions from QC
        qc_exceptions = set()
        for line in open(qc_exception_file):
            qc_exceptions.add(line.split('\t')[0].strip())
        self.logger.info('Identified %d genomes flagged as exceptions from QC.' % len(qc_exceptions))
        
        # calculate quality score for genomes
        self.logger.info('Parsing QC statistics for each genome.')
        quality_metadata = read_gtdb_metadata(metadata_file, ['checkm_completeness',
                                                                'checkm_contamination',
                                                                'checkm_strain_heterogeneity_100',
                                                                'contig_count',
                                                                'n50_contigs',
                                                                'ambiguous_bases',
                                                                'genome_size'])
                                                                
        marker_perc = parse_marker_percentages(gtdb_domain_report)
                                                                
        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get type material designations for each genome
        self.logger.info('Reading type material designations for genomes from GTDB metadata file.')
        type_metadata = read_gtdb_metadata(metadata_file, ['ncbi_type_material_designation',
                                                                'gtdb_type_designation',
                                                                'gtdb_type_designation_sources'])
                                                                
        ncbi_tsp = ncbi_type_strain_of_species(type_metadata)
        gtdb_tsp = gtdb_type_strain_of_species(type_metadata)
        
        # QC all genomes
        self.logger.info('Validating genomes.')
        fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w')
        fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w')
        
        header = 'Accession\tNCBI species'
        header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases'
        
        fout_retained.write(header + '\tNote\n')
        fout_failed.write(header)
        fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n')

        num_retained = 0
        num_filtered = 0
        for gid in quality_metadata:
            if gid.startswith('U_') and gid not in gtdb_user_to_genbank:
                # skip user genomes not marked for retention
                continue

            failed_tests = defaultdict(int)
            passed_qc = pass_qc(quality_metadata[gid], 
                                    marker_perc[gid],
                                    min_comp,
                                    max_cont,
                                    min_quality,
                                    sh_exception,
                                    min_perc_markers,
                                    max_contigs,
                                    min_N50,
                                    max_ambiguous,
                                    failed_tests)

            if passed_qc or gid in qc_exceptions:
                num_retained += 1
                fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % (
                                        quality_metadata[gid].checkm_completeness,
                                        quality_metadata[gid].checkm_contamination,
                                        quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                        ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        quality_metadata[gid].contig_count,
                                        quality_metadata[gid].n50_contigs,
                                        quality_metadata[gid].ambiguous_bases,
                                        'Passed QC' if passed_qc else 'Flagged as exception'))
            else:
                num_filtered += 1 
                fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % (
                                        quality_metadata[gid].checkm_completeness,
                                        quality_metadata[gid].checkm_contamination,
                                        quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                        ('%.2f' % quality_metadata[gid].checkm_strain_heterogeneity_100) if quality_metadata[gid].checkm_strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        quality_metadata[gid].contig_count,
                                        quality_metadata[gid].n50_contigs,
                                        quality_metadata[gid].ambiguous_bases))
                fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    failed_tests['comp'],
                                    failed_tests['cont'],
                                    failed_tests['qual'],
                                    failed_tests['marker_perc'],
                                    failed_tests['contig_count'],
                                    failed_tests['N50'],
                                    failed_tests['ambig']))
        fout_retained.close()
        fout_failed.close()
        
        self.logger.info('Retained %d genomes and filtered %d genomes.' % (num_retained, num_filtered))
                                                                
        # QC genomes in each named species
        self.logger.info('Performing QC of type genome for each of the %d NCBI species.' % len(ncbi_species))
        
        fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w')
        fout_type_fail.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)')
        fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n')
        
        fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w')
        fout_fail_sp.write('Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)')
        fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases')
        fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases')
        fout_fail_sp.write('\tNCBI exclude from RefSeq\n')
        
        fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w')
        fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes')
        fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers')
        fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n')
        
        lost_type = 0
        lost_sp = 0
        filtered_genomes = 0
        failed_tests_cumulative = defaultdict(int)
        for sp, gids in ncbi_species.items():
            type_pass = set()
            type_fail = set()
            other_pass = set()
            other_fail = set()
            
            failed_tests_gids = {}
            for gid in gids:
                failed_tests = defaultdict(int)
                passed_qc = pass_qc(quality_metadata[gid], 
                                    marker_perc[gid],
                                    min_comp,
                                    max_cont,
                                    min_quality,
                                    sh_exception,
                                    min_perc_markers,
                                    max_contigs,
                                    min_N50,
                                    max_ambiguous,
                                    failed_tests)
                                    
                failed_tests_gids[gid] = failed_tests

                if gid in gtdb_tsp or gid in ncbi_tsp:
                    if passed_qc:
                        type_pass.add(gid)
                    else:
                        type_fail.add(gid)
                        filtered_genomes += 1
                else:
                    if passed_qc:
                        other_pass.add(gid)
                    else:
                        other_fail.add(gid)
                        filtered_genomes += 1
                        
            # tally failed species
            for test, count in failed_tests.items():
                failed_tests_cumulative[test] += count

            if len(type_pass) >= 1:
                # great: one or more type genomes pass QC and will be selected as the type genome
                continue 
            
            if len(type_fail):
                # all potential type genomes for species failed QC so report these for manual inspection
                lost_type += 1
                for gid in type_fail:
                    fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % (
                                            sp,
                                            gid,
                                            '; '.join(gtdb_taxonomy[gid]),
                                            '; '.join(ncbi_taxonomy[gid]),
                                            type_metadata[gid].gtdb_type_designation_sources,
                                            type_metadata[gid].ncbi_type_material_designation,
                                            float(quality_metadata[gid].genome_size)/1e6,
                                            quality_metadata[gid].checkm_completeness,
                                            quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_strain_heterogeneity_100,
                                            marker_perc[gid],
                                            quality_metadata[gid].contig_count,
                                            quality_metadata[gid].n50_contigs,
                                            quality_metadata[gid].ambiguous_bases,
                                            excluded_from_refseq_note[gid],
                                            len(other_pass) == 0))
                
            if len(other_pass) == 0:
                # no genomes for species pass QC so report loss of species
                lost_sp += 1
                fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail)))
                fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    sum([failed_tests_gids[gid]['comp'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['cont'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['qual'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['contig_count'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['N50'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['ambig'] for gid in gids])))
                                    
                for gid in type_fail.union(other_fail):
                    fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % (
                                            sp,
                                            gid,
                                            '; '.join(gtdb_taxonomy[gid]),
                                            '; '.join(ncbi_taxonomy[gid]),
                                            gid in type_fail,
                                            float(quality_metadata[gid].genome_size)/1e6,
                                            quality_metadata[gid].checkm_completeness,
                                            quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_completeness-5*quality_metadata[gid].checkm_contamination,
                                            quality_metadata[gid].checkm_strain_heterogeneity_100,
                                            marker_perc[gid],
                                            quality_metadata[gid].contig_count,
                                            quality_metadata[gid].n50_contigs,
                                            quality_metadata[gid].ambiguous_bases))
                    fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (
                                        failed_tests_gids[gid]['comp'],
                                        failed_tests_gids[gid]['cont'],
                                        failed_tests_gids[gid]['qual'],
                                        failed_tests_gids[gid]['marker_perc'],
                                        failed_tests_gids[gid]['contig_count'],
                                        failed_tests_gids[gid]['N50'],
                                        failed_tests_gids[gid]['ambig']))
                    fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid])

        fout_type_fail.close()
        fout_fail_sp.close()
        fout_sp_lost.close()
        
        self.logger.info('Genomes filtered for each criterion:')
        for test in sorted(failed_tests_cumulative):
            self.logger.info('%s: %d' % (test, failed_tests_cumulative[test]))

        self.logger.info('Filtered %d genomes assigned to NCBI species.' % filtered_genomes)
        self.logger.info('Identified %d species with type genomes failing QC and %d total species failing QC.' % (lost_type, lost_sp))
Example #2
0
    def run(self, qc_file, gtdb_metadata_file, gtdb_final_clusters,
            output_dir):
        """Quality check all potential GTDB genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info(
            'Reading NCBI and GTDB taxonomy from GTDB metadata file.')
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(gtdb_metadata_file)
        prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes.' %
                         len(ncbi_taxonomy))
        self.logger.info('Read GTDB taxonomy for %d genomes.' %
                         len(prev_gtdb_taxonomy))

        # get GTDB metadata
        type_metadata = read_gtdb_metadata(gtdb_metadata_file, [
            'gtdb_type_designation', 'gtdb_type_designation_sources',
            'gtdb_type_species_of_genus'
        ])

        quality_metadata = read_quality_metadata(gtdb_metadata_file)

        # read species clusters
        sp_clusters, species = read_clusters(gtdb_final_clusters)
        self.logger.info('Read %d species clusters.' % len(sp_clusters))

        # sanity check species clusters all defined by genomes passing QC
        for gid in sp_clusters:
            if gid not in passed_qc:
                self.logger.error(
                    'Genome %s defines a species cluster, but fails QC.' % gid)
                sys.exit(-1)

        # modify GTDB taxonomy to reflect new species clustering a report incongruencies
        self.logger.info(
            'Identifying species with incongruent specific names.')
        self._incongruent_specific_names(species, ncbi_taxonomy,
                                         prev_gtdb_taxonomy, type_metadata,
                                         output_dir)

        self._incongruent_genus_names(species, ncbi_taxonomy,
                                      prev_gtdb_taxonomy, type_metadata,
                                      output_dir)

        # get GIDs for canonical and validation trees
        fout_bac_can_gtdb = open(
            os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w')
        fout_bac_val_gtdb = open(
            os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w')
        fout_ar_can_gtdb = open(
            os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w')
        fout_ar_val_gtdb = open(
            os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w')

        fout_bac_val = open(
            os.path.join(output_dir, 'gids_bac_validation.lst'), 'w')
        fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'),
                           'w')
        fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'),
                            'w')
        fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'),
                           'w')
        fout_bac_val.write('#Accession\tSpecies\tNote\n')
        fout_ar_val.write('#Accession\tSpecies\tNote\n')
        fout_bac_can.write('#Accession\tSpecies\tNote\n')
        fout_ar_can.write('#Accession\tSpecies\tNote\n')

        for rid in sp_clusters:
            domain = prev_gtdb_taxonomy[rid][0]
            if domain == 'd__Bacteria':
                fout_val = fout_bac_val
                fout_can = fout_bac_can

                fout_can_gtdb = fout_bac_can_gtdb
                fout_val_gtdb = fout_bac_val_gtdb
            elif domain == 'd__Archaea':
                fout_val = fout_ar_val
                fout_can = fout_ar_can
                fout_can_gtdb = fout_ar_can_gtdb
                fout_val_gtdb = fout_ar_val_gtdb
            else:
                self.logger.error('Genome %s has no GTDB domain assignment.' %
                                  rid)
                sys.exit(-1)

            # substitute proposed species name into GTDB taxonomy
            sp = species[rid]
            canonical_sp = parse_canonical_sp(sp)
            taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp]
            new_gtdb_str = '; '.join(taxa)
            fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))

            fout_val.write('%s\t%s\t%s\n' %
                           (rid, sp, 'GTDB type or representative genome'))
            fout_can.write('%s\t%s\t%s\n' %
                           (rid, sp, 'GTDB type or representative genome'))

            cluster_gids = set(sp_clusters[rid])
            for gid in cluster_gids:
                if gid not in passed_qc:
                    self.logger.error(
                        'Genome %s is in a species cluster, but fails QC.' %
                        gid)
                    sys.exit(-1)

            if len(cluster_gids) > 0:
                # select highest-quality genome
                q = quality_score(cluster_gids, quality_metadata)
                gid = max(q.items(), key=operator.itemgetter(1))[0]

                fout_val.write(
                    '%s\t%s\t%s\n' %
                    (gid, sp,
                     'selected highest-quality genome (Q=%.2f)' % q[gid]))
                fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str))

        fout_bac_val.close()
        fout_ar_val.close()
        fout_bac_can.close()
        fout_ar_can.close()

        fout_bac_can_gtdb.close()
        fout_bac_val_gtdb.close()
        fout_ar_can_gtdb.close()
        fout_ar_val_gtdb.close()
Example #3
0
    def run(self, 
                qc_file,
                gtdb_metadata_file,
                gtdb_final_clusters,
                species_exception_file,
                output_dir):
        """Quality check all potential GTDB genomes."""
        
        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))
        
        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI and GTDB taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(gtdb_metadata_file, species_exception_file)
        prev_gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(prev_gtdb_taxonomy))
        
        # get GTDB metadata
        type_metadata = read_gtdb_metadata(gtdb_metadata_file, ['gtdb_type_designation',
                                                                    'gtdb_type_designation_sources',
                                                                    'gtdb_type_species_of_genus'])
                                                                    
        quality_metadata = read_quality_metadata(gtdb_metadata_file)

        # read species clusters
        sp_clusters, species, _rep_radius = read_clusters(gtdb_final_clusters)
        self.logger.info('Read %d species clusters.' % len(sp_clusters))
        
        # sanity check species clusters all defined by genomes passing QC
        for gid in sp_clusters:
            if gid not in passed_qc:
                self.logger.error('Genome %s defines a species cluster, but fails QC.' % gid)
                sys.exit(-1)
                
        # modify GTDB taxonomy to reflect new species clustering and report incongruencies
        self.logger.info('Identifying species with incongruent specific names.')
        self._incongruent_specific_names(species, 
                                            ncbi_taxonomy,
                                            prev_gtdb_taxonomy, 
                                            type_metadata, 
                                            output_dir)
        
        self._incongruent_genus_names(species, 
                                            ncbi_taxonomy,
                                            prev_gtdb_taxonomy, 
                                            type_metadata, 
                                            output_dir)
                                            

        # get GIDs for canonical and validation trees
        fout_bac_can_gtdb = open(os.path.join(output_dir, 'bac_can_taxonomy.tsv'), 'w')
        fout_bac_val_gtdb = open(os.path.join(output_dir, 'bac_val_taxonomy.tsv'), 'w')
        fout_ar_can_gtdb = open(os.path.join(output_dir, 'ar_can_taxonomy.tsv'), 'w')
        fout_ar_val_gtdb = open(os.path.join(output_dir, 'ar_val_taxonomy.tsv'), 'w')
            
        fout_bac_val = open(os.path.join(output_dir, 'gids_bac_validation.lst'), 'w')
        fout_ar_val = open(os.path.join(output_dir, 'gids_ar_validation.lst'), 'w')
        fout_bac_can = open(os.path.join(output_dir, 'gids_bac_canonical.lst'), 'w')
        fout_ar_can = open(os.path.join(output_dir, 'gids_ar_canonical.lst'), 'w')
        fout_bac_val.write('#Accession\tSpecies\tNote\n')
        fout_ar_val.write('#Accession\tSpecies\tNote\n')
        fout_bac_can.write('#Accession\tSpecies\tNote\n')
        fout_ar_can.write('#Accession\tSpecies\tNote\n')
   
        for rid in sp_clusters:
            domain = prev_gtdb_taxonomy[rid][0]
            if domain == 'd__Bacteria':
                fout_val = fout_bac_val
                fout_can = fout_bac_can
                
                fout_can_gtdb = fout_bac_can_gtdb
                fout_val_gtdb = fout_bac_val_gtdb
            elif domain == 'd__Archaea':
                fout_val = fout_ar_val
                fout_can = fout_ar_can
                fout_can_gtdb = fout_ar_can_gtdb
                fout_val_gtdb = fout_ar_val_gtdb
            else:
                self.logger.error('Genome %s has no GTDB domain assignment.' % rid)
                sys.exit(-1)
            
            # substitute proposed species name into GTDB taxonomy
            sp = species[rid]
            canonical_sp = parse_canonical_sp(sp)
            taxa = prev_gtdb_taxonomy[rid][0:6] + [canonical_sp]
            new_gtdb_str = '; '.join(taxa)
            fout_can_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            fout_val_gtdb.write('%s\t%s\n' % (rid, new_gtdb_str))
            
            fout_val.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome'))
            fout_can.write('%s\t%s\t%s\n' % (rid, sp, 'GTDB type or representative genome'))
            
            cluster_gids = set(sp_clusters[rid])
            for gid in cluster_gids:
                if gid not in passed_qc:
                    self.logger.error('Genome %s is in a species cluster, but fails QC.' % gid)
                    sys.exit(-1)
                    
            if len(cluster_gids) > 0:
                # select highest-quality genome
                q = quality_score(cluster_gids, quality_metadata)
                gid = max(q.items(), key=operator.itemgetter(1))[0]
                
                taxa = prev_gtdb_taxonomy[gid][0:6] + [canonical_sp]
                new_gtdb_str = '; '.join(taxa)
    
                fout_val.write('%s\t%s\t%s\n' % (gid, sp, 'selected highest-quality genome (Q=%.2f)' % q[gid]))
                fout_val_gtdb.write('%s\t%s\n' % (gid, new_gtdb_str))
                    
        fout_bac_val.close()
        fout_ar_val.close()
        fout_bac_can.close()
        fout_ar_can.close()
        
        fout_bac_can_gtdb.close()
        fout_bac_val_gtdb.close()
        fout_ar_can_gtdb.close()
        fout_ar_val_gtdb.close()
        
    def representatives(self,
                        species_derep_file,
                        metadata_file,
                        prev_rep_file,
                        mash_pairwise_file,
                        trusted_user_file,
                        min_rep_comp,
                        max_rep_cont,
                        min_quality,
                        max_contigs,
                        min_N50,
                        max_ambiguous,
                        max_gap_length,
                        output_file):
        """Identify additional representatives.
        
        Additional representatives are selected in a greedy fashion,
        by ordering genomes according to database source and estimated
        genome quality. A slight quality boost is given to genomes that 
        were previously selected as a representative in order to try and
        retain more stability between releases. Genomes only added as a new 
        representative if they cannot be clustered with an existing representative. 
        Clustering is based on a conservative Mash distance threshold that
        reflects the 95% ANI species criteria.

        Parameters
        ----------
        species_derep_file : str
            File listing selected representatives from named species.
        metadata_file : str
            Metadata, including CheckM estimates, for all genomes.
        prev_rep_file : str
            File indicating previous representatives to favour during selection.
        trusted_user_file : str
            File listing trusted User genomes that should be treated as if they are in GenBank.
        mash_pairwise_file : str
          File with pairwise Mash distances.
        min_rep_comp : float [0, 100]
            Minimum completeness for a genome to be a representative.
        max_rep_cont : float [0, 100]
            Maximum contamination for a genome to be a representative.
        min_quality : float [0, 100]
            Minimum quality (comp - 5*cont) for a genome to be a representative.
        max_contigs : int
            Maximum number of contigs for a genome to be a representative.
        min_N50 : int
            Minimum N50 of scaffolds for a genome to be a representative.
        max_ambiguous : int
            Maximum number of ambiguous bases within contigs for a genome to be a representative.
        max_gap_length : int
            Maximum number of ambiguous bases between contigs for a genome to be a representative.
        output_file : str
            Output file containing all genomes identified as representatives.
        """
        
        # read previous representatives and trusted user genomes
        prev_gtdb_reps = self._read_genome_list(prev_rep_file)
        trusted_user_genomes = self._read_genome_list(trusted_user_file)
        
        self.logger.info('Identified %d trusted User genomes.' % len(trusted_user_genomes))
        self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps))

        # get genome and assembly quality
        genome_stats = self._genome_stats(metadata_file)

        # read initial representatives
        init_rep_genomes = set()
        for line in open(species_derep_file):
            if line[0] == '#':
                continue

            genome_id = line.strip().split('\t')[0]
            init_rep_genomes.add(genome_id)

        self.logger.info('Identified %d initial representatives.' % len(init_rep_genomes))

        # remove existing representative genomes and genomes
        # of insufficient quality to be a representative
        genome_quality = {}
        potential_reps = set()
        for genome_id, stats in genome_stats.items():
            if genome_id in init_rep_genomes:
                continue
                
            if genome_id.startswith('U_') and genome_id not in trusted_user_genomes:
                continue
                
            if (stats.checkm_completeness >= min_rep_comp 
                and stats.checkm_contamination <= max_rep_cont 
                and (stats.checkm_completeness - 5*stats.checkm_contamination) >= min_quality
                and stats.contig_count <= max_contigs
                and stats.n50_scaffolds >= min_N50
                and stats.ambiguous_bases <= max_ambiguous
                and stats.total_gap_length <= max_gap_length):
                    potential_reps.add(genome_id)
                    genome_quality[genome_id] = stats.checkm_completeness - 5*stats.checkm_contamination

        # perform greedy identification of new representatives
        ordered_genomes = self._order_genomes(potential_reps, 
                                                genome_quality, 
                                                trusted_user_genomes, 
                                                prev_gtdb_reps)
        info = (('Comparing %d genomes to %d initial representatives.') % (len(ordered_genomes),
                                                                            len(init_rep_genomes)))
        self.logger.info(info)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        representatives = self._greedy_representatives(init_rep_genomes,
                                                        ordered_genomes,
                                                        gtdb_taxonomy,
                                                        ncbi_taxonomy,
                                                        mash_pairwise_file)

        self.logger.info('Identified %d representatives.' % len(representatives))

        # read metadata for genomes
        (refseq_genomes, 
            complete_genomes, 
            representative_genomes) = ncbi.read_refseq_metadata(metadata_file)
        ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file)
        
            
        # write out information for representative genomes
        fout = open(output_file, 'w')

        fout.write('# Selection criteria:\n')
        fout.write('# Species dereplication file: %s\n' % species_derep_file)
        fout.write('# Previous representative file: %s\n' % prev_rep_file)
        fout.write('# Trusted user genomes file: %s\n' % trusted_user_file)
        fout.write('# Genome quality metadata file: %s\n' % str(metadata_file))
        fout.write('# Min. representative completeness: %.2f\n' % min_rep_comp)
        fout.write('# Max. representative contamination: %.2f\n' % max_rep_cont)
        fout.write('# Mash strict threshold: %.3f\n' % self.mash_strict_threshold)
        fout.write('# Mash GTDB species threshold: %.3f\n' % self.mash_gtdb_species_threshold)
        fout.write('# Mash NCBI species threshold: %.3f\n' % self.mash_ncbi_species_threshold)
        fout.write('#\n')

        fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tNCBI Organism Name\tNCBI Type strain\tComplete\tRepresentative\n')
        for genome_id in representatives:
            representative = 'yes' if genome_id in representative_genomes else 'no'
            complete = 'yes' if genome_id in complete_genomes else 'no'
            ts = 'yes' if genome_id in ncbi_type_strains else 'no'
            gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(genome_id, Taxonomy.rank_prefixes))
            ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(genome_id, Taxonomy.rank_prefixes))

            fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (genome_id,
                                                            gtdb_taxa_str,
                                                            ncbi_taxa_str,
                                                            genome_stats[genome_id].ncbi_organism_name,
                                                            ts,
                                                            complete,
                                                            representative))

        fout.close()
    def cluster(self,
                rep_genome_file,
                metadata_file,
                mash_pairwise_file,
                output_file):
        """Cluster genomes based on Mash distances.
        
        Genomes are assigned to their closest representative,
        that is below the species cutoff. However, genomes 
        assigned to different GTDB species are never clustered
        together. This allows refinement of species to be 
        performed using alternative methods and ensures this
        will be respected.
        
        Parameters
        ----------
        rep_genome_file : str
          File indicating genome representative.
        metadata_file : str
          Metadata, including CheckM estimates, for all genomes.
        mash_pairwise_file : str
          File with pairwise Mash distances.
        output_file : str
          Output file indicating genome clusters.
        """
        
        # read previous representatives and trusted user genomes
        representatives = self._read_genome_list(rep_genome_file)
        self.logger.info('Identified %d representative genomes.' % len(representatives))
        
        # get genome and assembly quality
        genome_stats = self._genome_stats(metadata_file)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        
        # read Mash distance between genomes
        self.logger.info('Reading pairwise Mash distances between genomes.')
        mash_dists = self._read_mash_dists(mash_pairwise_file)
        
        # cluster genomes
        self.logger.info('Clustering genomes.')
        clusters = {}
        for rep_id in representatives:
            clusters[rep_id] = []
        
        remaining_genomes = set(genome_stats) - representatives
        for i, genome_id in enumerate(remaining_genomes):
            if i % 100 == 0:
                sys.stdout.write('==> Processed %d of %d genomes.\r' % (i+1, len(remaining_genomes)))
                sys.stdout.flush()
                
            query_dists = mash_dists[genome_id]
            
            query_gtdb_sp = gtdb_taxonomy[genome_id][6]
            query_ncbi_sp = ncbi_taxonomy[genome_id][6]
            
            assigned_rep = None
            min_d = 1.0
            for ref_id in representatives:
                d = query_dists.get(ref_id, 1.0)
                if d >= min_d:
                    continue
                
                ref_gtdb_sp = gtdb_taxonomy[ref_id][6]
                
                if (d <= self.mash_strict_threshold
                        and (query_gtdb_sp == 's__' 
                        or ref_gtdb_sp == query_gtdb_sp)):
                        # genomes meet the strict threshold for
                        # clustering and don't conflict in their
                        # assigned species names
                        assigned_rep = ref_id
                        min_d = d
                        continue
                        
                if ref_gtdb_sp == 's__' or ref_gtdb_sp != query_gtdb_sp:
                    continue
                        
                if d <= self.mash_gtdb_species_threshold:
                        # genomes are from same named species and 
                        # meet the threshold for clustering
                        assigned_rep = ref_id
                        min_d = d
                elif (d <= self.mash_ncbi_species_threshold 
                        and self._canonical_species_name(ref_gtdb_sp) == query_ncbi_sp):
                        # genomes are from same named species and 
                        # meet the threshold for clustering
                        assigned_rep = ref_id
                        min_d = d
            
            if assigned_rep:
                clusters[assigned_rep].append(genome_id)
                
        sys.stdout.write('==> Processed %d of %d genomes.\r' % (len(remaining_genomes), 
                                                                len(remaining_genomes)))
        sys.stdout.flush()
        sys.stdout.write('\n')
                
        # write out clusters
        fout = open(output_file, 'w')
        clustered_genomes = 0
        for c, cluster_rep in enumerate(sorted(clusters, key=lambda x: len(clusters[x]), reverse=True)):   
            cluster_str = 'cluster_%d' % (c + 1)
            cluster = clusters[cluster_rep]
            clustered_genomes += len(cluster)
            fout.write('%s\t%s\t%d\t%s\n' % (cluster_rep, cluster_str, len(cluster) + 1, ','.join(cluster)))

        fout.close()
        
        self.logger.info('Assigned %d genomes to representatives.' % clustered_genomes)
    def run(self, qc_file,
                metadata_file,
                gtdb_user_genomes_file,
                genome_path_file,
                type_genome_cluster_file,
                type_genome_synonym_file,
                ncbi_refseq_assembly_file,
                ncbi_genbank_assembly_file,
                ani_af_nontype_vs_type,
                species_exception_file,
                rnd_type_genome):
        """Infer de novo species clusters and type genomes for remaining genomes."""
        
        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))
        
        # get NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy))
        
        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = parse_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files))
        assert(len(genome_files) == len(passed_qc))
        
        # determine type genomes and genomes clustered to type genomes
        type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file)
        assert(len(type_species) == len(type_gids))
        self.logger.info('Identified %d type genomes.' % len(type_gids))
        self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids))
        
        # calculate quality score for genomes
        self.logger.info('Parse quality statistics for all genomes.')
        quality_metadata = read_quality_metadata(metadata_file)
        
        # calculate genome quality score
        self.logger.info('Calculating genome quality score.')
        genome_quality = quality_score(quality_metadata.keys(), quality_metadata)

        # determine genomes left to be clustered
        unclustered_gids = passed_qc - type_gids - type_clustered_gids
        self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids))

        # establish closest type genome for each unclustered genome
        self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids))
        nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type)
        
        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids)

        # select species representatives genomes in a greedy fashion based on genome quality
        rep_genomes = self._selected_rep_genomes(genome_files,
                                                    nontype_radius, 
                                                    unclustered_gids, 
                                                    mash_anis,
                                                    quality_metadata,
                                                    rnd_type_genome)
        
        # cluster all non-type/non-rep genomes to species type/rep genomes
        final_cluster_radius = type_radius.copy()
        final_cluster_radius.update(nontype_radius)
        
        final_clusters, ani_af = self._cluster_genomes(genome_files,
                                                        rep_genomes,
                                                        type_gids, 
                                                        passed_qc,
                                                        final_cluster_radius)
        rep_clusters = {}
        for gid in rep_genomes:
            rep_clusters[gid] = final_clusters[gid]

        # get list of synonyms in order to restrict usage of species names
        synonyms = self._parse_synonyms(type_genome_synonym_file)
        self.logger.info('Identified %d synonyms.' % len(synonyms))
        
        # determine User genomes with NCBI accession number that may form species names
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file)
        self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank))
        
        # assign species names to de novo species clusters
        names_in_use = synonyms.union(type_species)
        self.logger.info('Identified %d species names already in use.' % len(names_in_use))
        self.logger.info('Assigning species name to each de novo species cluster.')
        cluster_sp_names = self._assign_species_names(rep_clusters, 
                                                        names_in_use, 
                                                        gtdb_taxonomy,
                                                        gtdb_user_to_genbank)
        
         # write out file with details about selected representative genomes
        self._write_rep_info(rep_clusters, 
                                cluster_sp_names,
                                quality_metadata,
                                genome_quality,
                                excluded_from_refseq_note,
                                ani_af,
                                os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv'))
                                             
        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]
            
        all_species = cluster_sp_names
        all_species.update(species_type_gid)

        self.logger.info('Writing %d species clusters to file.' % len(all_species))
        self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        all_species, 
                        os.path.join(self.output_dir, 'gtdb_clusters_final.tsv'))

        write_type_radius(final_cluster_radius, 
                            all_species, 
                            os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))
        
    def dereplicate(self, metadata_file,
                    prev_rep_file,
                    exceptions_file,
                    trusted_user_file,
                    max_species,
                    min_rep_comp,
                    max_rep_cont,
                    min_quality,
                    max_contigs,
                    min_N50,
                    max_ambiguous,
                    max_gap_length,
                    strict_filtering,
                    output_file):
        """Select representative genomes from named species.
        
        Each named species is dereplicated to a fixed number of
        reprsentatives, taking care to retain all genomes marked as a
        'reference' or 'representative' at NCBI. Preference
        is then given to genomes marked as type strains at
        NCBI. Finally, genomes are selected based on estimated quality.

        Parameters
        ----------
        max_species : int
            Maximum number of genomes of the same species to retain.
        prev_rep_file : str
            File indicating previous representatives to favour during selection.
        trusted_genomes_file:
            File containing list of genomes to retain regardless of filtering criteria.
        metadata_file : str
            Metadata, including CheckM estimates, for all genomes.
        min_rep_comp : float [0, 100]
            Minimum completeness for a genome to be a representative.
        max_rep_cont : float [0, 100]
            Maximum contamination for a genome to be a representative.
        min_quality : float [0, 100]
            Minimum genome quality (comp-5*cont) for a genome to be a representative.
        max_contigs : int
            Maximum number of contigs for a genome to be a representative.
        min_N50 : int
            Minimum N50 of scaffolds for a genome to be a representative.
        max_ambiguous : int
            Maximum number of ambiguous bases for a genome to be a representative.
        max_gap_length : int
            Maximum number of ambiguous bases between contigs for a genome to be a representative.
        strict_filtering : boolean
            If True apply filtering to all genomes, otherise apply lenient 
            filtering to genomes where the chromosome and plasmids are reported 
            as complete.
        output_file : str
            Output file to contain list of dereplicated genomes.
        """
        
        # identify previous reps, genomes to treat as exceptions, 
        # and user genomes to process
        prev_gtdb_reps = self._read_genome_list(prev_rep_file)
        exception_genomes = self._read_genome_list(exceptions_file)
        trusted_user_genomes = self._read_genome_list(trusted_user_file)

        (refseq_genomes, 
            complete_genomes, 
            representative_genomes) = ncbi.read_refseq_metadata(metadata_file)
        self.logger.info('Identified %d RefSeq genomes.' % len(refseq_genomes))
        self.logger.info('Identified %d representative or reference genomes.' % len(representative_genomes))
        self.logger.info('Identified %d complete genomes.' % len(complete_genomes))
        self.logger.info('Identified %d genomes in exception list.' % len(exception_genomes))
        self.logger.info('Identified %d trusted user genomes.' % len(trusted_user_genomes))
        self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps))
        
        # get genome and assembly quality
        genome_stats = self._genome_stats(metadata_file)
        
        # get genomes in each named GTDB species
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        
        species = {}
        species_index = Taxonomy.rank_index['s__']
        for genome_id, taxa in gtdb_taxonomy.items():
            sp = taxa[species_index]
            if sp != 's__':
                species[genome_id] = sp

        self.logger.info('Identified %d genomes with a GTDB species names.' % len(species))

        # identify genomes passing filtering criteria
        filtered_reps_file = output_file + '.filtered_reps'
        fout = open(filtered_reps_file, 'w')
        fout.write('Genome ID\tCompleteness\tContamination')
        fout.write('\tContig Count\tN50\tAmbiguous Bases\tTotal Gap Length')
        fout.write('\tNote\tNCBI Organism Name\n')

        lpsn_type_strains = defaultdict(set)
        genomes_to_consider = []
        genome_quality = {}
        filtered_reps = 0
        lack_ncbi_taxonomy = 0
        for genome_id in list(genome_stats.keys()):
            if genome_id.startswith('U_') and genome_id not in trusted_user_genomes:
                continue
                
            stats = genome_stats[genome_id]
            comp = stats.checkm_completeness
            cont = stats.checkm_contamination
            
            keep = False
            if genome_id in exception_genomes:
                keep = True
            elif (comp >= min_rep_comp
                    and cont <= max_rep_cont
                    and (comp - 5*cont) >= min_quality
                    and stats.contig_count <= max_contigs
                    and stats.n50_scaffolds >= min_N50
                    and stats.ambiguous_bases <= max_ambiguous
                    and stats.total_gap_length <= max_gap_length):
                        keep = True
            elif not strict_filtering:
                # check if genome appears to consist of only an unspanned
                # chromosome and unspanned plasmids and thus can be 
                # subjected to a more lenient quality check
                if (stats.ncbi_assembly_level in ['Complete Genome', 'Chromosome']
                    and stats.ncbi_genome_representation == 'full'
                    and stats.scaffold_count == stats.ncbi_molecule_count
                    and stats.ncbi_unspanned_gaps == 0
                    and stats.ncbi_spanned_gaps <= 10
                    and stats.ambiguous_bases <= max_ambiguous
                    and stats.total_gap_length <= max_gap_length
                    and stats.ssu_count >= 1):
                    
                    # apply lenient quality check that should pick
                    # up the vast majority (if not all) even highly
                    # reduced genomes and those with substantial genome
                    # duplication leading to high estimated contamination
                    if comp >= 40 and cont <= 15:
                        keep = True
                        
            if keep:
                genomes_to_consider.append(genome_id)
                genome_quality[genome_id] = comp - 5*cont
                if stats.lpsn_strain:
                    gtdb_species = gtdb_taxonomy[genome_id][species_index]
                    if gtdb_species != 's__':
                        lpsn_type_strains[gtdb_species].add(genome_id)
            
            # check if a representative at NCBI is being filtered
            if genome_id in representative_genomes:
                if genome_id not in genomes_to_consider:
                    if comp < min_rep_comp:
                        note = 'failed completeness criteria'
                    elif cont > max_rep_cont:
                        note = 'failed contamination criteria'
                    elif (comp - 5*cont) < min_quality:
                        note = 'failed genome quality criteria'
                    elif stats.contig_count > max_contigs:
                        note = 'failed contig count criteria'
                    elif stats.n50_scaffolds < min_N50:
                        note = 'failed scaffold N50 criteria'
                    elif stats.ambiguous_bases > max_ambiguous:
                        note = 'failed ambiguous bases criteria'
                    elif stats.total_gap_length > max_gap_length:
                        note = 'failed total gap length criteria'
                        
                    fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\t%s\n' % (
                                genome_id, 
                                comp, 
                                cont, 
                                stats.contig_count, 
                                stats.n50_scaffolds, 
                                stats.ambiguous_bases,
                                stats.total_gap_length,
                                note,
                                stats.ncbi_organism_name
                                ))

                    warning = ('Filtered RefSeq rep %s with comp=%.2f, cont=%.2f, contigs=%d, N50=%d'
                                    % (genome_id, comp, cont, stats.contig_count, stats.n50_scaffolds))
                    self.logger.warning(warning)
                    
                    filtered_reps += 1
                    
            if genome_id in refseq_genomes and not stats.ncbi_taxonomy:
                # this should never happen, but sometimes the NCBI taxonomy
                # is missing information for some genomes probably due to when NCBI
                # updates the taxonomy database relative to RefSeq
                lack_ncbi_taxonomy += 1
                self.logger.warning('RefSeq representative %s has no assigned NCBI taxonomy.' % genome_id)

        fout.close()

        self.logger.info('Identified %d RefSeq representatives without an assigned NCBI taxonomy.' % lack_ncbi_taxonomy)
        self.logger.info('Filtered %d RefSeq representatives based on genome or assembly quality.' % filtered_reps)
        self.logger.info('Filtered RefSeq representatives written to %s' % filtered_reps_file)
        self.logger.info('Considering %d genomes after filtering for genome quality.' % (len(genomes_to_consider)))

        ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file)
        self.logger.info('Identified %d genomes marked as type strains at NCBI.' % len(ncbi_type_strains))
        self.logger.info('Identified %d genomes marked as type strains at LPSN.' % sum([len(x) for x in list(lpsn_type_strains.values())]))

        # dereplicate named species
        genomes_to_retain = self._dereplicate_species(genomes_to_consider,
                                                        max_species,
                                                        species,
                                                        representative_genomes,
                                                        complete_genomes,
                                                        ncbi_type_strains,
                                                        lpsn_type_strains,
                                                        prev_gtdb_reps,
                                                        genome_quality)

        self.logger.info('Retained %d genomes.' % len(genomes_to_retain))

        # write results
        if not exceptions_file:
            exceptions_file = ''

        fout = open(output_file, 'w')
        fout.write('# Selection criteria:\n')
        fout.write('# Maximum species: %d\n' % max_species)
        fout.write('# Exception file: %s\n' % exceptions_file)
        fout.write('# Trusted user genomes file: %s\n' % trusted_user_file)
        fout.write('# Genome quality metadata file: %s\n' % str(metadata_file))
        fout.write('# Min. representative completeness: %s\n' % str(min_rep_comp))
        fout.write('# Max. representative contamination: %s\n' % str(max_rep_cont))
        fout.write('#\n')
        fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tNCBI Organism Name\tNCBI Type strain\tComplete\tRepresentative\n')
        for genome_id in genomes_to_retain:
            representative = 'yes' if genome_id in representative_genomes else 'no'
            complete = 'yes' if genome_id in complete_genomes else 'no'
            ts = 'yes' if genome_id in ncbi_type_strains else 'no'
            gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(genome_id, Taxonomy.rank_prefixes))
            ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(genome_id, Taxonomy.rank_prefixes))

            fout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (genome_id,
                                                            gtdb_taxa_str,
                                                            ncbi_taxa_str,
                                                            genome_stats[genome_id].ncbi_organism_name,
                                                            ts,
                                                            complete,
                                                            representative))
        fout.close()
    def run(self, max_species,
                prev_rep_file,
                trusted_genomes_file,
                metadata_file,
                min_rep_comp,
                max_rep_cont,
                min_quality,
                max_contigs,
                min_N50,
                max_ambiguous,
                max_gap_length,
                strict_filtering,
                output_file):
        """Dereplicate genomes to a specific number per named species.

        Parameters
        ----------
        max_species : int
            Maximum number of genomes of the same species to retain.
        prev_rep_file : str
            File indicating previous representatives to favour during selection.
        trusted_genomes_file:
            File containing list of genomes to retain regardless of filtering criteria.
        metadata_file : str
            Metadata, including CheckM estimates, for all genomes.
        min_rep_comp : float [0, 100]
            Minimum completeness for a genome to be a representative.
        max_rep_cont : float [0, 100]
            Maximum contamination for a genome to be a representative.
        min_quality : float [0, 100]
            Minimum genome quality (comp-5*cont) for a genome to be a representative.
        max_contigs : int
            Maximum number of contigs for a genome to be a representative.
        min_N50 : int
            Minimum N50 of scaffolds for a genome to be a representative.
        max_ambiguous : int
            Maximum number of ambiguous bases for a genome to be a representative.
        max_gap_length : int
            Maximum number of ambiguous bases between contigs for a genome to be a representative.
        strict_filtering : boolean
            If True apply filtering to all genomes, otherise apply lenient 
            filtering to genomes where the chromosome and plasmids are reported 
            as complete.
        output_file : str
            Output file to contain list of dereplicated genomes.
        """
        
        trusted_accessions = set()
        if trusted_genomes_file:
            for line in open(trusted_genomes_file):
                line_split = line.split('\t')
                trusted_accessions.add(line_split[0].strip())

        accession_to_taxid, complete_genomes, representative_genomes = ncbi.read_refseq_metadata(metadata_file, keep_db_prefix=True)
        self.logger.info('Identified %d RefSeq genomes.' % len(accession_to_taxid))
        self.logger.info('Identified %d representative or reference genomes.' % len(representative_genomes))
        self.logger.info('Identified %d complete genomes.' % len(complete_genomes))
        self.logger.info('Identified %d genomes in exception list.' % len(trusted_accessions))
        
        if trusted_accessions.difference(representative_genomes):
            self.logger.error('There are genomes in the exception list which are not representatives.')
            sys.exit()
        
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        ncbi_organism_names = read_gtdb_ncbi_organism_name(metadata_file)
        species = species_label(gtdb_taxonomy, ncbi_taxonomy, ncbi_organism_names)
        self.logger.info('Identified %d genomes with a GTDB or NCBI species names.' % len(species))

        # get previous representatives
        prev_gtdb_reps = set()
        for line in open(prev_rep_file):
            prev_gtdb_reps.add(line.strip().split('\t')[0])
            
        self.logger.info('Identified %d previous GTDB representatives.' % len(prev_gtdb_reps))
        
        # get genome quality
        genomes_to_consider = list(accession_to_taxid.keys())
        genome_stats = read_gtdb_metadata(metadata_file, ['checkm_completeness',
                                                            'checkm_contamination',
                                                            'contig_count',
                                                            'n50_scaffolds',
                                                            'ambiguous_bases',
                                                            'total_gap_length',
                                                            'scaffold_count',
                                                            'ssu_count',
                                                            'ncbi_molecule_count',
                                                            'ncbi_unspanned_gaps',
                                                            'ncbi_genome_representation',
                                                            'ncbi_spanned_gaps',
                                                            'ncbi_assembly_level',
                                                            'ncbi_taxonomy',
                                                            'ncbi_organism_name',
                                                            'lpsn_strain'])
        missing_quality = set(accession_to_taxid.keys()) - set(genome_stats.keys())
        if missing_quality:
            self.logger.error('There are %d genomes without metadata information.' % len(missing_quality))
            self.exit(-1)
            
        filtered_reps_file = output_file + '.filtered_reps'
        fout = open(filtered_reps_file, 'w')
        fout.write('Genome ID\tCompleteness\tContamination\tContig Count\tN50\tNote\n')

        lpsn_type_strains = defaultdict(set)
        new_genomes_to_consider = []
        genome_quality = {}
        filtered_reps = 0
        lack_ncbi_taxonomy = 0
        contig_filter_count = 0
        for genome_id in list(accession_to_taxid.keys()):
            stats = genome_stats[genome_id]
            
            if not stats.ncbi_taxonomy:
                lack_ncbi_taxonomy += 1
                fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, 
                                                                        comp, 
                                                                        cont, 
                                                                        stats.contig_count, 
                                                                        stats.n50_scaffolds, 
                                                                        stats.ambiguous_bases,
                                                                        stats.total_gap_length,
                                                                        'no NCBI taxonomy'))
                self.logger.warning('Skipping %s as it has no assigned NCBI taxonomy.' % genome_id)
                continue
            
            comp = stats.checkm_completeness
            cont = stats.checkm_contamination
            
            keep = False
            if genome_id in trusted_accessions:
                keep = True
            elif (comp >= min_rep_comp
                    and cont <= max_rep_cont
                    and (comp - 5*cont) >= min_quality
                    and stats.contig_count <= max_contigs
                    and stats.n50_scaffolds >= min_N50
                    and stats.ambiguous_bases <= max_ambiguous
                    and stats.total_gap_length <= max_gap_length):
                        keep = True
            elif not strict_filtering:
                # check if genome appears to consist of only an unspanned
                # chromosome and unspanned plasmids and thus can be 
                # subjected to a more lenient quality check
                if (stats.ncbi_assembly_level in ['Complete Genome', 'Chromosome']
                    and stats.ncbi_genome_representation == 'full'
                    and stats.scaffold_count == stats.ncbi_molecule_count
                    and stats.ncbi_unspanned_gaps == 0
                    and stats.ncbi_spanned_gaps <= 10
                    and stats.ambiguous_bases <= 1000
                    and stats.total_gap_length <= 100000
                    and stats.ssu_count >= 1):
                    
                    # apply lenient quality check 
                    if comp >= 50 and cont <= 15:
                        keep = True
                        
            if keep:
                new_genomes_to_consider.append(genome_id)
                genome_quality[genome_id] = comp - 5*cont
                if stats.lpsn_strain:
                    ncbi_species = stats.ncbi_taxonomy.split(';')[6].strip()
                    lpsn_type_strains[ncbi_species].add(genome_id)
            
            # check if a representative at NCBI is being filtered
            if genome_id in representative_genomes and genome_id not in new_genomes_to_consider:
                fout.write('%s\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%s\n' % (genome_id, 
                                                                        comp, 
                                                                        cont, 
                                                                        stats.contig_count, 
                                                                        stats.n50_scaffolds, 
                                                                        stats.ambiguous_bases,
                                                                        stats.total_gap_length,
                                                                        stats.ncbi_organism_name))
                                                                        
                if stats.contig_count > 300:
                    contig_filter_count += 1 

                self.logger.warning('Filtered RefSeq representative %s with comp=%.2f, cont=%.2f, contigs=%d, N50=%d' % (genome_id, 
                                                                                                                            comp, 
                                                                                                                            cont, 
                                                                                                                            stats.contig_count, 
                                                                                                                            stats.n50_scaffolds))
                filtered_reps += 1
                
        fout.close()
        
        print('contig_filter_count', contig_filter_count)

        genomes_to_consider = new_genomes_to_consider
        self.logger.info('Skipped %d genomes without an assigned NCBI taxonomy.' % lack_ncbi_taxonomy)
        self.logger.info('Filtered %d representative or reference genomes based on genome or assembly quality.' % filtered_reps)
        self.logger.info('Filtered representative or reference genomes written to %s' % filtered_reps_file)
        self.logger.info('Considering %d genomes after filtering for genome quality.' % (len(genomes_to_consider)))

        ncbi_type_strains = read_gtdb_ncbi_type_strain(metadata_file)
        self.logger.info('Identified %d genomes marked as type strains at NCBI.' % len(ncbi_type_strains))
        self.logger.info('Identified %d genomes marked as type strains at LPSN.' % sum([len(x) for x in list(lpsn_type_strains.values())]))

        genomes_to_retain = self._dereplicate(genomes_to_consider,
                                                max_species,
                                                species,
                                                representative_genomes,
                                                complete_genomes,
                                                ncbi_type_strains,
                                                lpsn_type_strains,
                                                prev_gtdb_reps,
                                                genome_quality)

        self.logger.info('Retained %d genomes.' % len(genomes_to_retain))

        if not trusted_genomes_file:
            trusted_genomes_file = ''

        fout = open(output_file, 'w')
        fout.write('# Selection criteria:\n')
        fout.write('# Maximum species: %d\n' % max_species)
        fout.write('# Trusted genomes file: %s\n' % trusted_genomes_file)
        fout.write('# Genome quality metadata file: %s\n' % str(metadata_file))
        fout.write('# Min. representative completeness: %s\n' % str(min_rep_comp))
        fout.write('# Max. representative contamination: %s\n' % str(max_rep_cont))
        fout.write('#\n')
        fout.write('# Genome Id\tGTDB Taxonomy\tNCBI Taxonomy\tType strain\tComplete\tRepresentative\n')
        for assembly_accession in genomes_to_retain:
            representative = 'yes' if assembly_accession in representative_genomes else 'no'
            complete = 'yes' if assembly_accession in complete_genomes else 'no'
            ts = 'yes' if assembly_accession in ncbi_type_strains else 'no'
            gtdb_taxa_str = ';'.join(gtdb_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes))
            ncbi_taxa_str = ';'.join(ncbi_taxonomy.get(assembly_accession, Taxonomy.rank_prefixes))

            if assembly_accession.startswith('GCF_'):
                assembly_accession = 'RS_' + assembly_accession
            elif assembly_accession.startswith('GCA_'):
                assembly_accession = 'GB_' + assembly_accession

            fout.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (assembly_accession,
                                                         gtdb_taxa_str,
                                                         ncbi_taxa_str,
                                                         ts,
                                                         complete,
                                                         representative))
        fout.close()
    def run(self, qc_file, metadata_file, gtdb_user_genomes_file,
            genome_path_file, type_genome_cluster_file,
            type_genome_synonym_file, ncbi_refseq_assembly_file,
            ncbi_genbank_assembly_file, ani_af_nontype_vs_type):
        """Infer de novo species clusters and type genomes for remaining genomes."""

        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))

        # get NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes.' %
                         len(ncbi_taxonomy))
        self.logger.info('Read GTDB taxonomy for %d genomes.' %
                         len(gtdb_taxonomy))

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(
            ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = parse_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info(
            'Considering %d genomes as potential representatives after removing unwanted User genomes.'
            % len(genome_files))
        assert (len(genome_files) == len(passed_qc))

        # determine type genomes and genomes clustered to type genomes
        type_species, species_type_gid, type_gids, type_clustered_gids = self._parse_type_clusters(
            type_genome_cluster_file)
        assert (len(type_species) == len(type_gids))
        self.logger.info('Identified %d type genomes.' % len(type_gids))
        self.logger.info('Identified %d clustered genomes.' %
                         len(type_clustered_gids))

        # calculate quality score for genomes
        self.logger.info('Parse quality statistics for all genomes.')
        quality_metadata = read_quality_metadata(metadata_file)

        # calculate genome quality score
        self.logger.info('Calculating genome quality score.')
        genome_quality = quality_score(quality_metadata.keys(),
                                       quality_metadata)

        # determine genomes left to be clustered
        unclustered_gids = passed_qc - type_gids - type_clustered_gids
        #***unclustered_gids = set(list(unclustered_gids)[0:2000]) #***DEBUG
        self.logger.info('Identified %d unclustered genomes passing QC.' %
                         len(unclustered_gids))

        # establish closest type genome for each unclustered genome
        self.logger.info(
            'Determining ANI circumscription for %d unclustered genomes.' %
            len(unclustered_gids))
        nontype_radius = self._nontype_radius(unclustered_gids, type_gids,
                                              ani_af_nontype_vs_type)

        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info(
            'Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids)

        # de novo cluster genomes in a greedy fashion based on genome quality
        clusters, ani_af = self._cluster_de_novo(genome_files, nontype_radius,
                                                 unclustered_gids, mash_anis,
                                                 quality_metadata)

        # get list of synonyms in order to restrict usage of species names
        synonyms = self._parse_synonyms(type_genome_synonym_file)
        self.logger.info('Identified %d synonyms.' % len(synonyms))

        # determine User genomes with NCBI accession number that may form species names
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file,
                                                       metadata_file)
        self.logger.info(
            'Identified %d GTDB User genomes with NCBI accessions.' %
            len(gtdb_user_to_genbank))

        # assign species names to de novo species clusters
        names_in_use = synonyms.union(type_species)
        self.logger.info('Identified %d species names already in use.' %
                         len(names_in_use))
        self.logger.info('Assigning species name to each species cluster.')
        cluster_sp_names = self._assign_species_names(clusters, names_in_use,
                                                      ncbi_taxonomy,
                                                      gtdb_taxonomy,
                                                      gtdb_user_to_genbank)

        # write out file with details about selected representative genomes
        self._write_rep_info(
            clusters, cluster_sp_names, quality_metadata, genome_quality,
            excluded_from_refseq_note, ani_af,
            os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv'))

        # report clustering
        write_clusters(
            clusters, cluster_sp_names,
            os.path.join(self.output_dir, 'gtdb_rep_genome_clusters.tsv'))

        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(nontype_radius) - set(clusters):
            del nontype_radius[gid]

        all_species = cluster_sp_names
        all_species.update(species_type_gid)
        write_type_radius(
            nontype_radius, all_species,
            os.path.join(self.output_dir, 'gtdb_rep_genome_ani_radius.tsv'))

        # create single file specifying all GTDB clusters
        self._concat_cluster_files(
            type_genome_cluster_file,
            os.path.join(self.output_dir, 'gtdb_rep_genome_clusters.tsv'),
            os.path.join(self.output_dir, 'gtdb_clusters_final.tsv'))
Example #10
0
    def run(self,
            rna_name,
            gtdb_metadata_file,
            rna_file,
            min_rna_length,
            min_scaffold_length,
            min_quality,
            max_contigs,
            min_N50,
            tax_filter,
            genome_list,
            output_dir,
            align_method='ssu_align'):
        """Infer rRNA gene tree spanning select GTDB genomes.

        Parameters
        ----------
        rna_name : str
            Name of rRNA gene.
        gtdb_metadata_file : str
            File specifying GTDB metadata for each genome.
        rna_file : str
            File with rRNA gene sequences in FASTA format.
        min_rna_length : int
            Minimum required length of rRNA gene sequences.
        min_scaffold_length : int
            Minimum required length of scaffold containing rRNA gene sequence.
        min_quality : float [0, 100]
            Minimum genome quality for a genome to be include in tree.
        max_contigs : int
            Maximum number of contigs to include genome.
        min_N50 : int
            Minimum N50 to include genome.
        tax_filter : boolean
            Filter sequences based on incongruent taxonomy classification.
        genome_list : str
            Explicit list of genomes to use (ignores --ncbi_rep_only and --user_genomes).
        output_dir : str
            Directory to store results
        """

        if rna_name not in ['ssu', 'lsu']:
            self.logger.error('Unrecognized rRNA gene type: %s' % rna_name)
            sys.exit(-1)

        genome_metadata = read_gtdb_metadata(gtdb_metadata_file, [
            'checkm_completeness', 'checkm_contamination', 'scaffold_count',
            'n50_scaffolds', 'organism_name', 'gtdb_representative'
        ])

        gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)

        user_genomes = set()
        uba_genomes = set()
        ncbi_genomes = set()
        rep_genomes = set()
        for genome_id in genome_metadata:
            org_name = str(genome_metadata[genome_id][4])
            if genome_id.startswith('U_'):
                if '(UBA' in org_name:
                    uba_genomes.add(genome_id)
                else:
                    user_genomes.add(genome_id)
            elif genome_id.startswith('RS_') or genome_id.startswith('GB_'):
                ncbi_genomes.add(genome_id)
            else:
                self.logger.warning('Unrecognized genome prefix: %s' %
                                    genome_id)

            rep = genome_metadata[genome_id][5] == 't'
            if rep:
                rep_genomes.add(genome_id)

        self.logger.info(
            'Initially considering %d genomes (%d NCBI, %d UBA, %d User).' %
            (len(genome_metadata), len(ncbi_genomes), len(uba_genomes),
             len(user_genomes)))
        self.logger.info('Identified %d representative genomes.' %
                         len(rep_genomes))

        # get genomes specified in genome list by user
        genomes_to_consider = set()
        if genome_list:
            for line in open(genome_list):
                gid = line.rstrip().split('\t')[0]
                if gid.startswith('RS_') or gid.startswith(
                        'GB_') or gid.startswith('U_'):
                    genomes_to_consider.add(gid)
            self.logger.info(
                'Restricting genomes to the %d in the genome list.' %
                len(genomes_to_consider))
        else:
            # filter genomes based on quality and database source
            self.logger.info('Filtering genomes based on specified critieria.')
            self.logger.info('Filtering on minimum quality <%d.' % min_quality)
            self.logger.info('Filtering on number of contigs >%d.' %
                             max_contigs)
            self.logger.info('Filtering on scaffold N50 <%d.' % min_N50)

            new_genomes_to_consider = []
            filtered_genomes = 0
            gt = 0
            gq = 0
            sc = 0
            n50 = 0
            for genome_id in genome_metadata:
                if genome_id not in rep_genomes:
                    gt += 1
                    filtered_genomes += 1
                    continue

                if genome_id not in ncbi_genomes and genome_id not in uba_genomes:
                    gt += 1
                    filtered_genomes += 1
                    continue

                comp, cont, scaffold_count, n50_contigs, _org_name, _rep = genome_metadata[
                    genome_id]
                q = float(comp) - 5 * float(cont)
                if q < min_quality or int(scaffold_count) > max_contigs or int(
                        n50_contigs) < min_N50:
                    if q < min_quality:
                        gq += 1

                    if int(scaffold_count) > max_contigs:
                        sc += 1

                    if int(n50_contigs) < min_N50:
                        n50 += 1

                    filtered_genomes += 1
                    continue

                new_genomes_to_consider.append(genome_id)

            genomes_to_consider = new_genomes_to_consider
            self.logger.info(
                'Filtered %d genomes (%d on genome type, %d on genome quality, %d on number of contigs, %d on N50).'
                % (filtered_genomes, gt, gq, sc, n50))
            self.logger.info('Considering %d genomes after filtering.' %
                             len(genomes_to_consider))

        # limit taxonomy to genomes being considered
        cur_gtdb_taxonomy = {}
        for gid in genomes_to_consider:
            cur_gtdb_taxonomy[gid] = gtdb_taxonomy[gid]

        # get rRNA gene sequences for each genome
        rna_output_file = self._get_rna_seqs(rna_name, rna_file,
                                             min_rna_length,
                                             min_scaffold_length,
                                             cur_gtdb_taxonomy,
                                             genomes_to_consider, output_dir)

        # identify erroneous rRNA gene sequences
        if tax_filter:
            self.logger.info(
                'Filtering sequences with incongruent taxonomy strings.')
            filter = self._tax_filter(rna_output_file, cur_gtdb_taxonomy,
                                      output_dir)

            self.logger.info('Filtered %d sequences.' % len(filter))
            if len(filter) > 0:
                rna_filtered_output = os.path.join(
                    output_dir, 'gtdb_%s.tax_filter.fna' % rna_name)
                fout = open(rna_filtered_output, 'w')
                for seq_id, seq, annotation in seq_io.read_seq(
                        rna_output_file, keep_annotation=True):
                    if seq_id not in filter:
                        fout.write('>' + seq_id + ' ' + annotation + '\n')
                        fout.write(seq + '\n')
                fout.close()

                rna_output_file = rna_filtered_output

        # align sequences with ssu-align or mothur
        if rna_name == 'ssu':
            if align_method == 'ssu_align':
                self.logger.info('Aligning sequences with ssu-align.')
                align_dir = os.path.join(output_dir, '%s_align' % rna_name)
                os.system('ssu-align --dna %s %s' %
                          (rna_output_file, align_dir))
                os.system('ssu-mask --afa %s' % align_dir)
            elif align_method == 'mothur':
                self.logger.info('Aligning sequences with mothur.')
                align_dir = os.path.join(output_dir, 'mothur')
                if not os.path.exists(align_dir):
                    os.makedirs(align_dir)

                mothur_cmd = 'mothur "#set.dir(output=%s, blastdir=/srv/sw/Mothur/1.39.5)' % align_dir
                mothur_cmd += '; align.seqs(candidate=%s, template=/srv/db/mothur/silva_128/silva.seed_v128.align, search=blast, flip=t, processors=%d)' % (
                    rna_output_file, self.cpus)
                input_prefix = remove_extension(rna_output_file)
                align_file = os.path.join(align_dir, input_prefix + '.align')
                mothur_cmd += '; filter.seqs(fasta=%s, hard=/srv/db/mothur/silva_128/Lane1349.silva.filter, processors=%d);"' % (
                    align_file, self.cpus)
                os.system(mothur_cmd)
                input_msa = os.path.join(align_dir,
                                         input_prefix + '.filter.fasta')
        elif rna_name == 'lsu':
            self.logger.info('Aligning sequences with ssu-align.')
            align_dir = os.path.join(output_dir, '%s_align' % rna_name)
            if not os.path.exists(align_dir):
                os.makedirs(align_dir)

            os.system('esl-sfetch --index %s' % rna_output_file)

            # search fo sequences using domain-specific LSU HMMs
            for domain in ['archaea', 'bacteria', 'eukaryote']:
                self.logger.info(
                    'Matching LSU rRNA genes to %s-specific HMM.' % domain)
                table_out = os.path.join(
                    align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain))
                cm_dir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)), 'cm_files')
                cm_file = os.path.join(cm_dir, 'lsu_%s.cm' % domain)
                log_file = os.path.join(
                    align_dir, 'cmsearch.%s.%s.out' % (rna_name, domain))
                os.system(
                    'cmsearch --hmmonly --cpu %d --noali --tblout %s %s %s > %s'
                    %
                    (self.cpus, table_out, cm_file, rna_output_file, log_file))

            # identify top hits for each domain
            self.logger.info(
                'Identifying best domain-specific HMM for each LSU rRNA gene.')
            top_hits = {}
            for domain in ['archaea', 'bacteria', 'eukaryote']:
                table_out = os.path.join(
                    align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain))
                for line in open(table_out):
                    if line[0] == '#':
                        continue

                    line_split = line.split()
                    seq_id = line_split[0]
                    start_seq = int(line_split[7])
                    end_seq = int(line_split[8])
                    bitscore = float(line_split[14])

                    prev_bitscore = top_hits.get(seq_id, [None, 0, 0, 0, 0])[4]
                    if bitscore > prev_bitscore:
                        top_hits[seq_id] = [
                            domain, seq_id, start_seq, end_seq, bitscore
                        ]

            # create MSA for each bacteria and archaea
            for domain in ['archaea', 'bacteria']:
                # creat file of top hits
                top_hits_out = os.path.join(
                    align_dir, 'top_hits.%s.%s.tsv' % (rna_name, domain))
                fout = open(top_hits_out, 'w')
                num_hits = 0
                for top_domain, seq_id, start_seq, end_seq, bitscore in top_hits.values(
                ):
                    if top_domain == domain:
                        fout.write('%s\t%d\t%d\%f\n' %
                                   (seq_id, start_seq, end_seq, bitscore))
                        num_hits += 1
                fout.close()

                # align top hits
                self.logger.info(
                    'Creating MSA for %s LSU rRNA genes (%d sequences).' %
                    (domain, num_hits))

                if num_hits > 0:
                    seq_file = os.path.join(
                        align_dir, 'cmsearch.%s.%s.fna' % (rna_name, domain))
                    os.system(
                        "grep -v '^#' %s | awk '{print $1, $2, $3, $1}' | esl-sfetch -Cf %s - > %s"
                        % (top_hits_out, rna_output_file, seq_file))

                    align_file = os.path.join(
                        align_dir, 'cmalign.%s.%s.stk' % (rna_name, domain))
                    os.system('cmalign --dnaout --outformat Pfam %s %s > %s' %
                              (cm_file, seq_file, align_file))

                    masked_file = os.path.join(
                        align_dir,
                        'cmalign.%s.%s.mask.afa' % (rna_name, domain))
                    os.system('esl-alimask -p --outformat AFA %s > %s' %
                              (align_file, masked_file))

        # trim sequences and infer tree
        if align_method == 'ssu_align':
            for domain in ['archaea', 'bacteria']:
                if rna_name == 'ssu':
                    input_msa = os.path.join(
                        align_dir, 'ssu_align.' + domain + '.mask.afa')
                elif rna_name == 'lsu':
                    input_msa = os.path.join(
                        align_dir,
                        'cmalign.%s.%s.mask.afa' % (rna_name, domain))

                if not os.path.exists(input_msa):
                    continue

                trimmed_msa = os.path.join(output_dir, domain + '.trimmed.fna')
                self._trim_seqs(input_msa, trimmed_msa)

                # infer tree
                self.logger.info('Inferring tree for %s genes.' % domain)
                output_tree = os.path.join(output_dir, domain + '.tree')
                os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                          (trimmed_msa, output_tree))
        elif align_method == 'mothur':
            trimmed_msa = os.path.join(output_dir,
                                       input_prefix + '.trimmed.fna')
            self._trim_seqs(input_msa, trimmed_msa)

            # infer tree
            self.logger.info('Inferring tree for %s genes.')
            output_tree = os.path.join(output_dir, input_prefix + '.tree')
            os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                      (trimmed_msa, output_tree))
    def run(self, metadata_file, trusted_comp, trusted_cont, max_contigs,
            min_N50, refseq_rep, output_file):
        """Determine trusted genomes based on genome statistics.

        Parameters
        ----------
        metadata_file : str
            Metadata, including CheckM estimates, for all genomes.
        trusted_comp : float [0, 100]
            Minimum completeness to trust genome for marker set inference.
        trusted_cont : float [0, 100]
            Maximum contamination to trust genome for marker set inference.
        max_contigs : int
            Maximum number of contigs within trusted genomes.
        min_N50 : int
            Minimum N50 of trusted genomes.
        refseq_rep : boolean
            If true, consider only RefSeq representative and reference genomes.
        output_file : str
            Output file to contain list of trusted genomes.
        """

        representative_genomes = None
        if refseq_rep:
            _accession_to_taxid, complete_genomes, representative_genomes = ncbi.read_refseq_metadata(
                metadata_file, keep_db_prefix=True)

        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)

        trusted_genomes_stats = self._trusted_genomes(metadata_file,
                                                      trusted_comp,
                                                      trusted_cont,
                                                      max_contigs, min_N50)
        if representative_genomes:
            self.logger.info('Limiting genomes to RefSeq representative.')
            for genome_id in list(trusted_genomes_stats.keys()):
                if genome_id not in representative_genomes:
                    del trusted_genomes_stats[genome_id]

        self.logger.info('Identified %d trusted genomes.' %
                         len(trusted_genomes_stats))

        fout = open(output_file, 'w')
        fout.write('# Selection criteria:\n')
        fout.write('# Trusted completeness: %f\n' % trusted_comp)
        fout.write('# Trusted contamination: %f\n' % trusted_cont)
        fout.write('# Maximum contigs: %d\n' % max_contigs)
        fout.write('# Minimum N50: %d\n' % min_N50)
        fout.write('#\n')
        fout.write(
            '# Genome Id\tCompleteness,Contamination,Contig count,N50\tGTDB Taxonomy\tNCBI Taxonomy\n'
        )

        for assembly_accession, stats in trusted_genomes_stats.items():
            fout.write(
                '%s\t%s\t%s\t%s\n' %
                (assembly_accession, ','.join(map(str, stats)), ';'.join(
                    gtdb_taxonomy.get(assembly_accession, ['none'])), ';'.join(
                        ncbi_taxonomy.get(assembly_accession, ['none']))))

        fout.close()
Example #12
0
    def run(self, metadata_file, gtdb_user_genomes_file, gtdb_user_reps,
            ncbi_refseq_assembly_file, ncbi_genbank_assembly_file,
            gtdb_domain_report, min_comp, max_cont, min_quality, sh_exception,
            min_perc_markers, max_contigs, min_N50, max_ambiguous, output_dir):
        """Quality check all potential GTDB genomes."""

        # get GTDB and NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy = read_gtdb_ncbi_taxonomy(metadata_file)
        ncbi_species = binomial_species(ncbi_taxonomy)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes.' %
                         len(ncbi_taxonomy))
        self.logger.info('Read GTDB taxonomy for %d genomes.' %
                         len(gtdb_taxonomy))

        # determine User genomes to retain for consideration
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file,
                                                       metadata_file)
        self.logger.info(
            'Identified %d GTDB User genomes with GenBank accessions to retain for potential inclusion in GTDB.'
            % len(gtdb_user_to_genbank))

        user_genomes = 0
        for line in open(gtdb_user_reps):
            line_split = line.strip().split('\t')
            gid, taxonomy = line_split
            if gid not in gtdb_user_to_genbank:
                if 'd__Bacteria' in taxonomy:
                    self.logger.warning(
                        'Bacterial genome %s has no NCBI accession and is being skipped.'
                        % gid)
                else:
                    gtdb_user_to_genbank[gid] = gid
                    user_genomes += 1
        self.logger.info(
            'Identified %d archaeal GTDB User genome WITHOUT GenBank accessions to retain for potential inclusion in GTDB.'
            % user_genomes)

        # calculate quality score for genomes
        self.logger.info('Parsing QC statistics for each genome.')
        quality_metadata = read_gtdb_metadata(metadata_file, [
            'checkm_completeness', 'checkm_contamination',
            'checkm_strain_heterogeneity_100', 'contig_count', 'n50_contigs',
            'ambiguous_bases', 'genome_size'
        ])

        marker_perc = parse_marker_percentages(gtdb_domain_report)

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(
            ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get type material designations for each genome
        self.logger.info(
            'Reading type material designations for genomes from GTDB metadata file.'
        )
        type_metadata = read_gtdb_metadata(metadata_file, [
            'ncbi_type_material_designation', 'gtdb_type_designation',
            'gtdb_type_designation_sources'
        ])

        ncbi_tsp = ncbi_type_strain_of_species(type_metadata)
        gtdb_tsp = gtdb_type_strain_of_species(type_metadata)

        # QC all genomes
        self.logger.info('Validating genomes.')
        fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w')
        fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w')

        header = 'Accession\tNCBI species'
        header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%%'
        header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases'

        fout_retained.write(header + '\n')
        fout_failed.write(header)
        fout_failed.write(
            '\tFailed completeness\tFailed contamination\tFailed quality')
        fout_failed.write(
            '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n'
        )

        num_retained = 0
        num_filtered = 0
        for gid in quality_metadata:
            if gid.startswith('U_') and gid not in gtdb_user_to_genbank:
                # skip user genomes not marked for retention
                continue

            failed_tests = defaultdict(int)
            passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid],
                                min_comp, max_cont, min_quality, sh_exception,
                                min_perc_markers, max_contigs, min_N50,
                                max_ambiguous, failed_tests)

            if passed_qc:
                num_retained += 1
                fout_retained.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_retained.write(
                    '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\n' %
                    (quality_metadata[gid].checkm_completeness,
                     quality_metadata[gid].checkm_contamination,
                     quality_metadata[gid].checkm_completeness -
                     5 * quality_metadata[gid].checkm_contamination,
                     ('%.2f' %
                      quality_metadata[gid].checkm_strain_heterogeneity_100) if
                     quality_metadata[gid].checkm_strain_heterogeneity_100 else
                     '-', marker_perc[gid], quality_metadata[gid].contig_count,
                     quality_metadata[gid].n50_contigs,
                     quality_metadata[gid].ambiguous_bases))
            else:
                num_filtered += 1
                fout_failed.write('%s\t%s' % (gid, ncbi_taxonomy[gid][6]))
                fout_failed.write(
                    '\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' %
                    (quality_metadata[gid].checkm_completeness,
                     quality_metadata[gid].checkm_contamination,
                     quality_metadata[gid].checkm_completeness -
                     5 * quality_metadata[gid].checkm_contamination,
                     ('%.2f' %
                      quality_metadata[gid].checkm_strain_heterogeneity_100) if
                     quality_metadata[gid].checkm_strain_heterogeneity_100 else
                     '-', marker_perc[gid], quality_metadata[gid].contig_count,
                     quality_metadata[gid].n50_contigs,
                     quality_metadata[gid].ambiguous_bases))
                fout_failed.write(
                    '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' %
                    (failed_tests['comp'], failed_tests['cont'],
                     failed_tests['qual'], failed_tests['marker_perc'],
                     failed_tests['contig_count'], failed_tests['N50'],
                     failed_tests['ambig']))
        fout_retained.close()
        fout_failed.close()

        self.logger.info('Retained %d genomes and filtered %d genomes.' %
                         (num_retained, num_filtered))

        # QC genomes in each named species
        self.logger.info(
            'Performing QC of type genome for each of the %d NCBI species.' %
            len(ncbi_species))

        fout_type_fail = open(
            os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w')
        fout_type_fail.write(
            'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)'
        )
        fout_type_fail.write(
            '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        )
        fout_type_fail.write(
            '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n'
        )

        fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'),
                            'w')
        fout_fail_sp.write(
            'Species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)'
        )
        fout_fail_sp.write(
            '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        )
        fout_fail_sp.write(
            '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases')
        fout_fail_sp.write(
            '\tFailed completeness\tFailed contamination\tFailed quality')
        fout_fail_sp.write(
            '\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases'
        )
        fout_fail_sp.write('\tNCBI exclude from RefSeq\n')

        fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w')
        fout_sp_lost.write('Species\tNo. genomes\tNo. type genomes')
        fout_sp_lost.write(
            '\tFail completeness\tFail contamination\tFail quality\tFailed percent markers'
        )
        fout_sp_lost.write(
            '\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n')

        lost_type = 0
        lost_sp = 0
        filtered_genomes = 0
        failed_tests_cumulative = defaultdict(int)
        for sp, gids in ncbi_species.items():
            type_pass = set()
            type_fail = set()
            other_pass = set()
            other_fail = set()

            failed_tests_gids = {}
            for gid in gids:
                failed_tests = defaultdict(int)
                passed_qc = pass_qc(quality_metadata[gid], marker_perc[gid],
                                    min_comp, max_cont, min_quality,
                                    sh_exception, min_perc_markers,
                                    max_contigs, min_N50, max_ambiguous,
                                    failed_tests)

                failed_tests_gids[gid] = failed_tests

                if gid in gtdb_tsp or gid in ncbi_tsp:
                    if passed_qc:
                        type_pass.add(gid)
                    else:
                        type_fail.add(gid)
                        filtered_genomes += 1
                else:
                    if passed_qc:
                        other_pass.add(gid)
                    else:
                        other_fail.add(gid)
                        filtered_genomes += 1

            # tally failed species
            for test, count in failed_tests.items():
                failed_tests_cumulative[test] += count

            if len(type_pass) >= 1:
                # great: one or more type genomes pass QC and will be selected as the type genome
                continue

            if len(type_fail):
                # all potential type genomes for species failed QC so report these for manual inspection
                lost_type += 1
                for gid in type_fail:
                    fout_type_fail.write(
                        '%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n'
                        %
                        (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(
                            ncbi_taxonomy[gid]),
                         type_metadata[gid].gtdb_type_designation_sources,
                         type_metadata[gid].ncbi_type_material_designation,
                         float(quality_metadata[gid].genome_size) / 1e6,
                         quality_metadata[gid].checkm_completeness,
                         quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_completeness -
                         5 * quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_strain_heterogeneity_100,
                         marker_perc[gid], quality_metadata[gid].contig_count,
                         quality_metadata[gid].n50_contigs,
                         quality_metadata[gid].ambiguous_bases,
                         excluded_from_refseq_note[gid], len(other_pass) == 0))

            if len(other_pass) == 0:
                # no genomes for species pass QC so report loss of species
                lost_sp += 1
                fout_sp_lost.write('%s\t%d\t%d' %
                                   (sp, len(gids), len(type_fail)))
                fout_sp_lost.write(
                    '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' %
                    (sum([failed_tests_gids[gid]['comp'] for gid in gids]),
                     sum([failed_tests_gids[gid]['cont'] for gid in gids]),
                     sum([failed_tests_gids[gid]['qual'] for gid in gids]),
                     sum([
                         failed_tests_gids[gid]['marker_perc'] for gid in gids
                     ]),
                     sum([
                         failed_tests_gids[gid]['contig_count'] for gid in gids
                     ]), sum([failed_tests_gids[gid]['N50'] for gid in gids]),
                     sum([failed_tests_gids[gid]['ambig'] for gid in gids])))

                for gid in type_fail.union(other_fail):
                    fout_fail_sp.write(
                        '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d'
                        %
                        (sp, gid, '; '.join(gtdb_taxonomy[gid]), '; '.join(
                            ncbi_taxonomy[gid]), gid in type_fail,
                         float(quality_metadata[gid].genome_size) / 1e6,
                         quality_metadata[gid].checkm_completeness,
                         quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_completeness -
                         5 * quality_metadata[gid].checkm_contamination,
                         quality_metadata[gid].checkm_strain_heterogeneity_100,
                         marker_perc[gid], quality_metadata[gid].contig_count,
                         quality_metadata[gid].n50_contigs,
                         quality_metadata[gid].ambiguous_bases))
                    fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' %
                                       (failed_tests_gids[gid]['comp'],
                                        failed_tests_gids[gid]['cont'],
                                        failed_tests_gids[gid]['qual'],
                                        failed_tests_gids[gid]['marker_perc'],
                                        failed_tests_gids[gid]['contig_count'],
                                        failed_tests_gids[gid]['N50'],
                                        failed_tests_gids[gid]['ambig']))
                    fout_fail_sp.write('\t%s\n' %
                                       excluded_from_refseq_note[gid])

        fout_type_fail.close()
        fout_fail_sp.close()
        fout_sp_lost.close()

        self.logger.info('Genomes filtered for each criterion:')
        for test in sorted(failed_tests_cumulative):
            self.logger.info('%s: %d' % (test, failed_tests_cumulative[test]))

        self.logger.info('Filtered %d genomes assigned to NCBI species.' %
                         filtered_genomes)
        self.logger.info(
            'Identified %d species with type genomes failing QC and %d total species failing QC.'
            % (lost_type, lost_sp))