def load_from_metadata_file(self, 
                                metadata_file,
                                species_exception_file=None,
                                genus_exception_file=None,
                                gtdb_type_strains_ledger=None,
                                create_sp_clusters=True,
                                uba_genome_file=None,
                                qc_passed_file=None,
                                ncbi_genbank_assembly_file=None,
                                untrustworthy_type_ledger=None):
        """Create genome set from file(s)."""
        
        pass_qc_gids = set()
        if qc_passed_file:
            with open(qc_passed_file) as f:
                f.readline()
                for line in f:
                    line_split = line.strip().split('\t')
                    pass_qc_gids.add(line_split[0].strip())
            self.logger.info(f' - identified {len(pass_qc_gids):,} genomes passing QC.')
                    
        valid_uba_ids = set()
        if uba_genome_file:
            with open(uba_genome_file) as f:
                for line in f:
                    line_split = line.strip().split('\t')
                    valid_uba_ids.add(line_split[0].strip())
            self.logger.info(f' - identified {len(valid_uba_ids):,} UBA genomes to retain.')

        gtdb_type_strains = set()
        if gtdb_type_strains_ledger:
            with open(gtdb_type_strains_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    gid = canonical_gid(tokens[0].strip())
                    gtdb_type_strains.add(gid)
            self.logger.info(f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.')
                    
        excluded_from_refseq_note = {}
        if ncbi_genbank_assembly_file:
            excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file)
            
        untrustworthy_as_type = set()
        if untrustworthy_type_ledger:
            untrustworthy_as_type = self.parse_untrustworthy_type_ledger(untrustworthy_type_ledger)
            self.logger.info(f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type.')

        with open(metadata_file, encoding='utf-8') as f:
            headers = f.readline().strip().split('\t')

            genome_index = headers.index('accession')

            gtdb_taxonomy_index = headers.index('gtdb_taxonomy')
            ncbi_taxonomy_index = headers.index('ncbi_taxonomy')
            ncbi_taxonomy_unfiltered_index = headers.index('ncbi_taxonomy_unfiltered')
            
            gtdb_type_index = headers.index('gtdb_type_designation')
            gtdb_type_sources_index = headers.index('gtdb_type_designation_sources')
            gtdb_type_species_of_genus_index = headers.index('gtdb_type_species_of_genus')
            ncbi_strain_identifiers_index = headers.index('ncbi_strain_identifiers')
            ncbi_type_index = headers.index('ncbi_type_material_designation')
            ncbi_asm_level_index = headers.index('ncbi_assembly_level')
            ncbi_genome_representation_index = headers.index('ncbi_genome_representation')
            ncbi_refseq_cat_index = headers.index('ncbi_refseq_category')
            ncbi_genome_cat_index = headers.index('ncbi_genome_category')
            
            comp_index = headers.index('checkm_completeness')
            cont_index = headers.index('checkm_contamination')
            sh_100_index = None
            if 'checkm_strain_heterogeneity_100' in headers:
                sh_100_index = headers.index('checkm_strain_heterogeneity_100')
            gs_index = headers.index('genome_size')
            contig_count_index = headers.index('contig_count')
            n50_index = headers.index('n50_contigs')
            scaffold_count_index = headers.index('scaffold_count')
            ambiguous_bases_index = headers.index('ambiguous_bases')
            total_gap_len_index = headers.index('total_gap_length')
            ssu_count_index = headers.index('ssu_count')
            ssu_length_index = headers.index('ssu_length')
            ncbi_molecule_count_index = headers.index('ncbi_molecule_count')
            ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps')
            ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps')
            
            gtdb_genome_rep_index = headers.index('gtdb_genome_representative')
            gtdb_rep_index = headers.index('gtdb_representative')
            
            if 'lpsn_priority_year' in headers:
                # this information will be missing from the previous
                # GTDB metadata file as we strip this out due to 
                # concerns over republishing this information
                lpsn_priority_index = headers.index('lpsn_priority_year')
                dsmz_priority_index = headers.index('dsmz_priority_year')
                straininfo_priority_index = headers.index('straininfo_priority_year')

            for line in f:
                line_split = line.strip().split('\t')
                
                ncbi_accn = line_split[genome_index]
                gid = canonical_gid(ncbi_accn)

                if gid.startswith('U_'):
                    # check if genome has a UBA identifier
                    org_name_index = headers.index('organism_name')
                    org_name = line_split[org_name_index]
                    if '(UBA' in org_name:
                        uba_id = org_name[org_name.find('(')+1:-1]
                        if uba_id in valid_uba_ids:
                            self.user_uba_id_map[gid] = uba_id
                            self.uba_user_id_map[uba_id] = gid
                            gid = uba_id
                        else:
                            continue # retain only valid UBA genomes
                    else:
                        continue # skip non-UBA user genomes
                        
                if pass_qc_gids and gid not in pass_qc_gids:
                    continue

                gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index])
                
                ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index])
                ncbi_taxonomy_unfiltered = Taxa(line_split[ncbi_taxonomy_unfiltered_index])
                
                gtdb_type = line_split[gtdb_type_index]
                gtdb_type_sources = line_split[gtdb_type_sources_index]
                if gid in gtdb_type_strains:
                    gtdb_type = 'type strain of species'
                    gtdb_type_sources = 'GTDB curator'
                gtdb_type_species_of_genus = line_split[gtdb_type_species_of_genus_index] == 't'
                
                ncbi_type = line_split[ncbi_type_index]
                ncbi_strain_identifiers = line_split[ncbi_strain_identifiers_index]
                ncbi_asm_level = line_split[ncbi_asm_level_index]
                ncbi_genome_representation = line_split[ncbi_genome_representation_index]
                ncbi_refseq_cat = line_split[ncbi_refseq_cat_index]
                ncbi_genome_cat = line_split[ncbi_genome_cat_index]
                
                comp = float(line_split[comp_index])
                cont = float(line_split[cont_index])
                sh_100 = 0
                if sh_100_index:
                    sh_100 = self._convert_float(line_split[sh_100_index])
                gs = int(line_split[gs_index])
                contig_count = int(line_split[contig_count_index])
                n50 = int(line_split[n50_index])
                scaffold_count = int(line_split[scaffold_count_index])
                ambiguous_bases = int(line_split[ambiguous_bases_index])
                total_gap_len = int(line_split[total_gap_len_index])
                ssu_count = int(line_split[ssu_count_index])
                ssu_length = self._convert_int(line_split[ssu_length_index])
                ncbi_molecule_count = self._convert_int(line_split[ncbi_molecule_count_index])
                ncbi_unspanned_gaps = self._convert_int(line_split[ncbi_unspanned_gaps_index])
                ncbi_spanned_gaps = self._convert_int(line_split[ncbi_spanned_gaps_index])
                
                gtdb_is_rep = line_split[gtdb_rep_index] == 't'
                gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index])
                if create_sp_clusters:
                    self.sp_clusters.update_sp_cluster(gtdb_rid, gid, gtdb_taxonomy.species)
                
                if 'lpsn_priority_year' in headers:
                    lpsn_priority_year = self._convert_int(line_split[lpsn_priority_index], Genome.NO_PRIORITY_YEAR)
                    dsmz_priority_year = self._convert_int(line_split[dsmz_priority_index], Genome.NO_PRIORITY_YEAR)
                    straininfo_priority_year = self._convert_int(line_split[straininfo_priority_index], Genome.NO_PRIORITY_YEAR)
                else:
                    lpsn_priority_year = Genome.NO_PRIORITY_YEAR
                    dsmz_priority_year = Genome.NO_PRIORITY_YEAR
                    straininfo_priority_year = Genome.NO_PRIORITY_YEAR

                self.genomes[gid] = Genome(gid,
                                            ncbi_accn,
                                            gtdb_rid,
                                            gtdb_is_rep,
                                            gtdb_taxonomy,
                                            ncbi_taxonomy,
                                            ncbi_taxonomy_unfiltered,
                                            gtdb_type,
                                            gtdb_type_sources,
                                            gtdb_type_species_of_genus,
                                            gid in untrustworthy_as_type,
                                            ncbi_type,
                                            ncbi_strain_identifiers,
                                            ncbi_asm_level,
                                            ncbi_genome_representation,
                                            ncbi_refseq_cat,
                                            ncbi_genome_cat,
                                            excluded_from_refseq_note.get(gid, ''),
                                            comp,
                                            cont,
                                            sh_100,
                                            gs,
                                            contig_count,
                                            n50,
                                            scaffold_count,
                                            ambiguous_bases,
                                            total_gap_len,
                                            ssu_count,
                                            ssu_length,
                                            ncbi_molecule_count,
                                            ncbi_unspanned_gaps,
                                            ncbi_spanned_gaps,
                                            lpsn_priority_year,
                                            dsmz_priority_year,
                                            straininfo_priority_year)
                                            
        self._apply_ncbi_taxonomy_ledgers(species_exception_file,
                                            genus_exception_file)
Example #2
0
    def run(self, prev_gtdb_metadata_file, cur_gtdb_metadata_file,
            ncbi_genbank_assembly_file, gtdb_domain_report,
            gtdb_type_strains_ledger, qc_exception_file,
            ncbi_env_bioproject_ledger, min_comp, max_cont, min_quality,
            sh_exception, min_perc_markers, max_contigs, min_N50,
            max_ambiguous):
        """Quality check all potential GTDB genomes."""

        # create previous and current GTDB genome sets
        self.logger.info('Creating previous GTDB genome set.')
        prev_genomes = Genomes()
        prev_genomes.load_from_metadata_file(
            prev_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - previous genome set contains {len(prev_genomes):,} genomes.')
        self.logger.info(
            ' - previous genome set has {:,} species clusters spanning {:,} genomes.'
            .format(len(prev_genomes.sp_clusters),
                    prev_genomes.sp_clusters.total_num_genomes()))

        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)

        # parse genomes flagged as exceptions from QC
        qc_exceptions = self.parse_qc_exception_file(qc_exception_file)
        self.logger.info(
            f'Identified {len(qc_exceptions):,} genomes flagged as exceptions from QC.'
        )

        # get percentage of bac120 or ar122 marker genes
        marker_perc = self.parse_marker_percentages(gtdb_domain_report)

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(
            ncbi_genbank_assembly_file)

        # QC all genomes
        self.logger.info('Validating genomes.')
        passed_qc_gids, failed_qc_gids = self.qc_genomes(
            cur_genomes, marker_perc, qc_exceptions, excluded_from_refseq_note,
            min_comp, max_cont, min_quality, sh_exception, min_perc_markers,
            max_contigs, min_N50, max_ambiguous)

        # check domain assignment of genomes passing QC
        # and report potential issues
        self.check_domain_assignments(gtdb_domain_report, passed_qc_gids)

        # report results of QC on genomes from each NCBI species
        self.check_qc_of_ncbi_species(cur_genomes, marker_perc, qc_exceptions,
                                      excluded_from_refseq_note, min_comp,
                                      max_cont, min_quality, sh_exception,
                                      min_perc_markers, max_contigs, min_N50,
                                      max_ambiguous)

        # sanity check QC results by identifying any genomes that passed QC last release, but
        # have now been flagged as failing QC. This should rarely, if ever, happen unless the
        # genomic data of the assembly has been updated.
        unexpected_qc_fail = []
        for gid in prev_genomes:
            if gid in cur_genomes:
                if not same_assembly_version(prev_genomes[gid].ncbi_accn,
                                             cur_genomes[gid].ncbi_accn):
                    # genome assembly has changed so QC status is not expected to be the same
                    continue

                if gid in failed_qc_gids:
                    unexpected_qc_fail.append(gid)

        if len(unexpected_qc_fail) > 0:
            self.logger.warning(
                'Identified {:,} genomes that passed QC in previous GTDB release, that failed QC in this release.'
                .format(len(unexpected_qc_fail)))
            self.logger.warning(' - examples: {}'.format(','.join(
                unexpected_qc_fail[0:10])))
    def run(self, 
                metadata_file,
                cur_uba_gid_file,
                ncbi_genbank_assembly_file,
                gtdb_domain_report,
                qc_exception_file,
                min_comp,
                max_cont,
                min_quality,
                sh_exception,
                min_perc_markers,
                max_contigs,
                min_N50,
                max_ambiguous,
                output_dir):
        """Quality check all potential GTDB genomes."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(metadata_file,
                                                create_sp_clusters=False,
                                                uba_genome_file=cur_uba_gid_file)
        self.logger.info(f' ...current genome set contains {len(cur_genomes):,} genomes.')

        # parse genomes flagged as exceptions from QC
        qc_exceptions = set()
        with open(qc_exception_file, encoding='utf-8') as f:
            f.readline()
            for line in f:
                gid = canonical_gid(line.split('\t')[0].strip())
                qc_exceptions.add(gid)
        self.logger.info(f'Identified {len(qc_exceptions):,} genomes flagged as exceptions from QC.')
        
        # get percentage of bac120 or ar122 marker genes
        marker_perc = self.read_marker_percentages(gtdb_domain_report, 
                                                    cur_genomes)

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file)

        # QC all genomes
        self.logger.info('Validating genomes.')
        fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w')
        fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w')
        
        header = 'Accession\tNCBI species\tGTDB taxonomy'
        header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases'
        
        fout_retained.write(header + '\tNote\n')
        fout_failed.write(header)
        fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n')

        pass_qc_gids = set()
        failed_qc_gids = set()
        for gid in cur_genomes:
            failed_tests = defaultdict(int)
            passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid],
                                                    min_comp,
                                                    max_cont,
                                                    min_quality,
                                                    sh_exception,
                                                    min_perc_markers,
                                                    max_contigs,
                                                    min_N50,
                                                    max_ambiguous,
                                                    failed_tests)

            if passed_qc or gid in qc_exceptions:
                pass_qc_gids.add(gid)
                fout_retained.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa))
                fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % (
                                        cur_genomes[gid].comp,
                                        cur_genomes[gid].cont,
                                        cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                        ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        cur_genomes[gid].contig_count,
                                        cur_genomes[gid].contig_n50,
                                        cur_genomes[gid].ambiguous_bases,
                                        'Passed QC' if passed_qc else 'Flagged as exception'))
            else:
                failed_qc_gids.add(gid) 
                fout_failed.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa))
                fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % (
                                        cur_genomes[gid].comp,
                                        cur_genomes[gid].cont,
                                        cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                        ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        cur_genomes[gid].contig_count,
                                        cur_genomes[gid].contig_n50,
                                        cur_genomes[gid].ambiguous_bases))
                fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    failed_tests['comp'],
                                    failed_tests['cont'],
                                    failed_tests['qual'],
                                    failed_tests['marker_perc'],
                                    failed_tests['contig_count'],
                                    failed_tests['N50'],
                                    failed_tests['ambig']))
        fout_retained.close()
        fout_failed.close()
        
        self.logger.info('Retained {:,} ({:.2f}%) genomes and filtered {:,} ({:.2f}%) genomes.'.format(
                            len(pass_qc_gids),
                            len(pass_qc_gids)*100.0/len(cur_genomes),
                            len(failed_qc_gids),
                            len(failed_qc_gids)*100.0/len(cur_genomes)))
        
        # check domain assignment of genomes passing QC
        # report potential issues
        self.check_domain_assignments(gtdb_domain_report, 
                                        cur_genomes,
                                        pass_qc_gids)
                                                                
        # QC genomes in each named species
        named_ncbi_species = cur_genomes.named_ncbi_species()
        self.logger.info(f'Performing QC of type genome for each of the {len(named_ncbi_species):,} NCBI species.')
        
        fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w')
        fout_type_fail.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)')
        fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n')
        
        fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w')
        fout_fail_sp.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)')
        fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases')
        fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases')
        fout_fail_sp.write('\tNCBI exclude from RefSeq\n')
        
        fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w')
        fout_sp_lost.write('NCBI species\tNo. genomes\tNo. type genomes')
        fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers')
        fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n')
        
        lost_type = 0
        lost_sp = 0
        filtered_genomes = 0
        failed_tests_cumulative = defaultdict(int)
        for sp, gids in named_ncbi_species.items():
            type_pass = set()
            type_fail = set()
            other_pass = set()
            other_fail = set()
            
            failed_tests_gids = {}
            for gid in gids:
                failed_tests = defaultdict(int)
                passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid],
                                                        min_comp,
                                                        max_cont,
                                                        min_quality,
                                                        sh_exception,
                                                        min_perc_markers,
                                                        max_contigs,
                                                        min_N50,
                                                        max_ambiguous,
                                                        failed_tests)
                                    
                failed_tests_gids[gid] = failed_tests

                if cur_genomes[gid].is_gtdb_type_strain() or cur_genomes[gid].is_ncbi_type_strain():
                    if passed_qc or gid in qc_exceptions:
                        type_pass.add(gid)
                    else:
                        type_fail.add(gid)
                        filtered_genomes += 1
                else:
                    if passed_qc or gid in qc_exceptions:
                        other_pass.add(gid)
                    else:
                        other_fail.add(gid)
                        filtered_genomes += 1
                        
                # tally failed species
                for test, count in failed_tests.items():
                    failed_tests_cumulative[test] += count

            if len(type_pass) >= 1:
                # great: one or more type genomes pass QC and will be selected as the type genome
                continue 
            
            if len(type_fail):
                # all potential type genomes for species failed QC so report these for manual inspection
                lost_type += 1
                for gid in type_fail:
                    fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % (
                                            sp,
                                            gid,
                                            cur_genomes[gid].gtdb_taxa,
                                            cur_genomes[gid].ncbi_taxa,
                                            cur_genomes[gid].gtdb_type_designation_sources,
                                            cur_genomes[gid].ncbi_type_material,
                                            float(cur_genomes[gid].length)/1e6,
                                            cur_genomes[gid].comp,
                                            cur_genomes[gid].cont,
                                            cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                            cur_genomes[gid].strain_heterogeneity_100,
                                            marker_perc[gid],
                                            cur_genomes[gid].contig_count,
                                            cur_genomes[gid].contig_n50,
                                            cur_genomes[gid].ambiguous_bases,
                                            excluded_from_refseq_note[gid],
                                            len(other_pass) == 0))
                
            if len(other_pass) == 0:
                # no genomes for species pass QC so report loss of species
                lost_sp += 1
                fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail)))
                fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    sum([failed_tests_gids[gid]['comp'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['cont'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['qual'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['contig_count'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['N50'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['ambig'] for gid in gids])))
                                    
                for gid in type_fail.union(other_fail):
                    fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % (
                                            sp,
                                            gid,
                                            cur_genomes[gid].gtdb_taxa,
                                            cur_genomes[gid].ncbi_taxa,
                                            gid in type_fail,
                                            float(cur_genomes[gid].length)/1e6,
                                            cur_genomes[gid].comp,
                                            cur_genomes[gid].cont,
                                            cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                            cur_genomes[gid].strain_heterogeneity_100,
                                            marker_perc[gid],
                                            cur_genomes[gid].contig_count,
                                            cur_genomes[gid].contig_n50,
                                            cur_genomes[gid].ambiguous_bases))
                    fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (
                                        failed_tests_gids[gid]['comp'],
                                        failed_tests_gids[gid]['cont'],
                                        failed_tests_gids[gid]['qual'],
                                        failed_tests_gids[gid]['marker_perc'],
                                        failed_tests_gids[gid]['contig_count'],
                                        failed_tests_gids[gid]['N50'],
                                        failed_tests_gids[gid]['ambig']))
                    fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid])

        fout_type_fail.close()
        fout_fail_sp.close()
        fout_sp_lost.close()
        
        self.logger.info(f'Filtered {filtered_genomes:,} genomes assigned to NCBI species.')
        self.logger.info(f'Identified {lost_type:,} species with type genomes failing QC and {lost_sp:,} total species failing QC.')
        self.logger.info('Genomes from NCBI species filtered by each criterion:')
        for test in sorted(failed_tests_cumulative):
            self.logger.info(f'{test}: {failed_tests_cumulative[test]:,}')
Example #4
0
    def load_from_metadata_file(self,
                                metadata_file,
                                species_exception_file=None,
                                genus_exception_file=None,
                                gtdb_type_strains_ledger=None,
                                create_sp_clusters=True,
                                qc_passed_file=None,
                                ncbi_genbank_assembly_file=None,
                                untrustworthy_type_ledger=None,
                                ncbi_untrustworthy_sp_ledger=None,
                                ncbi_env_bioproject_ledger=None):
        """Create genome set from file(s)."""

        pass_qc_gids = set()
        if qc_passed_file:
            with open(qc_passed_file) as f:
                f.readline()
                for line in f:
                    line_split = line.strip().split('\t')
                    pass_qc_gids.add(line_split[0].strip())
            self.logger.info(
                f' - identified {len(pass_qc_gids):,} genomes passing QC.')

        gtdb_type_strains = set()
        if gtdb_type_strains_ledger:
            with open(gtdb_type_strains_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    gid = canonical_gid(tokens[0].strip())
                    gtdb_type_strains.add(gid)
            self.logger.info(
                f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.'
            )

        excluded_from_refseq_note = {}
        ncbi_bioproject = {}
        if ncbi_genbank_assembly_file:
            ncbi_bioproject = parse_ncbi_bioproject(ncbi_genbank_assembly_file)
            excluded_from_refseq_note = exclude_from_refseq(
                ncbi_genbank_assembly_file)

        ncbi_env_bioproject = set()
        if ncbi_env_bioproject_ledger:
            with open(ncbi_env_bioproject_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    ncbi_env_bioproject.add(tokens[0].strip())

        untrustworthy_as_type = set()
        if untrustworthy_type_ledger:
            untrustworthy_as_type = self.parse_untrustworthy_type_ledger(
                untrustworthy_type_ledger)
            self.logger.info(
                f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type by GTDB.'
            )

        untrustworthy_ncbi_sp = set()
        if ncbi_untrustworthy_sp_ledger:
            untrustworthy_ncbi_sp = self.parse_ncbi_untrustworthy_sp_ledger(
                ncbi_untrustworthy_sp_ledger)
            self.logger.info(
                f' - identified {len(untrustworthy_ncbi_sp):,} genomes annotated as having untrustworthy NCBI species assignments.'
            )

        with open(metadata_file, encoding='utf-8') as f:
            headers = f.readline().strip().split('\t')

            genome_index = headers.index('accession')

            gtdb_taxonomy_index = headers.index('gtdb_taxonomy')
            ncbi_taxonomy_index = headers.index('ncbi_taxonomy')
            ncbi_taxonomy_unfiltered_index = headers.index(
                'ncbi_taxonomy_unfiltered')

            gtdb_type_index = headers.index('gtdb_type_designation')
            gtdb_type_sources_index = headers.index(
                'gtdb_type_designation_sources')
            gtdb_type_species_of_genus_index = headers.index(
                'gtdb_type_species_of_genus')
            ncbi_strain_identifiers_index = headers.index(
                'ncbi_strain_identifiers')
            ncbi_type_index = headers.index('ncbi_type_material_designation')
            ncbi_asm_level_index = headers.index('ncbi_assembly_level')
            ncbi_genome_representation_index = headers.index(
                'ncbi_genome_representation')
            ncbi_refseq_cat_index = headers.index('ncbi_refseq_category')
            ncbi_genome_cat_index = headers.index('ncbi_genome_category')

            comp_index = headers.index('checkm_completeness')
            cont_index = headers.index('checkm_contamination')
            sh_100_index = None
            if 'checkm_strain_heterogeneity_100' in headers:
                sh_100_index = headers.index('checkm_strain_heterogeneity_100')
            gs_index = headers.index('genome_size')
            contig_count_index = headers.index('contig_count')
            n50_index = headers.index('n50_contigs')
            scaffold_count_index = headers.index('scaffold_count')
            ambiguous_bases_index = headers.index('ambiguous_bases')
            total_gap_len_index = headers.index('total_gap_length')
            ssu_count_index = headers.index('ssu_count')
            ssu_length_index = headers.index('ssu_length')
            ncbi_molecule_count_index = headers.index('ncbi_molecule_count')
            ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps')
            ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps')

            gtdb_genome_rep_index = headers.index('gtdb_genome_representative')
            gtdb_rep_index = headers.index('gtdb_representative')

            if 'lpsn_priority_year' in headers:
                # this information will be missing from the previous
                # GTDB metadata file as we strip this out due to
                # concerns over republishing this information
                lpsn_priority_index = headers.index('lpsn_priority_year')

            for line in f:
                line_split = line.strip().split('\t')

                ncbi_accn = line_split[genome_index]
                gid = canonical_gid(ncbi_accn)
                self.full_gid[gid] = ncbi_accn

                if gid.startswith('U_'):
                    continue

                if pass_qc_gids and gid not in pass_qc_gids:
                    continue

                gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index])

                ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index])
                ncbi_taxonomy_unfiltered = Taxa(
                    line_split[ncbi_taxonomy_unfiltered_index], filtered=False)

                gtdb_type = line_split[gtdb_type_index]
                gtdb_type_sources = line_split[gtdb_type_sources_index]
                if gid in gtdb_type_strains:
                    gtdb_type = 'type strain of species'
                    gtdb_type_sources = 'GTDB curator'
                gtdb_type_species_of_genus = line_split[
                    gtdb_type_species_of_genus_index] == 't'

                ncbi_type = line_split[ncbi_type_index]
                ncbi_strain_identifiers = line_split[
                    ncbi_strain_identifiers_index]
                ncbi_asm_level = line_split[ncbi_asm_level_index]
                ncbi_genome_representation = line_split[
                    ncbi_genome_representation_index]
                ncbi_refseq_cat = line_split[ncbi_refseq_cat_index]
                ncbi_genome_cat = line_split[ncbi_genome_cat_index]

                if ncbi_bioproject.get(gid,
                                       None) in ncbi_env_bioproject:  # ***
                    # HACK to force genomes from MAG mining projects
                    # to be indicated as MAGs which are currently
                    # not correctly annotated at NCBI
                    ncbi_genome_cat = 'derived from environmental source'

                comp = float(line_split[comp_index])
                cont = float(line_split[cont_index])
                sh_100 = 0
                if sh_100_index:
                    sh_100 = self._convert_float(line_split[sh_100_index])
                gs = int(line_split[gs_index])
                contig_count = int(line_split[contig_count_index])
                n50 = int(line_split[n50_index])
                scaffold_count = int(line_split[scaffold_count_index])
                ambiguous_bases = int(line_split[ambiguous_bases_index])
                total_gap_len = int(line_split[total_gap_len_index])
                ssu_count = int(line_split[ssu_count_index])
                ssu_length = self._convert_int(line_split[ssu_length_index])
                ncbi_molecule_count = self._convert_int(
                    line_split[ncbi_molecule_count_index])
                ncbi_unspanned_gaps = self._convert_int(
                    line_split[ncbi_unspanned_gaps_index])
                ncbi_spanned_gaps = self._convert_int(
                    line_split[ncbi_spanned_gaps_index])

                gtdb_is_rep = line_split[gtdb_rep_index] == 't'
                gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index])
                if create_sp_clusters:
                    self.sp_clusters.update_sp_cluster(gtdb_rid, gid,
                                                       gtdb_taxonomy.species)

                lpsn_priority_year = Genome.NO_PRIORITY_YEAR
                if 'lpsn_priority_year' in headers:
                    lpsn_priority_year = self._convert_int(
                        line_split[lpsn_priority_index],
                        Genome.NO_PRIORITY_YEAR)

                self.genomes[gid] = Genome(
                    gid, ncbi_accn, gtdb_rid, gtdb_is_rep, gtdb_taxonomy,
                    ncbi_taxonomy, ncbi_taxonomy_unfiltered, gtdb_type,
                    gtdb_type_sources, gtdb_type_species_of_genus, gid
                    in untrustworthy_as_type, gid in untrustworthy_ncbi_sp,
                    ncbi_type, ncbi_strain_identifiers, ncbi_asm_level,
                    ncbi_genome_representation,
                    ncbi_refseq_cat, ncbi_genome_cat,
                    excluded_from_refseq_note.get(gid, ''), comp, cont, sh_100,
                    gs, contig_count, n50, scaffold_count, ambiguous_bases,
                    total_gap_len, ssu_count, ssu_length, ncbi_molecule_count,
                    ncbi_unspanned_gaps, ncbi_spanned_gaps, lpsn_priority_year)

        self._apply_ncbi_taxonomy_ledgers(species_exception_file,
                                          genus_exception_file)
Example #5
0
    def run(self, qc_file,
                metadata_file,
                gtdb_user_genomes_file,
                genome_path_file,
                type_genome_cluster_file,
                type_genome_synonym_file,
                ncbi_refseq_assembly_file,
                ncbi_genbank_assembly_file,
                ani_af_nontype_vs_type,
                species_exception_file,
                rnd_type_genome):
        """Infer de novo species clusters and type genomes for remaining genomes."""
        
        # identify genomes failing quality criteria
        self.logger.info('Reading QC file.')
        passed_qc = read_qc_file(qc_file)
        self.logger.info('Identified %d genomes passing QC.' % len(passed_qc))
        
        # get NCBI taxonomy strings for each genome
        self.logger.info('Reading NCBI taxonomy from GTDB metadata file.')
        ncbi_taxonomy, ncbi_update_count = read_gtdb_ncbi_taxonomy(metadata_file, species_exception_file)
        gtdb_taxonomy = read_gtdb_taxonomy(metadata_file)
        self.logger.info('Read NCBI taxonomy for %d genomes with %d manually defined updates.' % (len(ncbi_taxonomy), ncbi_update_count))
        self.logger.info('Read GTDB taxonomy for %d genomes.' % len(gtdb_taxonomy))
        
        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_refseq_assembly_file, ncbi_genbank_assembly_file)

        # get path to genome FASTA files
        self.logger.info('Reading path to genome FASTA files.')
        genome_files = read_genome_path(genome_path_file)
        self.logger.info('Read path for %d genomes.' % len(genome_files))
        for gid in set(genome_files):
            if gid not in passed_qc:
                genome_files.pop(gid)
        self.logger.info('Considering %d genomes as potential representatives after removing unwanted User genomes.' % len(genome_files))
        assert(len(genome_files) == len(passed_qc))
        
        # determine type genomes and genomes clustered to type genomes
        type_species, species_type_gid, type_gids, type_clustered_gids, type_radius = self._parse_type_clusters(type_genome_cluster_file)
        assert(len(type_species) == len(type_gids))
        self.logger.info('Identified %d type genomes.' % len(type_gids))
        self.logger.info('Identified %d clustered genomes.' % len(type_clustered_gids))
        
        # calculate quality score for genomes
        self.logger.info('Parse quality statistics for all genomes.')
        quality_metadata = read_quality_metadata(metadata_file)
        
        # calculate genome quality score
        self.logger.info('Calculating genome quality score.')
        genome_quality = quality_score(quality_metadata.keys(), quality_metadata)

        # determine genomes left to be clustered
        unclustered_gids = passed_qc - type_gids - type_clustered_gids
        self.logger.info('Identified %d unclustered genomes passing QC.' % len(unclustered_gids))

        # establish closest type genome for each unclustered genome
        self.logger.info('Determining ANI circumscription for %d unclustered genomes.' % len(unclustered_gids))
        nontype_radius = self._nontype_radius(unclustered_gids, type_gids, ani_af_nontype_vs_type)
        
        # calculate Mash ANI estimates between unclustered genomes
        self.logger.info('Calculating Mash ANI estimates between unclustered genomes.')
        mash_anis = self._mash_ani_unclustered(genome_files, unclustered_gids)

        # select species representatives genomes in a greedy fashion based on genome quality
        rep_genomes = self._selected_rep_genomes(genome_files,
                                                    nontype_radius, 
                                                    unclustered_gids, 
                                                    mash_anis,
                                                    quality_metadata,
                                                    rnd_type_genome)
        
        # cluster all non-type/non-rep genomes to species type/rep genomes
        final_cluster_radius = type_radius.copy()
        final_cluster_radius.update(nontype_radius)
        
        final_clusters, ani_af = self._cluster_genomes(genome_files,
                                                        rep_genomes,
                                                        type_gids, 
                                                        passed_qc,
                                                        final_cluster_radius)
        rep_clusters = {}
        for gid in rep_genomes:
            rep_clusters[gid] = final_clusters[gid]

        # get list of synonyms in order to restrict usage of species names
        synonyms = self._parse_synonyms(type_genome_synonym_file)
        self.logger.info('Identified %d synonyms.' % len(synonyms))
        
        # determine User genomes with NCBI accession number that may form species names
        gtdb_user_to_genbank = self._gtdb_user_genomes(gtdb_user_genomes_file, metadata_file)
        self.logger.info('Identified %d GTDB User genomes with NCBI accessions.' % len(gtdb_user_to_genbank))
        
        # assign species names to de novo species clusters
        names_in_use = synonyms.union(type_species)
        self.logger.info('Identified %d species names already in use.' % len(names_in_use))
        self.logger.info('Assigning species name to each de novo species cluster.')
        cluster_sp_names = self._assign_species_names(rep_clusters, 
                                                        names_in_use, 
                                                        gtdb_taxonomy,
                                                        gtdb_user_to_genbank)
        
         # write out file with details about selected representative genomes
        self._write_rep_info(rep_clusters, 
                                cluster_sp_names,
                                quality_metadata,
                                genome_quality,
                                excluded_from_refseq_note,
                                ani_af,
                                os.path.join(self.output_dir, 'gtdb_rep_genome_info.tsv'))
                                             
        # remove genomes that are not representatives of a species cluster and then write out representative ANI radius
        for gid in set(final_cluster_radius) - set(final_clusters):
            del final_cluster_radius[gid]
            
        all_species = cluster_sp_names
        all_species.update(species_type_gid)

        self.logger.info('Writing %d species clusters to file.' % len(all_species))
        self.logger.info('Writing %d cluster radius information to file.' % len(final_cluster_radius))
        
        write_clusters(final_clusters, 
                        final_cluster_radius, 
                        all_species, 
                        os.path.join(self.output_dir, 'gtdb_clusters_final.tsv'))

        write_rep_radius(final_cluster_radius, 
                            all_species, 
                            os.path.join(self.output_dir, 'gtdb_ani_radius_final.tsv'))