def test_ncbi_type_material(self):
        """Test identification of genome being type material according to NCBI."""

        g = copy.copy(test_genome)

        # test type material flagged as untrustworthy within the GTDB
        g.excluded_from_refseq_note = ""
        g.gtdb_untrustworthy_as_type = True
        g.ncbi_type_material = 'assembly from type material'
        assert not g.is_ncbi_type_strain()
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri subsp. testus"
        )
        assert not g.is_ncbi_type_subspecies()

        # test type material flagged as untrustworthy at NCBI
        g.excluded_from_refseq_note = "untrustworthy as type"
        g.gtdb_untrustworthy_as_type = False
        g.ncbi_type_material = 'assembly from type material'
        assert not g.is_ncbi_type_strain()
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri subsp. testus"
        )
        assert not g.is_ncbi_type_subspecies()

        # test valid type material
        g.excluded_from_refseq_note = ""
        g.gtdb_untrustworthy_as_type = False
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri"
        )
        g.ncbi_type_material = 'assembly from type material'
        assert g.is_ncbi_type_strain()
        assert not g.is_ncbi_type_subspecies()

        # test type material for NCBI subspecies
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri subsp. testus"
        )
        g.ncbi_type_material = 'assembly from type material'
        assert not g.is_ncbi_type_strain()
        assert g.is_ncbi_type_subspecies()
Beispiel #2
0
    def set_gtdbtk_classification(self, gtdbtk_classify_file, prev_genomes):
        """Update classification of genomes based on GTDB-Tk results."""

        # get species names in previous GTDB release
        prev_ncbi_sp = set()
        prev_gtdb_sp = set()
        for gid in prev_genomes:
            prev_ncbi_sp.add(prev_genomes[gid].ncbi_taxa.species)
            prev_gtdb_sp.add(prev_genomes[gid].gtdb_taxa.species)

        # set new genomes to the predicted GTDB-Tk classification, but
        # change genus and species classifications of a genome if it has
        # a previously unseen NCBI species assignment. This is a problematic
        # case for curation as GTDB-Tk is unaware of these assignments.
        # A common example is a new genome being the most basal
        # member of a genus, this genome being classified to this genus by
        # GTDB-Tk, but this genome being from a newly proposed genera which
        # should be favored in order to have the GTDB reflect the opinion
        # of the community. New NCBI taxa above the rank of genus are not considered
        # as these are relatively uncommon and are picked up for manual curation
        # using curation trees that specifically highlight genomes with previously
        # unseen NCBI taxon names (see update_curation_trees)
        gtdbtk_classifications = read_gtdbtk_classifications(
            gtdbtk_classify_file)
        num_updated = 0
        num_ncbi_sp = 0
        for gid in self.genomes:
            if gid in gtdbtk_classifications:
                num_updated += 1

                gtdbtk_taxa = Taxa(';'.join(gtdbtk_classifications[gid]))
                self.genomes[gid].gtdb_taxa.update_taxa(gtdbtk_taxa)

                ncbi_sp = self.genomes[gid].ncbi_taxa.species
                if (ncbi_sp == 's__' or ncbi_sp in prev_ncbi_sp
                        or ncbi_sp in prev_gtdb_sp):
                    continue

                self.genomes[gid].gtdb_taxa.set_taxa(
                    5, self.genomes[gid].ncbi_taxa.genus)
                self.genomes[gid].gtdb_taxa.set_taxa(6, ncbi_sp)
                num_ncbi_sp += 1

        return num_updated, num_ncbi_sp
    def test_ncbi_subspecies(self):
        """Test identification of genomes classified as a subspecies at NCBI."""

        g = copy.copy(test_genome)

        # no subspecies defined
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri"
        )
        assert not g.is_ncbi_subspecies()

        # subspecies defined
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri subsp. testus"
        )
        assert g.is_ncbi_subspecies()

        # a variety or str is not a subspecies
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri var. testus"
        )
        assert not g.is_ncbi_subspecies()

        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri str testus"
        )
        assert not g.is_ncbi_subspecies()

        # a variety is not a subspecies
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri var. testus"
        )
        assert not g.is_ncbi_subspecies()

        # genome is not from a subspecies if the subspecies and specific name are the same since it should be considered from the species itself;
        # this is critical for identifying a genome as the type strain of the species and not the type strain of a subspecies
        g.ncbi_unfiltered_taxa = Taxa(
            "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri;sb__Escherichia flexneri subsp. flexneri"
        )
        assert not g.is_ncbi_subspecies()
    def load_from_metadata_file(self, 
                                metadata_file,
                                species_exception_file=None,
                                genus_exception_file=None,
                                gtdb_type_strains_ledger=None,
                                create_sp_clusters=True,
                                uba_genome_file=None,
                                qc_passed_file=None,
                                ncbi_genbank_assembly_file=None,
                                untrustworthy_type_ledger=None):
        """Create genome set from file(s)."""
        
        pass_qc_gids = set()
        if qc_passed_file:
            with open(qc_passed_file) as f:
                f.readline()
                for line in f:
                    line_split = line.strip().split('\t')
                    pass_qc_gids.add(line_split[0].strip())
            self.logger.info(f' - identified {len(pass_qc_gids):,} genomes passing QC.')
                    
        valid_uba_ids = set()
        if uba_genome_file:
            with open(uba_genome_file) as f:
                for line in f:
                    line_split = line.strip().split('\t')
                    valid_uba_ids.add(line_split[0].strip())
            self.logger.info(f' - identified {len(valid_uba_ids):,} UBA genomes to retain.')

        gtdb_type_strains = set()
        if gtdb_type_strains_ledger:
            with open(gtdb_type_strains_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    gid = canonical_gid(tokens[0].strip())
                    gtdb_type_strains.add(gid)
            self.logger.info(f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.')
                    
        excluded_from_refseq_note = {}
        if ncbi_genbank_assembly_file:
            excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file)
            
        untrustworthy_as_type = set()
        if untrustworthy_type_ledger:
            untrustworthy_as_type = self.parse_untrustworthy_type_ledger(untrustworthy_type_ledger)
            self.logger.info(f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type.')

        with open(metadata_file, encoding='utf-8') as f:
            headers = f.readline().strip().split('\t')

            genome_index = headers.index('accession')

            gtdb_taxonomy_index = headers.index('gtdb_taxonomy')
            ncbi_taxonomy_index = headers.index('ncbi_taxonomy')
            ncbi_taxonomy_unfiltered_index = headers.index('ncbi_taxonomy_unfiltered')
            
            gtdb_type_index = headers.index('gtdb_type_designation')
            gtdb_type_sources_index = headers.index('gtdb_type_designation_sources')
            gtdb_type_species_of_genus_index = headers.index('gtdb_type_species_of_genus')
            ncbi_strain_identifiers_index = headers.index('ncbi_strain_identifiers')
            ncbi_type_index = headers.index('ncbi_type_material_designation')
            ncbi_asm_level_index = headers.index('ncbi_assembly_level')
            ncbi_genome_representation_index = headers.index('ncbi_genome_representation')
            ncbi_refseq_cat_index = headers.index('ncbi_refseq_category')
            ncbi_genome_cat_index = headers.index('ncbi_genome_category')
            
            comp_index = headers.index('checkm_completeness')
            cont_index = headers.index('checkm_contamination')
            sh_100_index = None
            if 'checkm_strain_heterogeneity_100' in headers:
                sh_100_index = headers.index('checkm_strain_heterogeneity_100')
            gs_index = headers.index('genome_size')
            contig_count_index = headers.index('contig_count')
            n50_index = headers.index('n50_contigs')
            scaffold_count_index = headers.index('scaffold_count')
            ambiguous_bases_index = headers.index('ambiguous_bases')
            total_gap_len_index = headers.index('total_gap_length')
            ssu_count_index = headers.index('ssu_count')
            ssu_length_index = headers.index('ssu_length')
            ncbi_molecule_count_index = headers.index('ncbi_molecule_count')
            ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps')
            ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps')
            
            gtdb_genome_rep_index = headers.index('gtdb_genome_representative')
            gtdb_rep_index = headers.index('gtdb_representative')
            
            if 'lpsn_priority_year' in headers:
                # this information will be missing from the previous
                # GTDB metadata file as we strip this out due to 
                # concerns over republishing this information
                lpsn_priority_index = headers.index('lpsn_priority_year')
                dsmz_priority_index = headers.index('dsmz_priority_year')
                straininfo_priority_index = headers.index('straininfo_priority_year')

            for line in f:
                line_split = line.strip().split('\t')
                
                ncbi_accn = line_split[genome_index]
                gid = canonical_gid(ncbi_accn)

                if gid.startswith('U_'):
                    # check if genome has a UBA identifier
                    org_name_index = headers.index('organism_name')
                    org_name = line_split[org_name_index]
                    if '(UBA' in org_name:
                        uba_id = org_name[org_name.find('(')+1:-1]
                        if uba_id in valid_uba_ids:
                            self.user_uba_id_map[gid] = uba_id
                            self.uba_user_id_map[uba_id] = gid
                            gid = uba_id
                        else:
                            continue # retain only valid UBA genomes
                    else:
                        continue # skip non-UBA user genomes
                        
                if pass_qc_gids and gid not in pass_qc_gids:
                    continue

                gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index])
                
                ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index])
                ncbi_taxonomy_unfiltered = Taxa(line_split[ncbi_taxonomy_unfiltered_index])
                
                gtdb_type = line_split[gtdb_type_index]
                gtdb_type_sources = line_split[gtdb_type_sources_index]
                if gid in gtdb_type_strains:
                    gtdb_type = 'type strain of species'
                    gtdb_type_sources = 'GTDB curator'
                gtdb_type_species_of_genus = line_split[gtdb_type_species_of_genus_index] == 't'
                
                ncbi_type = line_split[ncbi_type_index]
                ncbi_strain_identifiers = line_split[ncbi_strain_identifiers_index]
                ncbi_asm_level = line_split[ncbi_asm_level_index]
                ncbi_genome_representation = line_split[ncbi_genome_representation_index]
                ncbi_refseq_cat = line_split[ncbi_refseq_cat_index]
                ncbi_genome_cat = line_split[ncbi_genome_cat_index]
                
                comp = float(line_split[comp_index])
                cont = float(line_split[cont_index])
                sh_100 = 0
                if sh_100_index:
                    sh_100 = self._convert_float(line_split[sh_100_index])
                gs = int(line_split[gs_index])
                contig_count = int(line_split[contig_count_index])
                n50 = int(line_split[n50_index])
                scaffold_count = int(line_split[scaffold_count_index])
                ambiguous_bases = int(line_split[ambiguous_bases_index])
                total_gap_len = int(line_split[total_gap_len_index])
                ssu_count = int(line_split[ssu_count_index])
                ssu_length = self._convert_int(line_split[ssu_length_index])
                ncbi_molecule_count = self._convert_int(line_split[ncbi_molecule_count_index])
                ncbi_unspanned_gaps = self._convert_int(line_split[ncbi_unspanned_gaps_index])
                ncbi_spanned_gaps = self._convert_int(line_split[ncbi_spanned_gaps_index])
                
                gtdb_is_rep = line_split[gtdb_rep_index] == 't'
                gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index])
                if create_sp_clusters:
                    self.sp_clusters.update_sp_cluster(gtdb_rid, gid, gtdb_taxonomy.species)
                
                if 'lpsn_priority_year' in headers:
                    lpsn_priority_year = self._convert_int(line_split[lpsn_priority_index], Genome.NO_PRIORITY_YEAR)
                    dsmz_priority_year = self._convert_int(line_split[dsmz_priority_index], Genome.NO_PRIORITY_YEAR)
                    straininfo_priority_year = self._convert_int(line_split[straininfo_priority_index], Genome.NO_PRIORITY_YEAR)
                else:
                    lpsn_priority_year = Genome.NO_PRIORITY_YEAR
                    dsmz_priority_year = Genome.NO_PRIORITY_YEAR
                    straininfo_priority_year = Genome.NO_PRIORITY_YEAR

                self.genomes[gid] = Genome(gid,
                                            ncbi_accn,
                                            gtdb_rid,
                                            gtdb_is_rep,
                                            gtdb_taxonomy,
                                            ncbi_taxonomy,
                                            ncbi_taxonomy_unfiltered,
                                            gtdb_type,
                                            gtdb_type_sources,
                                            gtdb_type_species_of_genus,
                                            gid in untrustworthy_as_type,
                                            ncbi_type,
                                            ncbi_strain_identifiers,
                                            ncbi_asm_level,
                                            ncbi_genome_representation,
                                            ncbi_refseq_cat,
                                            ncbi_genome_cat,
                                            excluded_from_refseq_note.get(gid, ''),
                                            comp,
                                            cont,
                                            sh_100,
                                            gs,
                                            contig_count,
                                            n50,
                                            scaffold_count,
                                            ambiguous_bases,
                                            total_gap_len,
                                            ssu_count,
                                            ssu_length,
                                            ncbi_molecule_count,
                                            ncbi_unspanned_gaps,
                                            ncbi_spanned_gaps,
                                            lpsn_priority_year,
                                            dsmz_priority_year,
                                            straininfo_priority_year)
                                            
        self._apply_ncbi_taxonomy_ledgers(species_exception_file,
                                            genus_exception_file)
# Test methods in Genome class

from collections import defaultdict
import copy

from gtdb_species_clusters.genome import Genome
from gtdb_species_clusters.taxa import Taxa

test_genome = Genome(
    "G012345678", "GCA_012345678.1", "G012345678", True,
    Taxa(
        "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri"
    ),
    Taxa(
        "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri"
    ),
    Taxa(
        "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia flexneri"
    ), "type strain of species", "LPSN", True, False, False,
    'assembly from type material', "K-12", "Complete genome", "Full",
    "Reference genome", None, "", 100, 0, 0, 3500000, 1, 3500000, 1, 0, 0, 1,
    1600, 1, 0, 0, "1900")


class TestGenome:
    """Test Genome class."""
    def test_ncbi_subspecies(self):
        """Test identification of genomes classified as a subspecies at NCBI."""

        g = copy.copy(test_genome)
Beispiel #6
0
    def load_from_metadata_file(self,
                                metadata_file,
                                species_exception_file=None,
                                genus_exception_file=None,
                                gtdb_type_strains_ledger=None,
                                create_sp_clusters=True,
                                qc_passed_file=None,
                                ncbi_genbank_assembly_file=None,
                                untrustworthy_type_ledger=None,
                                ncbi_untrustworthy_sp_ledger=None,
                                ncbi_env_bioproject_ledger=None):
        """Create genome set from file(s)."""

        pass_qc_gids = set()
        if qc_passed_file:
            with open(qc_passed_file) as f:
                f.readline()
                for line in f:
                    line_split = line.strip().split('\t')
                    pass_qc_gids.add(line_split[0].strip())
            self.logger.info(
                f' - identified {len(pass_qc_gids):,} genomes passing QC.')

        gtdb_type_strains = set()
        if gtdb_type_strains_ledger:
            with open(gtdb_type_strains_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    gid = canonical_gid(tokens[0].strip())
                    gtdb_type_strains.add(gid)
            self.logger.info(
                f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.'
            )

        excluded_from_refseq_note = {}
        ncbi_bioproject = {}
        if ncbi_genbank_assembly_file:
            ncbi_bioproject = parse_ncbi_bioproject(ncbi_genbank_assembly_file)
            excluded_from_refseq_note = exclude_from_refseq(
                ncbi_genbank_assembly_file)

        ncbi_env_bioproject = set()
        if ncbi_env_bioproject_ledger:
            with open(ncbi_env_bioproject_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    ncbi_env_bioproject.add(tokens[0].strip())

        untrustworthy_as_type = set()
        if untrustworthy_type_ledger:
            untrustworthy_as_type = self.parse_untrustworthy_type_ledger(
                untrustworthy_type_ledger)
            self.logger.info(
                f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type by GTDB.'
            )

        untrustworthy_ncbi_sp = set()
        if ncbi_untrustworthy_sp_ledger:
            untrustworthy_ncbi_sp = self.parse_ncbi_untrustworthy_sp_ledger(
                ncbi_untrustworthy_sp_ledger)
            self.logger.info(
                f' - identified {len(untrustworthy_ncbi_sp):,} genomes annotated as having untrustworthy NCBI species assignments.'
            )

        with open(metadata_file, encoding='utf-8') as f:
            headers = f.readline().strip().split('\t')

            genome_index = headers.index('accession')

            gtdb_taxonomy_index = headers.index('gtdb_taxonomy')
            ncbi_taxonomy_index = headers.index('ncbi_taxonomy')
            ncbi_taxonomy_unfiltered_index = headers.index(
                'ncbi_taxonomy_unfiltered')

            gtdb_type_index = headers.index('gtdb_type_designation')
            gtdb_type_sources_index = headers.index(
                'gtdb_type_designation_sources')
            gtdb_type_species_of_genus_index = headers.index(
                'gtdb_type_species_of_genus')
            ncbi_strain_identifiers_index = headers.index(
                'ncbi_strain_identifiers')
            ncbi_type_index = headers.index('ncbi_type_material_designation')
            ncbi_asm_level_index = headers.index('ncbi_assembly_level')
            ncbi_genome_representation_index = headers.index(
                'ncbi_genome_representation')
            ncbi_refseq_cat_index = headers.index('ncbi_refseq_category')
            ncbi_genome_cat_index = headers.index('ncbi_genome_category')

            comp_index = headers.index('checkm_completeness')
            cont_index = headers.index('checkm_contamination')
            sh_100_index = None
            if 'checkm_strain_heterogeneity_100' in headers:
                sh_100_index = headers.index('checkm_strain_heterogeneity_100')
            gs_index = headers.index('genome_size')
            contig_count_index = headers.index('contig_count')
            n50_index = headers.index('n50_contigs')
            scaffold_count_index = headers.index('scaffold_count')
            ambiguous_bases_index = headers.index('ambiguous_bases')
            total_gap_len_index = headers.index('total_gap_length')
            ssu_count_index = headers.index('ssu_count')
            ssu_length_index = headers.index('ssu_length')
            ncbi_molecule_count_index = headers.index('ncbi_molecule_count')
            ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps')
            ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps')

            gtdb_genome_rep_index = headers.index('gtdb_genome_representative')
            gtdb_rep_index = headers.index('gtdb_representative')

            if 'lpsn_priority_year' in headers:
                # this information will be missing from the previous
                # GTDB metadata file as we strip this out due to
                # concerns over republishing this information
                lpsn_priority_index = headers.index('lpsn_priority_year')

            for line in f:
                line_split = line.strip().split('\t')

                ncbi_accn = line_split[genome_index]
                gid = canonical_gid(ncbi_accn)
                self.full_gid[gid] = ncbi_accn

                if gid.startswith('U_'):
                    continue

                if pass_qc_gids and gid not in pass_qc_gids:
                    continue

                gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index])

                ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index])
                ncbi_taxonomy_unfiltered = Taxa(
                    line_split[ncbi_taxonomy_unfiltered_index], filtered=False)

                gtdb_type = line_split[gtdb_type_index]
                gtdb_type_sources = line_split[gtdb_type_sources_index]
                if gid in gtdb_type_strains:
                    gtdb_type = 'type strain of species'
                    gtdb_type_sources = 'GTDB curator'
                gtdb_type_species_of_genus = line_split[
                    gtdb_type_species_of_genus_index] == 't'

                ncbi_type = line_split[ncbi_type_index]
                ncbi_strain_identifiers = line_split[
                    ncbi_strain_identifiers_index]
                ncbi_asm_level = line_split[ncbi_asm_level_index]
                ncbi_genome_representation = line_split[
                    ncbi_genome_representation_index]
                ncbi_refseq_cat = line_split[ncbi_refseq_cat_index]
                ncbi_genome_cat = line_split[ncbi_genome_cat_index]

                if ncbi_bioproject.get(gid,
                                       None) in ncbi_env_bioproject:  # ***
                    # HACK to force genomes from MAG mining projects
                    # to be indicated as MAGs which are currently
                    # not correctly annotated at NCBI
                    ncbi_genome_cat = 'derived from environmental source'

                comp = float(line_split[comp_index])
                cont = float(line_split[cont_index])
                sh_100 = 0
                if sh_100_index:
                    sh_100 = self._convert_float(line_split[sh_100_index])
                gs = int(line_split[gs_index])
                contig_count = int(line_split[contig_count_index])
                n50 = int(line_split[n50_index])
                scaffold_count = int(line_split[scaffold_count_index])
                ambiguous_bases = int(line_split[ambiguous_bases_index])
                total_gap_len = int(line_split[total_gap_len_index])
                ssu_count = int(line_split[ssu_count_index])
                ssu_length = self._convert_int(line_split[ssu_length_index])
                ncbi_molecule_count = self._convert_int(
                    line_split[ncbi_molecule_count_index])
                ncbi_unspanned_gaps = self._convert_int(
                    line_split[ncbi_unspanned_gaps_index])
                ncbi_spanned_gaps = self._convert_int(
                    line_split[ncbi_spanned_gaps_index])

                gtdb_is_rep = line_split[gtdb_rep_index] == 't'
                gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index])
                if create_sp_clusters:
                    self.sp_clusters.update_sp_cluster(gtdb_rid, gid,
                                                       gtdb_taxonomy.species)

                lpsn_priority_year = Genome.NO_PRIORITY_YEAR
                if 'lpsn_priority_year' in headers:
                    lpsn_priority_year = self._convert_int(
                        line_split[lpsn_priority_index],
                        Genome.NO_PRIORITY_YEAR)

                self.genomes[gid] = Genome(
                    gid, ncbi_accn, gtdb_rid, gtdb_is_rep, gtdb_taxonomy,
                    ncbi_taxonomy, ncbi_taxonomy_unfiltered, gtdb_type,
                    gtdb_type_sources, gtdb_type_species_of_genus, gid
                    in untrustworthy_as_type, gid in untrustworthy_ncbi_sp,
                    ncbi_type, ncbi_strain_identifiers, ncbi_asm_level,
                    ncbi_genome_representation,
                    ncbi_refseq_cat, ncbi_genome_cat,
                    excluded_from_refseq_note.get(gid, ''), comp, cont, sh_100,
                    gs, contig_count, n50, scaffold_count, ambiguous_bases,
                    total_gap_len, ssu_count, ssu_length, ncbi_molecule_count,
                    ncbi_unspanned_gaps, ncbi_spanned_gaps, lpsn_priority_year)

        self._apply_ncbi_taxonomy_ledgers(species_exception_file,
                                          genus_exception_file)