Example #1
0
    def load_from_sp_cluster_file(self, cluster_file):
        """Create species clusters from file."""

        with open(cluster_file) as f:
            headers = f.readline().strip().split('\t')

            sp_index = headers.index('GTDB species')
            rid_index = headers.index('Representative genome')

            num_clustered_index = headers.index('No. clustered genomes')
            cluster_index = headers.index('Clustered genomes')

            for line in f:
                line_split = line.strip().split('\t')

                sp = line_split[sp_index]
                rid = canonical_gid(line_split[rid_index])
                self.genome_rid[rid] = rid

                self.sp_clusters[rid] = set([rid])
                self.species_names[rid] = sp

                num_clustered = int(line_split[num_clustered_index])
                if num_clustered > 0:
                    cids = [
                        canonical_gid(cid.strip())
                        for cid in line_split[cluster_index].split(',')
                    ]
                    self.sp_clusters[rid].update([cid for cid in cids])
                    for cid in cids:
                        self.genome_rid[cid] = rid
Example #2
0
    def _apply_ncbi_taxonomy_ledgers(self, species_exception_file,
                                     genus_exception_file):
        """Apply corrections to NCBI taxonomy."""

        species_updates = {}
        if species_exception_file:
            with open(species_exception_file, encoding='utf-8') as f:
                f.readline()
                for line in f:
                    line_split = [
                        token.strip() for token in line.strip().split('\t')
                    ]
                    gid = canonical_gid(line_split[0].strip())

                    sp = line_split[1].strip().replace('Candidatus ', '')
                    if gid not in self.genomes:
                        self.logger.warning(
                            f'Genome {gid} in species exception list not defined in genome set.'
                        )
                        continue

                    if not sp.startswith('s__'):
                        sp = 's__' + sp

                    self.genomes[gid].ncbi_taxa.species = sp
                    species_updates[gid] = sp

        if genus_exception_file:
            with open(genus_exception_file, encoding='utf-8') as f:
                f.readline()
                for line in f:
                    line_split = [
                        token.strip() for token in line.strip().split('\t')
                    ]
                    gid = canonical_gid(line_split[0].strip())
                    genus = line_split[1].strip()
                    if gid not in self.genomes:
                        self.logger.warning(
                            f'Genome {gid} in genus exception list not defined in genome set.'
                        )
                        continue

                    if genus.startswith('g__'):
                        genus = genus[3:]

                    self.genomes[gid].ncbi_taxa.genus = f'g__{genus}'

                    species = self.genomes[gid].ncbi_taxa.species
                    if species != 's__':
                        specific = self.genomes[gid].ncbi_taxa.specific_epithet
                        self.genomes[
                            gid].ncbi_taxa.species = f's__{genus} {specific}'

                    # sanity check ledgers
                    if gid in species_updates and genus not in species_updates[
                            gid]:
                        self.logger.error(
                            f'Species and genus ledgers have conflicting assignments for {gid}.'
                        )
                        sys.exit(-1)
Example #3
0
def read_gtdb_taxonomy(metadata_file):
    """Parse GTDB taxonomy from GTDB metadata.

    Parameters
    ----------
    metadata_file : str
        Metadata for all genomes.

    Return
    ------
    dict : d[genome_id] -> taxonomy list
    """

    taxonomy = {}

    with open(metadata_file, encoding='utf-8') as f:
        headers = f.readline().strip().split('\t')
        genome_index = headers.index('accession')
        taxonomy_index = headers.index('gtdb_taxonomy')
        for line in f:
            line_split = line.strip().split('\t')
            genome_id = canonical_gid(line_split[genome_index])
            taxa_str = line_split[taxonomy_index].strip()

            if taxa_str and taxa_str != 'none':
                taxonomy[genome_id] = [t.strip() for t in taxa_str.split(';')]
            else:
                taxonomy[genome_id] = list(Taxonomy.rank_prefixes)

    return taxonomy
    def check_domain_assignments(self, gtdb_domain_report, cur_genomes, pass_qc_gids):
        """Check GTDB domain assignment."""
        
        with open(gtdb_domain_report, encoding='utf-8') as f:
            header = f.readline().rstrip().split('\t')
            
            domain_index = header.index('Predicted domain')
            bac_marker_perc_index = header.index('Bacterial Marker Percentage')
            ar_marker_perc_index = header.index('Archaeal Marker Percentage')
            ncbi_taxonomy_index = header.index('NCBI taxonomy')
            gtdb_taxonomy_index = header.index('GTDB taxonomy')
            
            for line in f:
                line_split = line.strip().split('\t')
                
                gid = canonical_gid(line_split[0])
                gid = cur_genomes.user_uba_id_map.get(gid, gid)
                
                if gid not in pass_qc_gids:
                    continue
                
                domain = line_split[domain_index]
                bac_perc = float(line_split[bac_marker_perc_index])
                ar_perc = float(line_split[ar_marker_perc_index])
                ncbi_domain = [t.strip() for t in line_split[ncbi_taxonomy_index].split(';')][0]
                gtdb_domain = [t.strip() for t in line_split[gtdb_taxonomy_index].split(';')][0]

                if not gid.startswith('U'):
                    if ncbi_domain != gtdb_domain and ncbi_domain != 'None':
                        print(f'[WARNING] NCBI ({ncbi_domain}) and GTDB ({gtdb_domain}) domains disagree in domain report (Bac = {bac_perc:.1f}%; Ar = {ar_perc:.1f}%): {gid}')

                    if domain != gtdb_domain and domain != 'None':
                        print(f'[WARNING] GTDB and predicted domain (Bac = {bac_perc:.1f}%; Ar = {ar_perc:.1f}%) disagree in domain report: {gid}')
    def _mash_genome_id(self, mash_genome_id):
        """Extract canonical GTDB genome ID from Mash results."""

        # get filename and remove information past genome accession
        # (e.g., GCA_002498385.1_ASM249838v1_genomic.fna => GCA_002498385.1)
        genome_file = ntpath.basename(mash_genome_id)
        gid = genome_file[0:genome_file.find('_', 4)]
        gid = canonical_gid(gid)

        return gid
 def parse_untrustworthy_type_ledger(self, untrustworth_type_ledger):
     """Determine genomes that should be considered untrustworthy as type material."""
     
     untrustworthy_as_type = set()
     with open(untrustworth_type_ledger) as f:
         f.readline()
         for line in f:
             tokens = line.strip().split('\t')
             untrustworthy_as_type.add(canonical_gid(tokens[0]))
         
     return untrustworthy_as_type
Example #7
0
    def parse_qc_exception_file(self, qc_exception_file):
        """Parse file indicating genomes flagged as exceptions from QC."""

        qc_exceptions = set()
        with open(qc_exception_file, encoding='utf-8') as f:
            f.readline()
            for line in f:
                gid = canonical_gid(line.split('\t')[0].strip())
                qc_exceptions.add(gid)

        return qc_exceptions
Example #8
0
    def parse_ncbi_untrustworthy_sp_ledger(self, ncbi_untrustworthy_sp_ledger):
        """Determine genomes that should be considered as having untrustworthy NCBI species assignments."""

        untrustworthy_ncbi_sp = set()
        with open(ncbi_untrustworthy_sp_ledger) as f:
            f.readline()
            for line in f:
                tokens = line.strip().split('\t')
                untrustworthy_ncbi_sp.add(canonical_gid(tokens[0]))

        return untrustworthy_ncbi_sp
    def same_genome_accn(self, accn1, accn2, identical_accns):
        """Check if NCBI genome accessions are the same."""

        if accn1 == accn2:
            return True

        if canonical_gid(accn1) != canonical_gid(accn2):
            self.logger.error(
                'Genomes have different canonical genome IDs: {}, {}, {}, {}'.
                format(accn1, canonical_gid(accn1), accn2,
                       canonical_gid(accn2)))
            sys.exit(-1)

        accn1 = accn1.replace('RS_', '').replace('GB_', '')
        accn2 = accn2.replace('RS_', '').replace('GB_', '')

        if identical_accns.get(accn1, None) == accn2:
            return True

        return False
Example #10
0
def parse_disbanded_cluster_ledger(disbanded_cluster_ledger):
    """Parse file indicating GTDB species clusters to be disbanded."""

    disbanded = set()
    with open(disbanded_cluster_ledger) as f:
        f.readline()
        for line in f:
            tokens = line.strip().split('\t')
            disbanded.add(canonical_gid(tokens[0]))

    return disbanded
    def mash_sp_ani(self, gids, genomes, output_prefix):
        """Calculate pairwise Mash ANI estimates between genomes."""

        INIT_MASH_ANI_FILTER = 95.0

        # create Mash sketch for all genomes
        mash_sketch_file = f'{output_prefix}.msh'
        genome_list_file = f'{output_prefix}.lst'
        self.mash.sketch(gids,
                         genomes.genomic_files,
                         genome_list_file,
                         mash_sketch_file,
                         silence=True)

        # get Mash distances
        mash_dist_file = f'{output_prefix}.dst'
        self.mash.dist_pairwise(float(100 - INIT_MASH_ANI_FILTER) / 100,
                                mash_sketch_file,
                                mash_dist_file,
                                silence=True)

        # read Mash distances
        mash_ani = self.mash.read_ani(mash_dist_file)

        # report pairs above Mash threshold
        revised_mash_ani = defaultdict(lambda: {})
        count = 0
        for qid in mash_ani:
            for rid in mash_ani[qid]:
                if qid != rid:
                    new_qid = canonical_gid(self.user_id_map.get(qid, qid))
                    new_rid = canonical_gid(self.user_id_map.get(rid, rid))
                    revised_mash_ani[new_qid][new_rid] = mash_ani[qid][rid]
                    count += 1

        self.logger.info(
            ' - identified {:,} pairs passing Mash filtering of ANI >= {:.1f}%.'
            .format(count, INIT_MASH_ANI_FILTER))

        return revised_mash_ani
    def parse_clusters(self, cluster_file):
        """Parse species clustering information."""

        species = {}
        clusters = {}
        cluster_radius = {}
        with open(cluster_file) as f:
            headers = f.readline().strip().split('\t')

            type_sp_index = headers.index('NCBI species')
            type_genome_index = headers.index('Type genome')
            num_clustered_index = headers.index('No. clustered genomes')
            clustered_genomes_index = headers.index('Clustered genomes')
            closest_type_index = headers.index('Closest type genome')
            ani_radius_index = headers.index('ANI radius')
            af_index = headers.index('AF closest')

            for line in f:
                line_split = line.strip().split('\t')

                rid = line_split[type_genome_index]
                rid = canonical_gid(rid)

                species[rid] = line_split[type_sp_index]

                clusters[rid] = set()
                num_clustered = int(line_split[num_clustered_index])
                if num_clustered > 0:
                    for gid in [g.strip() for g in line_split[clustered_genomes_index].split(',')]:
                        gid = canonical_gid(gid)
                        clusters[rid].add(gid)

                cluster_radius[rid] = GenomeRadius(ani=float(line_split[ani_radius_index]),
                                                   af=float(
                                                       line_split[af_index]),
                                                   neighbour_gid=line_split[closest_type_index])

        return clusters, species, cluster_radius
    def _get_genome_id(self, genome_path):
        """Extract genome ID from path to genomic file."""

        genome_id = ntpath.basename(genome_path)
        if genome_id.startswith('GCA_') or genome_id.startswith('GCF_'):
            genome_id = '_'.join(genome_id.split('_')[0:2])
            if genome_id.startswith('GCA_'):
                genome_id = 'GB_' + genome_id
            else:
                genome_id = 'RS_' + genome_id
        else:
            genome_id = '_'.join(genome_id.split('_')[0:2])

        return canonical_gid(genome_id)
Example #14
0
def read_gtdb_metadata(metadata_file, fields):
    """Parse genome quality from GTDB metadata.

    Parameters
    ----------
    metadata_file : str
        Metadata for all genomes in CSV file.
    fields : iterable
        Fields  to read.

    Return
    ------
    dict : d[genome_id] -> namedtuple
        Value for fields indicted by genome IDs.
    """

    gtdb_metadata = namedtuple('gtdb_metadata', ' '.join(fields))
    m = {}

    with open(metadata_file, encoding='utf-8') as f:
        headers = f.readline().strip().split('\t')

        genome_index = headers.index('accession')

        indices = []
        for field in fields:
            indices.append(headers.index(field))

        for line in f:
            line_split = line.strip().split('\t')
            genome_id = canonical_gid(line_split[genome_index])

            values = []
            for i in indices:
                # save values as floats or strings
                v = line_split[i]
                try:
                    values.append(float(v))
                except ValueError:
                    if v is None or v == '' or v == 'none':
                        values.append(None)
                    elif v == 'f' or v.lower() == 'false':
                        values.append(False)
                    elif v == 't' or v.lower() == 'true':
                        values.append(True)
                    else:
                        values.append(v)
            m[genome_id] = gtdb_metadata._make(values)

    return m
    def load_genomic_file_paths(self, genome_path_file):
        """Determine path to genomic FASTA file for each genome."""

        for line in open(genome_path_file):
            line_split = line.strip().split('\t')
            
            gid = line_split[0]
            gid = canonical_gid(gid)
            if gid in self.genomes:
                genome_path = line_split[1]
                accession = os.path.basename(os.path.normpath(genome_path))
                genomic_file = os.path.join(genome_path, accession + '_genomic.fna')
                self.genomes[gid].genomic_file = genomic_file
                self.genomic_files[gid] = genomic_file
                
                #*** Need to handle UBA genomes that can reference via their U_ ID when run through MASH
                self.genomic_files[accession] = genomic_file 
Example #16
0
    def parse_untrustworthy_type_ledger(self, untrustworthy_type_ledger):
        """Parse file indicating genomes considered to be untrustworthy as type material."""

        manual_untrustworthy_types = {}
        with open(untrustworthy_type_ledger) as f:
            header = f.readline().strip().split('\t')

            ncbi_sp_index = header.index('NCBI species')
            reason_index = header.index('Reason for declaring untrustworthy')

            for line in f:
                tokens = line.strip().split('\t')

                gid = canonical_gid(tokens[0])
                manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index],
                                                   tokens[reason_index])

        return manual_untrustworthy_types
Example #17
0
    def type_status(self,
                    cur_gtdb_metadata_file,
                    qc_passed_file,
                    ncbi_genbank_assembly_file,
                    untrustworthy_type_file,
                    gtdb_type_strains_ledger,
                    ncbi_env_bioproject_ledger,
                    genome_ids):
        """Report information related to a genome being type material."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                            create_sp_clusters=False,
                                            qc_passed_file=qc_passed_file,
                                            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                            untrustworthy_type_ledger=untrustworthy_type_file,
                                            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - current genome set contains {len(cur_genomes):,} genomes.')

        # report information
        pt = PrettyTable()
        pt.field_names = ['Genome ID', 'GTDB representative', 'GTDB type strain', 'GTDB untrustworthy as type',
                          'NCBI type strain', 'NCBI untrustworthy as type', 'GTDB species', 'NCBI species', 'NCBI strain IDs\n']
        for gid in genome_ids:
            gid = canonical_gid(gid)
            if gid not in cur_genomes:
                self.logger.warning(f'Genome {gid} not in current genome set.')
                continue

            pt.add_row([gid,
                        cur_genomes[gid].is_gtdb_sp_rep(),
                        cur_genomes[gid].is_gtdb_type_strain(),
                        cur_genomes[gid].is_gtdb_untrustworthy_as_type(),
                        cur_genomes[gid].is_ncbi_type_strain(),
                        cur_genomes[gid].is_ncbi_untrustworthy_as_type(),
                        cur_genomes[gid].gtdb_taxa.species,
                        cur_genomes[gid].ncbi_taxa.species,
                        cur_genomes[gid].ncbi_strain_identifiers])

        print(pt)
Example #18
0
    def parse_marker_percentages(self, gtdb_domain_report):
        """Parse percentage of marker genes for each genome."""

        marker_perc = {}
        with open(gtdb_domain_report, encoding='utf-8') as f:
            header = f.readline().rstrip().split('\t')

            bac_marker_perc_index = header.index('Bacterial Marker Percentage')
            ar_marker_perc_index = header.index('Archaeal Marker Percentage')

            for line in f:
                line_split = line.strip().split('\t')

                gid = canonical_gid(line_split[0])
                bac_perc = float(line_split[bac_marker_perc_index])
                ar_perc = float(line_split[ar_marker_perc_index])

                marker_perc[gid] = max(bac_perc, ar_perc)

        return marker_perc
Example #19
0
    def check_domain_assignments(self, gtdb_domain_report, passed_qc_gids):
        """Check GTDB domain assignment."""

        with open(gtdb_domain_report, encoding='utf-8') as f:
            header = f.readline().rstrip().split('\t')

            domain_index = header.index('Predicted domain')
            bac_marker_perc_index = header.index('Bacterial Marker Percentage')
            ar_marker_perc_index = header.index('Archaeal Marker Percentage')
            ncbi_taxonomy_index = header.index('NCBI taxonomy')
            gtdb_taxonomy_index = header.index('GTDB taxonomy')

            for line in f:
                line_split = line.strip().split('\t')

                gid = canonical_gid(line_split[0])
                if gid not in passed_qc_gids:
                    continue

                domain = line_split[domain_index]
                bac_perc = float(line_split[bac_marker_perc_index])
                ar_perc = float(line_split[ar_marker_perc_index])
                ncbi_domain = [
                    t.strip()
                    for t in line_split[ncbi_taxonomy_index].split(';')
                ][0]
                gtdb_domain = [
                    t.strip()
                    for t in line_split[gtdb_taxonomy_index].split(';')
                ][0]

                if not gid.startswith('U'):
                    if ncbi_domain != gtdb_domain and ncbi_domain != 'None':
                        self.logger.warning(
                            f'NCBI ({ncbi_domain}) and GTDB ({gtdb_domain}) domains disagree in domain report (Bac = {bac_perc:.1f}%; Ar = {ar_perc:.1f}%): {gid}'
                        )

                    if domain != gtdb_domain and domain != 'None':
                        self.logger.error(
                            f'GTDB and predicted domain (Bac = {bac_perc:.1f}%; Ar = {ar_perc:.1f}%) disagree in domain report: {gid} [THIS MUST BE FIXED BEFORE PROCEEDING].'
                        )
    def _get_genome_id(self, genome_path):
        """Extract genome ID from path to genomic file."""

        genome_id = ntpath.basename(genome_path)

        return canonical_gid(genome_id)
    def load_from_metadata_file(self, 
                                metadata_file,
                                species_exception_file=None,
                                genus_exception_file=None,
                                gtdb_type_strains_ledger=None,
                                create_sp_clusters=True,
                                uba_genome_file=None,
                                qc_passed_file=None,
                                ncbi_genbank_assembly_file=None,
                                untrustworthy_type_ledger=None):
        """Create genome set from file(s)."""
        
        pass_qc_gids = set()
        if qc_passed_file:
            with open(qc_passed_file) as f:
                f.readline()
                for line in f:
                    line_split = line.strip().split('\t')
                    pass_qc_gids.add(line_split[0].strip())
            self.logger.info(f' - identified {len(pass_qc_gids):,} genomes passing QC.')
                    
        valid_uba_ids = set()
        if uba_genome_file:
            with open(uba_genome_file) as f:
                for line in f:
                    line_split = line.strip().split('\t')
                    valid_uba_ids.add(line_split[0].strip())
            self.logger.info(f' - identified {len(valid_uba_ids):,} UBA genomes to retain.')

        gtdb_type_strains = set()
        if gtdb_type_strains_ledger:
            with open(gtdb_type_strains_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    gid = canonical_gid(tokens[0].strip())
                    gtdb_type_strains.add(gid)
            self.logger.info(f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.')
                    
        excluded_from_refseq_note = {}
        if ncbi_genbank_assembly_file:
            excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file)
            
        untrustworthy_as_type = set()
        if untrustworthy_type_ledger:
            untrustworthy_as_type = self.parse_untrustworthy_type_ledger(untrustworthy_type_ledger)
            self.logger.info(f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type.')

        with open(metadata_file, encoding='utf-8') as f:
            headers = f.readline().strip().split('\t')

            genome_index = headers.index('accession')

            gtdb_taxonomy_index = headers.index('gtdb_taxonomy')
            ncbi_taxonomy_index = headers.index('ncbi_taxonomy')
            ncbi_taxonomy_unfiltered_index = headers.index('ncbi_taxonomy_unfiltered')
            
            gtdb_type_index = headers.index('gtdb_type_designation')
            gtdb_type_sources_index = headers.index('gtdb_type_designation_sources')
            gtdb_type_species_of_genus_index = headers.index('gtdb_type_species_of_genus')
            ncbi_strain_identifiers_index = headers.index('ncbi_strain_identifiers')
            ncbi_type_index = headers.index('ncbi_type_material_designation')
            ncbi_asm_level_index = headers.index('ncbi_assembly_level')
            ncbi_genome_representation_index = headers.index('ncbi_genome_representation')
            ncbi_refseq_cat_index = headers.index('ncbi_refseq_category')
            ncbi_genome_cat_index = headers.index('ncbi_genome_category')
            
            comp_index = headers.index('checkm_completeness')
            cont_index = headers.index('checkm_contamination')
            sh_100_index = None
            if 'checkm_strain_heterogeneity_100' in headers:
                sh_100_index = headers.index('checkm_strain_heterogeneity_100')
            gs_index = headers.index('genome_size')
            contig_count_index = headers.index('contig_count')
            n50_index = headers.index('n50_contigs')
            scaffold_count_index = headers.index('scaffold_count')
            ambiguous_bases_index = headers.index('ambiguous_bases')
            total_gap_len_index = headers.index('total_gap_length')
            ssu_count_index = headers.index('ssu_count')
            ssu_length_index = headers.index('ssu_length')
            ncbi_molecule_count_index = headers.index('ncbi_molecule_count')
            ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps')
            ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps')
            
            gtdb_genome_rep_index = headers.index('gtdb_genome_representative')
            gtdb_rep_index = headers.index('gtdb_representative')
            
            if 'lpsn_priority_year' in headers:
                # this information will be missing from the previous
                # GTDB metadata file as we strip this out due to 
                # concerns over republishing this information
                lpsn_priority_index = headers.index('lpsn_priority_year')
                dsmz_priority_index = headers.index('dsmz_priority_year')
                straininfo_priority_index = headers.index('straininfo_priority_year')

            for line in f:
                line_split = line.strip().split('\t')
                
                ncbi_accn = line_split[genome_index]
                gid = canonical_gid(ncbi_accn)

                if gid.startswith('U_'):
                    # check if genome has a UBA identifier
                    org_name_index = headers.index('organism_name')
                    org_name = line_split[org_name_index]
                    if '(UBA' in org_name:
                        uba_id = org_name[org_name.find('(')+1:-1]
                        if uba_id in valid_uba_ids:
                            self.user_uba_id_map[gid] = uba_id
                            self.uba_user_id_map[uba_id] = gid
                            gid = uba_id
                        else:
                            continue # retain only valid UBA genomes
                    else:
                        continue # skip non-UBA user genomes
                        
                if pass_qc_gids and gid not in pass_qc_gids:
                    continue

                gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index])
                
                ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index])
                ncbi_taxonomy_unfiltered = Taxa(line_split[ncbi_taxonomy_unfiltered_index])
                
                gtdb_type = line_split[gtdb_type_index]
                gtdb_type_sources = line_split[gtdb_type_sources_index]
                if gid in gtdb_type_strains:
                    gtdb_type = 'type strain of species'
                    gtdb_type_sources = 'GTDB curator'
                gtdb_type_species_of_genus = line_split[gtdb_type_species_of_genus_index] == 't'
                
                ncbi_type = line_split[ncbi_type_index]
                ncbi_strain_identifiers = line_split[ncbi_strain_identifiers_index]
                ncbi_asm_level = line_split[ncbi_asm_level_index]
                ncbi_genome_representation = line_split[ncbi_genome_representation_index]
                ncbi_refseq_cat = line_split[ncbi_refseq_cat_index]
                ncbi_genome_cat = line_split[ncbi_genome_cat_index]
                
                comp = float(line_split[comp_index])
                cont = float(line_split[cont_index])
                sh_100 = 0
                if sh_100_index:
                    sh_100 = self._convert_float(line_split[sh_100_index])
                gs = int(line_split[gs_index])
                contig_count = int(line_split[contig_count_index])
                n50 = int(line_split[n50_index])
                scaffold_count = int(line_split[scaffold_count_index])
                ambiguous_bases = int(line_split[ambiguous_bases_index])
                total_gap_len = int(line_split[total_gap_len_index])
                ssu_count = int(line_split[ssu_count_index])
                ssu_length = self._convert_int(line_split[ssu_length_index])
                ncbi_molecule_count = self._convert_int(line_split[ncbi_molecule_count_index])
                ncbi_unspanned_gaps = self._convert_int(line_split[ncbi_unspanned_gaps_index])
                ncbi_spanned_gaps = self._convert_int(line_split[ncbi_spanned_gaps_index])
                
                gtdb_is_rep = line_split[gtdb_rep_index] == 't'
                gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index])
                if create_sp_clusters:
                    self.sp_clusters.update_sp_cluster(gtdb_rid, gid, gtdb_taxonomy.species)
                
                if 'lpsn_priority_year' in headers:
                    lpsn_priority_year = self._convert_int(line_split[lpsn_priority_index], Genome.NO_PRIORITY_YEAR)
                    dsmz_priority_year = self._convert_int(line_split[dsmz_priority_index], Genome.NO_PRIORITY_YEAR)
                    straininfo_priority_year = self._convert_int(line_split[straininfo_priority_index], Genome.NO_PRIORITY_YEAR)
                else:
                    lpsn_priority_year = Genome.NO_PRIORITY_YEAR
                    dsmz_priority_year = Genome.NO_PRIORITY_YEAR
                    straininfo_priority_year = Genome.NO_PRIORITY_YEAR

                self.genomes[gid] = Genome(gid,
                                            ncbi_accn,
                                            gtdb_rid,
                                            gtdb_is_rep,
                                            gtdb_taxonomy,
                                            ncbi_taxonomy,
                                            ncbi_taxonomy_unfiltered,
                                            gtdb_type,
                                            gtdb_type_sources,
                                            gtdb_type_species_of_genus,
                                            gid in untrustworthy_as_type,
                                            ncbi_type,
                                            ncbi_strain_identifiers,
                                            ncbi_asm_level,
                                            ncbi_genome_representation,
                                            ncbi_refseq_cat,
                                            ncbi_genome_cat,
                                            excluded_from_refseq_note.get(gid, ''),
                                            comp,
                                            cont,
                                            sh_100,
                                            gs,
                                            contig_count,
                                            n50,
                                            scaffold_count,
                                            ambiguous_bases,
                                            total_gap_len,
                                            ssu_count,
                                            ssu_length,
                                            ncbi_molecule_count,
                                            ncbi_unspanned_gaps,
                                            ncbi_spanned_gaps,
                                            lpsn_priority_year,
                                            dsmz_priority_year,
                                            straininfo_priority_year)
                                            
        self._apply_ncbi_taxonomy_ledgers(species_exception_file,
                                            genus_exception_file)
    def run(self, 
                cur_gtdb_metadata_file,
                cur_genomic_path_file,
                qc_passed_file,
                ncbi_genbank_assembly_file,
                ltp_taxonomy_file,
                gtdb_type_strains_ledger,
                untrustworthy_type_ledger):
        """Resolve cases where a species has multiple genomes assembled from the type strain."""
        
        # get species in LTP reference database
        self.logger.info('Determining species defined in LTP reference database.')
        ltp_defined_species = self.ltp_defined_species(ltp_taxonomy_file)
        self.logger.info(f' ... identified {len(ltp_defined_species):,} species.')
        
        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(cur_gtdb_metadata_file,
                                                gtdb_type_strains_ledger=gtdb_type_strains_ledger,
                                                create_sp_clusters=False,
                                                uba_genome_file=None,
                                                qc_passed_file=qc_passed_file,
                                                ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
                                                untrustworthy_type_ledger=untrustworthy_type_ledger)
        cur_genomes.load_genomic_file_paths(cur_genomic_path_file)
        self.logger.info(f' ... current genome set contains {len(cur_genomes):,} genomes.')
        
        # update current genomes with GTDB-Tk classifications
        self.logger.info('Updating current genomes with GTDB-Tk classifications.')
        num_updated, num_ncbi_sp = cur_genomes.set_gtdbtk_classification(gtdbtk_classify_file, prev_genomes)
        self.logger.info(f' ... set GTDB taxa for {num_updated:,} genomes with {num_ncbi_sp:,} genomes using NCBI genus and species name.')
        
        # parsing genomes manually established to be untrustworthy as type
        self.logger.info('Determining genomes manually annotated as untrustworthy as type.')
        manual_untrustworthy_types = {}
        with open(untrustworthy_type_ledger) as f:
            header = f.readline().strip().split('\t')
            
            ncbi_sp_index = header.index('NCBI species')
            reason_index = header.index('Reason for declaring untrustworthy')
            
            for line in f:
                tokens = line.strip().split('\t')
                
                gid = canonical_gid(tokens[0])
                manual_untrustworthy_types[gid] = (tokens[ncbi_sp_index], tokens[reason_index])
        self.logger.info(f' ... identified {len(manual_untrustworthy_types):,} genomes manually annotated as untrustworthy as type.')

        # identify NCBI species with multiple genomes assembled from type strain of species
        self.logger.info('Determining number of type strain genomes in each NCBI species.')
        sp_type_strain_genomes = defaultdict(set)
        for gid in cur_genomes:
            if cur_genomes[gid].is_effective_type_strain():
                ncbi_sp = cur_genomes[gid].ncbi_taxa.species
                if ncbi_sp != 's__':
                    # yes, NCBI has genomes marked as assembled from type material
                    # that do not actually have a binomial species name
                    sp_type_strain_genomes[ncbi_sp].add(gid)

        multi_type_strains_sp = [ncbi_sp for ncbi_sp, gids in sp_type_strain_genomes.items() if len(gids) > 1]
        self.logger.info(f' ... identified {len(multi_type_strains_sp):,} NCBI species with multiple assemblies indicated as being type strain genomes.')
        
        # sort by number of genome assemblies
        self.logger.info('Calculating ANI between type strain genomes in each species.')
        
        fout = open(os.path.join(self.output_dir, 'multi_type_strain_species.tsv'), 'w')
        fout.write('NCBI species\tNo. type strain genomes\t>=99% ANI\tMean ANI\tStd ANI\tMean AF\tStd AF\tResolution\tGenome IDs\n')
        
        fout_genomes = open(os.path.join(self.output_dir, 'type_strain_genomes.tsv'), 'w')
        fout_genomes.write('Genome ID\tUntrustworthy\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tConflict with prior GTDB assignment')
        fout_genomes.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_unresolved = open(os.path.join(self.output_dir, 'unresolved_type_strain_genomes.tsv'), 'w')
        fout_unresolved.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species')
        fout_unresolved.write('\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_high_divergence = open(os.path.join(self.output_dir, 'highly_divergent_type_strain_genomes.tsv'), 'w')
        fout_high_divergence.write('Genome ID\tNCBI species\tGTDB genus\tGTDB species\tLTP species\tMean ANI\tStd ANI\tMean AF\tStd AF\tExclude from RefSeq\tNCBI taxonomy\tGTDB taxonomy\n')
        
        fout_untrustworthy = open(os.path.join(self.output_dir, 'untrustworthy_type_material.tsv'), 'w')
        fout_untrustworthy.write('Genome ID\tNCBI species\tGTDB species\tLTP species\tReason for declaring untrustworthy\n')
        for gid in manual_untrustworthy_types:
            ncbi_sp, reason = manual_untrustworthy_types[gid]
            fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(
                                        gid, 
                                        ncbi_sp, 
                                        cur_genomes[gid].gtdb_taxa.species,
                                        '<not tested>',
                                        'n/a',
                                        'Manual curation: ' + reason))
        
        processed = 0
        num_divergent = 0
        unresolved_sp_count = 0
        
        ncbi_ltp_resolved = 0
        intra_ani_resolved = 0
        ncbi_type_resolved = 0
        gtdb_family_resolved = 0
        gtdb_genus_resolved = 0
        gtdb_sp_resolved = 0
        ltp_resolved = 0
        
        use_pickled_results = False #***
        if use_pickled_results:
            self.logger.warning('Using previously calculated ANI results in: {}'.format(self.ani_pickle_dir))
        
        prev_gtdb_sp_conflicts = 0
        for ncbi_sp, type_gids in sorted(sp_type_strain_genomes.items(), key=lambda kv: len(kv[1])):
            if len(type_gids) == 1:
                continue
                
            status_str = '-> Processing {} with {:,} type strain genomes [{:,} of {:,} ({:.2f}%)].'.format(
                                ncbi_sp, 
                                len(type_gids),
                                processed+1, 
                                len(multi_type_strains_sp),
                                (processed+1)*100.0/len(multi_type_strains_sp)).ljust(128)
            sys.stdout.write('{}\r'.format(status_str))
            sys.stdout.flush()
            processed += 1

            # calculate ANI between type strain genomes
            ncbi_sp_str = ncbi_sp[3:].lower().replace(' ', '_')
            if not use_pickled_results: #***
                ani_af = self.fastani.pairwise(type_gids, cur_genomes.genomic_files)
                pickle.dump(ani_af, open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'wb'))
            else:
                ani_af = pickle.load(open(os.path.join(self.ani_pickle_dir, f'{ncbi_sp_str}.pkl'), 'rb'))
            
            anis = []
            afs = []
            gid_anis = defaultdict(lambda: {})
            gid_afs = defaultdict(lambda: {})
            all_similar = True
            for gid1, gid2 in combinations(type_gids, 2):
                ani, af = symmetric_ani(ani_af, gid1, gid2)
                if ani < 99 or af < 0.65:
                    all_similar = False
                    
                anis.append(ani)
                afs.append(af)
                
                gid_anis[gid1][gid2] = ani
                gid_anis[gid2][gid1] = ani
                
                gid_afs[gid1][gid2] = af
                gid_afs[gid2][gid1] = af
                
            note = 'All type strain genomes have ANI >99% and AF >65%.'
            unresolved_species = False
            
            # read LTP metadata for genomes
            ltp_metadata = self.parse_ltp_metadata(type_gids, cur_genomes)

            untrustworthy_gids = {}
            gtdb_resolved_sp_conflict = False
            if not all_similar:
                # need to establish which genomes are untrustworthy as type
                num_divergent += 1
                unresolved_species = True
                
                # write out highly divergent cases for manual inspection; 
                # these should be compared to the automated selection
                if np_mean(anis) < 95:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_high_divergence.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                                        gid,
                                                        ncbi_sp,
                                                        cur_genomes[gid].gtdb_taxa.genus,
                                                        cur_genomes[gid].gtdb_taxa.species,
                                                        ' / '.join(ltp_species),
                                                        np_mean(list(gid_anis[gid].values())),
                                                        np_std(list(gid_anis[gid].values())),
                                                        np_mean(list(gid_afs[gid].values())),
                                                        np_std(list(gid_afs[gid].values())),
                                                        cur_genomes[gid].excluded_from_refseq_note,
                                                        cur_genomes[gid].ncbi_taxa,
                                                        cur_genomes[gid].gtdb_taxa))
                
                # filter genomes marked as `untrustworthy as type` at NCBI and where the LTP
                # assignment also suggest the asserted type material is incorrect
                resolved, untrustworthy_gids = self.resolve_validated_untrustworthy_ncbi_genomes(gid_anis, 
                                                                                                    ncbi_sp, 
                                                                                                    type_gids, 
                                                                                                    ltp_metadata, 
                                                                                                    ltp_defined_species,
                                                                                                    cur_genomes)
                if resolved:
                    note = "Species resolved by removing genomes considered `untrustworthy as type` and with a LTP BLAST hit confirming the assembly is likely untrustworthy"
                    ncbi_ltp_resolved += 1

                # try to resolve by LTP 16S BLAST results
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_ltp_conflict(gid_anis, ncbi_sp, type_gids, ltp_metadata, 0)
                    if resolved:
                        note = 'Species resolved by identifying conflicting or lack of LTP BLAST results'
                        ltp_resolved += 1

                # try to resolve species using intra-specific ANI test
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_intra_specific_ani(gid_anis)
                    if resolved:
                        note = 'Species resolved by intra-specific ANI test'
                        intra_ani_resolved += 1

                # try to resolve by GTDB family assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_family(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB family classifications'
                        gtdb_family_resolved += 1
                
                # try to resolve by GTDB genus assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_genus(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB genus classifications'
                        gtdb_genus_resolved += 1
                           
                # try to resolve by GTDB species assignment
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_gtdb_species(gid_anis, ncbi_sp, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting GTDB species classifications'
                        gtdb_sp_resolved += 1
                        
                # try to resolve by considering genomes annotated as type material at NCBI,
                # which includes considering if genomes are marked as untrustworthy as type
                if not resolved:
                    resolved, untrustworthy_gids = self.resolve_by_ncbi_types(gid_anis, type_gids, cur_genomes)
                    if resolved:
                        note = 'Species resolved by consulting NCBI assembled from type metadata'
                        ncbi_type_resolved += 1

                if resolved:
                    unresolved_species = False
                    
                    # check if type strain genomes marked as trusted or untrusted conflict
                    # with current GTDB species assignment
                    untrustworthy_gtdb_sp_match = False
                    trusted_gtdb_sp_match = False
                    for gid in type_gids:
                        gtdb_canonical_epithet = canonical_taxon(specific_epithet(cur_genomes[gid].gtdb_taxa.species))
                        if gtdb_canonical_epithet == specific_epithet(ncbi_sp):
                            if gid in untrustworthy_gids:
                                untrustworthy_gtdb_sp_match = True
                            else:
                                trusted_gtdb_sp_match = True

                    if untrustworthy_gtdb_sp_match and not trusted_gtdb_sp_match:
                        prev_gtdb_sp_conflicts += 1
                        gtdb_resolved_sp_conflict = True

                    # write results to file
                    for gid, reason in untrustworthy_gids.items():
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                        
                        if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note:
                            reason += "; considered `untrustworthy as type` at NCBI"
                        fout_untrustworthy.write('{}\t{}\t{}\t{}\t{}\n'.format(gid,
                                                                                ncbi_sp,
                                                                                cur_genomes[gid].gtdb_taxa.species,
                                                                                ' / '.join(ltp_species),
                                                                                reason))
                                                                                
                        # Sanity check that if the untrustworthy genome has an LTP to only the
                        # expected species, that all other genomes also have a hit to the 
                        # expected species (or potentially no hit). Otherwise, more consideration
                        # should be given to the genome with the conflicting LTP hit.
                        if len(ltp_species) == 1 and ncbi_sp in ltp_species:
                            other_sp = set()
                            for test_gid in type_gids:
                                ltp_species = self.ltp_species(test_gid, ltp_metadata)
                                if ltp_species and ncbi_sp not in ltp_species:
                                    other_sp.update(ltp_species)
                                
                            if other_sp:
                                self.logger.warning(f'Genome {gid} marked as untrustworthy, but this conflicts with high confidence LTP 16S rRNA assignment.')
                                
                    num_ncbi_untrustworthy = sum([1 for gid in type_gids if 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note])
                    if num_ncbi_untrustworthy != len(type_gids):
                        for gid in type_gids:
                            if (gid not in untrustworthy_gids 
                                and 'untrustworthy as type' in cur_genomes[gid].excluded_from_refseq_note):
                                self.logger.warning("Retaining genome {} from {} despite being marked as `untrustworthy as type` at NCBI [{:,} of {:,} considered untrustworthy].".format(
                                                        gid, 
                                                        ncbi_sp,
                                                        num_ncbi_untrustworthy,
                                                        len(type_gids)))
                else:
                    note = 'Species is unresolved; manual curation is required!'
                    unresolved_sp_count += 1
                    
                if unresolved_species:
                    for gid in type_gids:
                        ltp_species = self.ltp_species(gid, ltp_metadata)
                            
                        fout_unresolved.write('{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                                    gid,
                                    ncbi_sp,
                                    cur_genomes[gid].gtdb_taxa.genus,
                                    cur_genomes[gid].gtdb_taxa.species,
                                    ' / '.join(ltp_species),
                                    np_mean(list(gid_anis[gid].values())),
                                    np_std(list(gid_anis[gid].values())),
                                    np_mean(list(gid_afs[gid].values())),
                                    np_std(list(gid_afs[gid].values())),
                                    cur_genomes[gid].excluded_from_refseq_note,
                                    cur_genomes[gid].ncbi_taxa,
                                    cur_genomes[gid].gtdb_taxa))

            for gid in type_gids:
                ltp_species = self.ltp_species(gid, ltp_metadata)
                    
                fout_genomes.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\t{}\n'.format(
                            gid,
                            gid in untrustworthy_gids,
                            ncbi_sp,
                            cur_genomes[gid].gtdb_taxa.genus,
                            cur_genomes[gid].gtdb_taxa.species,
                            ' / '.join(ltp_species),
                            gtdb_resolved_sp_conflict,
                            np_mean(list(gid_anis[gid].values())),
                            np_std(list(gid_anis[gid].values())),
                            np_mean(list(gid_afs[gid].values())),
                            np_std(list(gid_afs[gid].values())),
                            cur_genomes[gid].excluded_from_refseq_note,
                            cur_genomes[gid].ncbi_taxa,
                            cur_genomes[gid].gtdb_taxa))

            fout.write('{}\t{}\t{}\t{:.2f}\t{:.3f}\t{:.3f}\t{:.4f}\t{}\t{}\n'.format(
                        ncbi_sp,
                        len(type_gids),
                        all_similar,
                        np_mean(anis),
                        np_std(anis),
                        np_mean(afs),
                        np_std(afs),
                        note,
                        ', '.join(type_gids)))

        sys.stdout.write('\n')
        fout.close()
        fout_unresolved.close()
        fout_high_divergence.close()
        fout_genomes.close()
        fout_untrustworthy.close()
        
        self.logger.info(f'Identified {num_divergent:,} species with 1 or more divergent type strain genomes.')
        self.logger.info(f' ... resolved {ncbi_ltp_resolved:,} species by removing NCBI `untrustworthy as type` genomes with a conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {ltp_resolved:,} species by considering conflicting LTP 16S rRNA classifications.')
        self.logger.info(f' ... resolved {intra_ani_resolved:,} species by considering intra-specific ANI values.')
        self.logger.info(f' ... resolved {gtdb_family_resolved:,} species by considering conflicting GTDB family classifications.')
        self.logger.info(f' ... resolved {gtdb_genus_resolved:,} species by considering conflicting GTDB genus classifications.')
        self.logger.info(f' ... resolved {gtdb_sp_resolved:,} species by considering conflicting GTDB species classifications.')
        self.logger.info(f' ... resolved {ncbi_type_resolved:,} species by considering type material designations at NCBI.')

        if unresolved_sp_count > 0:
            self.logger.warning(f'There are {unresolved_sp_count:,} unresolved species with multiple type strain genomes.')
            self.logger.warning('These should be handled before proceeding with the next step of GTDB species updating.')
            self.logger.warning("This can be done by manual curation and adding genomes to 'untrustworthy_type_ledger'.")
        
        self.logger.info(f'Identified {prev_gtdb_sp_conflicts:,} cases where resolved type strain conflicts with prior GTDB assignment.')
    def identify_misclassified_genomes_cluster(self, cur_genomes, cur_clusters,
                                               ncbi_untrustworthy_sp_ledger):
        """Identify genomes with erroneous NCBI species assignments, based on GTDB clustering of type strain genomes."""

        forbidden_names = set(['cyanobacterium'])

        # get mapping from genomes to their representatives
        gid_to_rid = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                gid_to_rid[cid] = rid

        # get genomes with NCBI species assignment
        ncbi_sp_gids = defaultdict(list)
        for gid in cur_genomes:
            ncbi_species = cur_genomes[gid].ncbi_taxa.species
            ncbi_specific = specific_epithet(ncbi_species)

            if ncbi_species != 's__' and ncbi_specific not in forbidden_names:
                ncbi_sp_gids[ncbi_species].append(gid)

        # get NCBI species anchored by a type strain genome
        ncbi_type_anchored_species = {}
        for rid, cids in cur_clusters.items():
            for cid in cids:
                if cur_genomes[cid].is_effective_type_strain():
                    ncbi_type_species = cur_genomes[cid].ncbi_taxa.species
                    ncbi_specific = specific_epithet(ncbi_species)
                    if ncbi_type_species != 's__' and ncbi_specific not in forbidden_names:
                        if (ncbi_type_species in ncbi_type_anchored_species
                                and rid !=
                                ncbi_type_anchored_species[ncbi_type_species]):
                            self.logger.error(
                                'NCBI species {} has multiple effective type strain genomes in different clusters.'
                                .format(ncbi_type_species))
                            sys.exit(-1)

                        ncbi_type_anchored_species[ncbi_type_species] = rid
        self.logger.info(
            ' - identified {:,} NCBI species anchored by a type strain genome.'
            .format(len(ncbi_type_anchored_species)))

        # identify genomes with erroneous NCBI species assignments
        fout = open(
            os.path.join(self.output_dir,
                         'ncbi_misclassified_sp.gtdb_clustering.tsv'), 'w')
        fout.write(
            'Genome ID\tNCBI species\tGenome cluster\tType species cluster\n')

        misclassified_gids = set()
        for ncbi_species, species_gids in ncbi_sp_gids.items():
            if ncbi_species not in ncbi_type_anchored_species:
                continue

            # find genomes with NCBI species assignments that are in a
            # different cluster than the type strain genome
            type_rid = ncbi_type_anchored_species[ncbi_species]
            for gid in species_gids:
                cur_rid = gid_to_rid[gid]
                if type_rid != cur_rid:
                    misclassified_gids.add(gid)
                    fout.write('{}\t{}\t{}\t{}\t\n'.format(
                        gid, ncbi_species, cur_rid, type_rid))

        # add in genomes manually indicated as having erroneous NCBI species assignments
        with open(ncbi_untrustworthy_sp_ledger) as f:
            f.readline()
            for line in f:
                tokens = line.strip().split('\t')
                gid = canonical_gid(tokens[0])
                fout.write('{}\t{}\t{}\t{}\t\n'.format(
                    gid, cur_genomes[gid].ncbi_taxa.species, gid_to_rid[gid],
                    "Manually marked as being erroneous"))

        fout.close()

        misclassified_species = {
            cur_genomes[gid].ncbi_taxa.species
            for gid in misclassified_gids
        }
        self.logger.info(
            ' - identified {:,} genomes from {:,} species as having misclassified NCBI species assignments.'
            .format(len(misclassified_gids), len(misclassified_species)))

        return misclassified_gids
Example #24
0
    def new_ncbi_genera(self, prev_genomes, cur_genomes, cur_clusters,
                        gtdbtk_classify_file):
        """Determine new NCBI genera that likely need to be considered by curators."""

        self.logger.info(
            'Determining new NCBI genera for consideration by curators.')

        # read GTDB-Tk classification information
        gtdbtk = {}
        with open(gtdbtk_classify_file) as f:
            header = f.readline().strip().split('\t')

            classification_index = header.index('classification')
            red_index = header.index('red_value')
            note_index = header.index('note')

            for line in f:
                tokens = line.strip().split('\t')
                gid = tokens[0]
                gid = canonical_gid(gid)

                classification = [
                    t.strip() for t in tokens[classification_index].split(';')
                ]
                red = tokens[red_index]
                if red != 'N/A':
                    red = float(tokens[red_index])

                gtdbtk[gid] = (classification, red, tokens[note_index])

        # get NCBI genera in previous GTDB release
        prev_ncbi_genera = set()
        for gid in prev_genomes:
            ncbi_genera = prev_genomes[gid].ncbi_taxa.genus
            prev_ncbi_genera.add(ncbi_genera)
        self.logger.info(
            ' ... identified {:,} NCBI genera in previous GTDB release.'.
            format(len(prev_ncbi_genera)))

        # get NCBI genera in current GTDB release
        cur_ncbi_genera = set()
        for rid in cur_clusters:
            ncbi_genera = cur_genomes[rid].ncbi_taxa.genus
            cur_ncbi_genera.add(ncbi_genera)
        self.logger.info(
            ' ... identified {:,} NCBI genera in current GTDB release.'.format(
                len(cur_ncbi_genera)))

        # determine new NCBI genera
        new_ncbi_genera = cur_ncbi_genera - prev_ncbi_genera
        self.logger.info(
            ' ... identified {:,} NCBI genera that are new to the current GTDB release.'
            .format(len(new_ncbi_genera)))

        # determine genomes from new NCBI genera
        new_genera_rids = defaultdict(list)
        for rid in cur_clusters:
            ncbi_genera = cur_genomes[rid].ncbi_taxa.genus
            if ncbi_genera in new_ncbi_genera:
                new_genera_rids[ncbi_genera].append(rid)

        # report each new NCBI genera
        fout = open(os.path.join(self.output_dir, 'new_ncbi_genera.tsv'), 'w')
        fout.write(
            'NCBI genus\tRepresentative ID(s)\tNew genomes\tGTDB-Tk classification\tNCBI genus conflict\tRED\tGTDB-Tk note\n'
        )
        for new_genus in new_ncbi_genera:
            new_genomes = []
            red_values = []
            gtdbtk_classification = []
            gtdbtk_notes = []
            ncbi_genus_conflict = False
            for rid in new_genera_rids[new_genus]:
                new_genomes.append(str(rid not in prev_genomes))
                if rid in gtdbtk:
                    classification, red, note = gtdbtk[rid]
                    gtdbtk_notes.append(note)

                    if red != 'N/A':
                        red_values.append('{:.2f}'.format(red))
                    else:
                        red_values.append('n/a')

                    if classification[6] != 's__':
                        gtdbtk_classification.append(classification[6])
                    else:
                        gtdbtk_classification.append(classification[5])

                    gtdbtk_genus = classification[5]
                    if gtdbtk_genus != 'g__' and gtdbtk_genus != new_genus:
                        ncbi_genus_conflict = True
                else:
                    red_values.append('n/a')
                    gtdbtk_classification.append('n/a')
                    gtdbtk_notes.append('n/a')

            fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                new_genus, ', '.join(new_genera_rids[new_genus]),
                ', '.join(new_genomes), ', '.join(gtdbtk_classification),
                str(ncbi_genus_conflict), ', '.join(red_values),
                ', '.join(gtdbtk_notes)))

        fout.close()
    def run(self, 
                metadata_file,
                cur_uba_gid_file,
                ncbi_genbank_assembly_file,
                gtdb_domain_report,
                qc_exception_file,
                min_comp,
                max_cont,
                min_quality,
                sh_exception,
                min_perc_markers,
                max_contigs,
                min_N50,
                max_ambiguous,
                output_dir):
        """Quality check all potential GTDB genomes."""

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(metadata_file,
                                                create_sp_clusters=False,
                                                uba_genome_file=cur_uba_gid_file)
        self.logger.info(f' ...current genome set contains {len(cur_genomes):,} genomes.')

        # parse genomes flagged as exceptions from QC
        qc_exceptions = set()
        with open(qc_exception_file, encoding='utf-8') as f:
            f.readline()
            for line in f:
                gid = canonical_gid(line.split('\t')[0].strip())
                qc_exceptions.add(gid)
        self.logger.info(f'Identified {len(qc_exceptions):,} genomes flagged as exceptions from QC.')
        
        # get percentage of bac120 or ar122 marker genes
        marker_perc = self.read_marker_percentages(gtdb_domain_report, 
                                                    cur_genomes)

        # parse NCBI assembly files
        self.logger.info('Parsing NCBI assembly files.')
        excluded_from_refseq_note = exclude_from_refseq(ncbi_genbank_assembly_file)

        # QC all genomes
        self.logger.info('Validating genomes.')
        fout_retained = open(os.path.join(output_dir, 'qc_passed.tsv'), 'w')
        fout_failed = open(os.path.join(output_dir, 'qc_failed.tsv'), 'w')
        
        header = 'Accession\tNCBI species\tGTDB taxonomy'
        header += '\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%'
        header += '\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases'
        
        fout_retained.write(header + '\tNote\n')
        fout_failed.write(header)
        fout_failed.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_failed.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases\n')

        pass_qc_gids = set()
        failed_qc_gids = set()
        for gid in cur_genomes:
            failed_tests = defaultdict(int)
            passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid],
                                                    min_comp,
                                                    max_cont,
                                                    min_quality,
                                                    sh_exception,
                                                    min_perc_markers,
                                                    max_contigs,
                                                    min_N50,
                                                    max_ambiguous,
                                                    failed_tests)

            if passed_qc or gid in qc_exceptions:
                pass_qc_gids.add(gid)
                fout_retained.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa))
                fout_retained.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d\t%s\n' % (
                                        cur_genomes[gid].comp,
                                        cur_genomes[gid].cont,
                                        cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                        ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        cur_genomes[gid].contig_count,
                                        cur_genomes[gid].contig_n50,
                                        cur_genomes[gid].ambiguous_bases,
                                        'Passed QC' if passed_qc else 'Flagged as exception'))
            else:
                failed_qc_gids.add(gid) 
                fout_failed.write('%s\t%s\t%s' % (gid, cur_genomes[gid].ncbi_taxa.species, cur_genomes[gid].gtdb_taxa))
                fout_failed.write('\t%.2f\t%.2f\t%.2f\t%s\t%.2f\t%d\t%d\t%d' % (
                                        cur_genomes[gid].comp,
                                        cur_genomes[gid].cont,
                                        cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                        ('%.2f' % cur_genomes[gid].strain_heterogeneity_100) if cur_genomes[gid].strain_heterogeneity_100 else '-',
                                        marker_perc[gid],
                                        cur_genomes[gid].contig_count,
                                        cur_genomes[gid].contig_n50,
                                        cur_genomes[gid].ambiguous_bases))
                fout_failed.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    failed_tests['comp'],
                                    failed_tests['cont'],
                                    failed_tests['qual'],
                                    failed_tests['marker_perc'],
                                    failed_tests['contig_count'],
                                    failed_tests['N50'],
                                    failed_tests['ambig']))
        fout_retained.close()
        fout_failed.close()
        
        self.logger.info('Retained {:,} ({:.2f}%) genomes and filtered {:,} ({:.2f}%) genomes.'.format(
                            len(pass_qc_gids),
                            len(pass_qc_gids)*100.0/len(cur_genomes),
                            len(failed_qc_gids),
                            len(failed_qc_gids)*100.0/len(cur_genomes)))
        
        # check domain assignment of genomes passing QC
        # report potential issues
        self.check_domain_assignments(gtdb_domain_report, 
                                        cur_genomes,
                                        pass_qc_gids)
                                                                
        # QC genomes in each named species
        named_ncbi_species = cur_genomes.named_ncbi_species()
        self.logger.info(f'Performing QC of type genome for each of the {len(named_ncbi_species):,} NCBI species.')
        
        fout_type_fail = open(os.path.join(output_dir, 'type_genomes_fail_qc.tsv'), 'w')
        fout_type_fail.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tType sources\tNCBI assembly type\tGenome size (bp)')
        fout_type_fail.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_type_fail.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases\tNCBI exclude from RefSeq\tLost species\n')
        
        fout_fail_sp = open(os.path.join(output_dir, 'species_fail_qc.tsv'), 'w')
        fout_fail_sp.write('NCBI species\tAccession\tGTDB taxonomy\tNCBI taxonomy\tAssembled from type material\tGenome size (bp)')
        fout_fail_sp.write('\tCompleteness (%)\tContamination (%)\tQuality\tStrain heterogeneity at 100%')
        fout_fail_sp.write('\tMarkers (%)\tNo. contigs\tN50 contigs\tAmbiguous bases')
        fout_fail_sp.write('\tFailed completeness\tFailed contamination\tFailed quality')
        fout_fail_sp.write('\tFailed marker percentage\tFailed no. contigs\tFailed N50 contigs\tFailed ambiguous bases')
        fout_fail_sp.write('\tNCBI exclude from RefSeq\n')
        
        fout_sp_lost = open(os.path.join(output_dir, 'species_lost.tsv'), 'w')
        fout_sp_lost.write('NCBI species\tNo. genomes\tNo. type genomes')
        fout_sp_lost.write('\tFail completeness\tFail contamination\tFail quality\tFailed percent markers')
        fout_sp_lost.write('\tFail no. contigs\tFail N50 contigs\tFail ambiguous bases\n')
        
        lost_type = 0
        lost_sp = 0
        filtered_genomes = 0
        failed_tests_cumulative = defaultdict(int)
        for sp, gids in named_ncbi_species.items():
            type_pass = set()
            type_fail = set()
            other_pass = set()
            other_fail = set()
            
            failed_tests_gids = {}
            for gid in gids:
                failed_tests = defaultdict(int)
                passed_qc = cur_genomes[gid].pass_qc(marker_perc[gid],
                                                        min_comp,
                                                        max_cont,
                                                        min_quality,
                                                        sh_exception,
                                                        min_perc_markers,
                                                        max_contigs,
                                                        min_N50,
                                                        max_ambiguous,
                                                        failed_tests)
                                    
                failed_tests_gids[gid] = failed_tests

                if cur_genomes[gid].is_gtdb_type_strain() or cur_genomes[gid].is_ncbi_type_strain():
                    if passed_qc or gid in qc_exceptions:
                        type_pass.add(gid)
                    else:
                        type_fail.add(gid)
                        filtered_genomes += 1
                else:
                    if passed_qc or gid in qc_exceptions:
                        other_pass.add(gid)
                    else:
                        other_fail.add(gid)
                        filtered_genomes += 1
                        
                # tally failed species
                for test, count in failed_tests.items():
                    failed_tests_cumulative[test] += count

            if len(type_pass) >= 1:
                # great: one or more type genomes pass QC and will be selected as the type genome
                continue 
            
            if len(type_fail):
                # all potential type genomes for species failed QC so report these for manual inspection
                lost_type += 1
                for gid in type_fail:
                    fout_type_fail.write('%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%s\t%s\n' % (
                                            sp,
                                            gid,
                                            cur_genomes[gid].gtdb_taxa,
                                            cur_genomes[gid].ncbi_taxa,
                                            cur_genomes[gid].gtdb_type_designation_sources,
                                            cur_genomes[gid].ncbi_type_material,
                                            float(cur_genomes[gid].length)/1e6,
                                            cur_genomes[gid].comp,
                                            cur_genomes[gid].cont,
                                            cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                            cur_genomes[gid].strain_heterogeneity_100,
                                            marker_perc[gid],
                                            cur_genomes[gid].contig_count,
                                            cur_genomes[gid].contig_n50,
                                            cur_genomes[gid].ambiguous_bases,
                                            excluded_from_refseq_note[gid],
                                            len(other_pass) == 0))
                
            if len(other_pass) == 0:
                # no genomes for species pass QC so report loss of species
                lost_sp += 1
                fout_sp_lost.write('%s\t%d\t%d' % (sp, len(gids), len(type_fail)))
                fout_sp_lost.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n' % (
                                    sum([failed_tests_gids[gid]['comp'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['cont'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['qual'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['marker_perc'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['contig_count'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['N50'] for gid in gids]),
                                    sum([failed_tests_gids[gid]['ambig'] for gid in gids])))
                                    
                for gid in type_fail.union(other_fail):
                    fout_fail_sp.write('%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d' % (
                                            sp,
                                            gid,
                                            cur_genomes[gid].gtdb_taxa,
                                            cur_genomes[gid].ncbi_taxa,
                                            gid in type_fail,
                                            float(cur_genomes[gid].length)/1e6,
                                            cur_genomes[gid].comp,
                                            cur_genomes[gid].cont,
                                            cur_genomes[gid].comp-5*cur_genomes[gid].cont,
                                            cur_genomes[gid].strain_heterogeneity_100,
                                            marker_perc[gid],
                                            cur_genomes[gid].contig_count,
                                            cur_genomes[gid].contig_n50,
                                            cur_genomes[gid].ambiguous_bases))
                    fout_fail_sp.write('\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (
                                        failed_tests_gids[gid]['comp'],
                                        failed_tests_gids[gid]['cont'],
                                        failed_tests_gids[gid]['qual'],
                                        failed_tests_gids[gid]['marker_perc'],
                                        failed_tests_gids[gid]['contig_count'],
                                        failed_tests_gids[gid]['N50'],
                                        failed_tests_gids[gid]['ambig']))
                    fout_fail_sp.write('\t%s\n' % excluded_from_refseq_note[gid])

        fout_type_fail.close()
        fout_fail_sp.close()
        fout_sp_lost.close()
        
        self.logger.info(f'Filtered {filtered_genomes:,} genomes assigned to NCBI species.')
        self.logger.info(f'Identified {lost_type:,} species with type genomes failing QC and {lost_sp:,} total species failing QC.')
        self.logger.info('Genomes from NCBI species filtered by each criterion:')
        for test in sorted(failed_tests_cumulative):
            self.logger.info(f'{test}: {failed_tests_cumulative[test]:,}')
Example #26
0
def read_gtdb_ncbi_taxonomy(metadata_file, 
                            species_exception_file,
                            genus_exception_file):
    """Parse NCBI taxonomy from GTDB metadata.

    Parameters
    ----------
    metadata_file : str
        Metadata for all genomes.

    Return
    ------
    dict : d[genome_id] -> taxonomy list
    """

    taxonomy = {}
    with open(metadata_file, encoding='utf-8') as f:
        headers = f.readline().strip().split('\t')
        genome_index = headers.index('accession')
        taxonomy_index = headers.index('ncbi_taxonomy')
        
        for line in f:
            line_split = [token.strip() for token in line.strip().split('\t')]
            gid = canonical_gid(line_split[genome_index])
            taxa_str = line_split[taxonomy_index].strip()
            taxa_str = taxa_str.replace('Candidatus ', '')

            if taxa_str and taxa_str != 'none':
                taxonomy[gid] = [t.strip() for t in taxa_str.split(';')]
            else:
                taxonomy[gid] = list(Taxonomy.rank_prefixes)
    
    ncbi_update_count = 0
    species_updates = {}
    if species_exception_file:
        with open(species_exception_file, encoding='utf-8') as f:
            f.readline()
            for line in f:
                line_split = [token.strip() for token in line.strip().split('\t')]
                gid = canonical_gid(line_split[0])

                sp = line_split[1].replace('Candidatus ', '')
                if gid not in taxonomy:
                    print('Genome in species exception list not defined at NCBI: %s' % gid)
                    sys.exit(-1)
                    
                if not sp.startswith('s__'):
                    sp = 's__' + sp
                    
                taxonomy[gid][6] = sp
                ncbi_update_count += 1
                
                species_updates[gid] = sp
    
    if genus_exception_file:
        with open(genus_exception_file, encoding='utf-8') as f:
            f.readline()
            for line in f:
                line_split = [token.strip() for token in line.strip().split('\t')]
                gid = canonical_gid(line_split[0])
                genus = line_split[1]
                if gid not in taxonomy:
                    print('Genome in genus exception list not defined at NCBI: %s' % gid)
                    sys.exit(-1)
                    
                if genus.startswith('g__'):
                    genus = genus[3:]
                    
                taxonomy[gid][5] = f'g__{genus}'
                
                species = taxonomy[gid][6]
                if species != 's__':
                    generic, specific = generic_specific_names(species)
                    taxonomy[gid][6] = f's__{genus} {specific}'
                ncbi_update_count += 1
            
                # sanity check ledgers
                if gid in species_updates and genus not in species_updates[gid]:
                    self.logger.error(f'Species and genus ledgers have conflicting assignments for {gid}.')
                    sys.exit(-1)

    return taxonomy, ncbi_update_count
Example #27
0
    def load_from_metadata_file(self,
                                metadata_file,
                                species_exception_file=None,
                                genus_exception_file=None,
                                gtdb_type_strains_ledger=None,
                                create_sp_clusters=True,
                                qc_passed_file=None,
                                ncbi_genbank_assembly_file=None,
                                untrustworthy_type_ledger=None,
                                ncbi_untrustworthy_sp_ledger=None,
                                ncbi_env_bioproject_ledger=None):
        """Create genome set from file(s)."""

        pass_qc_gids = set()
        if qc_passed_file:
            with open(qc_passed_file) as f:
                f.readline()
                for line in f:
                    line_split = line.strip().split('\t')
                    pass_qc_gids.add(line_split[0].strip())
            self.logger.info(
                f' - identified {len(pass_qc_gids):,} genomes passing QC.')

        gtdb_type_strains = set()
        if gtdb_type_strains_ledger:
            with open(gtdb_type_strains_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    gid = canonical_gid(tokens[0].strip())
                    gtdb_type_strains.add(gid)
            self.logger.info(
                f' - identified {len(gtdb_type_strains):,} manually annotated as type strain genomes.'
            )

        excluded_from_refseq_note = {}
        ncbi_bioproject = {}
        if ncbi_genbank_assembly_file:
            ncbi_bioproject = parse_ncbi_bioproject(ncbi_genbank_assembly_file)
            excluded_from_refseq_note = exclude_from_refseq(
                ncbi_genbank_assembly_file)

        ncbi_env_bioproject = set()
        if ncbi_env_bioproject_ledger:
            with open(ncbi_env_bioproject_ledger) as f:
                f.readline()
                for line in f:
                    tokens = line.strip().split('\t')
                    ncbi_env_bioproject.add(tokens[0].strip())

        untrustworthy_as_type = set()
        if untrustworthy_type_ledger:
            untrustworthy_as_type = self.parse_untrustworthy_type_ledger(
                untrustworthy_type_ledger)
            self.logger.info(
                f' - identified {len(untrustworthy_as_type):,} genomes annotated as untrustworthy as type by GTDB.'
            )

        untrustworthy_ncbi_sp = set()
        if ncbi_untrustworthy_sp_ledger:
            untrustworthy_ncbi_sp = self.parse_ncbi_untrustworthy_sp_ledger(
                ncbi_untrustworthy_sp_ledger)
            self.logger.info(
                f' - identified {len(untrustworthy_ncbi_sp):,} genomes annotated as having untrustworthy NCBI species assignments.'
            )

        with open(metadata_file, encoding='utf-8') as f:
            headers = f.readline().strip().split('\t')

            genome_index = headers.index('accession')

            gtdb_taxonomy_index = headers.index('gtdb_taxonomy')
            ncbi_taxonomy_index = headers.index('ncbi_taxonomy')
            ncbi_taxonomy_unfiltered_index = headers.index(
                'ncbi_taxonomy_unfiltered')

            gtdb_type_index = headers.index('gtdb_type_designation')
            gtdb_type_sources_index = headers.index(
                'gtdb_type_designation_sources')
            gtdb_type_species_of_genus_index = headers.index(
                'gtdb_type_species_of_genus')
            ncbi_strain_identifiers_index = headers.index(
                'ncbi_strain_identifiers')
            ncbi_type_index = headers.index('ncbi_type_material_designation')
            ncbi_asm_level_index = headers.index('ncbi_assembly_level')
            ncbi_genome_representation_index = headers.index(
                'ncbi_genome_representation')
            ncbi_refseq_cat_index = headers.index('ncbi_refseq_category')
            ncbi_genome_cat_index = headers.index('ncbi_genome_category')

            comp_index = headers.index('checkm_completeness')
            cont_index = headers.index('checkm_contamination')
            sh_100_index = None
            if 'checkm_strain_heterogeneity_100' in headers:
                sh_100_index = headers.index('checkm_strain_heterogeneity_100')
            gs_index = headers.index('genome_size')
            contig_count_index = headers.index('contig_count')
            n50_index = headers.index('n50_contigs')
            scaffold_count_index = headers.index('scaffold_count')
            ambiguous_bases_index = headers.index('ambiguous_bases')
            total_gap_len_index = headers.index('total_gap_length')
            ssu_count_index = headers.index('ssu_count')
            ssu_length_index = headers.index('ssu_length')
            ncbi_molecule_count_index = headers.index('ncbi_molecule_count')
            ncbi_unspanned_gaps_index = headers.index('ncbi_unspanned_gaps')
            ncbi_spanned_gaps_index = headers.index('ncbi_spanned_gaps')

            gtdb_genome_rep_index = headers.index('gtdb_genome_representative')
            gtdb_rep_index = headers.index('gtdb_representative')

            if 'lpsn_priority_year' in headers:
                # this information will be missing from the previous
                # GTDB metadata file as we strip this out due to
                # concerns over republishing this information
                lpsn_priority_index = headers.index('lpsn_priority_year')

            for line in f:
                line_split = line.strip().split('\t')

                ncbi_accn = line_split[genome_index]
                gid = canonical_gid(ncbi_accn)
                self.full_gid[gid] = ncbi_accn

                if gid.startswith('U_'):
                    continue

                if pass_qc_gids and gid not in pass_qc_gids:
                    continue

                gtdb_taxonomy = Taxa(line_split[gtdb_taxonomy_index])

                ncbi_taxonomy = Taxa(line_split[ncbi_taxonomy_index])
                ncbi_taxonomy_unfiltered = Taxa(
                    line_split[ncbi_taxonomy_unfiltered_index], filtered=False)

                gtdb_type = line_split[gtdb_type_index]
                gtdb_type_sources = line_split[gtdb_type_sources_index]
                if gid in gtdb_type_strains:
                    gtdb_type = 'type strain of species'
                    gtdb_type_sources = 'GTDB curator'
                gtdb_type_species_of_genus = line_split[
                    gtdb_type_species_of_genus_index] == 't'

                ncbi_type = line_split[ncbi_type_index]
                ncbi_strain_identifiers = line_split[
                    ncbi_strain_identifiers_index]
                ncbi_asm_level = line_split[ncbi_asm_level_index]
                ncbi_genome_representation = line_split[
                    ncbi_genome_representation_index]
                ncbi_refseq_cat = line_split[ncbi_refseq_cat_index]
                ncbi_genome_cat = line_split[ncbi_genome_cat_index]

                if ncbi_bioproject.get(gid,
                                       None) in ncbi_env_bioproject:  # ***
                    # HACK to force genomes from MAG mining projects
                    # to be indicated as MAGs which are currently
                    # not correctly annotated at NCBI
                    ncbi_genome_cat = 'derived from environmental source'

                comp = float(line_split[comp_index])
                cont = float(line_split[cont_index])
                sh_100 = 0
                if sh_100_index:
                    sh_100 = self._convert_float(line_split[sh_100_index])
                gs = int(line_split[gs_index])
                contig_count = int(line_split[contig_count_index])
                n50 = int(line_split[n50_index])
                scaffold_count = int(line_split[scaffold_count_index])
                ambiguous_bases = int(line_split[ambiguous_bases_index])
                total_gap_len = int(line_split[total_gap_len_index])
                ssu_count = int(line_split[ssu_count_index])
                ssu_length = self._convert_int(line_split[ssu_length_index])
                ncbi_molecule_count = self._convert_int(
                    line_split[ncbi_molecule_count_index])
                ncbi_unspanned_gaps = self._convert_int(
                    line_split[ncbi_unspanned_gaps_index])
                ncbi_spanned_gaps = self._convert_int(
                    line_split[ncbi_spanned_gaps_index])

                gtdb_is_rep = line_split[gtdb_rep_index] == 't'
                gtdb_rid = canonical_gid(line_split[gtdb_genome_rep_index])
                if create_sp_clusters:
                    self.sp_clusters.update_sp_cluster(gtdb_rid, gid,
                                                       gtdb_taxonomy.species)

                lpsn_priority_year = Genome.NO_PRIORITY_YEAR
                if 'lpsn_priority_year' in headers:
                    lpsn_priority_year = self._convert_int(
                        line_split[lpsn_priority_index],
                        Genome.NO_PRIORITY_YEAR)

                self.genomes[gid] = Genome(
                    gid, ncbi_accn, gtdb_rid, gtdb_is_rep, gtdb_taxonomy,
                    ncbi_taxonomy, ncbi_taxonomy_unfiltered, gtdb_type,
                    gtdb_type_sources, gtdb_type_species_of_genus, gid
                    in untrustworthy_as_type, gid in untrustworthy_ncbi_sp,
                    ncbi_type, ncbi_strain_identifiers, ncbi_asm_level,
                    ncbi_genome_representation,
                    ncbi_refseq_cat, ncbi_genome_cat,
                    excluded_from_refseq_note.get(gid, ''), comp, cont, sh_100,
                    gs, contig_count, n50, scaffold_count, ambiguous_bases,
                    total_gap_len, ssu_count, ssu_length, ncbi_molecule_count,
                    ncbi_unspanned_gaps, ncbi_spanned_gaps, lpsn_priority_year)

        self._apply_ncbi_taxonomy_ledgers(species_exception_file,
                                          genus_exception_file)
    def run(self, prev_gtdb_metadata_file, cur_gtdb_metadata_file,
            cur_genome_paths, ncbi_assembly_summary_genbank):
        """Identify new or modified genomes."""

        self.logger.info('Reading previous GTDB genomes.')
        prev_accns = {}
        gtdb_taxonomy = {}
        gtdb_rep = {}
        ncbi_genome_category = {}
        with open(prev_gtdb_metadata_file, encoding='utf-8') as f:
            header = f.readline().strip().split('\t')

            gtdb_index = header.index('gtdb_taxonomy')
            gtdb_rep_index = header.index('gtdb_representative')
            ncbi_genome_cat_index = header.index('ncbi_genome_category')

            for line in f:
                tokens = line.strip().split('\t')

                gid = tokens[0]
                if gid.startswith('U'):  # only concerned with genomes at NCBI
                    continue

                cid = canonical_gid(gid)
                prev_accns[cid] = gid
                gtdb_taxonomy[cid] = tokens[gtdb_index]
                gtdb_rep[cid] = tokens[gtdb_rep_index]
                ncbi_genome_category[cid] = tokens[ncbi_genome_cat_index]

        self.logger.info(f' - identified {len(prev_accns):,} genomes.')

        # get genomes in current release
        self.logger.info('Reading current GTDB genomes.')
        cur_accns = {}
        with open(cur_gtdb_metadata_file, encoding='utf-8') as f:
            f.readline()
            for line in f:
                tokens = line.strip().split('\t')
                gid = tokens[0]
                if gid.startswith('U'):  # only concerned with genomes at NCBI
                    continue

                cur_accns[canonical_gid(gid)] = gid
        self.logger.info(f' - identified {len(cur_accns):,} genomes.')

        # get equivalent GenBank and RefSeq genome assemblies
        self.logger.info(
            'Determining identical GenBank and RefSeq accessions.')
        identical_accns = {}
        with open(ncbi_assembly_summary_genbank, encoding='utf-8') as f:
            for line in f:
                if line.startswith('#'):
                    if 'assembly_accession' in line:
                        header = line.strip().split('\t')

                        gb_accn_index = header.index('# assembly_accession')
                        rs_accn_index = header.index('gbrs_paired_asm')
                        paired_asm_index = header.index('paired_asm_comp')
                else:
                    tokens = line.strip().split('\t')

                    paired_asm = tokens[paired_asm_index]
                    if paired_asm.lower() == 'identical':
                        gb_accn = tokens[gb_accn_index]
                        rs_accn = tokens[rs_accn_index]
                        identical_accns[gb_accn] = rs_accn
                        identical_accns[rs_accn] = gb_accn

        # identify new and modified genome IDs
        self.logger.info('Identifying new or modified genome IDs.')
        new_gids = set()
        updated_gids = set()
        for cur_gid in cur_accns:
            if cur_gid in prev_accns:
                if not self.same_genome_accn(cur_accns[cur_gid],
                                             prev_accns[cur_gid],
                                             identical_accns):
                    updated_gids.add(cur_gid)
            else:
                # genome not present in previous GTDB release
                new_gids.add(cur_gid)

        lost_gids = set(prev_accns) - set(cur_accns)
        num_lost_gtdb_reps = sum(
            [1 for gid in lost_gids if gtdb_rep[gid] == 't'])
        self.logger.info(
            f' - identified {len(new_gids):,} new, {len(updated_gids):,} updated, and {len(lost_gids):,} lost genomes.'
        )
        self.logger.info(
            f' - {num_lost_gtdb_reps:,} lost genomes were GTDB representatives.'
        )

        # get path to current GTDB genome directories
        self.logger.info(
            'Identifying path to genomic files for current GTDB genomes.')
        cur_genome_files = {}
        skipped_genomes = 0
        fout = open(os.path.join(self.output_dir, 'skipped_genomes.tsv'), 'w')
        with open(cur_genome_paths) as f:
            for line in f:
                tokens = line.strip().split('\t')
                accn = tokens[0]
                genome_path = tokens[1]
                gid = tokens[2]
                assert canonical_gid(accn) == gid

                if gid not in cur_accns:
                    # a genome may not be part of the GTDB release
                    # (e.g., genome has no NCBI taxonomy information
                    # or Prodigal failed to call genes such as for
                    # GCA_000716285.1)
                    skipped_genomes += 1
                    fout.write(gid + '\n')
                    continue

                assembly_id = os.path.basename(os.path.normpath(genome_path))
                genomic_file = os.path.join(genome_path,
                                            assembly_id + '_genomic.fna')
                cur_genome_files[gid] = genomic_file

        fout.close()

        self.logger.info(
            f' - identified genomic file for {len(cur_genome_files):,} genomes.'
        )
        self.logger.info(
            f' - skipped {skipped_genomes:,} genomes without GTDB metadata.')

        # write out new or modified genome IDs
        self.logger.info(
            'Writing out path to new and updated genomic FASTA files.')
        output_file = os.path.join(self.output_dir, 'genomes_new_updated.tsv')
        fout = open(output_file, 'w')
        fout.write('Genome ID\tNCBI accession\tStatus\tGenomic file\n')
        for type_str, gids in [('NEW', new_gids), ('UPDATED', updated_gids)]:
            for gid in gids:
                genomic_file = cur_genome_files[gid]
                fout.write('{}\t{}\t{}\t{}\n'.format(gid, cur_accns[gid],
                                                     type_str, genomic_file))
        fout.close()

        # write out lost genomes
        output_file = os.path.join(self.output_dir, 'genomes_lost.tsv')
        fout = open(output_file, 'w')
        fout.write(
            'Genome ID\tGTDB taxonomy\tGTDB representative\tNCBI genome category\n'
        )
        for gid in lost_gids:
            fout.write('{}\t{}\t{}\t{}\n'.format(gid, gtdb_taxonomy[gid],
                                                 gtdb_rep[gid],
                                                 ncbi_genome_category[gid]))
        fout.close()