Beispiel #1
0
    def parse_metadata(self, metadata_file, qc_passed):
        """Parse GTDB genome metadata."""

        GenomeQuality = namedtuple('GenomeQuality', 'comp cont')

        genome_qual = {}
        gtdb_taxonomy = {}
        with open(metadata_file) as f:
            header = f.readline().strip().split('\t')

            gtdb_rep_idx = header.index('gtdb_representative')
            gtdb_taxonomy_idx = header.index('gtdb_taxonomy')

            for line in f:
                tokens = line.strip().split('\t')

                gid = canonical_gid(tokens[0])
                gtdb_rep = tokens[gtdb_rep_idx].lower().startswith('t')

                if gtdb_rep and gid in qc_passed:
                    comp, cont = qc_passed[gid]
                    genome_qual[gid] = GenomeQuality(comp, cont)

                    gtdb_taxa = [
                        t.strip() for t in tokens[gtdb_taxonomy_idx].split(';')
                    ]
                    gtdb_taxonomy[gid] = gtdb_taxa

        return genome_qual, gtdb_taxonomy
Beispiel #2
0
    def run(self, gtdb_init_taxonomy, gtdb_sp_clusters, gtdb_prev_sp_clusters,
            gtdb_decorate_table):
        """Create curation lists and pseudo-trees."""

        # get genomes
        self.logger.info('Identifying taxonomic assignment of genomes.')
        taxa_gid_map = defaultdict(set)
        domain_gids = set()
        for line in open(gtdb_init_taxonomy):
            tokens = line.strip().split('\t')
            gid = canonical_gid(tokens[0])

            taxa = [t.strip() for t in tokens[1].split(';')]
            for taxon in taxa:
                taxa_gid_map[taxon].add(gid)

            domain_gids.add(gid)
        self.logger.info(' - identified {:,} genomes.'.format(
            len(domain_gids)))

        # new GTDB representatives
        self.new_gtdb_reps(domain_gids, gtdb_sp_clusters,
                           gtdb_prev_sp_clusters)

        # polyphyletic and rogue GTDB representatives
        self.poly_rogue_gtdb_reps(domain_gids, taxa_gid_map,
                                  gtdb_decorate_table)
Beispiel #3
0
    def new_gtdb_reps(self, domain_gids, gtdb_sp_clusters,
                      gtdb_prev_sp_clusters):
        """New GTDB representatives."""

        self.logger.info('Identifying previous GTDB representatives.')
        prev_rids = set()
        with open(gtdb_prev_sp_clusters) as f:
            f.readline()
            for line in f:
                tokens = line.strip().split('\t')
                rid = canonical_gid(tokens[0])
                prev_rids.add(rid)
        self.logger.info(
            ' - identified {:,} previous GTDB representatives.'.format(
                len(prev_rids)))

        self.logger.info('Identifying current GTDB representatives.')
        cur_rids = set()
        with open(gtdb_sp_clusters) as f:
            f.readline()
            for line in f:
                tokens = line.strip().split('\t')
                rid = canonical_gid(tokens[0])
                cur_rids.add(rid)
        self.logger.info(
            ' - identified {:,} current GTDB representatives.'.format(
                len(cur_rids)))

        self.logger.info(
            'Creating curation list and pseudo-tree of new GTDB representatives.'
        )
        out_file = os.path.join(self.output_dir,
                                f'gids_new_reps.{self.domain}.lst')
        fout = open(out_file, 'w')
        new_rids = set()
        for rid in cur_rids:
            if rid in domain_gids and rid not in prev_rids:
                fout.write('{}\n'.format(rid))
                new_rids.add(rid)
        fout.close()
        self.logger.info(' - identified {:,} new GTDB representatives.'.format(
            len(new_rids)))

        self.pseudo_tree(new_rids, out_file.replace('.lst', '.tree'))
Beispiel #4
0
    def _get_genome_id(self, genome_path):
        """Extract genome ID from path to genomic file."""

        genome_id = ntpath.basename(genome_path)
        if genome_id.startswith('GCA_') or genome_id.startswith('GCF_'):
            genome_id = '_'.join(genome_id.split('_')[0:2])
            if genome_id.startswith('GCA_'):
                genome_id = 'GB_' + genome_id
            else:
                genome_id = 'RS_' + genome_id
        else:
            genome_id = '_'.join(genome_id.split('_')[0:2])

        return canonical_gid(genome_id)
Beispiel #5
0
    def read(self, taxonomy_file, use_canonical_gid=False):
        """Read Greengenes-style taxonomy file.

        Expected format is:
            <id>\t<taxonomy string>

        where the taxonomy string has the formats:
            d__; c__; o__; f__; g__; s__

        Parameters
        ----------
        taxonomy_file : str
            Greengenes-style taxonomy file.

        Returns
        -------
        dict : d[unique_id] -> [d__<taxon>, ..., s__<taxon>]
            Taxa indexed by unique ids.
        """

        check_file_exists(taxonomy_file)

        try:
            d = {}
            for row, line in enumerate(open(taxonomy_file)):
                line_split = line.split('\t')
                unique_id = line_split[0]

                if use_canonical_gid:
                    unique_id = canonical_gid(unique_id)

                tax_str = line_split[1].rstrip()
                if tax_str[-1] == ';':
                    # remove trailing semicolons which sometimes
                    # appear in Greengenes-style taxonomy files
                    tax_str = tax_str[0:-1]

                d[unique_id] = [x.strip() for x in tax_str.split(';')]
        except:
            self.logger.error('Failed to parse taxonomy file on line %d' %
                              (row + 1))
            raise

        return d
Beispiel #6
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        user_to_uba = {}
        if options.uba_mapping_file:
            self.logger.info('Parsing genome ID mapping file.')
            with open(options.uba_mapping_file) as f:
                for line in f:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 2:
                        user_to_uba[tokens[0]] = tokens[1]
            self.logger.info(' - found mappings for {:,} genomes.'.format(
                len(user_to_uba)))

        # get representative genome information
        rep_metadata = read_gtdb_metadata(
            options.metadata_file,
            ['gtdb_representative', 'gtdb_clustered_genomes'])

        rep_metadata = {
            canonical_gid(gid): values
            for gid, values in rep_metadata.items()
        }

        rep_metadata = {
            user_to_uba.get(gid, gid): values
            for gid, values in rep_metadata.items()
        }

        explict_tax = Taxonomy().read(options.input_taxonomy)

        self.logger.info(f' - identified {len(rep_metadata):,} genomes')

        # sanity check all representatives have a taxonomy string
        rep_count = 0
        for gid in rep_metadata:
            is_rep_genome, clustered_genomes = rep_metadata.get(
                gid, (None, None))
            if is_rep_genome:
                rep_count += 1
                if gid not in explict_tax:
                    self.logger.error(
                        'Expected to find {} in input taxonomy as it is a GTDB representative.'
                        .format(gid))
                    sys.exit(-1)

        self.logger.info(
            'Identified {:,} representatives in metadata file and {:,} genomes in input taxonomy file.'
            .format(rep_count, len(explict_tax)))

        # propagate taxonomy to genomes clustered with each representative
        fout = open(options.output_taxonomy, 'w')
        for rid, taxon_list in explict_tax.items():
            taxonomy_str = ';'.join(taxon_list)
            rid = canonical_gid(rid)
            rid = user_to_uba.get(rid, rid)

            is_rep_genome, clustered_genomes = rep_metadata[rid]
            if is_rep_genome:
                # assign taxonomy to representative and all genomes in the cluster
                fout.write('{}\t{}\n'.format(rid, taxonomy_str))
                for cid in [
                        gid.strip() for gid in clustered_genomes.split(';')
                ]:
                    cid = canonical_gid(cid)
                    cid = user_to_uba.get(cid, cid)
                    if cid != rid:
                        if cid in rep_metadata:
                            fout.write('{}\t{}\n'.format(cid, taxonomy_str))
                        else:
                            self.logger.warning(
                                'Skipping {} as it is not in GTDB metadata file.'
                                .format(cid))
            else:
                self.logger.error(
                    'Did not expected to find {} in input taxonomy as it is not a GTDB representative.'
                    .format(rid))
                sys.exit(-1)

        self.logger.info('Taxonomy written to: {}'.format(
            options.output_taxonomy))
Beispiel #7
0
    def run(self, gtdb_bac_hits_file, gtdb_ar_hits_file,
            gtdb_bac_metadata_file, gtdb_ar_metadata_file, checkm_v2_rep_file,
            alt_marker_assign, min_comp, max_cont, min_single_copy,
            min_phyla_rate, output_dir):
        """Get uniquitious, single-copy genes across GTDB species."""

        # get CheckM v2 quality estimates
        qc_passed = {}
        with open(checkm_v2_rep_file) as f:
            header = f.readline().rstrip().split('\t')

            comp_idx = header.index('Completeness')
            cont_idx = header.index('Contamination')

            for line in f:
                tokens = line.rstrip().split('\t')

                comp = float(tokens[comp_idx])
                cont = float(tokens[cont_idx])

                if comp >= min_comp and cont <= max_cont:
                    gid = canonical_gid(tokens[0])
                    qc_passed[gid] = (comp, cont)

        # get representative genomes meeting quality criteria
        bac_qual = None
        ar_qual = None
        bac_taxonomy = None
        ar_taxonomy = None
        for domain, domain_metadata_file in [
            ('bacterial', gtdb_bac_metadata_file),
            ('archaeal', gtdb_ar_metadata_file)
        ]:

            self.logger.info(
                f'Identifying {domain} genomes meeting quality criteria:')

            cur_qual, cur_taxonomy = self.parse_metadata(
                domain_metadata_file, qc_passed)
            self.logger.info(
                f' - identified {len(cur_qual):,} genomes passing QC')

            comp = [q.comp for q in cur_qual.values()]
            self.logger.info(
                f' - completeness: {np_mean(comp):.1f} +/- {np_std(comp):.1f}')

            cont = [q.cont for q in cur_qual.values()]
            self.logger.info(
                f' - contamination: {np_mean(cont):.1f} +/- {np_std(cont):.1f}'
            )

            if domain == 'bacterial':
                bac_qual = cur_qual
                bac_taxonomy = cur_taxonomy
            else:
                ar_qual = cur_qual
                ar_taxonomy = cur_taxonomy

        # get ubiqutious, single-copy bacterial marker genes across
        # all GTDB species representatives
        bac_mg = None
        ar_mg = None
        bac_genome_mgs = None
        ar_genome_mgs = None
        for domain, domain_qual, domain_hit_file in [
            ('bacterial', bac_qual, gtdb_bac_hits_file),
            ('archaeal', ar_qual, gtdb_ar_hits_file)
        ]:

            self.logger.info(
                f'Identifying ubiqutious, single-copy {domain} marker genes:')
            cur_mg, cur_genome_mgs = self.determine_marker_genes(
                domain_hit_file, domain_qual, min_single_copy)
            self.logger.info(f' - identified {len(cur_mg):,} marker genes')

            sc_rates = [sc for sc in cur_mg.values()]
            self.logger.info(
                f' - single-copy rate: {np_mean(sc_rates):.1f} +/- {np_std(sc_rates):.1f}'
            )

            if domain == 'bacterial':
                bac_mg = cur_mg
                bac_genome_mgs = cur_genome_mgs
            else:
                ar_mg = cur_mg
                ar_genome_mgs = cur_genome_mgs

        # remove marker genes that are not predominately single copy across all phyla
        self.logger.info(
            'Identifying marker genes not predominately single copy across the majority of phyla:'
        )

        filtered_bac_mgs = self.filter_mg_across_phyla(min_phyla_rate,
                                                       bac_taxonomy, bac_mg,
                                                       bac_genome_mgs)
        for mg in filtered_bac_mgs:
            del bac_mg[mg]

        self.logger.info(
            f' - removed {len(filtered_bac_mgs):,} bacterial marker genes')
        self.logger.info(f' - retained {len(bac_mg):,} bacterial marker genes')

        filtered_ar_mgs = self.filter_mg_across_phyla(min_phyla_rate,
                                                      ar_taxonomy, ar_mg,
                                                      ar_genome_mgs)
        for mg in filtered_ar_mgs:
            del ar_mg[mg]

        self.logger.info(
            f' - removed {len(filtered_ar_mgs):,} archaeal marker genes')
        self.logger.info(f' - retained {len(ar_mg):,} archaeal marker genes')

        # create table indicating single-copy rate of genes for each phylum
        bac_table = os.path.join(output_dir, 'phylum_mg_table_bac.tsv')
        self.single_copy_phylum_table(bac_mg, bac_taxonomy, bac_genome_mgs,
                                      min_single_copy, bac_table)

        ar_table = os.path.join(output_dir, 'phylum_mg_table_ar.tsv')
        self.single_copy_phylum_table(ar_mg, ar_taxonomy, ar_genome_mgs,
                                      min_single_copy, ar_table)

        # get additional markers required to ensure robust annotation
        # of selected marker genes
        self.logger.info(
            'Adding additional HMMs required to robustly annotate selected marker genes:'
        )
        added_hmms = set()
        if alt_marker_assign.lower() != 'none':
            with open(alt_marker_assign) as f:
                f.readline()

                for line in f:
                    tokens = line.strip().split('\t')

                    hmm_id = tokens[0]
                    alt_hits = tokens[1]

                    for marker_info in alt_hits.split(';'):
                        marker_id, _count = marker_info.split(':')
                        if ((hmm_id in bac_mg or hmm_id in ar_mg)
                                and marker_id not in bac_mg
                                and marker_id not in ar_mg):
                            added_hmms.add(marker_id)

        assert len(added_hmms.intersection(set(bac_mg))) == 0
        assert len(added_hmms.intersection(set(ar_mg))) == 0

        self.logger.info(f' - added {len(added_hmms):,} additional HMMs')

        # pull out bacterial TIGRFAM and Pfam HMMs
        self.logger.info(f'Pulling out TIGRfam and Pfam HMMs to: {output_dir}')
        bac_tigr_markers = set([mg for mg in bac_mg if mg.startswith('TIGR')])
        bac_pfam_markers = set([mg for mg in bac_mg if mg.startswith('PF')])
        ar_tigr_markers = set([mg for mg in ar_mg if mg.startswith('TIGR')])
        ar_pfam_markers = set([mg for mg in ar_mg if mg.startswith('PF')])
        self.get_hmms(bac_tigr_markers, bac_pfam_markers, ar_tigr_markers,
                      ar_pfam_markers, added_hmms, output_dir)

        # create binary index for PFAM HMMs
        cmd = f"hmmpress {os.path.join(output_dir, 'pfam.hmm')}"
        os.system(cmd)
Beispiel #8
0
    def poly_rogue_gtdb_reps(self, domain_gids, taxa_gid_map,
                             gtdb_decorate_table):
        """Polyphyletic and rogue GTDB representatives."""

        self.logger.info(
            'Identifying polyphyletic and rogue GTDB representatives.')
        poly_taxa_count = 0
        poly_gids = set()
        rogue_gids = set()
        with open(gtdb_decorate_table) as f:
            f.readline()
            for line in f:
                tokens = line.split('\t')

                taxon = tokens[0]
                fmeasure = float(tokens[2])
                rogue_in = tokens[7].strip()
                rogue_out = tokens[8].strip()
                if fmeasure < 1.0:
                    poly_taxa_count += 1
                    poly_gids.update(taxa_gid_map[taxon])

                    if rogue_in:
                        for gid in rogue_in.split(','):
                            gid = canonical_gid(gid.strip())
                            if not gid.startswith('D-'):
                                rogue_gids.add(gid)

                    if rogue_out:
                        for gid in rogue_out.split(','):
                            gid = canonical_gid(gid.strip())
                            if not gid.startswith('D-'):
                                rogue_gids.add(gid)

        self.logger.info(
            ' - identified {:,} polyphyletic taxa spanning {:,} GTDB representatives.'
            .format(poly_taxa_count, len(poly_gids)))
        self.logger.info(
            ' - identified {:,} rogue GTDB representatives.'.format(
                len(rogue_gids)))

        self.logger.info(
            'Creating curation lists and pseudo-trees of polyphyletic GTDB representatives.'
        )
        out_file = os.path.join(self.output_dir,
                                f'gids_poly_taxa.{self.domain}.lst')
        fout = open(out_file, 'w')
        for gid in poly_gids:
            fout.write('{}\n'.format(gid))
        fout.close()
        self.pseudo_tree(poly_gids, out_file.replace('.lst', '.tree'))

        self.logger.info(
            'Creating curation lists and pseudo-trees of rogue GTDB representatives.'
        )
        out_file = os.path.join(self.output_dir,
                                f'gids_rogues.{self.domain}.lst')
        fout = open(out_file, 'w')
        for gid in rogue_gids:
            fout.write('{}\n'.format(gid))
        fout.close()
        self.pseudo_tree(rogue_gids, out_file.replace('.lst', '.tree'))
Beispiel #9
0
    def parse_gtdb_metadata(self, gtdb_metadata):
        """Parse GTDB metadata to establish stems for placeholder and Latin names for each GTDB representative."""
        
        rep_placeholder_stems = defaultdict(set)
        rep_latin_stems = {}
        if gtdb_metadata:
            self.logger.info('Reading GTDB metadata.')

            with open(gtdb_metadata) as f:
                header = f.readline().strip().split('\t')
                
                gid_idx = header.index('formatted_accession')
                
                gtdb_rep_idx = header.index('gtdb_genome_representative')
                
                ncbi_strain_identifiers_idx = header.index('ncbi_strain_identifiers') 
                ncbi_wgs_formatted_idx = header.index('ncbi_wgs_formatted')
                ncbi_taxonomy_idx = header.index('ncbi_taxonomy')
                ncbi_org_name_idx = header.index('ncbi_organism_name')
                
                gtdb_type_species_of_genus_idx = header.index('gtdb_type_species_of_genus')
                
                for line in f:
                    tokens = line.strip().split('\t')
                    
                    gid = tokens[gid_idx]
                    
                    gtdb_rid = canonical_gid(tokens[gtdb_rep_idx])
                    if not gtdb_rid: # genome failed QC so has no GTDB representative
                        continue
                        
                    ncbi_org_name = tokens[ncbi_org_name_idx]
                    last_ncbi_org_name = ncbi_org_name.split()[-1]
                    if any(c.isdigit() for c in last_ncbi_org_name) or any(c.isupper() for c in last_ncbi_org_name):
                        # looks like a strain/genome designation that may have been used
                        # to form a GTDB taxon name
                        last_ncbi_org_name = last_ncbi_org_name.replace('_', '-').replace(':', '-')
                        rep_placeholder_stems[gtdb_rid].add(last_ncbi_org_name)
                        rep_placeholder_stems[gtdb_rid].add(last_ncbi_org_name.upper())
                        rep_placeholder_stems[gtdb_rid].add(last_ncbi_org_name.capitalize())
                        rep_placeholder_stems[gtdb_rid].add(last_ncbi_org_name.upper()[:15])
                        rep_placeholder_stems[gtdb_rid].add(last_ncbi_org_name.upper()[-15:])
                        
                        # long names with hyphens (underscores) were truncated from back to start
                        # (e.g. GW2011_GWF2_32_72 -> GWF2-32-72
                        if len(last_ncbi_org_name) > 15:
                            placeholder_tokens = last_ncbi_org_name.split('-')
                            for start_idx in range(1, len(placeholder_tokens)):
                                if len(placeholder_tokens[start_idx]) > 0 and placeholder_tokens[start_idx][0].isalpha():
                                    placeholder = '-'.join(placeholder_tokens[start_idx:])

                                    if len(placeholder) < 15:
                                        rep_placeholder_stems[gtdb_rid].add(placeholder)
                                        rep_placeholder_stems[gtdb_rid].add(placeholder.upper())
                                        break

                    ncbi_strain_id = tokens[ncbi_strain_identifiers_idx].replace('_', '-').replace(' ', '-')
                    ncbi_strain_id = ''.join([ch for ch in ncbi_strain_id if ch.isalnum() or ch == '-'])
                    ncbi_strain_id = ncbi_strain_id.capitalize()
                    if len(ncbi_strain_id) > 12:
                        ncbi_strain_id = ncbi_strain_id[0:12]

                    rep_placeholder_stems[gtdb_rid].add(ncbi_strain_id)
                    rep_placeholder_stems[gtdb_rid].add(ncbi_strain_id.upper())
                    rep_placeholder_stems[gtdb_rid].add(tokens[ncbi_wgs_formatted_idx])
                    rep_placeholder_stems[gtdb_rid].add(tokens[ncbi_wgs_formatted_idx])
                    rep_placeholder_stems[gtdb_rid].add(gid.replace('G', 'GCA-'))
                    rep_placeholder_stems[gtdb_rid].add(gid.replace('G', 'GCF-'))
                    
                    # There are names like f__GCA-2401445 which were derived from G002401445
                    gid_no_leading_zeros = 'G'
                    for idx, ch in enumerate(gid[1:]):
                        if ch != '0':
                            gid_no_leading_zeros += gid[idx+1:] 
                            break

                    rep_placeholder_stems[gtdb_rid].add(gid.replace('G', 'GCA-'))
                    rep_placeholder_stems[gtdb_rid].add(gid.replace('G', 'GCF-'))
                    rep_placeholder_stems[gtdb_rid].add(gid_no_leading_zeros.replace('G', 'GCA-'))
                    rep_placeholder_stems[gtdb_rid].add(gid_no_leading_zeros.replace('G', 'GCF-'))
                    
                    if gid == gtdb_rid and tokens[gtdb_type_species_of_genus_idx].lower().startswith('t'):
                        # add in stem of genus name. Ideally, we would derive the family, order, and
                        # class names from the genus name. However, this appears to be complicated 
                        # as, for example, the family name of Aquifex is Aquificaceae. How does one
                        # know to insert a "ic" in this case?
                        ncbi_genus = [t.strip() for t in tokens[ncbi_taxonomy_idx].split(';')][Taxonomy.GENUS_INDEX]
                        ncbi_genus_stem = ncbi_genus[3:-2] # does chopping the last 2 characters work in all cases?
                        rep_latin_stems[gtdb_rid] = ncbi_genus_stem
                    
            self.logger.info(' - determined placeholder stems for {:,} genomes.'.format(len(rep_placeholder_stems)))
            self.logger.info(' - determined Latin stems for {:,} genomes.'.format(len(rep_latin_stems)))
        
        return rep_placeholder_stems, rep_latin_stems