def manual_species(self, init_taxonomy, manually_curated_tree):
        """Identify species names manually set by curators."""

        # read initial and manually curated taxonomy
        self.logger.info('Reading initial species names.')
        init_taxonomy = Taxonomy().read(init_taxonomy, use_canonical_gid=True)
        init_num_gids = sum(
            [1 for gid in init_taxonomy if not gid.startswith('D-')])
        self.logger.info(
            ' - read taxonomy for {:,} genomes.'.format(init_num_gids))

        self.logger.info('Reading manually-curated species names from tree.')
        mc_tree = dendropy.Tree.get_from_path(manually_curated_tree,
                                              schema='newick',
                                              rooting='force-rooted',
                                              preserve_underscores=True)
        mc_taxonomy = Taxonomy().read_from_tree(mc_tree)

        mc_specific = {}
        for gid, taxa in mc_taxonomy.items():
            if gid.startswith('D-'):
                continue

            mc_sp = taxa[-1]
            if not mc_sp.startswith('s__') or mc_sp == 's__':
                self.logger.error(
                    'Most specific classification for {} is {}.'.format(
                        gid, taxa))
                continue

            mc_specific[gid] = specific_epithet(mc_sp)

        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_specific)))

        # report genomes with modified specific name assignment
        self.logger.info(
            'Identifying genomes with manually-curated species names.')
        fout = open(os.path.join(self.output_dir, 'manual_species_names.tsv'),
                    'w')
        fout.write('Genome ID\tInitial species\tManually-curated species\n')
        num_mc = 0
        for gid, mc_sp in mc_specific.items():
            init_species = init_taxonomy[gid][Taxonomy.SPECIES_INDEX]
            init_specific = specific_epithet(init_species)

            if init_specific != mc_sp:
                mc_generic = mc_taxonomy[gid][Taxonomy.GENUS_INDEX].replace(
                    'g__', '')
                mc_species = 's__{} {}'.format(mc_generic, mc_sp)
                num_mc += 1
                fout.write('{}\t{}\t{}\n'.format(gid, init_species,
                                                 mc_species))

        fout.close()

        self.logger.info(
            ' - identified {:,} manually-curated species names.'.format(
                num_mc))
Esempio n. 2
0
    def pull(self, options):
        """Pull command"""
        check_file_exists(options.input_tree)

        t = Taxonomy().read_from_tree(options.input_tree)  #, False)
        if not options.no_rank_fill:
            for taxon_id, taxa in t.items():
                t[taxon_id] = Taxonomy().fill_missing_ranks(taxa)

        Taxonomy().write(t, options.output_file)

        self.logger.info('Taxonomy strings written to: %s' %
                         options.output_file)
Esempio n. 3
0
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support):
    """Determine taxa to use for inferring distribution of relative divergences.

    Parameters
    ----------
    tree : Dendropy Tree
        Phylogenetic tree.
    taxonomy : d[taxon ID] -> [d__x; p__y; ...]
        Taxonomy for each taxon.
    trusted_taxa : iterable
        Trusted taxa to consider when inferring distribution.
    min_children : int
        Only consider taxa with at least the specified number of children taxa when inferring distribution.
    min_support : float
        Only consider taxa with at least this level of support when inferring distribution.
    """

    # determine children taxa for each named group
    taxon_children = Taxonomy().taxon_children(taxonomy)

    # get all named groups
    taxa_for_dist_inference = set()
    for taxon_id, taxa in taxonomy.items():
        for taxon in taxa:
            taxa_for_dist_inference.add(taxon)

    # sanity check species names as these are a common problem
    species = set()
    for taxon_id, taxa in taxonomy.items():
        if len(taxa) > Taxonomy.rank_index['s__']:
            species_name = taxa[Taxonomy.rank_index['s__']]
            valid, error_msg = True, None
            if species_name != 's__':
                valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True)
            if not valid:
                print('[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg))
                continue
                
            species.add(species_name)

    # restrict taxa to those with a sufficient number of named children
    # Note: a taxonomic group with no children will not end up in the
    # taxon_children data structure so care must be taken when applying
    # this filtering criteria.
    if min_children > 0:
        valid_taxa = set()
        for taxon, children_taxa in taxon_children.items():
            if len(children_taxa) >= min_children:
                valid_taxa.add(taxon)

        taxa_for_dist_inference.intersection_update(valid_taxa)

        # explicitly add in the species since they have no
        # children and thus be absent from the taxon_child dictionary
        taxa_for_dist_inference.update(species)

    # restrict taxa used for inferring distribution to those with sufficient support
    if min_support > 0:
        for node in tree.preorder_node_iter():
            if not node.label or node.is_leaf():
                continue

            # check for support value
            support, taxon_name, _auxiliary_info = parse_label(node.label)

            if not taxon_name:
                continue

            if support and float(support) < min_support:
                taxa_for_dist_inference.difference_update([taxon_name])
            elif not support and min_support > 0:
                # no support value, so inform user if they were trying to filter on this property
                print('[Error] Tree does not contain support values. As such, --min_support should be set to 0.')
                continue

    # restrict taxa used for inferring distribution to the trusted set
    if trusted_taxa:
        taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference)

    return taxa_for_dist_inference
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, qc_passed_file,
            ncbi_genbank_assembly_file, untrustworthy_type_file,
            gtdb_type_strains_ledger, sp_priority_ledger,
            genus_priority_ledger, ncbi_env_bioproject_ledger, lpsn_gss_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  lpsn_gss_file,
                                                  self.output_dir)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file,
            ncbi_env_bioproject_ledger=ncbi_env_bioproject_ledger)
        self.logger.info(
            f' - current genome set contains {len(cur_genomes):,} genomes.')

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type species genomes with incongruent GTDB genus assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_species_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB genus\tNCBI genus\tGTDB genus priority date\tNCBI genus priority date\tPriority status\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_gtdb_type_species():
                gtdb_genus = taxa[Taxonomy.GENUS_INDEX]
                ncbi_genus = cur_genomes[rid].ncbi_taxa.genus

                if gtdb_genus != ncbi_genus:
                    priority_genus = sp_priority_mngr.genus_priority(
                        gtdb_genus, ncbi_genus)

                    if priority_genus != gtdb_genus:
                        num_incongruent += 1

                        if priority_genus == ncbi_genus:
                            priority_status = 'NCBI genus name has priority'
                        else:
                            priority_status = 'Genus with priority must be manually established'

                        fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                            rid, gtdb_genus, ncbi_genus,
                            sp_priority_mngr.genus_priority_year(gtdb_genus),
                            sp_priority_mngr.genus_priority_year(ncbi_genus),
                            priority_status,
                            cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent genus assignments.'.
            format(num_incongruent))
        fout.close()
    def run(self, manual_taxonomy, cur_gtdb_metadata_file, uba_genome_paths,
            qc_passed_file, ncbi_genbank_assembly_file,
            untrustworthy_type_file, synonym_file, gtdb_type_strains_ledger,
            sp_priority_ledger, genus_priority_ledger, dsmz_bacnames_file):
        """Finalize species names based on results of manual curation."""

        # initialize species priority manager
        sp_priority_mngr = SpeciesPriorityManager(sp_priority_ledger,
                                                  genus_priority_ledger,
                                                  dsmz_bacnames_file)

        # identify species and genus names updated during manual curation
        self.logger.info('Parsing manually curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        self.logger.info(' - read taxonomy for {:,} genomes.'.format(
            len(mc_taxonomy)))

        # create current GTDB genome sets
        self.logger.info('Creating current GTDB genome set.')
        cur_genomes = Genomes()
        cur_genomes.load_from_metadata_file(
            cur_gtdb_metadata_file,
            gtdb_type_strains_ledger=gtdb_type_strains_ledger,
            create_sp_clusters=False,
            uba_genome_file=uba_genome_paths,
            qc_passed_file=qc_passed_file,
            ncbi_genbank_assembly_file=ncbi_genbank_assembly_file,
            untrustworthy_type_ledger=untrustworthy_type_file)
        self.logger.info(
            f' ... current genome set contains {len(cur_genomes):,} genomes.')

        # get all GTDB species represented by a type strain:
        gtdb_type_species = set()
        for rid in mc_taxonomy:
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_type_species.add(mc_taxonomy[rid][Taxonomy.SPECIES_INDEX])

        # establish appropriate species names for GTDB clusters with new representatives
        self.logger.info(
            'Identifying type strain genomes with incongruent GTDB species assignments.'
        )
        fout = open(
            os.path.join(self.output_dir, 'type_strains_incongruencies.tsv'),
            'w')
        fout.write(
            'Genome ID\tGTDB species\tNCBI species\tGTDB type strain\tNCBI type strain\tNCBI RefSeq note\n'
        )
        num_incongruent = 0
        for rid, taxa in mc_taxonomy.items():
            if cur_genomes[rid].is_effective_type_strain():
                gtdb_sp = taxa[Taxonomy.SPECIES_INDEX]
                gtdb_generic = generic_name(gtdb_sp)

                ncbi_sp = cur_genomes[rid].ncbi_taxa.species
                ncbi_generic = generic_name(ncbi_sp)

                if ncbi_sp == 's__':
                    # NCBI taxonomy is sometimes behind the genome annotation pages,
                    # and do not have a species assignment even for type strain genome
                    continue

                # check if genome is a valid genus transfer into a genus
                # that already contains a species with the specific
                # name which results in a polyphyletic suffix being required
                # e.g. G002240355 is Prauserella marina at NCBI and is
                # transferred into Saccharomonospora under the GTDB. However,
                # Saccharomonospora marina already exists so this genome
                # needs to be S. marina_A.
                if (is_placeholder_taxon(gtdb_sp)
                        and gtdb_generic != ncbi_generic
                        and canonical_species(gtdb_sp) in gtdb_type_species):
                    continue

                if not test_same_epithet(specific_epithet(gtdb_sp),
                                         specific_epithet(ncbi_sp)):
                    num_incongruent += 1
                    fout.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        rid, gtdb_sp, ncbi_sp,
                        cur_genomes[rid].is_gtdb_type_strain(),
                        cur_genomes[rid].is_ncbi_type_strain(),
                        cur_genomes[rid].excluded_from_refseq_note))

        self.logger.info(
            ' - identified {:,} genomes with incongruent species assignments.'.
            format(num_incongruent))
        fout.close()
Esempio n. 6
0
    def propagate(self, options):
        """Propagate labels to all genomes in a cluster."""

        check_file_exists(options.input_taxonomy)
        check_file_exists(options.metadata_file)

        user_to_uba = {}
        if options.uba_mapping_file:
            self.logger.info('Parsing genome ID mapping file.')
            with open(options.uba_mapping_file) as f:
                for line in f:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 2:
                        user_to_uba[tokens[0]] = tokens[1]
            self.logger.info(' - found mappings for {:,} genomes.'.format(
                len(user_to_uba)))

        # get representative genome information
        rep_metadata = read_gtdb_metadata(
            options.metadata_file,
            ['gtdb_representative', 'gtdb_clustered_genomes'])

        rep_metadata = {
            canonical_gid(gid): values
            for gid, values in rep_metadata.items()
        }

        rep_metadata = {
            user_to_uba.get(gid, gid): values
            for gid, values in rep_metadata.items()
        }

        explict_tax = Taxonomy().read(options.input_taxonomy)

        self.logger.info(f' - identified {len(rep_metadata):,} genomes')

        # sanity check all representatives have a taxonomy string
        rep_count = 0
        for gid in rep_metadata:
            is_rep_genome, clustered_genomes = rep_metadata.get(
                gid, (None, None))
            if is_rep_genome:
                rep_count += 1
                if gid not in explict_tax:
                    self.logger.error(
                        'Expected to find {} in input taxonomy as it is a GTDB representative.'
                        .format(gid))
                    sys.exit(-1)

        self.logger.info(
            'Identified {:,} representatives in metadata file and {:,} genomes in input taxonomy file.'
            .format(rep_count, len(explict_tax)))

        # propagate taxonomy to genomes clustered with each representative
        fout = open(options.output_taxonomy, 'w')
        for rid, taxon_list in explict_tax.items():
            taxonomy_str = ';'.join(taxon_list)
            rid = canonical_gid(rid)
            rid = user_to_uba.get(rid, rid)

            is_rep_genome, clustered_genomes = rep_metadata[rid]
            if is_rep_genome:
                # assign taxonomy to representative and all genomes in the cluster
                fout.write('{}\t{}\n'.format(rid, taxonomy_str))
                for cid in [
                        gid.strip() for gid in clustered_genomes.split(';')
                ]:
                    cid = canonical_gid(cid)
                    cid = user_to_uba.get(cid, cid)
                    if cid != rid:
                        if cid in rep_metadata:
                            fout.write('{}\t{}\n'.format(cid, taxonomy_str))
                        else:
                            self.logger.warning(
                                'Skipping {} as it is not in GTDB metadata file.'
                                .format(cid))
            else:
                self.logger.error(
                    'Did not expected to find {} in input taxonomy as it is not a GTDB representative.'
                    .format(rid))
                sys.exit(-1)

        self.logger.info('Taxonomy written to: {}'.format(
            options.output_taxonomy))
Esempio n. 7
0
    def run(self,
                input_taxonomy,
                genome_path_file,
                metadata_file, 
                max_genomes,
                min_comp,
                max_cont,
                min_quality, 
                max_contigs, 
                min_N50, 
                max_ambiguous, 
                max_gap_length, 
                output_dir):
        """Calculate ANI for named species."""
        
        # get genomes passing filtering criteria
        filtered_genome_ids = filter_genomes(metadata_file,
                                                min_comp,
                                                max_cont,
                                                min_quality, 
                                                max_contigs, 
                                                min_N50, 
                                                max_ambiguous, 
                                                max_gap_length)
                                                
        # get species in each named species
        taxonomy = Taxonomy().read(input_taxonomy)
        genome_ids_to_remove = set(taxonomy.keys()) - filtered_genome_ids
        for genome_id in genome_ids_to_remove:
            del taxonomy[genome_id]
            
        named_species = Taxonomy().extant_taxa_for_rank('species', taxonomy)
        
        # get path to nucleotide files
        nt_files = {}
        for line in open(genome_path_file):
            line_split = line.strip().split('\t')

            gtdb_id = line_split[0]
            genome_id = gtdb_id.replace('GB_', '').replace('RS_', '')

            genome_dir = line_split[1]

            nt_file = os.path.join(genome_dir, 'prodigal', genome_id + '_protein.fna')
            nt_files[gtdb_id] = nt_file

        # populate worker queue with data to process
        worker_queue = mp.Queue()
        writer_queue = mp.Queue()

        num_species = 0
        for species, genome_ids in named_species.items():
            if len(genome_ids) > 1:
                worker_queue.put((species, genome_ids))
                num_species += 1

        for _ in range(self.cpus):
          worker_queue.put((None, None))

        try:
          worker_proc = [mp.Process(target=self.__worker, args=(metadata_file,
                                                                    nt_files,
                                                                    max_genomes,
                                                                    worker_queue,
                                                                    writer_queue)) for _ in range(self.cpus)]
          write_proc = mp.Process(target=self.__writer, args=(num_species,
                                                                  output_dir,
                                                                  writer_queue))

          write_proc.start()

          for p in worker_proc:
              p.start()

          for p in worker_proc:
              p.join()

          writer_queue.put((None, None, None, None, None))
          write_proc.join()
        except:
          for p in worker_proc:
            p.terminate()

          write_proc.terminate()
    def replace_generic(self, manual_species_names, manual_taxonomy):
        """Replace generic names with genus assignment."""

        # read manually-curated species names
        self.logger.info('Reading manually-curated species names.')
        mc_species = {}
        with open(manual_species_names) as f:
            f.readline()

            for line in f:
                tokens = line.strip().split('\t')
                mc_species[tokens[0]] = tokens[2]
        self.logger.info(
            ' - read manually-curated species for {:,} genomes.'.format(
                len(mc_species)))

        # read manual taxonomy file
        self.logger.info('Reading manually-curated taxonomy.')
        mc_taxonomy = Taxonomy().read(manual_taxonomy, use_canonical_gid=True)
        mc_num_gids = sum(
            [1 for gid in mc_taxonomy if not gid.startswith('D-')])
        self.logger.info(
            ' - read taxonomy for {:,} genomes.'.format(mc_num_gids))

        # replace generic names with genus names
        self.logger.info('Creating taxonomy file with updated species names.')
        fout = open(os.path.join(self.output_dir, 'taxonomy_updated_sp.tsv'),
                    'w')
        num_genomes = 0
        for gid, taxa in mc_taxonomy.items():
            if gid.startswith('D-'):
                continue

            genus = taxa[Taxonomy.GENUS_INDEX]
            generic = genus.replace('g__', '')
            if not generic:
                self.logger.error(
                    'Genome is missing genus assignment: {}'.format(gid))
                raise

            species = taxa[Taxonomy.SPECIES_INDEX]

            if gid in mc_species:
                if generic not in species and species != 's__':
                    self.logger.error(
                        'Genus assignment does not agree with manually-curated species assignment: {} {} {}'
                        .format(gid, mc_species[gid],
                                '; '.join(mc_taxonomy[gid])))

            sp_tokens = species.split()
            if len(sp_tokens) < 2:
                self.logger.error(
                    'Species name appear to be erroneous: {} {}'.format(
                        gid, species))
                specific = '<unassigned>'
            else:
                specific = species.split()[-1]

            final_sp = 's__{} {}'.format(generic, specific)
            taxa[Taxonomy.SPECIES_INDEX] = final_sp
            fout.write('{}\t{}\n'.format(gid, ';'.join(taxa)))

            num_genomes += 1

        fout.close()
        self.logger.info(' - processed {:,} genomes.'.format(num_genomes))
Esempio n. 9
0
   def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir):
       """Tabulate differences between two taxonomies.
       
       Parameters
       ----------
       tax1_file : str
           First taxonomy file.
       tax2_file : str
           Second taxonomy file.
       include_user_taxa : boolean
           Flag indicating if User genomes should be considered.
       output_dir : str
           Output directory.
       """
       
       tax1 = Taxonomy().read(tax1_file)
       tax2 = Taxonomy().read(tax2_file)
       
       if not include_user_taxa:
           new_tax1 = {}
           for genome_id, taxonomy in tax1.items():
               if not genome_id.startswith('U_'):
                   new_tax1[genome_id] = taxonomy
           tax1 = new_tax1
           
           new_tax2 = {}
           for genome_id, taxonomy in tax2.items():
               if not genome_id.startswith('U_'):
                   new_tax2[genome_id] = taxonomy
           tax2 = new_tax2
       
       common_taxa = set(tax1.keys()).intersection(list(tax2.keys()))
       
       self.logger.info('First taxonomy contains %d taxa.' % len(tax1))
       self.logger.info('Second taxonomy contains %d taxa.' % len(tax2))
       self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa))
       
       # identify differences between taxonomies
       tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0]
       tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0]
       output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2))
       
       unchanged = defaultdict(int)           # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__
       active_change = defaultdict(int)       # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A
       passive_change = defaultdict(int)      # T2 = g__??? -> T1 = g__Jane
       unresolved_change = defaultdict(int)   # T2 = g__Box -> T1 = g__???
       for taxa in common_taxa:
           t1 = tax1[taxa]
           t2 = tax2[taxa]
           
           for rank, (taxon1, taxon2) in enumerate(list(zip(t1, t2))):
               if taxon1 == taxon2:
                   unchanged[rank] += 1
               elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]:
                   active_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon2 == Taxonomy.rank_prefixes[rank]:
                   passive_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
               elif taxon1 == Taxonomy.rank_prefixes[rank]:
                   unresolved_change[rank] += 1
                   fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2)))
                   
       fout.close()
 
       # report results
       output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1)
       
       fout = open(output_table, 'w')
       fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n')
       print('Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal')
       for rank in range(0, len(Taxonomy.rank_prefixes)):
           total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank]
           if total != 0:
               fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' %
                                   (Taxonomy.rank_labels[rank],
                                   unchanged[rank], unchanged[rank] * 100.0 / total,
                                   active_change[rank], active_change[rank] * 100.0 / total,
                                   passive_change[rank], passive_change[rank] * 100.0 / total,
                                   unresolved_change[rank], unresolved_change[rank] * 100.0 / total))
               print('%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank],
                                                   unchanged[rank],
                                                   active_change[rank],
                                                   passive_change[rank],
                                                   unresolved_change[rank],
                                                   total))