def run(self, taxonomy_file, genome_list_file): """Add taxonomy to database.""" genome_list = set() if genome_list_file: for line in open(genome_list_file): if '\t' in line: genome_list.add(line.rstrip().split('\t')[0]) else: genome_list.add(line.rstrip().split(',')[0]) # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # add full taxonomy string to database temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_list_file and genome_id not in genome_list: continue taxa_str = ';'.join(taxa) temp_file.write('%s\t%s\n' % (genome_id, taxa_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_taxonomy', 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name) # add each taxonomic rank to database for i, rank in enumerate(Taxonomy.rank_labels): temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_list_file and genome_id not in genome_list: continue rank_str = taxa[i] if Taxonomy.rank_labels[i] == 'species': # ensure species name includes genus if taxa[i-1][3:] not in taxa[i]: rank_str = 's__' + taxa[i-1][3:] + ' ' + taxa[i][3:] temp_file.write('%s\t%s\n' % (genome_id, rank_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ('metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name)
def run(self, taxonomy_file, genome_list): """Add taxonomy to database.""" if genome_list: genomes_to_process = set() for line in open(genome_list): if line[0] == '#': continue genomes_to_process.add(line.rstrip().split('\t')[0]) # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # add full taxonomy string to database temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_id.startswith('GCA_'): genome_id = 'GB_' + genome_id elif genome_id.startswith('GCF_'): genome_id = 'RS_' + genome_id if not genome_list or genome_id in genomes_to_process: taxa_str = ';'.join(taxa) temp_file.write('%s\t%s\n' % (genome_id, taxa_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ( 'metadata_taxonomy', 'ncbi_taxonomy_unfiltered', 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name)
def run(self, taxonomy_file, genome_list_file): """Add taxonomy to database.""" genome_list = set() if genome_list_file: for line in open(genome_list_file): if '\t' in line: genome_list.add(line.rstrip().split('\t')[0]) else: genome_list.add(line.rstrip().split(',')[0]) # read taxonomy file taxonomy = Taxonomy().read(taxonomy_file) # add each taxonomic rank to database for i, rank in enumerate(Taxonomy.rank_labels): temp_file = tempfile.NamedTemporaryFile(delete=False) for genome_id, taxa in taxonomy.iteritems(): if genome_list_file and genome_id not in genome_list: continue rank_str = taxa[i] temp_file.write('%s\t%s\n' % (genome_id, rank_str)) temp_file.close() cmd = 'gtdb -r metadata import --table %s --field %s --type %s --metadatafile %s' % ( 'metadata_taxonomy', 'gtdb_' + rank, 'TEXT', temp_file.name) print cmd os.system(cmd) os.remove(temp_file.name)
def pull(self, options): """Pull command""" check_file_exists(options.input_tree) t = Taxonomy().read_from_tree(options.input_tree) #, False) if not options.no_rank_fill: for taxon_id, taxa in t.iteritems(): t[taxon_id] = Taxonomy().fill_missing_ranks(taxa) Taxonomy().write(t, options.output_file) self.logger.info('Taxonomy strings written to: %s' % options.output_file)
def root(self, options): """Root tree using outgroup.""" self.logger.warning("Tree rooting is still under development!") check_file_exists(options.input_tree) gtdb_taxonomy = Taxonomy().read(Config.TAXONOMY_FILE) self.logger.info('Identifying genomes from the specified outgroup.') outgroup = set() for genome_id, taxa in gtdb_taxonomy.iteritems(): if options.outgroup_taxon in taxa: outgroup.add(genome_id) reroot = RerootTree() reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup) self.logger.info('Done.')
def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program, prot_model, split_chars, output_dir): """Infer concatenated gene tree. Parameters ---------- gene_dirs : list GeneTreeTk output directories with information for individual genes. min_per_gene : float Minimum percentage of genes required to retain taxa. min_per_bps : float Minimum percentage of base pairs required to retain taxa. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ # read MSA files concat = defaultdict(lambda: defaultdict(list)) msa_length = 0 gene_lengths = {} for gene_dir in gene_dirs: homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa') for seq_id, seq in seq_io.read_seq(homologs): taxon_id, gene_id = self._split_ids(seq_id, split_chars) if not taxon_id: self.logger.error('Failed to split identifier: %s' % seq_id) sys.exit(-1) concat[taxon_id][gene_dir].append(seq) msa_length += len(seq) gene_lengths[gene_dir] = len(seq) # filter taxon mc_filter = set() min_per_gene_filter = set() min_per_bps_filter = set() for taxon_id in concat: # check if multiple copy missing = 0 taxon_msa_len = 0 for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: missing += 1 continue if len(concat[taxon_id][gene_id]) > 1: mc_filter.add(taxon_id) break taxon_msa_len += len(concat[taxon_id][gene_id][0]) if taxon_id not in mc_filter: if missing > len(gene_dirs) * (1.0 - float(min_per_gene) / 100.0): min_per_gene_filter.add(taxon_id) elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0: min_per_bps_filter.add(taxon_id) min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0) filtered_taxa = mc_filter.union(min_per_gene_filter).union( min_per_bps_filter) remaining_taxa = set(concat) - filtered_taxa self.logger.info('No. genes: %d' % len(gene_dirs)) self.logger.info('No. taxa across all genes: %d' % len(concat)) self.logger.info('Total filtered taxa: %d' % len(filtered_taxa)) self.logger.info(' Due to multi-copy genes: %d' % len(mc_filter)) self.logger.info(' Due to having <%d of the genes: %d' % (min_req_genes, len(min_per_gene_filter))) self.logger.info(' Due to an insufficient number of base pairs: %d' % len(min_per_bps_filter)) self.logger.info('Remaining taxa: %d' % len(remaining_taxa)) self.logger.info('Length of concatenated MSA: %d' % msa_length) # create the multiple sequences alignment msa_file = os.path.join(output_dir, 'concatenated.faa') fout = open(msa_file, 'w') for taxon_id in remaining_taxa: msa = '' for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: msa += '-' * gene_lengths[gene_id] else: msa += concat[taxon_id][gene_id][0] fout.write('>%s\n' % taxon_id) fout.write('%s\n' % msa) fout.close() # read all taxonomy files # (assumes taxonomy is the same for taxa across all genes) taxonomy = {} for gene_id in gene_dirs: taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv') t = Taxonomy().read(taxonomy_file) for label, taxa_str in t.iteritems(): taxon_id, gene_id = self._split_ids(label, split_chars) taxonomy[taxon_id] = taxa_str # create taxonomy file for retained taxa self.logger.info('Creating taxonomy file for retained taxa.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for taxon_id in remaining_taxa: if taxon_id in taxonomy: # query genomes will generally be missing fout.write('%s\t%s\n' % (taxon_id, ';'.join(taxonomy[taxon_id]))) fout.close() # infer tree if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, 'concatenated.unrooted.tree') tree_log = os.path.join(output_dir, 'concatenated.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) if len(remaining_taxa) > 2: tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, 'concatenated.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) # create tax2tree consensus map and decorate tree t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(msa_file, taxonomy, metadata, arb_metadata_file)
def _filter_taxa_for_dist_inference(self, tree, taxonomy, trusted_taxa, min_children, min_support): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # determine children taxa for each named group taxon_children = Taxonomy().taxon_children(taxonomy) # get all named groups taxa_for_dist_inference = set() for taxon_id, taxa in taxonomy.iteritems(): for taxon in taxa: taxa_for_dist_inference.add(taxon) # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.iteritems(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name( species_name, require_full=True, require_prefix=True) if not valid: print '[Warning] Species name %s for %s is invalid: %s' % ( species_name, taxon_id, error_msg) continue species.add(species_name) # restrict taxa to those with a sufficient number of named children # Note: a taxonomic group with no children will not end up in the # taxon_children data structure so care must be taken when applying # this filtering criteria. if min_children > 0: valid_taxa = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children: valid_taxa.add(taxon) taxa_for_dist_inference.intersection_update(valid_taxa) # explicitly add in the species since they have no # children and thus be absent from the taxon_child dictionary taxa_for_dist_inference.update(species) # restrict taxa used for inferring distribution to those with sufficient support if min_support > 0: for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue # check for support value support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue if support and float(support) < min_support: taxa_for_dist_inference.difference_update([taxon_name]) elif not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.' continue # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection( taxa_for_dist_inference) return taxa_for_dist_inference
def tax_diff(self, tax1_file, tax2_file, include_user_taxa, output_dir): """Tabulate differences between two taxonomies. Parameters ---------- tax1_file : str First taxonomy file. tax2_file : str Second taxonomy file. include_user_taxa : boolean Flag indicating if User genomes should be considered. output_dir : str Output directory. """ tax1 = Taxonomy().read(tax1_file) tax2 = Taxonomy().read(tax2_file) if not include_user_taxa: new_tax1 = {} for genome_id, taxonomy in tax1.iteritems(): if not genome_id.startswith('U_'): new_tax1[genome_id] = taxonomy tax1 = new_tax1 new_tax2 = {} for genome_id, taxonomy in tax2.iteritems(): if not genome_id.startswith('U_'): new_tax2[genome_id] = taxonomy tax2 = new_tax2 common_taxa = set(tax1.keys()).intersection(tax2.keys()) self.logger.info('First taxonomy contains %d taxa.' % len(tax1)) self.logger.info('Second taxonomy contains %d taxa.' % len(tax2)) self.logger.info('Taxonomies have %d taxa in common.' % len(common_taxa)) # identify differences between taxonomies tax_file_name1 = os.path.splitext(os.path.basename(tax1_file))[0] tax_file_name2 = os.path.splitext(os.path.basename(tax2_file))[0] output_table = os.path.join(output_dir, '%s.tax_diff.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Genome ID\tChange\tRank\t%s\t%s\n' % (tax_file_name1, tax_file_name2)) unchanged = defaultdict(int) # T2 = g__Bob -> T1 = g__Bob, or T2 = g__ -> T1 = g__ active_change = defaultdict(int) # T2 = g__Bob -> T1 = g__Jane, or T2 = g__Bob -> T1 = g__Bob_A passive_change = defaultdict(int) # T2 = g__??? -> T1 = g__Jane unresolved_change = defaultdict(int) # T2 = g__Box -> T1 = g__??? for taxa in common_taxa: t1 = tax1[taxa] t2 = tax2[taxa] for rank, (taxon1, taxon2) in enumerate(zip(t1, t2)): if taxon1 == taxon2: unchanged[rank] += 1 elif taxon1 != Taxonomy.rank_prefixes[rank] and taxon2 != Taxonomy.rank_prefixes[rank]: active_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'active', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon2 == Taxonomy.rank_prefixes[rank]: passive_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'passive', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) elif taxon1 == Taxonomy.rank_prefixes[rank]: unresolved_change[rank] += 1 fout.write('%s\t%s\t%s\t%s\t%s\n' % (taxa, 'unresolved', Taxonomy.rank_labels[rank], ';'.join(t1), ';'.join(t2))) fout.close() # report results output_table = os.path.join(output_dir, '%s.tax_diff_summary.tsv' % tax_file_name1) fout = open(output_table, 'w') fout.write('Rank\tUnchanged\tUnchanged (%)\tActive\t Active (%)\tPassive\tPassive (%)\tUnresolved\tUnresolved (%)\n') print 'Rank\tUnchanged\tActive\tPassive\tUnresolved\tTotal' for rank in xrange(0, len(Taxonomy.rank_prefixes)): total = unchanged[rank] + active_change[rank] + passive_change[rank] + unresolved_change[rank] if total != 0: fout.write('%s\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n' % (Taxonomy.rank_labels[rank], unchanged[rank], unchanged[rank] * 100.0 / total, active_change[rank], active_change[rank] * 100.0 / total, passive_change[rank], passive_change[rank] * 100.0 / total, unresolved_change[rank], unresolved_change[rank] * 100.0 / total)) print '%s\t%d\t%d\t%d\t%d\t%d' % (Taxonomy.rank_labels[rank], unchanged[rank], active_change[rank], passive_change[rank], unresolved_change[rank], total)
def filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support): """Determine taxa to use for inferring distribution of relative divergences. Parameters ---------- tree : Dendropy Tree Phylogenetic tree. taxonomy : d[taxon ID] -> [d__x; p__y; ...] Taxonomy for each taxon. trusted_taxa : iterable Trusted taxa to consider when inferring distribution. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. """ # determine children taxa for each named group taxon_children = Taxonomy().taxon_children(taxonomy) # get all named groups taxa_for_dist_inference = set() for taxon_id, taxa in taxonomy.iteritems(): for taxon in taxa: taxa_for_dist_inference.add(taxon) # sanity check species names as these are a common problem species = set() for taxon_id, taxa in taxonomy.iteritems(): if len(taxa) > Taxonomy.rank_index['s__']: species_name = taxa[Taxonomy.rank_index['s__']] valid, error_msg = True, None if species_name != 's__': valid, error_msg = Taxonomy().validate_species_name(species_name, require_full=True, require_prefix=True) if not valid: print '[Warning] Species name %s for %s is invalid: %s' % (species_name, taxon_id, error_msg) continue species.add(species_name) # restrict taxa to those with a sufficient number of named children # Note: a taxonomic group with no children will not end up in the # taxon_children data structure so care must be taken when applying # this filtering criteria. if min_children > 0: valid_taxa = set() for taxon, children_taxa in taxon_children.iteritems(): if len(children_taxa) >= min_children: valid_taxa.add(taxon) taxa_for_dist_inference.intersection_update(valid_taxa) # explicitly add in the species since they have no # children and thus be absent from the taxon_child dictionary taxa_for_dist_inference.update(species) # restrict taxa used for inferring distribution to those with sufficient support if min_support > 0: for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue # check for support value support, taxon_name, _auxiliary_info = parse_label(node.label) if not taxon_name: continue if support and float(support) < min_support: taxa_for_dist_inference.difference_update([taxon_name]) elif not support and min_support > 0: # no support value, so inform user if they were trying to filter on this property print '[Error] Tree does not contain support values. As such, --min_support should be set to 0.' continue # restrict taxa used for inferring distribution to the trusted set if trusted_taxa: taxa_for_dist_inference = trusted_taxa.intersection(taxa_for_dist_inference) return taxa_for_dist_inference