def append(self, options): """Append command""" check_file_exists(options.input_tree) check_file_exists(options.input_taxonomy) taxonomy = Taxonomy().read(options.input_taxonomy) tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for n in tree.leaf_node_iter(): taxa_str = taxonomy.get(n.taxon.label, None) if taxa_str == None: self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label) sys.exit(-1) n.taxon.label = n.taxon.label + '|' + '; '.join(taxonomy[n.taxon.label]) tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Decorated tree written to: %s' % options.output_tree)
def append(self, options): """Append command""" check_file_exists(options.input_tree) check_file_exists(options.input_taxonomy) taxonomy = Taxonomy().read(options.input_taxonomy) tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for n in tree.leaf_node_iter(): taxa_str = taxonomy.get(n.taxon.label, None) if taxa_str == None: self.logger.error( 'Taxonomy file does not contain an entry for %s.' % n.label) sys.exit(-1) n.taxon.label = n.taxon.label + '|' + '; '.join( taxonomy[n.taxon.label]) tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Decorated tree written to: %s' % options.output_tree)
def clean_ftp(self, new_list_genomes, ftp_genome_dir_file, ftp_genome_dir, report_dir, taxonomy_file=None): list_of_files = new_list_genomes.split(',') genome_in_new_rel = [] make_sure_path_exists(report_dir) for new_genome_file in list_of_files: with open(new_genome_file, 'r') as ngf: for line in ngf: genome_in_new_rel.append(line.strip().split('\t')[0]) # read taxonomy file taxonomy = {} if taxonomy_file is not None: taxonomy = Taxonomy().read(taxonomy_file) current_ftp_genomes = {} with open(ftp_genome_dir_file) as fgdf: for line in fgdf: infos = line.strip().split('\t') current_ftp_genomes[infos[0]] = infos[1] deleted_genomes = list( set(current_ftp_genomes.keys()) - set(genome_in_new_rel)) added_genomes = list( set(genome_in_new_rel) - set(current_ftp_genomes.keys())) deleted_genome_file = open( os.path.join(report_dir, 'deleted_genomes.tsv'), 'w') added_genome_file = open(os.path.join(report_dir, 'added_genomes.tsv'), 'w') print('{} genomes have been deleted in the release'.format( len(deleted_genomes))) print('{} genomes have been added in the release'.format( len(added_genomes))) for idx, deleted_genome in enumerate(deleted_genomes): print("{}/{} genomes deleted".format(idx, len(deleted_genomes)), end="\r") deleted_genome_file.write('{}\n'.format(deleted_genome)) #print('we delete {}'.format(current_ftp_genomes.get(deleted_genome))) shutil.rmtree(current_ftp_genomes.get(deleted_genome)) self.delete_empty_directory( os.path.dirname(current_ftp_genomes.get(deleted_genome))) for added_genome in added_genomes: added_genome_file.write('{}\t{}\n'.format( added_genome, taxonomy.get(added_genome, ['N/A'] * 7)[6]))
def run(self, input_tree, taxonomy_file, trusted_taxa_file, min_children, min_support, skip_rd_refine, output_tree): """Decorate internal nodes with taxa labels. Parameters ---------- input_tree : str Tree to decorate taxonomy_file : str File indicating taxonomic information for extant taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. skip_rd_refine : boolean Skip refinement of taxonomy based on relative divergence information. output_tree: str Name of output tree. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # remove any previous taxon labels self.logger.info('Removing any previous internal node labels.') self._strip_taxon_labels(tree) # read taxonomy and trim to taxa in tree self.logger.info('Reading taxonomy.') full_taxonomy = Taxonomy().read(taxonomy_file) taxonomy = {} for leaf in tree.leaf_node_iter(): taxonomy[leaf.taxon.label] = full_taxonomy.get( leaf.taxon.label, Taxonomy.rank_prefixes) # find best placement for each taxon based # on the F-measure statistic self.logger.info('Calculating F-measure statistic for each taxa.') fmeasure_for_taxa = self._fmeasure(tree, taxonomy) # place labels with only one acceptable position and calculate # the relative divergence thresholds from these as a guide for # placing the remaining labels self.logger.info('Placing labels with unambiguous position in tree.') placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa) # calculating relative if not skip_rd_refine: self.logger.info( 'Establishing median relative divergence for taxonomic ranks.') median_rank_rd = self._median_rank_rd(tree, placed_taxon, taxonomy, trusted_taxa_file, min_children, min_support) # resolve ambiguous position in tree self.logger.info( 'Resolving ambiguous taxon label placements using median relative divergences.' ) self._resolve_ambiguous_placements(fmeasure_for_taxa, median_rank_rd) else: # simply select most terminal placement in order to be conservative ambiguous_placements = set() for taxon, fmeasures in list(fmeasure_for_taxa.items()): if len(fmeasures) != 1: ambiguous_placements.add(taxon) fmeasure_for_taxa[taxon] = [fmeasures[-1]] if len(ambiguous_placements) > 0: self.logger.warning( 'There are %d taxon with multiple placements of equal quality.' % len(ambiguous_placements)) self.logger.warning( 'These were resolved by placing the label at a terminal position.' ) # place all labels on tree self.logger.info('Placing labels on tree.') self._strip_taxon_labels(tree) placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa) # write statistics for placed taxon labels self.logger.info('Writing out statistics for taxa.') out_table = output_tree + '-table' self._write_statistics_table(fmeasure_for_taxa, out_table) # output taxonomy of extant taxa on tree self.logger.info('Writing out taxonomy for extant taxa.') out_taxonomy = output_tree + '-taxonomy' self._write_taxonomy(tree, out_taxonomy) # output decorated tree self.logger.info('Writing out decorated tree.') tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) # validate taxonomy if False: self.logger.info('Validating taxonomy for extant taxa.') tree_taxonomy = Taxonomy().read(out_taxonomy) Taxonomy().validate(tree_taxonomy, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True)
def run(self, input_tree, taxonomy_file, trusted_taxa_file, min_children, min_support, output_tree): """Decorate internal nodes with taxa labels. Parameters ---------- input_tree : str Tree to decorate taxonomy_file : str File indicating taxonomic information for extant taxa. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. output_tree: str Name of output tree. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # remove any previous taxon labels self.logger.info('Removing any previous internal node labels.') self._strip_taxon_labels(tree) # read taxonomy and trim to taxa in tree self.logger.info('Reading taxonomy.') full_taxonomy = Taxonomy().read(taxonomy_file) taxonomy = {} for leaf in tree.leaf_node_iter(): taxonomy[leaf.taxon.label] = full_taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) # find best placement for each taxon based # on the F-measure statistic self.logger.info('Calculating F-measure statistic for each taxa.') fmeasure_for_taxa = self._fmeasure(tree, taxonomy) # place labels with only one acceptable position and calculate # the relative divergence thresholds from these as a guide for # placing the remaining labels self.logger.info('Placing labels with unambiguous position in tree.') placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa) # calculating relative self.logger.info('Establishing median relative divergence for taxonomic ranks.') median_rank_rd = self._median_rank_rd(tree, placed_taxon, taxonomy, trusted_taxa_file, min_children, min_support) # resolve ambiguous position in tree self.logger.info('Resolving ambiguous taxon label placements using median relative divergences.') self._resolve_ambiguous_placements(tree, fmeasure_for_taxa, median_rank_rd) # write statistics for placed taxon labels self.logger.info('Writing out statistics for taxa.') out_table = output_tree + '-table' self._write_statistics_table(fmeasure_for_taxa, out_table) # output taxonomy of extant taxa on tree self.logger.info('Writing out taxonomy for extant taxa.') out_taxonomy = output_tree + '-taxonomy' self._write_taxonomy(tree, out_taxonomy) # validate taxonomy self.logger.info('Validating taxonomy for extant taxa.') tree_taxonomy = Taxonomy().read(out_taxonomy) Taxonomy().validate(tree_taxonomy, check_prefixes=True, check_ranks=True, check_hierarchy=True, check_species=True, check_group_names=True, check_duplicate_names=True, report_errors=True) # output decorated tree self.logger.info('Writing out decorated tree.') tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, query_proteins, db_file, custom_db_file, taxonomy_file, custom_taxonomy_file, evalue, per_identity, per_aln_len, max_matches, homology_search, min_per_taxa, consensus, min_per_bp, use_trimAl, restrict_taxon, msa_program, tree_program, prot_model, skip_rooting, output_dir): """Infer a gene tree for homologs genes identified by blast. Workflow for inferring a gene tree from sequences identified as being homologs to a set of query proteins. Homologs are identified using BLASTP and a set of user-defined parameters. Parameters ---------- query_proteins : str Fasta file containing query proteins. db_file : str BLAST database of reference proteins. custom_db_file : str Custom database of proteins. taxonomy_file : str Taxonomic assignment of each reference genomes. custom_taxonomy_file : str Taxonomic assignment of genomes in custom database. evalue : float E-value threshold used to define homolog. per_identity : float Percent identity threshold used to define a homolog. per_aln_len : float Alignment length threshold used to define a homolog. max_matches : int Maximum matches per query protein. metadata : dict[genome_id] -> metadata dictionary Metadata for genomes. homology_search : str Type of homology search to perform. min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. restrict_taxon : str Restrict alignment to specific taxonomic group (e.g., k__Archaea). msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. skip_rooting : boolean Skip midpoint rooting if True. output_dir : str Directory to store results. """ # validate query sequence names for use with GeneTreeTk validate_seq_ids(query_proteins) # read taxonomy file self.logger.info('Reading taxonomy file.') taxonomy = Taxonomy().read(taxonomy_file) if custom_taxonomy_file: custom_taxonomy = Taxonomy().read(custom_taxonomy_file) taxonomy.update(custom_taxonomy) # report distribution of query genes mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( query_proteins) self.logger.info( 'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # identify homologs using BLASTP self.logger.info('Identifying homologs using %s.' % homology_search) blast = Blast(self.cpus) blast_output = os.path.join(output_dir, 'reference_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, db_file, evalue, per_identity, per_aln_len, max_matches, blast_output, output_fmt='custom') else: blast.blastp(query_proteins, db_file, blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) homologs = blast.identify_homologs(blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in reference database.' % len(homologs)) custom_homologs = None if custom_db_file: custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, custom_db_file, evalue, per_identity, per_aln_len, max_matches, custom_blast_output, output_fmt='custom') else: blast.blastp(query_proteins, custom_db_file, custom_blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) custom_homologs = blast.identify_homologs(custom_blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in custom database.' % len(custom_homologs)) # restrict homologs to specific taxonomic group if restrict_taxon: self.logger.info('Restricting homologs to %s.' % restrict_taxon) restricted_homologs = {} for query_id, hit in homologs.iteritems(): genome_id = hit.subject_id.split('~')[0] if restrict_taxon in taxonomy[genome_id]: restricted_homologs[query_id] = hit self.logger.info( '%d of %d homologs in reference database are from the specified group.' % (len(restricted_homologs), len(homologs))) homologs = restricted_homologs if len(homologs) == 0: self.logger.error( 'Too few homologs were identified. Gene tree cannot be inferred.' ) sys.exit() # extract homologs self.logger.info( 'Extracting homologs and determining local gene context.') db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp') gene_precontext, gene_postcontext = self.extract_homologs_and_context( homologs.keys(), db_file, db_homologs_tmp) # report gene length distribution of homologs mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( db_homologs_tmp) self.logger.info( 'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # concatenate homologs with initial query genes homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp') if custom_homologs: custom_db_homologs_tmp = os.path.join(output_dir, 'custom_homologs_db.tmp') custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context( custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp) gene_precontext.update(custom_gene_precontext) gene_postcontext.update(custom_gene_postcontext) homologs.update(custom_homologs) concatenate_files( [query_proteins, db_homologs_tmp, custom_db_homologs_tmp], homolog_ouput_tmp) os.remove(custom_db_homologs_tmp) else: concatenate_files([query_proteins, db_homologs_tmp], homolog_ouput_tmp) os.remove(db_homologs_tmp) # remove stop codons homolog_ouput = os.path.join(output_dir, 'homologs.faa') self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput) os.remove(homolog_ouput_tmp) # infer multiple sequence alignment msa = MsaWorkflow(self.cpus) trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir) # infer tree tw = TreeWorkflow(self.cpus) tree_output = tw.run(trimmed_msa_output, tree_program, prot_model, skip_rooting, output_dir) # create tax2tree consensus map and decorate tree self.logger.info('Decorating internal tree nodes with tax2tree.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for homolog_id in homologs.keys(): genome_id = homolog_id.split('~')[0] t = taxonomy.get(genome_id, None) if t: fout.write(homolog_id + '\t' + ';'.join(t) + '\n') fout.close() t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # create tree with leaf nodes given as genome accessions tree = dendropy.Tree.get_from_path(t2t_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for leaf in tree.leaf_node_iter(): leaf.taxon.label = leaf.taxon.label.split('~')[0] genome_tree = os.path.join(output_dir, 'homologs.tax2tree.genome_accessions.tree') tree.write_to_path(genome_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_query_proteins'] = query_proteins metadata['genetreetk_db_file'] = db_file metadata['genetreetk_taxonomy_file'] = taxonomy_file metadata['genetreetk_blast_evalue'] = str(evalue) metadata['genetreetk_blast_per_identity'] = str(per_identity) metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len) metadata['genetreetk_blast_max_matches'] = str(max_matches) metadata['genetreetk_homology_search'] = homology_search metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa) metadata['genetreetk_msa_consensus'] = str(consensus) metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp) metadata['genetreetk_msa_program'] = msa_program metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy, metadata, gene_precontext, gene_postcontext, arb_metadata_file)
def run(self, genomes, align_dir, out_dir, prefix, debugopt=False): try: """Classify genomes based on position in reference tree.""" for marker_set_id in ('bac120', 'ar122'): user_msa_file = os.path.join( align_dir, prefix + '.%s.user_msa.fasta' % marker_set_id) if not os.path.exists(user_msa_file): # file will not exist if there are no User genomes from a given domain continue classify_tree = self.place_genomes(user_msa_file, marker_set_id, out_dir, prefix) # get taxonomic classification of each user genome tree = dendropy.Tree.get_from_path(classify_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) fout = open( os.path.join( out_dir, prefix + '.%s.classification.tsv' % marker_set_id), 'w') fastaniout = open( os.path.join( out_dir, prefix + '.%s.fastani_results.tsv' % marker_set_id), 'w') redfout = open( os.path.join(out_dir, prefix + '.%s.summary.tsv' % marker_set_id), 'w') if debugopt: parchiinfo = open( os.path.join( out_dir, prefix + '.%s.debug_file.tsv' % marker_set_id), 'w') reddictfile = open( os.path.join( out_dir, prefix + '.%s.red_dictionary.tsv' % marker_set_id), 'w') marker_dict = {} if marker_set_id == 'bac120': marker_dict = Config.RED_DIST_BAC_DICT elif marker_set_id == 'ar122': marker_dict = Config.RED_DIST_ARC_DICT reddictfile.write('Phylum\t{0}\n'.format( marker_dict.get('p__'))) reddictfile.write('Class\t{0}\n'.format( marker_dict.get('c__'))) reddictfile.write('Order\t{0}\n'.format( marker_dict.get('o__'))) reddictfile.write('Family\t{0}\n'.format( marker_dict.get('f__'))) reddictfile.write('Genus\t{0}\n'.format( marker_dict.get('g__'))) reddictfile.close() fastaniout.write("User genome\tReference genome\tANI\n") redfout.write( "user_genome\tclassification_method\tred_value\n") if debugopt: parchiinfo.write( "User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n" ) # Genomes can be classified by using Mash or RED values # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node. # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome analysed_nodes = [] fastani_dict = {} all_fastani_dict = {} fastani_list = [] # some genomes of Case C are handled here, if Mash distance is close enough self.logger.info( 'Calculating Average Nucleotide Identity using FastANI.') for nd in tree.preorder_node_iter(): #We store the prefixes of each leaves to check if one starts with GB_ or RS_ list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in nd.leaf_iter() ] list_subnode = [ subnd.taxon.label.replace("'", '') for subnd in nd.leaf_iter() ] #if only one genome is a reference genome if (list_subnode_initials.count('RS_') + list_subnode_initials.count('GB_') + list_subnode_initials.count('UBA')) == 1 and len( list_subnode_initials ) > 1 and list_subnode[0] not in analysed_nodes: fastani_list.append(list_subnode) analysed_nodes.extend(list_subnode) manager = multiprocessing.Manager() out_q = manager.dict() procs = [] nprocs = self.cpus if len(fastani_list) > 0: for item in splitchunks_list(fastani_list, nprocs): p = multiprocessing.Process(target=self._fastaniWorker, args=(item, genomes, out_q)) procs.append(p) p.start() # Collect all results into a single result dict. We know how many dicts # with results to expect. #while out_q.empty(): # time.sleep(1) # Wait for all worker processes to finish for p in procs: p.join() if p.exitcode == 1: raise ValueError("Stop!!") all_fastani_dict = dict(out_q) for k, v in all_fastani_dict.iteritems(): fastaniout.write("{0}\t{1}\t{2}\n".format( k, v.get("ref_genome"), v.get("ani"))) if Config.FASTANI_SPECIES_THRESHOLD <= v.get("ani"): suffixed_name = add_ncbi_prefix(v.get("ref_genome")) taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name)) if taxa_str.endswith("s__"): taxa_str = taxa_str + v.get("ref_genome") fout.write('%s\t%s\n' % (k, taxa_str)) fastani_dict[k] = v redfout.write("{0}\tani\tNone\n".format(k)) fastaniout.close() self.logger.info( '{0} genomes have been classify with FastANI.'.format( len(fastani_dict))) scaled_tree = self._calculate_red_distances( classify_tree, out_dir) user_genome_ids = set(read_fasta(user_msa_file).keys()) user_genome_ids = user_genome_ids.difference( set(fastani_dict.keys())) # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us # the rank level that can be associated with a User genome. # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node. # Is there are multiple orders under the parent node. The user genome is considered as a new order for leaf in scaled_tree.leaf_node_iter(): if leaf.taxon.label in user_genome_ids: taxa = [] # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf. cur_node = leaf.parent_node list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in cur_node.leaf_iter() ] while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials: cur_node = cur_node.parent_node list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in cur_node.leaf_iter() ] current_rel_list = cur_node.rel_dist parent_taxon_node = cur_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) while parent_taxon_node is not None and not parent_taxon: parent_taxon_node = parent_taxon_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) parent_rank = parent_taxon.split(";")[-1][0:3] parent_rel_dist = parent_taxon_node.rel_dist genome_parent_child = [ leaf.taxon.label, parent_rank, parent_rel_dist, '', '', '', '' ] child_taxons = [] closest_rank = None detection = "RED" # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called if parent_rank != 'g__': child_rk = self.order_rank[ self.order_rank.index(parent_rank) + 1] list_subnode = [ childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_')) ] list_ranks = [ gtdb_taxonomy.get(name)[self.order_rank.index( child_rk)] for name in list_subnode ] if len(set(list_ranks)) == 1: for subranknd in cur_node.preorder_iter(): _support, subranknd_taxon, _aux_info = parse_label( subranknd.label) if subranknd.is_internal( ) and subranknd_taxon is not None and subranknd_taxon.startswith( child_rk): child_taxons = subranknd_taxon.split( ";") child_taxon_node = subranknd child_rel_dist = child_taxon_node.rel_dist break else: #case 2a and 2b closest_rank = parent_rank detection = "Topology" else: #case 1a closest_rank = parent_rank detection = "Topology" #case 1b if len(child_taxons) == 0 and closest_rank is None: list_leaves = [ childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_')) ] if len(list_leaves) != 1: self.logger.error( 'There should be only one leaf.') sys.exit(-1) list_leaf_ranks = gtdb_taxonomy.get( list_leaves[0])[self.order_rank.index(child_rk ):-1] for leaf_taxon in reversed(list_leaf_ranks): if leaf_taxon == list_leaf_ranks[0]: if abs(current_rel_list - marker_dict.get( leaf_taxon[:3])) < abs( (current_rel_list) - marker_dict.get(parent_rank)): #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ): closest_rank = leaf_taxon[:3] genome_parent_child[3] = leaf_taxon genome_parent_child[ 5] = 'case 1b - III' break else: pchildrank = list_leaf_ranks[ list_leaf_ranks.index(leaf_taxon) - 1] if abs( current_rel_list - marker_dict.get(leaf_taxon[:3]) ) < abs(current_rel_list - marker_dict.get(pchildrank[:3])): #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) : closest_rank = leaf_taxon[:3] genome_parent_child[1] = pchildrank genome_parent_child[2] = 1.0 genome_parent_child[3] = leaf_taxon genome_parent_child[5] = 'case 1b - II' break if closest_rank is None: closest_rank = parent_rank genome_parent_child[3] = list_leaf_ranks[0] genome_parent_child[5] = 'case 1b - IV' #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae) #we loop through the list of rank from f_ to c_ rank for child_taxon in reversed(child_taxons): # if lower rank is c__Nitropiria if child_taxon == child_taxons[0]: if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist - marker_dict.get(child_taxon[:3])) and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(parent_rank))): genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - II' closest_rank = child_taxon[:3] elif closest_rank is None: closest_rank = parent_rank genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - III' else: pchildrank = child_taxons[ child_taxons.index(child_taxon) - 1] if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(pchildrank[:3])) and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist - marker_dict.get(child_taxon[:3]))): closest_rank = child_taxon genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - I' break # case 1b if closest_rank is None: print "IT SHOULDN'T HAPPEN!!!" genome_parent_child[6] = closest_rank list_subnode = [ subnd.taxon.label.replace("'", '') for subnd in cur_node.leaf_iter() ] red_taxonomy = self._get_redtax( list_subnode, closest_rank, gtdb_taxonomy) fout.write('{0}\t{1}\n'.format(leaf.taxon.label, red_taxonomy)) del genome_parent_child[0] redfout.write("{0}\t{1}\t{2}\n".format( leaf.taxon.label, detection, current_rel_list)) if debugopt: parchiinfo.write('{0}\t{1}\t{2}\t{3}\n'.format( leaf.taxon.label, current_rel_list, '\t'.join(str(x) for x in genome_parent_child), detection)) redfout.close() fout.close() if debugopt: parchiinfo.close() pplaceout = open( os.path.join( out_dir, prefix + '.%s.classification_pplacer.tsv' % marker_set_id), 'w') # We get the pplacer taxonomy for comparison user_genome_ids = set(read_fasta(user_msa_file).keys()) for leaf in tree.leaf_node_iter(): if leaf.taxon.label in user_genome_ids: taxa = [] cur_node = leaf while cur_node.parent_node: _support, taxon, _aux_info = parse_label( cur_node.label) if taxon: for t in taxon.split(';')[::-1]: taxa.append(t.strip()) cur_node = cur_node.parent_node taxa_str = ';'.join(taxa[::-1]) pplaceout.write('%s\t%s\n' % (leaf.taxon.label, taxa_str)) pplaceout.close() except ValueError as error: print "GTDB-Tk has stopped before finishing" sys.exit(-1) except Exception as error: print "GTDB-Tk has stopped before finishing" sys.exit(-1)
def run(self, taxonomy_file, type_strains_file, genome_prot_dir, extension, max_taxa, rank, per_identity, per_aln_len, genomes_to_process, keep_all_genes, no_reformat_gene_ids, output_dir): """ Create dereplicate set of genes. Taxonomy file should have the following format: <genome_id>\t<taxonomy_str> where taxonomy_str is in GreenGenes format: d__Bacteria;p__Proteobacteria;...;s__Escherichia coli Type strain file should have the following format: <genome_id>\t<genome name> Parameters ---------- taxonomy_file : str File indicating taxonomy string for all genomes of interest type_strains_file : str File indicating type strains. genome_prot_dir : str Directory containing amino acid genes for each genome. extension : str Extension of files with called genes. max_taxa : int Maximum taxa to retain in a named group. rank : int Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species). per_identity : float Percent identity for subsampling similar genes. per_aln_len : float Percent alignment length for subsampling similar genes. genomes_to_process : str File with list of genomes to retain instead of performing taxon subsampling. keep_all_genes : boolean Flag indicating that no gene subsampling should be performed. no_reformat_gene_ids : boolean Flag indicating if gene ids should be reformatted to include scaffold names given by the GFF file. output_dir : str Desired output directory for storing results. """ make_sure_path_exists(output_dir) self.logger.info('Dereplicating at the rank of %s.' % self.rank_labels[rank]) # get taxonomy string for each genome taxonomy = {} if taxonomy_file: self.logger.info('Reading taxonomy file.') taxonomy = Taxonomy().read(taxonomy_file) self.logger.info('There are %d genomes with taxonomy strings.' % len(taxonomy)) # get type strains; genomes which should never be dereplicated type_strains = set() if type_strains_file: self.logger.info('Reading type strain file.') type_strains = self.read_type_strain(type_strains_file) self.logger.info('There are %d type strains.' % len(type_strains)) # get specific list of genomes to process genomes_to_retain = set() if genomes_to_process: self.logger.info('Reading genomes to retain.') for line in open(genomes_to_process): line_split = line.split() genomes_to_retain.add(line_split[0]) self.logger.info('Retaining %d genomes.' % len(genomes_to_retain)) # make sure extension filter starts with a '.' if not extension.startswith('.'): extension = '.' + extension # identify unique genes in each named group fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w') rank_genomes = defaultdict(list) genome_files = os.listdir(genome_prot_dir) underclassified_genomes = 0 genomes_with_missing_data = 0 for genome_file in genome_files: genome_id = remove_extension(genome_file, extension) if not genome_file.endswith(extension): continue if genomes_to_process and genome_id not in genomes_to_retain: continue genome_file = os.path.join(genome_prot_dir, genome_file) if not os.path.exists(genome_file): genomes_with_missing_data += 1 fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n') continue t = taxonomy.get(genome_id, self.rank_prefixes) taxa = t[rank] if taxa[3:] == '': underclassified_genomes += 1 rank_genomes[self.underclassified].append(genome_id) else: rank_genomes[taxa].append(genome_id) validate_seq_ids(genome_file) fout.close() total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()]) if total_genomes_to_process == 0: self.logger.error('No genomes found in directory: %s. Check the --extension flag used to identify genomes.' % genome_prot_dir) sys.exit(-1) self.logger.info('Under-classified genomes automatically placed into the database: %d' % underclassified_genomes) self.logger.info('Genomes with missing sequence data: %d' % genomes_with_missing_data) self.logger.info('Total named groups: %d' % len(rank_genomes)) self.logger.info('Total genomes to process: %d' % total_genomes_to_process) # process each named group gene_file = os.path.join(output_dir, 'custom_db.faa') gene_out = open(gene_file, 'w') taxonomy_out = open(os.path.join(output_dir, 'custom_taxonomy.tsv'), 'w') tmp_dir = tempfile.mkdtemp() total_genes_removed = 0 total_genes_kept = 0 total_genomes_kept = 0 processed_genomes = 0 for taxa, genome_list in rank_genomes.iteritems(): processed_genomes += len(genome_list) print '-------------------------------------------------------------------------------' self.logger.info('Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process)) # create directory with selected genomes taxon_dir = os.path.join(tmp_dir, 'taxon') os.mkdir(taxon_dir) reduced_genome_list = genome_list if not genomes_to_process and taxa != self.underclassified: # perform taxon subsampling reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa) total_genomes_kept += len(reduced_genome_list) gene_dir = os.path.join(taxon_dir, 'genes') os.mkdir(gene_dir) for genome_id in reduced_genome_list: taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy.get(genome_id, self.rank_prefixes)) + '\n') genome_gene_file = os.path.join(genome_prot_dir, genome_id + extension) gff_file = os.path.join(genome_prot_dir, genome_id + '.gff') output_gene_file = os.path.join(gene_dir, genome_id + '.faa') if not no_reformat_gene_ids: self.reformat_gene_id_to_scaffold_id(genome_gene_file, gff_file, taxonomy, output_gene_file) else: os.system('cp %s %s' % (genome_gene_file, output_gene_file)) # filter genes based on amino acid identity genes_to_remove = [] amended_gene_dir = os.path.join(taxon_dir, 'amended_genes') if keep_all_genes or taxa == self.underclassified: # modify gene identifiers to include genome ids self.amend_gene_identifies(gene_dir, amended_gene_dir) else: # filter genes on AAI genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, self.cpus) self.logger.info('Writing unique genes from genomes in %s.' % taxa) genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove) self.logger.info('Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list))) self.logger.info('Genes to keep: %d' % genes_kept) self.logger.info('Genes removed: %d' % len(genes_to_remove)) total_genes_kept += genes_kept total_genes_removed += len(genes_to_remove) shutil.rmtree(taxon_dir) taxonomy_out.close() gene_out.close() self.logger.info('Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process))) self.logger.info('Total genes kept: %d' % total_genes_kept) self.logger.info('Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed))) self.logger.info('Creating BLAST database.') os.system('makeblastdb -dbtype prot -in %s' % gene_file) shutil.rmtree(tmp_dir)
def combine(self, ssu_msa, ssu_tree, lsu_msa, lsu_tree, output_dir): """Infer 16S + 23S tree spanning GTDB genomes.""" # identify common 16S and 23S sequences ssu_seqs = {} for seq_id, seq, annotation in seq_io.read_seq(ssu_msa, keep_annotation=True): genome_id = seq_id.split('~')[0] ssu_seqs[genome_id] = [seq, annotation] self.logger.info('Read %d SSU rRNA sequences.' % len(ssu_seqs)) lsu_seqs = {} for seq_id, seq, annotation in seq_io.read_seq(lsu_msa, keep_annotation=True): genome_id = seq_id.split('~')[0] lsu_seqs[genome_id] = [seq, annotation] self.logger.info('Read %d LSU rRNA sequences.' % len(lsu_seqs)) common_seqs = set(ssu_seqs.keys()).intersection(set(lsu_seqs.keys())) self.logger.info('Identified %d sequences in common.' % len(common_seqs)) # identify incongruent taxonomic order classifcations between trees self.logger.info( 'Identifying incongruent order-level taxonomic classifications between trees.' ) ssu_taxonomy = Taxonomy().read_from_tree(ssu_tree) lsu_taxonomy = Taxonomy().read_from_tree(lsu_tree) order_index = Taxonomy.rank_labels.index('order') seqs_to_filter = set() for seq_id in common_seqs: ssu_order = ssu_taxonomy.get(seq_id)[order_index][3:] lsu_order = lsu_taxonomy.get(seq_id)[order_index][3:] # remove designator of paraphyletic orders # (since in the concatenated tree this may be resolved) ssu_order = ssu_order.split('_')[0] lsu_order = lsu_order.split('_')[0] if ssu_order != lsu_order: seqs_to_filter.add(seq_id) self.logger.info( 'Identified %d sequences with incongruent classifcations.' % len(seqs_to_filter)) common_seqs.difference_update(seqs_to_filter) # write out MSA concatenated_msa = os.path.join(output_dir, 'ssu_lsu_concatenated.fna') fout = open(concatenated_msa, 'w') for seq_id in common_seqs: fout.write('>%s %s %s\n' % (seq_id, ssu_seqs[seq_id][1], lsu_seqs[seq_id][1])) fout.write('%s%s\n' % (ssu_seqs[seq_id][0], lsu_seqs[seq_id][0])) fout.close() # infer tree output_tree = os.path.join(output_dir, 'ssu_lsu_concatenated.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (concatenated_msa, output_tree))
def run(self, input_tree, taxonomy_file, output_dir, plot_taxa_file, plot_dist_taxa_only, plot_domain, highlight_polyphyly, highlight_taxa_file, trusted_taxa_file, fixed_root, min_children, min_support, mblet, fmeasure_table, min_fmeasure, fmeasure_mono, verbose_table): """Determine distribution of taxa at each taxonomic rank. Parameters ---------- input_tree : str Name of input tree. taxonomy_file : str File with taxonomy strings for each taxa. output_dir : str Desired output directory. plot_taxa_file : str File specifying taxa to plot. Set to None to consider all taxa. plot_dist_taxa_only : boolean Only plot the taxa used to infer distribution. plot_domain : boolean Plot domain rank. trusted_taxa_file : str File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa. fixed_root : boolean Usa a single fixed root to infer outliers. min_children : int Only consider taxa with at least the specified number of children taxa when inferring distribution. min_support : float Only consider taxa with at least this level of support when inferring distribution. verbose_table : boolean Print additional columns in output table. """ # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) input_tree_name = os.path.splitext(os.path.basename(input_tree))[0] # pull taxonomy from tree and file self.logger.info('Reading taxonomy.') taxonomy = Taxonomy().read(taxonomy_file) tree_taxonomy = Taxonomy().read_from_tree(input_tree, warnings=False) gtdb_parent_ranks = Taxonomy().parents(tree_taxonomy) # read trusted taxa trusted_taxa = None if trusted_taxa_file: trusted_taxa = read_taxa_file(trusted_taxa_file) # read F-measure for taxa fmeasure = None if fmeasure_table: fmeasure = self.read_fmeasure(fmeasure_table) # determine taxa to be used for inferring distribution taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, min_children, min_support, fmeasure, min_fmeasure) # limit plotted taxa taxa_to_plot = None if plot_dist_taxa_only: taxa_to_plot = taxa_for_dist_inference elif plot_taxa_file: taxa_to_plot = read_taxa_file(plot_taxa_file) else: # plot every taxon defined in tree taxa_to_plot = set() for node in tree.preorder_node_iter(): support, taxon, _auxiliary_info = parse_label(node.label) if taxon: taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) taxa_to_plot.add(taxon) if False: # HACK FOR NCBI: only plot taxa with >= 2 taxa taxa_to_plot = set() for node in tree.preorder_node_iter(): if not node.label or node.is_leaf(): continue support, taxon, _auxiliary_info = parse_label(node.label) if not taxon: continue taxon = taxon.split(';')[-1].strip() # get most specific taxon from compound names # (e.g. p__Armatimonadetes; c__Chthonomonadetes) # count number of subordinate children rank_prefix = taxon[0:3] if min_children > 0 and rank_prefix != 's__': child_rank_index = Taxonomy().rank_index[rank_prefix] + 1 child_rank_prefix = Taxonomy.rank_prefixes[child_rank_index] subordinate_taxa = set() for leaf in node.leaf_iter(): taxa = taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) if len(taxa) > child_rank_index: sub_taxon = taxa[child_rank_index] if sub_taxon != Taxonomy.rank_prefixes[child_rank_index] and sub_taxon.startswith(child_rank_prefix): subordinate_taxa.add(sub_taxon) if len(subordinate_taxa) < min_children: continue taxa_to_plot.add(taxon) # highlight taxa highlight_taxa = set() if highlight_taxa_file: for line in open(highlight_taxa_file): highlight_taxa.add(line.strip().split('\t')[0]) # check if a single fixed root should be used if fixed_root or mblet: self.logger.info('Using single fixed rooting for inferring distributions.') if not mblet: rel_dists = self.rd_fixed_root(tree, taxa_for_dist_inference) else: rel_dists = self.mblet(tree, taxa_for_dist_inference) # create fixed rooting style tables and plots distribution_table = os.path.join(output_dir, '%s.rank_distribution.tsv' % input_tree_name) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) else: # calculate relative distance to taxa rd = RelativeDistance() rel_dists = rd.rel_dist_to_named_clades(tree) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # report number of taxa at each rank print '' print 'Rank\tTaxa to Plot\tTaxa for Inference' for rank, taxa in rel_dists.iteritems(): taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference] print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference)) print '' # *** determine phyla for inferring distribution if True: phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, taxa_for_dist_inference) else: phyla_for_inference = filter_taxa_for_dist_inference(tree, taxonomy, trusted_taxa, 2, min_support, fmeasure, min_fmeasure) phylum_rel_dists, rel_node_dists = self.median_rd_over_phyla(tree, phyla_for_inference) print '' print 'Phyla for RED Inference:' print ','.join(phylum_rel_dists) phyla_file = os.path.join(output_dir, '%s.phyla.tsv' % input_tree_name) fout = open(phyla_file, 'w') for p in phylum_rel_dists: fout.write(p + '\n') fout.close() # set edge lengths to median value over all rootings tree.seed_node.rel_dist = 0.0 for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): n.rel_dist = np_median(rel_node_dists[n.id]) rd_to_parent = n.rel_dist - n.parent_node.rel_dist if rd_to_parent < 0: self.logger.warning('Not all branches are positive after scaling.') n.edge_length = rd_to_parent for phylum, rel_dists in phylum_rel_dists.iteritems(): phylum_dir = os.path.join(output_dir, phylum) if not os.path.exists(phylum_dir): os.makedirs(phylum_dir) # restrict to taxa of interest if taxa_to_plot: for r in rel_dists: for k in set(rel_dists[r].keys()) - set(taxa_to_plot): del rel_dists[r][k] # create distribution plot distribution_table = os.path.join(phylum_dir, '%s.rank_distribution.tsv' % phylum) plot_file = os.path.join(phylum_dir, '%s.rank_distribution.png' % phylum) self._distribution_plot(rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, distribution_table, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(phylum_dir, '%s.median_outlier.tsv' % phylum) self._median_outlier_file(rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table) plot_file = os.path.join(output_dir, '%s.png' % input_tree_name) self._distribution_summary_plot(phylum_rel_dists, taxa_for_dist_inference, highlight_polyphyly, highlight_taxa, fmeasure, fmeasure_mono, plot_file) median_outlier_table = os.path.join(output_dir, '%s.tsv' % input_tree_name) median_rank_file = os.path.join(output_dir, '%s.dict' % input_tree_name) self._median_summary_outlier_file(phylum_rel_dists, taxa_for_dist_inference, gtdb_parent_ranks, median_outlier_table, median_rank_file, verbose_table) output_rd_file = os.path.join(output_dir, '%s.node_rd.tsv' % input_tree_name) self._write_rd(tree, output_rd_file) output_tree = os.path.join(output_dir, '%s.scaled.tree' % input_tree_name) tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, input_tree, taxonomy_file, viral, skip_species, gtdb_metadata, trusted_taxa_file, min_children, min_support, skip_rd_refine, output_tree): """Decorate internal nodes with taxa labels based on F-measure.""" # read GTDB metadata rep_placeholder_stems, rep_latin_stems = self.parse_gtdb_metadata(gtdb_metadata) # read tree self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) # remove any previous taxon labels self.logger.info('Removing any previous internal node labels.') self._strip_taxon_labels(tree) # read taxonomy and trim to taxa in tree self.logger.info('Reading taxonomy.') full_taxonomy = Taxonomy().read(taxonomy_file) if viral: self.logger.info('Translating viral prefixes.') full_taxonomy = translate_viral_taxonomy(full_taxonomy) taxonomy = {} for leaf in tree.leaf_node_iter(): taxonomy[leaf.taxon.label] = full_taxonomy.get(leaf.taxon.label, Taxonomy.rank_prefixes) # find best placement for each taxon based # on the F-measure statistic self.logger.info('Calculating F-measure statistic for each taxa.') fmeasure_for_taxa = self._fmeasure(tree, taxonomy, skip_species) # calculating relative if not skip_rd_refine: # place labels with only one acceptable position and calculate # the relative divergence thresholds from these as a guide for # placing the remaining labels self.logger.info('Placing labels with unambiguous position in tree.') placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa) self.logger.info('Establishing median relative divergence for taxonomic ranks.') median_rank_rd = self._median_rank_rd(tree, placed_taxon, taxonomy, trusted_taxa_file, min_children, min_support) # resolve ambiguous position in tree self.logger.info('Resolving ambiguous taxon label placements using median relative divergences.') self._resolve_ambiguous_placements(fmeasure_for_taxa, median_rank_rd) else: # resolve cases where 2 or more nodes have the same F-measure self.resolve_equal_fmeasure(fmeasure_for_taxa, rep_placeholder_stems, rep_latin_stems, output_tree) # place all labels on tree self.logger.info('Placing labels on tree.') placed_taxon = self._assign_taxon_labels(fmeasure_for_taxa) # write statistics for placed taxon labels self.logger.info('Writing out statistics for taxa.') out_table = output_tree + '-table' self._write_statistics_table(fmeasure_for_taxa, taxonomy, out_table) summary_table = output_tree + '-summary' self._write_summary_table(fmeasure_for_taxa, taxonomy, summary_table) # output taxonomy of extant taxa on tree self.logger.info('Writing out taxonomy for extant taxa.') out_taxonomy = output_tree + '-taxonomy' self._write_taxonomy(tree, out_taxonomy) # output decorated tree self.logger.info('Writing out decorated tree.') tree.write_to_path(output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) if viral: self.logger.info('Translating output files to viral prefixes.') rev_translate_output_file(out_table) rev_translate_output_file(out_taxonomy) rev_translate_output_file(output_tree)