def _producer(self, replicated_num): """Infer tree from jackknifed alignments. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join( self.replicate_dir, 'jk_markers.msa.' + str(replicated_num) + '.faa') self.jackknife_alignment(self.msa, self.perc_markers_to_keep, self.marker_lengths, output_msa) fast_tree = FastTree(multithreaded=False) output_tree = os.path.join( self.replicate_dir, 'jk_markers.tree.' + str(replicated_num) + '.tre') fast_tree_output = os.path.join( self.replicate_dir, 'jk_markers.fasttree.' + str(replicated_num) + '.out') fast_tree.run(output_msa, 'prot', self.model, output_tree, fast_tree_output) return True
def _producer(self, replicated_num): """Infer tree from bootstrapped multiple sequence alignment. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join( self.replicate_dir, 'bootstrap_msa.r_' + str(replicated_num) + '.fna') if os.path.exists(output_msa) and os.path.getsize(output_msa) > 0: self.logger.warning( 'Skipping {} as it already exists.'.format(output_msa)) return True output_tree = os.path.join( self.replicate_dir, 'bootstrap_tree.r_' + str(replicated_num) + '.tree') fast_tree_output = os.path.join( self.replicate_dir, 'bootstrap_fasttree.r_' + str(replicated_num) + '.out') if os.path.exists( fast_tree_output) and os.path.getsize(fast_tree_output) > 0: self.logger.warning( 'Skipping {} as it already exists.'.format(fast_tree_output)) return True bootstrap_alignment(self.msa, output_msa, frac=self.frac) fast_tree = FastTree(multithreaded=False) cmd = fast_tree.run(output_msa, self.base_type, self.model, self.gamma, output_tree, fast_tree_output) return True
def infer(self, options): """Infer tree from MSA.""" self.logger.warning("Tree inference is still under development!") check_file_exists(options.msa_file) make_sure_path_exists(options.out_dir) if (options.cpus > 1): check_dependencies(['FastTreeMP']) else: check_dependencies(['FastTree']) self.logger.info('Inferring tree with FastTree using %s+GAMMA.' % options.prot_model) fasttree = FastTree(multithreaded=(options.cpus > 1)) tree_unrooted_output = os.path.join( options.out_dir, options.prefix + options.suffix + '.unrooted.tree') tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log') tree_output_log = os.path.join(options.out_dir, 'fasttree.log') fasttree.run(options.msa_file, 'prot', options.prot_model, tree_unrooted_output, tree_log, tree_output_log) self.logger.info('Done.')
def _producer(self, replicated_num): """Infer tree from bootstrapped multiple sequence alignment. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join( self.replicate_dir, 'bootstrap_msa.r_' + str(replicated_num) + '.fna') bootstrap_alignment(self.msa, output_msa, frac=self.frac) fast_tree = FastTree(multithreaded=False) output_tree = os.path.join( self.replicate_dir, 'bootstrap_tree.r_' + str(replicated_num) + '.tree') fast_tree_output = os.path.join( self.replicate_dir, 'bootstrap_fasttree.r_' + str(replicated_num) + '.out') fast_tree.run(output_msa, self.base_type, self.model, output_tree, fast_tree_output) return True
def infer_gene_trees(self, msa_dir, output_dir, extension): """Infer gene trees. Parameters ---------- msa_dir : str Directory containing multiple sequence alignment of marker genes. output_dir : str Directory to store gene trees. extension : str Extension of multiple sequence alignment files. """ files = os.listdir(msa_dir) msa_files = [] for f in files: if f.endswith(extension): msa_file = os.path.join(msa_dir, f) msa_files.append(msa_file) fin = open(msa_file) data = fin.readlines() fin.close() fout = open(msa_file, 'w') for line in data: if line[0] != '>': # remove trailing star if line[-1] == '*': line = line[0:-1] fout.write(line) fout.close() fasttree = FastTree(multithreaded=False) fasttree.parallel_run(msa_files, 'prot', 'wag', output_dir, self.cpus) # create gene tree without gene ids for visualization in ARB for msa_file in msa_files: tree_filename = ntpath.basename(msa_file) tree_prefix = tree_filename[0:tree_filename.find('.')] if tree_prefix.startswith('PF'): # patch up output file for Pfam trees old_tree_prefix = tree_prefix tree_prefix = '.'.join(tree_filename.split('.')[0:2]) shutil.move(os.path.join(output_dir, old_tree_prefix + '.tree'), os.path.join(output_dir, tree_prefix + '.tree')) gene_tree_file = os.path.join(output_dir, tree_prefix + '.tree') gene_tree = dendropy.Tree.get_from_path(gene_tree_file, schema='newick', rooting='force-unrooted', preserve_underscores=True) # rename nodes to contain only genome id for node in gene_tree.leaf_nodes(): genome_id = node.taxon.label.split(DefaultValues.SEQ_CONCAT_CHAR)[0] node.taxon.label = genome_id output_tree_file = os.path.join(output_dir, tree_prefix + '.genome_ids.tree') gene_tree.write_to_path(output_tree_file, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def _producer(self, replicated_num): """Infer tree from bootstrapped multiple sequence alignment. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join(self.replicate_dir, 'bootstrap_msa.r_' + str(replicated_num) + '.fna') bootstrap_alignment(self.msa, output_msa, frac=self.frac) fast_tree = FastTree(multithreaded=False) output_tree = os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(replicated_num) + '.tree') fast_tree_output = os.path.join(self.replicate_dir, 'bootstrap_fasttree.r_' + str(replicated_num) + '.out') fast_tree.run(output_msa, self.base_type, self.model, self.gamma, output_tree, fast_tree_output) return True
def _producer(self, replicated_num): """Infer tree from jackknifed alignments. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join(self.replicate_dir, 'jk_taxa.msa.' + str(replicated_num) + '.fna') self.jackknife_taxa(self.msa, self.perc_taxa_to_keep, self.outgroup_ids, output_msa) fast_tree = FastTree(multithreaded=False) output_tree = os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(replicated_num) + '.tre') fast_tree_output = os.path.join(self.replicate_dir, 'jk_taxa.fasttree.' + str(replicated_num) + '.out') fast_tree.run(output_msa, 'prot', self.model, output_tree, fast_tree_output) return True
def run(self, input_tree, msa_file, tree_program, prot_model, num_replicates, output_dir): """Calculate bootstraps. Calculate support for tree using the non-parametric bootstrap methods. Parameters ---------- input_tree : str Tree requiring bootstrap support values. msa_file : str Multiple sequence alignment used to infer input tree (fasta format). tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG']. num_replicates : str Number of bootstrap replicates to perform. output_tree : float Output tree with bootstrap values. """ if tree_program == 'fasttree': self.logger.info( 'Calculating bootstraps with FastTree under %s+GAMMA.' % prot_model) ft = FastTree(multithreaded=False) ft.bootstrap(input_tree, msa_file, 'prot', prot_model, num_replicates, output_dir, self.cpus) elif tree_program == 'raxml': self.logger.info( 'Calculating bootstraps with RAxML under PROTGAMMA%s.' % prot_model) raxml = RAxML(cpus=1) raxml.bootstrap(input_tree, msa_file, prot_model, num_replicates, output_dir, self.cpus)
def run(self, genome_id_file, marker_id_file, model, output_dir): """Identify phylogenetic tree. Parameters ---------- genome_id_file : str File specifying unique ids of genomes to include in tree. marker_id_file : str File specifying unique ids of marker genes to use for inference. model : str ['wag' or 'jtt'] Model of evolution to use. output_dir : str Directory to store results. """ time_keeper = TimeKeeper() output_alignment_dir = os.path.join(output_dir, 'alignments') make_sure_path_exists(output_alignment_dir) output_model_dir = os.path.join(output_dir, 'hmm_models') make_sure_path_exists(output_model_dir) # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Inferring tree for %d genomes.' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # get marker genes self.logger.info('Reading marker genes.') marker_genes = read_marker_id_file(marker_id_file) self.logger.info('Read %d marker genes.' % len(marker_genes)) # gather all single-copy HMMs into a single model file hmm_model_out = os.path.join(output_dir, 'phylo.hmm') hmm_info_out = os.path.join(output_dir, 'phylo.tsv') self.logger.info('Generating marker gene HMM model files.') self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join( output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file) # generate summary report report_out = os.path.join(output_dir, 'infer_workflow.log') fout = open(report_out, 'w') fout.write('[infer]\n') fout.write('Genome Id file: %s\n' % genome_id_file) fout.write('Marker Id file: %s\n' % marker_id_file) fout.write('Model of evolution: %s\n' % model) fout.write(time_keeper.get_time_stamp()) fout.close()
def run(self, msa_file, tree_program, prot_model, skip_rooting, output_dir): """Infer tree. Parameters ---------- msa_file : str Multiple sequence alignment in fasta format. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)]) if num_seqs <= 2: self.logger.error( 'Insufficient number of sequences in MSA to infer tree.') raise SystemExit('Tree inference failed.') output_file = ntpath.basename(msa_file) prefix = output_file[0:output_file.rfind('.')] suffix = output_file[output_file.rfind('.') + 1:] if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, prefix + '.unrooted.tree') tree_log = os.path.join(output_dir, prefix + '.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint if not skip_rooting: seqs = seq_io.read(msa_file) if len(seqs) > 2: self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, prefix + '.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) else: tree_output = tree_unrooted_output return tree_output
def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program, prot_model, split_chars, output_dir): """Infer concatenated gene tree. Parameters ---------- gene_dirs : list GeneTreeTk output directories with information for individual genes. min_per_gene : float Minimum percentage of genes required to retain taxa. min_per_bps : float Minimum percentage of base pairs required to retain taxa. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ # read MSA files concat = defaultdict(lambda: defaultdict(list)) msa_length = 0 gene_lengths = {} for gene_dir in gene_dirs: homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa') for seq_id, seq in seq_io.read_seq(homologs): taxon_id, gene_id = self._split_ids(seq_id, split_chars) if not taxon_id: self.logger.error('Failed to split identifier: %s' % seq_id) sys.exit(-1) concat[taxon_id][gene_dir].append(seq) msa_length += len(seq) gene_lengths[gene_dir] = len(seq) # filter taxon mc_filter = set() min_per_gene_filter = set() min_per_bps_filter = set() for taxon_id in concat: # check if multiple copy missing = 0 taxon_msa_len = 0 for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: missing += 1 continue if len(concat[taxon_id][gene_id]) > 1: mc_filter.add(taxon_id) break taxon_msa_len += len(concat[taxon_id][gene_id][0]) if taxon_id not in mc_filter: if missing > len(gene_dirs) * (1.0 - float(min_per_gene) / 100.0): min_per_gene_filter.add(taxon_id) elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0: min_per_bps_filter.add(taxon_id) min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0) filtered_taxa = mc_filter.union(min_per_gene_filter).union( min_per_bps_filter) remaining_taxa = set(concat) - filtered_taxa self.logger.info('No. genes: %d' % len(gene_dirs)) self.logger.info('No. taxa across all genes: %d' % len(concat)) self.logger.info('Total filtered taxa: %d' % len(filtered_taxa)) self.logger.info(' Due to multi-copy genes: %d' % len(mc_filter)) self.logger.info(' Due to having <%d of the genes: %d' % (min_req_genes, len(min_per_gene_filter))) self.logger.info(' Due to an insufficient number of base pairs: %d' % len(min_per_bps_filter)) self.logger.info('Remaining taxa: %d' % len(remaining_taxa)) self.logger.info('Length of concatenated MSA: %d' % msa_length) # create the multiple sequences alignment msa_file = os.path.join(output_dir, 'concatenated.faa') fout = open(msa_file, 'w') for taxon_id in remaining_taxa: msa = '' for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: msa += '-' * gene_lengths[gene_id] else: msa += concat[taxon_id][gene_id][0] fout.write('>%s\n' % taxon_id) fout.write('%s\n' % msa) fout.close() # read all taxonomy files # (assumes taxonomy is the same for taxa across all genes) taxonomy = {} for gene_id in gene_dirs: taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv') t = Taxonomy().read(taxonomy_file) for label, taxa_str in t.iteritems(): taxon_id, gene_id = self._split_ids(label, split_chars) taxonomy[taxon_id] = taxa_str # create taxonomy file for retained taxa self.logger.info('Creating taxonomy file for retained taxa.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for taxon_id in remaining_taxa: if taxon_id in taxonomy: # query genomes will generally be missing fout.write('%s\t%s\n' % (taxon_id, ';'.join(taxonomy[taxon_id]))) fout.close() # infer tree if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, 'concatenated.unrooted.tree') tree_log = os.path.join(output_dir, 'concatenated.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) if len(remaining_taxa) > 2: tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, 'concatenated.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) # create tax2tree consensus map and decorate tree t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(msa_file, taxonomy, metadata, arb_metadata_file)
def run(self, genome_id_file, marker_id_file, model, output_dir): """Identify phylogenetic tree. Parameters ---------- genome_id_file : str File specifying unique ids of genomes to include in tree. marker_id_file : str File specifying unique ids of marker genes to use for inference. model : str ['wag' or 'jtt'] Model of evolution to use. output_dir : str Directory to store results. """ time_keeper = TimeKeeper() output_alignment_dir = os.path.join(output_dir, 'alignments') make_sure_path_exists(output_alignment_dir) output_model_dir = os.path.join(output_dir, 'hmm_models') make_sure_path_exists(output_model_dir) # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Inferring tree for %d genomes.' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # get marker genes self.logger.info('Reading marker genes.') marker_genes = read_marker_id_file(marker_id_file) self.logger.info('Read %d marker genes.' % len(marker_genes)) # gather all single-copy HMMs into a single model file hmm_model_out = os.path.join(output_dir, 'phylo.hmm') hmm_info_out = os.path.join(output_dir, 'phylo.tsv') self.logger.info('Generating marker gene HMM model files.') self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file) # generate summary report report_out = os.path.join(output_dir, 'infer_workflow.log') fout = open(report_out, 'w') fout.write('[infer]\n') fout.write('Genome Id file: %s\n' % genome_id_file) fout.write('Marker Id file: %s\n' % marker_id_file) fout.write('Model of evolution: %s\n' % model) fout.write(time_keeper.get_time_stamp()) fout.close()
def run(self, genome_ids, marker_genes, hmm_model_file, min_support, min_per_taxa, perc_markers_to_jackknife, gene_tree_dir, alignment_dir, output_dir): """Identify gene trees which do not recover well-support, internal splits in a jackknifed genome tree. Parameters ---------- genome_ids : iterable Genomes of interest. marker_genes : iterable Unique ids of marker genes. hmm_model_file : str File containing HMMs for each marker gene. min_support : float Minimum jackknife support of splits to use during LGT filtering [0, 1]. min_per_taxa : float Minimum percentage of taxa required to consider a split during LGT filtering [0, 1]. perc_markers_to_jackknife : float Percentage of taxa to keep during marker jackknifing [0, 1]. gene_tree_dir : str Directory containing gene trees. alignment_dir : str Directory containing multiple sequence alignments. output_dir : str Output directory. """ output_dir = os.path.join(output_dir, 'jackknife_markers') make_sure_path_exists(output_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'concatenated.fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', 'wag', concatenated_tree, concatenated_tree_log, log_file) # calculate jackknife support values self.logger.info('Calculating jackknife marker support values.') jackknife_markers = JackknifeMarkers(self.cpus) jackknife_tree = jackknife_markers.run(concatenated_tree, concatenated_alignment_file, marker_file, perc_markers_to_jackknife, 100, 'wag', output_dir) # jackknife_tree = os.path.join(output_dir, 'concatenated.jk_markers.tree') # identify well-support, internal splits self.logger.info('Identifying well-support, internal splits.') tree = dendropy.Tree.get_from_path(jackknife_tree, schema='newick', rooting='force-unrooted', preserve_underscores=True) num_leaves = len(tree.leaf_nodes()) num_internal_nodes = 0 num_major_splits = 0 well_supported_major_splits = 0 splits = [] for node in tree.internal_nodes(): num_internal_nodes += 1 num_node_leaves = len(node.leaf_nodes()) if min(num_node_leaves, num_leaves - num_node_leaves) >= max(min_per_taxa * num_leaves, 2): num_major_splits += 1 if int(node.label) > (min_support * 100.0): well_supported_major_splits += 1 split = set([x.taxon.label for x in node.leaf_nodes()]) splits.append((split, node.edge_length)) self.logger.info('# internal nodes: %d' % num_internal_nodes) self.logger.info('# major splits: %d' % num_major_splits) self.logger.info('# well-supported, major splits: %d' % well_supported_major_splits) # filter gene trees that do not recover well-support, internal splits self.logger.info('Filtering gene trees.') distances = {} for i, mg in enumerate(sorted(marker_genes)): sys.stdout.write('==> Processed %d of %d (%.2f) gene trees.\r' % (i + 1, len(marker_genes), (i + 1) * 100.0 / len(marker_genes))) sys.stdout.flush() # read gene tree f = mg + '.tree' gene_tree_file = os.path.join(gene_tree_dir, f) gene_tree = dendropy.Tree.get_from_path(gene_tree_file, schema='newick', rooting='force-unrooted', preserve_underscores=True) # prune gene tree so each genome is present exactly once processed_genome_ids = set() taxa_to_prune = [] for node in gene_tree.leaf_nodes(): genome_id = node.taxon.label.split(DefaultValues.SEQ_CONCAT_CHAR)[0] if genome_id in processed_genome_ids or genome_id not in genome_ids: taxa_to_prune.append(node.taxon) processed_genome_ids.add(genome_id) gene_tree.prune_taxa(taxa_to_prune) # rename nodes to contain only genome id gene_tree_taxa_set = set() for node in gene_tree.leaf_nodes(): genome_id = node.taxon.label.split(DefaultValues.SEQ_CONCAT_CHAR)[0] node.taxon.label = genome_id gene_tree_taxa_set.add(genome_id) # re-encode the split system over the new taxon namespace gene_tree.migrate_taxon_namespace(dendropy.TaxonNamespace(gene_tree_taxa_set)) gene_tree.encode_bipartitions() split_bitmasks = set(b.split_bitmask for b in gene_tree.bipartition_encoding) # determine number of splits recovered by or compatible with this gene tree recovered_splits = 0 compatible_splits = 0 compatible_edge_length = 0 for split, edge_length in splits: common_taxa_labels = split.intersection(gene_tree_taxa_set) common_split = gene_tree.taxon_namespace.taxa_bitmask(labels=common_taxa_labels) normalized_split = dendropy.Bipartition.normalize_bitmask( bitmask=common_split, fill_bitmask=gene_tree.taxon_namespace.all_taxa_bitmask(), lowest_relevant_bit=1) if normalized_split in split_bitmasks: recovered_splits += 1 if gene_tree.is_compatible_with_bipartition(dendropy.Bipartition(bitmask=normalized_split, is_rooted=False)): compatible_splits += 1 compatible_edge_length += edge_length perc_recovered_splits = recovered_splits * 100.0 / len(splits) perc_comp_splits = compatible_splits * 100.0 / len(splits) norm_comp_edge_length = float(compatible_edge_length) / sum([s[1] for s in splits]) # calculate weighted Robinson-Foulds (Manhattan) and Felsenstein's Euclidean # distances to the concatenated genome tree pruned_tree = tree.clone(depth=2) pruned_tree.retain_taxa_with_labels(gene_tree.taxon_namespace.labels()) pruned_tree.migrate_taxon_namespace(gene_tree.taxon_namespace) pruned_tree.encode_bipartitions() pruned_tree_edge_len = sum([e.length for e in pruned_tree.edges() if e.length]) gene_tree_edge_len = sum([e.length for e in gene_tree.edges() if e.length]) pruned_tree.scale_edges(1.0 / pruned_tree_edge_len) gene_tree.scale_edges(1.0 / gene_tree_edge_len) manhattan = dendropy.calculate.treecompare.weighted_robinson_foulds_distance(pruned_tree, gene_tree) euclidean = dendropy.calculate.treecompare.euclidean_distance(pruned_tree, gene_tree) distances[mg] = (perc_recovered_splits, perc_comp_splits, norm_comp_edge_length, manhattan, euclidean) return distances, num_internal_nodes, num_major_splits, well_supported_major_splits