def _producer(self, replicated_num): """Infer tree from bootstrapped multiple sequence alignment. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join( self.replicate_dir, 'bootstrap_msa.r_' + str(replicated_num) + '.fna') if os.path.exists(output_msa) and os.path.getsize(output_msa) > 0: self.logger.warning( 'Skipping {} as it already exists.'.format(output_msa)) return True output_tree = os.path.join( self.replicate_dir, 'bootstrap_tree.r_' + str(replicated_num) + '.tree') fast_tree_output = os.path.join( self.replicate_dir, 'bootstrap_fasttree.r_' + str(replicated_num) + '.out') if os.path.exists( fast_tree_output) and os.path.getsize(fast_tree_output) > 0: self.logger.warning( 'Skipping {} as it already exists.'.format(fast_tree_output)) return True bootstrap_alignment(self.msa, output_msa, frac=self.frac) fast_tree = FastTree(multithreaded=False) cmd = fast_tree.run(output_msa, self.base_type, self.model, self.gamma, output_tree, fast_tree_output) return True
def _producer(self, replicated_num): """Infer tree from bootstrapped multiple sequence alignment. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join( self.replicate_dir, 'bootstrap_msa.r_' + str(replicated_num) + '.fna') bootstrap_alignment(self.msa, output_msa, frac=self.frac) fast_tree = FastTree(multithreaded=False) output_tree = os.path.join( self.replicate_dir, 'bootstrap_tree.r_' + str(replicated_num) + '.tree') fast_tree_output = os.path.join( self.replicate_dir, 'bootstrap_fasttree.r_' + str(replicated_num) + '.out') fast_tree.run(output_msa, self.base_type, self.model, output_tree, fast_tree_output) return True
def _producer(self, replicated_num): """Infer tree from jackknifed alignments. Parameters ---------- replicated_num : int Unique replicate number. """ output_msa = os.path.join( self.replicate_dir, 'jk_markers.msa.' + str(replicated_num) + '.faa') self.jackknife_alignment(self.msa, self.perc_markers_to_keep, self.marker_lengths, output_msa) fast_tree = FastTree(multithreaded=False) output_tree = os.path.join( self.replicate_dir, 'jk_markers.tree.' + str(replicated_num) + '.tre') fast_tree_output = os.path.join( self.replicate_dir, 'jk_markers.fasttree.' + str(replicated_num) + '.out') fast_tree.run(output_msa, 'prot', self.model, output_tree, fast_tree_output) return True
def infer(self, options): """Infer tree from MSA.""" self.logger.warning("Tree inference is still under development!") check_file_exists(options.msa_file) make_sure_path_exists(options.out_dir) if (options.cpus > 1): check_dependencies(['FastTreeMP']) else: check_dependencies(['FastTree']) self.logger.info('Inferring tree with FastTree using %s+GAMMA.' % options.prot_model) fasttree = FastTree(multithreaded=(options.cpus > 1)) tree_unrooted_output = os.path.join( options.out_dir, options.prefix + options.suffix + '.unrooted.tree') tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log') tree_output_log = os.path.join(options.out_dir, 'fasttree.log') fasttree.run(options.msa_file, 'prot', options.prot_model, tree_unrooted_output, tree_log, tree_output_log) self.logger.info('Done.')
def infer_gene_trees(self, msa_dir, output_dir, extension): """Infer gene trees. Parameters ---------- msa_dir : str Directory containing multiple sequence alignment of marker genes. output_dir : str Directory to store gene trees. extension : str Extension of multiple sequence alignment files. """ files = os.listdir(msa_dir) msa_files = [] for f in files: if f.endswith(extension): msa_file = os.path.join(msa_dir, f) msa_files.append(msa_file) fin = open(msa_file) data = fin.readlines() fin.close() fout = open(msa_file, 'w') for line in data: if line[0] != '>': # remove trailing star if line[-1] == '*': line = line[0:-1] fout.write(line) fout.close() fasttree = FastTree(multithreaded=False) fasttree.parallel_run(msa_files, 'prot', 'wag', output_dir, self.cpus) # create gene tree without gene ids for visualization in ARB for msa_file in msa_files: tree_filename = ntpath.basename(msa_file) tree_prefix = tree_filename[0:tree_filename.find('.')] if tree_prefix.startswith('PF'): # patch up output file for Pfam trees old_tree_prefix = tree_prefix tree_prefix = '.'.join(tree_filename.split('.')[0:2]) shutil.move(os.path.join(output_dir, old_tree_prefix + '.tree'), os.path.join(output_dir, tree_prefix + '.tree')) gene_tree_file = os.path.join(output_dir, tree_prefix + '.tree') gene_tree = dendropy.Tree.get_from_path(gene_tree_file, schema='newick', rooting='force-unrooted', preserve_underscores=True) # rename nodes to contain only genome id for node in gene_tree.leaf_nodes(): genome_id = node.taxon.label.split(DefaultValues.SEQ_CONCAT_CHAR)[0] node.taxon.label = genome_id output_tree_file = os.path.join(output_dir, tree_prefix + '.genome_ids.tree') gene_tree.write_to_path(output_tree_file, schema='newick', suppress_rooting=True, unquoted_underscores=True)
def run(self, input_tree, msa_file, tree_program, prot_model, num_replicates, output_dir): """Calculate bootstraps. Calculate support for tree using the non-parametric bootstrap methods. Parameters ---------- input_tree : str Tree requiring bootstrap support values. msa_file : str Multiple sequence alignment used to infer input tree (fasta format). tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG']. num_replicates : str Number of bootstrap replicates to perform. output_tree : float Output tree with bootstrap values. """ if tree_program == 'fasttree': self.logger.info( 'Calculating bootstraps with FastTree under %s+GAMMA.' % prot_model) ft = FastTree(multithreaded=False) ft.bootstrap(input_tree, msa_file, 'prot', prot_model, num_replicates, output_dir, self.cpus) elif tree_program == 'raxml': self.logger.info( 'Calculating bootstraps with RAxML under PROTGAMMA%s.' % prot_model) raxml = RAxML(cpus=1) raxml.bootstrap(input_tree, msa_file, prot_model, num_replicates, output_dir, self.cpus)
def run(self, genome_id_file, marker_id_file, model, output_dir): """Identify phylogenetic tree. Parameters ---------- genome_id_file : str File specifying unique ids of genomes to include in tree. marker_id_file : str File specifying unique ids of marker genes to use for inference. model : str ['wag' or 'jtt'] Model of evolution to use. output_dir : str Directory to store results. """ time_keeper = TimeKeeper() output_alignment_dir = os.path.join(output_dir, 'alignments') make_sure_path_exists(output_alignment_dir) output_model_dir = os.path.join(output_dir, 'hmm_models') make_sure_path_exists(output_model_dir) # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Inferring tree for %d genomes.' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # get marker genes self.logger.info('Reading marker genes.') marker_genes = read_marker_id_file(marker_id_file) self.logger.info('Read %d marker genes.' % len(marker_genes)) # gather all single-copy HMMs into a single model file hmm_model_out = os.path.join(output_dir, 'phylo.hmm') hmm_info_out = os.path.join(output_dir, 'phylo.tsv') self.logger.info('Generating marker gene HMM model files.') self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join( output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file) # generate summary report report_out = os.path.join(output_dir, 'infer_workflow.log') fout = open(report_out, 'w') fout.write('[infer]\n') fout.write('Genome Id file: %s\n' % genome_id_file) fout.write('Marker Id file: %s\n' % marker_id_file) fout.write('Model of evolution: %s\n' % model) fout.write(time_keeper.get_time_stamp()) fout.close()
def run(self, msa_file, tree_program, prot_model, skip_rooting, output_dir): """Infer tree. Parameters ---------- msa_file : str Multiple sequence alignment in fasta format. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)]) if num_seqs <= 2: self.logger.error( 'Insufficient number of sequences in MSA to infer tree.') raise SystemExit('Tree inference failed.') output_file = ntpath.basename(msa_file) prefix = output_file[0:output_file.rfind('.')] suffix = output_file[output_file.rfind('.') + 1:] if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, prefix + '.unrooted.tree') tree_log = os.path.join(output_dir, prefix + '.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint if not skip_rooting: seqs = seq_io.read(msa_file) if len(seqs) > 2: self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, prefix + '.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) else: tree_output = tree_unrooted_output return tree_output
def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program, prot_model, split_chars, output_dir): """Infer concatenated gene tree. Parameters ---------- gene_dirs : list GeneTreeTk output directories with information for individual genes. min_per_gene : float Minimum percentage of genes required to retain taxa. min_per_bps : float Minimum percentage of base pairs required to retain taxa. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ # read MSA files concat = defaultdict(lambda: defaultdict(list)) msa_length = 0 gene_lengths = {} for gene_dir in gene_dirs: homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa') for seq_id, seq in seq_io.read_seq(homologs): taxon_id, gene_id = self._split_ids(seq_id, split_chars) if not taxon_id: self.logger.error('Failed to split identifier: %s' % seq_id) sys.exit(-1) concat[taxon_id][gene_dir].append(seq) msa_length += len(seq) gene_lengths[gene_dir] = len(seq) # filter taxon mc_filter = set() min_per_gene_filter = set() min_per_bps_filter = set() for taxon_id in concat: # check if multiple copy missing = 0 taxon_msa_len = 0 for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: missing += 1 continue if len(concat[taxon_id][gene_id]) > 1: mc_filter.add(taxon_id) break taxon_msa_len += len(concat[taxon_id][gene_id][0]) if taxon_id not in mc_filter: if missing > len(gene_dirs) * (1.0 - float(min_per_gene) / 100.0): min_per_gene_filter.add(taxon_id) elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0: min_per_bps_filter.add(taxon_id) min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0) filtered_taxa = mc_filter.union(min_per_gene_filter).union( min_per_bps_filter) remaining_taxa = set(concat) - filtered_taxa self.logger.info('No. genes: %d' % len(gene_dirs)) self.logger.info('No. taxa across all genes: %d' % len(concat)) self.logger.info('Total filtered taxa: %d' % len(filtered_taxa)) self.logger.info(' Due to multi-copy genes: %d' % len(mc_filter)) self.logger.info(' Due to having <%d of the genes: %d' % (min_req_genes, len(min_per_gene_filter))) self.logger.info(' Due to an insufficient number of base pairs: %d' % len(min_per_bps_filter)) self.logger.info('Remaining taxa: %d' % len(remaining_taxa)) self.logger.info('Length of concatenated MSA: %d' % msa_length) # create the multiple sequences alignment msa_file = os.path.join(output_dir, 'concatenated.faa') fout = open(msa_file, 'w') for taxon_id in remaining_taxa: msa = '' for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: msa += '-' * gene_lengths[gene_id] else: msa += concat[taxon_id][gene_id][0] fout.write('>%s\n' % taxon_id) fout.write('%s\n' % msa) fout.close() # read all taxonomy files # (assumes taxonomy is the same for taxa across all genes) taxonomy = {} for gene_id in gene_dirs: taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv') t = Taxonomy().read(taxonomy_file) for label, taxa_str in t.iteritems(): taxon_id, gene_id = self._split_ids(label, split_chars) taxonomy[taxon_id] = taxa_str # create taxonomy file for retained taxa self.logger.info('Creating taxonomy file for retained taxa.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for taxon_id in remaining_taxa: if taxon_id in taxonomy: # query genomes will generally be missing fout.write('%s\t%s\n' % (taxon_id, ';'.join(taxonomy[taxon_id]))) fout.close() # infer tree if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, 'concatenated.unrooted.tree') tree_log = os.path.join(output_dir, 'concatenated.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) if len(remaining_taxa) > 2: tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, 'concatenated.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) # create tax2tree consensus map and decorate tree t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(msa_file, taxonomy, metadata, arb_metadata_file)