def identify_marker_genes(self, ingroup_file, ubiquity_threshold, single_copy_threshold, redundancy, valid_marker_genes, output_msa_dir, output_model_dir): """Identify ubiquitous, single-copy marker genes. Parameters ---------- ingroup_file : str File specifying unique ids of ingroup genomes. ubiquity_threshold : float Threshold for defining ubiquity marker genes. single_copy_threshold : float Threshold for defining a single-copy marker gene. redundancy : float Threshold for declaring HMMs redundant. valid_marker_genes : iterable Restrict marker set to genes within this set. output_msa_dir : str Directory to store multiple sequence alignment of marker genes. output_model_dir : str Directory to store HMMs of marker genes. """ # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(ingroup_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Ingroup genomes: %d' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # identify marker genes self.logger.info('Identifying marker genes.') gene_stats_file = os.path.join(output_model_dir, '..', 'gene_stats.all.tsv') gene_count_table = self._gene_count_table(genome_ids, genome_dirs) marker_gene_stats = self._marker_genes(genome_ids, gene_count_table, ubiquity_threshold, single_copy_threshold, gene_stats_file) # with open('tmp_marker_gene_list', 'wb') as f: # pickle.dump(marker_gene_stats, f) # with open('tmp_marker_gene_list', 'rb') as f: # marker_gene_stats = pickle.load(f) marker_genes = set(marker_gene_stats.keys()) if valid_marker_genes: self.logger.info( 'Restricting %d identified markers to specified set of valid markers.' % len(marker_genes)) marker_genes = marker_genes.intersection(valid_marker_genes) self.logger.info( 'Identified ubiquitous, single-copy marker genes: %d' % len(marker_genes)) redundancy_out_file = os.path.join(output_model_dir, '..', 'redundant_markers.tsv') redundancy = redundancy * len(genome_ids) redundant_hmms = self._identify_redundant_hmms(marker_genes, gene_count_table, redundancy, redundancy_out_file) marker_genes = marker_genes - redundant_hmms self.logger.info( 'Marker genes identified as redundant between TIGRFAM and Pfam: %d' % len(redundant_hmms)) self.logger.info('Remaining ubiquitous, single-copy marker genes: %d' % len(marker_genes)) # get HMM for each marker gene self.logger.info('Fetching HMM for each marker genes.') self._fetch_marker_models(marker_genes, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, False, output_msa_dir, output_model_dir) return len(genome_ids), len(ncbi_genome_ids), len( user_genome_ids), genome_ids, marker_gene_stats, marker_genes
def run(self, genome_id_file, marker_id_file, model, output_dir): """Identify phylogenetic tree. Parameters ---------- genome_id_file : str File specifying unique ids of genomes to include in tree. marker_id_file : str File specifying unique ids of marker genes to use for inference. model : str ['wag' or 'jtt'] Model of evolution to use. output_dir : str Directory to store results. """ time_keeper = TimeKeeper() output_alignment_dir = os.path.join(output_dir, 'alignments') make_sure_path_exists(output_alignment_dir) output_model_dir = os.path.join(output_dir, 'hmm_models') make_sure_path_exists(output_model_dir) # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Inferring tree for %d genomes.' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # get marker genes self.logger.info('Reading marker genes.') marker_genes = read_marker_id_file(marker_id_file) self.logger.info('Read %d marker genes.' % len(marker_genes)) # gather all single-copy HMMs into a single model file hmm_model_out = os.path.join(output_dir, 'phylo.hmm') hmm_info_out = os.path.join(output_dir, 'phylo.tsv') self.logger.info('Generating marker gene HMM model files.') self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join( output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file) # generate summary report report_out = os.path.join(output_dir, 'infer_workflow.log') fout = open(report_out, 'w') fout.write('[infer]\n') fout.write('Genome Id file: %s\n' % genome_id_file) fout.write('Marker Id file: %s\n' % marker_id_file) fout.write('Model of evolution: %s\n' % model) fout.write(time_keeper.get_time_stamp()) fout.close()
def run(self, genome_id_file, marker_id_file, model, output_dir): """Identify phylogenetic tree. Parameters ---------- genome_id_file : str File specifying unique ids of genomes to include in tree. marker_id_file : str File specifying unique ids of marker genes to use for inference. model : str ['wag' or 'jtt'] Model of evolution to use. output_dir : str Directory to store results. """ time_keeper = TimeKeeper() output_alignment_dir = os.path.join(output_dir, 'alignments') make_sure_path_exists(output_alignment_dir) output_model_dir = os.path.join(output_dir, 'hmm_models') make_sure_path_exists(output_model_dir) # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Inferring tree for %d genomes.' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # get marker genes self.logger.info('Reading marker genes.') marker_genes = read_marker_id_file(marker_id_file) self.logger.info('Read %d marker genes.' % len(marker_genes)) # gather all single-copy HMMs into a single model file hmm_model_out = os.path.join(output_dir, 'phylo.hmm') hmm_info_out = os.path.join(output_dir, 'phylo.tsv') self.logger.info('Generating marker gene HMM model files.') self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file) # generate summary report report_out = os.path.join(output_dir, 'infer_workflow.log') fout = open(report_out, 'w') fout.write('[infer]\n') fout.write('Genome Id file: %s\n' % genome_id_file) fout.write('Marker Id file: %s\n' % marker_id_file) fout.write('Model of evolution: %s\n' % model) fout.write(time_keeper.get_time_stamp()) fout.close()