def __init__(self): """Initialize.""" check_dependencies(['comparem', 'diamond', 'makeblastdb']) self.underclassified = 'underclassified' self.rank_prefixes = Taxonomy.rank_prefixes self.rank_index = Taxonomy.rank_index self.rank_labels = Taxonomy.rank_labels self.time_keeper = TimeKeeper()
def run(self, genome_id_file, marker_id_file, model, output_dir): """Identify phylogenetic tree. Parameters ---------- genome_id_file : str File specifying unique ids of genomes to include in tree. marker_id_file : str File specifying unique ids of marker genes to use for inference. model : str ['wag' or 'jtt'] Model of evolution to use. output_dir : str Directory to store results. """ time_keeper = TimeKeeper() output_alignment_dir = os.path.join(output_dir, 'alignments') make_sure_path_exists(output_alignment_dir) output_model_dir = os.path.join(output_dir, 'hmm_models') make_sure_path_exists(output_model_dir) # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Inferring tree for %d genomes.' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # get marker genes self.logger.info('Reading marker genes.') marker_genes = read_marker_id_file(marker_id_file) self.logger.info('Read %d marker genes.' % len(marker_genes)) # gather all single-copy HMMs into a single model file hmm_model_out = os.path.join(output_dir, 'phylo.hmm') hmm_info_out = os.path.join(output_dir, 'phylo.tsv') self.logger.info('Generating marker gene HMM model files.') self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join( output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file) # generate summary report report_out = os.path.join(output_dir, 'infer_workflow.log') fout = open(report_out, 'w') fout.write('[infer]\n') fout.write('Genome Id file: %s\n' % genome_id_file) fout.write('Marker Id file: %s\n' % marker_id_file) fout.write('Model of evolution: %s\n' % model) fout.write(time_keeper.get_time_stamp()) fout.close()
def __init__(self): """Initialization""" self.logger = logging.getLogger() self.time_keeper = TimeKeeper()
class OptionsParser(object): def __init__(self): """Initialization""" self.logger = logging.getLogger() self.time_keeper = TimeKeeper() def outliers(self, options): """Create information for identifying taxnomic outliers""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) if options.plot_taxa_file: check_file_exists(options.plot_taxa_file) if options.trusted_taxa_file: check_file_exists(options.trusted_taxa_file) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) if options.highlight_polyphyly and not options.fmeasure_table: self.logger.error( "The '--highlight_polyphyly' flag must be used with the '--fmeasure_table' flag." ) return o = Outliers(options.skip_mpld3, options.dpi, options.output_dir) o.run(options.input_tree, options.taxonomy_file, options.viral, options.plot_taxa_file, options.plot_dist_taxa_only, options.plot_domain, options.highlight_polyphyly, options.highlight_taxa_file, options.trusted_taxa_file, options.fixed_root, options.min_children, options.min_support, options.mblet, options.fmeasure_table, options.min_fmeasure, options.fmeasure_mono, options.verbose_table) self.logger.info('Done.') def scale_tree(self, options): """Scale a rooted tree based on RED.""" check_file_exists(options.input_tree) self.logger.info('Reading tree.') tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) self.logger.info('Scaling tree based on RED.') rd = RelativeDistance() rd.decorate_rel_dist(tree) for n in tree.preorder_node_iter(lambda n: n != tree.seed_node): rd_to_parent = n.rel_dist - n.parent_node.rel_dist n.edge_length = rd_to_parent tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Done.') def compare_red(self, options): """Compare RED values of taxa calculated over different trees.""" check_file_exists(options.red_table1) check_file_exists(options.red_table2) check_file_exists(options.red_dict2) median_reds = eval(open(options.red_dict2).readline()) red1 = {} red2 = {} lineage = {} for d, red_file in [(red1, options.red_table1), (red2, options.red_table2)]: with open(red_file) as f: f.readline() for line in f: line_split = line.strip().split('\t') taxon = line_split[0] median_red = float(line_split[2]) d[taxon] = median_red if d == red1: lineage[taxon] = line_split[1] red1_label = os.path.splitext(os.path.basename(options.red_table1))[0] red2_label = os.path.splitext(os.path.basename(options.red_table2))[0] fout = open(options.output_table, 'w') fout.write( 'Taxon\tLineage\t%s\t%s\tDifference\tAbs. Difference\tChanged rank\n' % (red1_label, red2_label)) if options.viral: sorted_taxa = sort_viral_taxa(set(red1.keys()).union(red2.keys())) else: sorted_taxa = Taxonomy().sort_taxa( set(red1.keys()).union(red2.keys())) for taxon in sorted_taxa: r1 = red1.get(taxon, 'NA') r2 = red2.get(taxon, 'NA') if r1 == 'NA': fout.write('%s\t%s\t%s\t%.3f\t%s\t%s' % (taxon, 'NA', 'NA', r2, 'NA', 'NA')) elif r2 == 'NA': fout.write('%s\t%s\t%.3f\t%s\t%s\t%s\t%s\n' % (taxon, lineage[taxon], r1, 'NA', 'NA', 'NA', 'NA')) else: fout.write( '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f' % (taxon, lineage[taxon], r1, r2, r1 - r2, abs(r1 - r2))) if r2 != 'NA': rank_prefix = taxon[0:3] if rank_prefix == 'd__': continue if options.viral: rank_label = VIRAL_RANK_LABELS[VIRAL_RANK_PREFIXES.index( rank_prefix)] else: rank_label = Taxonomy.rank_labels[ Taxonomy.rank_prefixes.index(rank_prefix)] rank_median = median_reds[rank_label] closest_rank = rank_label closest_dist = 1e6 if r2 < rank_median - 0.1 or r2 > rank_median + 0.1: for rank, median_red in median_reds.items(): d = abs(r2 - median_red) if d < closest_dist: closest_dist = d closest_rank = rank if rank_label != closest_rank: fout.write('\tTrue (%s: %.3f)' % (closest_rank, closest_dist)) else: fout.write('\tFalse') fout.write('\n') fout.close() def mark_tree(self, options): """Mark tree command.""" check_file_exists(options.input_tree) mt = MarkTree() mt.run(options.input_tree, options.output_tree, options.min_support, options.only_named_clades, options.min_length, not options.no_percentile, not options.no_relative_divergence, not options.no_prediction, options.thresholds) self.logger.info('Marked tree written to: %s' % options.output_tree) def rogue_test(self, options): """Rogue taxa command.""" check_dir_exists(options.input_tree_dir) check_file_exists(options.taxonomy_file) make_sure_path_exists(options.output_dir) if options.decorate: check_dependencies(['genometreetk']) rt = RogueTest() rt.run(options.input_tree_dir, options.taxonomy_file, options.outgroup_taxon, options.decorate, options.output_dir) self.logger.info('Finished rogue taxa test.') def decorate(self, options): """Place internal taxonomic labels on tree.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) decorate = Decorate() decorate.run(options.input_tree, options.taxonomy_file, options.viral, options.trusted_taxa_file, options.min_children, options.min_support, options.skip_rd_refine, options.output_tree) self.logger.info('Finished decorating tree.') def taxon_stats(self, options): """Taxon stats command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy().read(options.taxonomy_file) taxon_children = Taxonomy().taxon_children(taxonomy) fout = open(options.output_file, 'w') fout.write('Taxa') for rank in Taxonomy.rank_labels[1:]: fout.write('\t# named %s' % rank) fout.write('\t# extant taxon with complete taxonomy') fout.write('\n') for rank_prefix in Taxonomy.rank_prefixes: # find taxon at the specified rank cur_taxa = [] for taxon in taxon_children: if taxon.startswith(rank_prefix): cur_taxa.append(taxon) cur_taxa.sort() for taxon in cur_taxa: fout.write(taxon) fout.write('\t-' * Taxonomy.rank_index[rank_prefix]) next_taxa = [taxon] for _ in range(Taxonomy.rank_index[rank_prefix], Taxonomy.rank_index['s__'] + 1): children_taxa = set() for t in next_taxa: children_taxa.update(taxon_children[t]) fout.write('\t%d' % len(children_taxa)) next_taxa = children_taxa fout.write('\n') fout.close() self.logger.info('Summary statistics written to: %s' % options.output_file) def robustness_plot(self, options): """Robustness plot command""" self.logger.info('') self.logger.info( '*******************************************************************************' ) self.logger.info( ' [PhyloRank - robustness_plot] Plotting distances across a set of tree.' ) self.logger.info( '*******************************************************************************' ) robustness_plot = RobustnessPlot() robustness_plot.run(options.rank, options.input_tree_dir, options.full_tree_file, options.derep_tree_file, options.taxonomy_file, options.output_prefix, options.min_children, options.title) self.time_keeper.print_time_stamp() def rd_ranks(self, options): """Calculate number of taxa for specified rd thresholds.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) r = RdRanks() r.run(options.input_tree, options.thresholds, options.output_dir) self.logger.info('Done.') def bl_dist(self, options): """Calculate distribution of branch lengths at each taxonomic rank.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) b = BranchLengthDistribution() b.run(options.input_tree, options.trusted_taxa_file, options.min_children, options.taxonomy_file, options.output_dir) self.logger.info('Done.') def bl_optimal(self, options): """Determine branch length for best congruency with existing taxonomy.""" b = BranchLengthDistribution() optimal_bl, correct_taxa, incorrect_taxa = b.optimal( options.input_tree, options.rank, options.min_dist, options.max_dist, options.step_size, options.output_table) prec = float(correct_taxa) / (correct_taxa + incorrect_taxa) self.logger.info('Optimal branch length is %f.' % optimal_bl) self.logger.info( 'This results in %d correct and %d incorrect taxa (precision = %.2f).' % (correct_taxa, incorrect_taxa, prec)) def bl_decorate(self, options): """Decorate tree based using a mean branch length criterion.""" check_file_exists(options.input_tree) b = BranchLengthDistribution() b.decorate(options.input_tree, options.taxonomy_file, options.threshold, options.rank, options.retain_named_lineages, options.keep_labels, options.prune, options.output_tree) self.logger.info('Done.') def bl_table(self, options): """Produce table with number of lineage for increasing mean branch lengths.""" check_file_exists(options.input_tree) check_file_exists(options.taxon_category) b = BranchLengthDistribution() b.table(options.input_tree, options.taxon_category, options.step_size, options.output_table) self.logger.info('Done.') def rank_res(self, options): """Calculate taxonomic resolution at each rank.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) if options.taxa_file: taxa_out = open(options.taxa_file, 'w') taxa_out.write('Rank\tLowest Rank\tTaxon\n') # determine taxonomic resolution of named groups tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rank_res = defaultdict(lambda: defaultdict(int)) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: lowest_rank = [x.strip() for x in taxon_name.split(';')][-1][0:3] for rank_prefix in Taxonomy.rank_prefixes: if rank_prefix in taxon_name: rank_res[rank_prefix][lowest_rank] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[ Taxonomy.rank_index[rank_prefix]] lowest_rank_name = Taxonomy.rank_labels[ Taxonomy.rank_index[lowest_rank]] taxa_out.write('%s\t%s\t%s\n' % (rank_prefix_name, lowest_rank_name, taxon_name)) # identify any singleton taxa which are treated as having species level resolution for line in open(options.taxonomy_file): line_split = line.split('\t') genome_id = line_split[0] taxonomy = line_split[1].split(';') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes): if taxonomy[i] == rank_prefix: # this taxa is undefined at the specified rank so # must be the sole representative; e.g., a p__ # indicates a taxon that represents a novel phyla rank_res[rank_prefix]['s__'] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[ Taxonomy.rank_index[rank_prefix]] taxa_out.write('%s\t%s\t%s (%s)\n' % (rank_prefix_name, 'species', taxonomy[i], genome_id)) if options.taxa_file: taxa_out.close() # write out results fout = open(options.output_file, 'w') fout.write('Category') for rank in Taxonomy.rank_labels[1:]: fout.write('\t' + rank) fout.write('\n') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]): fout.write(Taxonomy.rank_labels[i + 1]) for j, r in enumerate(Taxonomy.rank_prefixes[1:]): if i >= j: fout.write('\t' + str(rank_res[r].get(rank_prefix, 0))) else: fout.write('\t-') fout.write('\n') fout.close() self.logger.info('Done.') def parse_options(self, options): """Parse user options and call the correct pipeline(s)""" logging.basicConfig(format='', level=logging.INFO) # check_dependencies(('diamond', 'ktImportText')) if options.subparser_name == 'outliers': self.outliers(options) elif options.subparser_name == 'scale_tree': self.scale_tree(options) elif options.subparser_name == 'compare_red': self.compare_red(options) elif options.subparser_name == 'mark_tree': self.mark_tree(options) elif options.subparser_name == 'rogue_test': self.rogue_test(options) elif options.subparser_name == 'decorate': self.decorate(options) elif options.subparser_name == 'taxon_stats': self.taxon_stats(options) elif options.subparser_name == 'robustness_plot': self.robustness_plot(options) elif options.subparser_name == 'rd_ranks': self.rd_ranks(options) elif options.subparser_name == 'bl_dist': self.bl_dist(options) elif options.subparser_name == 'bl_optimal': self.bl_optimal(options) elif options.subparser_name == 'bl_decorate': self.bl_decorate(options) elif options.subparser_name == 'bl_table': self.bl_table(options) elif options.subparser_name == 'rank_res': self.rank_res(options) else: self.logger.error(' [Error] Unknown PhyloRank command: ' + options.subparser_name + '\n') sys.exit() return 0
def __init__(self): self.logger = logging.getLogger() self.time_keeper = TimeKeeper()
class OptionsParser(): def __init__(self): self.logger = logging.getLogger() self.time_keeper = TimeKeeper() def _genome_files(self, genome_dir, genome_ext): """Identify genomes files. Parameters ---------- genome_dir : str Directory containing genomes of interest. genome_ext : str Extension of genome files. Returns ------- list Name of genome files in directory. """ check_dir_exists(genome_dir) genome_files = [] for f in os.listdir(genome_dir): if f.endswith(genome_ext): genome_files.append(os.path.join(genome_dir, f)) if not genome_files: self.logger.warning(' [Warning] No genomes found. Check the --genome_ext flag used to identify genomes.') sys.exit() return genome_files def _write_usage_profile(self, genome_usage, feature_set, output_file): """Write out occurrence of specified features for each genome. Parameters ---------- genome_usage : d[genome_id][feature] -> count Occurrence of genomic feature in genome feature_set : iterable All genomic features. output_file : str File to produce. """ sorted_feature_set = sorted(feature_set) fout = open(output_file, 'w') fout.write('Genome ID') for feature in sorted_feature_set: fout.write('\t' + feature) fout.write('\n') totals = defaultdict(int) for genome_id, features in genome_usage.iteritems(): for feature in sorted_feature_set: totals[genome_id] += features.get(feature, 0) for genome_id, features in genome_usage.iteritems(): fout.write(genome_id) for feature in sorted_feature_set: fout.write('\t%.2f%%' % (features.get(feature, 0) * 100.0 / totals[genome_id])) fout.write('\n') def ani(self, options): """ANI command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - ani] Calculating the ANI between genome pairs.') self.logger.info('*******************************************************************************') self.logger.info('') make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_dir, options.genome_ext) self.logger.info('') self.logger.info(' Average nucleotide identity information written to: %s' % options.output_dir) self.time_keeper.print_time_stamp() def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_dir, options.genome_ext) if not genome_files: self.logger.warning(' [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.') sys.exit() prodigal = Prodigal(options.cpus) summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n') for genome_id, stats in summary_stats.iteritems(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('') self.logger.info(' Identified genes written to: %s' % options.output_dir) self.time_keeper.print_time_stamp() def rblast(self, options): """Reciprocal blast command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.protein_dir) make_sure_path_exists(options.output_dir) aa_gene_files = [] for f in os.listdir(options.protein_dir): if f.endswith(options.protein_ext): aa_gene_files.append(os.path.join(options.protein_dir, f)) if not aa_gene_files: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() # modify gene ids to include genome ids in order to ensure # all gene identifiers are unique across the set of genomes, # also removes the trailing asterisk used to identify the stop # codon self.logger.info('') self.logger.info(' Appending genome identifiers to all gene identifiers.') gene_out_dir = os.path.join(options.output_dir, 'genes') make_sure_path_exists(gene_out_dir) modified_aa_gene_files = [] for gf in aa_gene_files: genome_id = remove_extension(gf) aa_file = os.path.join(gene_out_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close() modified_aa_gene_files.append(aa_file) # perform the reciprocal blast with blastp or diamond self.logger.info('') if options.blastp: rblast = ReciprocalBlast(options.cpus) rblast.run(modified_aa_gene_files, options.evalue, options.output_dir) # concatenate all blast tables to mimic output of diamond, all hits # for a given genome MUST be in consecutive order to fully mimic # the expected results from diamond self.logger.info('') self.logger.info(' Creating single file with all blast hits (be patient!).') blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')]) hit_tables = [os.path.join(options.output_dir, f) for f in blast_files] concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv')) else: rdiamond = ReciprocalDiamond(options.cpus) rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir) self.logger.info('') self.logger.info(' Reciprocal blast hits written to: %s' % options.output_dir) self.time_keeper.print_time_stamp() def aai(self, options): """AAI command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.rblast_dir) make_sure_path_exists(options.output_dir) genome_ids = [] protein_dir = os.path.join(options.rblast_dir, 'genes') for f in os.listdir(protein_dir): if f.endswith('.faa'): genome_id = remove_extension(f, '.faa') genome_ids.append(genome_id) if not genome_ids: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() aai_calculator = AAICalculator(options.cpus) aai_calculator.run(genome_ids, protein_dir, options.rblast_dir, options.per_identity, options.per_aln_len, options.write_shared_genes, options.output_dir) shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes) self.logger.info('') self.logger.info(' Identified homologs between genome pairs written to: %s' % shared_genes_dir) self.time_keeper.print_time_stamp() def aa_usage(self, options): """Amino acid usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - aa_usage] Calculating amino acid usage within each genome.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.protein_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.protein_dir) for f in files: if f.endswith(options.protein_ext): gene_files.append(os.path.join(options.protein_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') return # calculate amino acid usage amino_acid_usage = AminoAcidUsage(options.cpus) genome_aa_usage, aa_set = amino_acid_usage.run(gene_files) # write out results self._write_usage_profile(genome_aa_usage, aa_set, options.output_file) self.logger.info('') self.logger.info(' Amino acid usage written to: %s' % options.output_file) self.time_keeper.print_time_stamp() def codon_usage(self, options): """Codon usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - codon_usage] Calculating codon usage within each genome.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.gene_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.gene_dir) for f in files: if f.endswith(options.gene_ext): gene_files.append(os.path.join(options.gene_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.') return # calculate amino acid usage codon_usage = CodonUsage(options.cpus, options.keep_ambiguous) genome_codon_usage, codon_set, _mean_length = codon_usage.run(gene_files) # write out results self._write_usage_profile(genome_codon_usage, codon_set, options.output_file) self.logger.info('') self.logger.info(' Codon usage written to: %s' % options.output_file) self.time_keeper.print_time_stamp() def stop_usage(self, options): """Stop codon usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - stop_usage] Calculating stop codon usage within each genome.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.gene_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.gene_dir) for f in files: if f.endswith(options.gene_ext): gene_files.append(os.path.join(options.gene_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.') return # calculate amino acid usage codon_usage = CodonUsage(options.cpus, keep_ambiguous=False, stop_codon_only=True) genome_codon_usage, codon_set, mean_gene_length = codon_usage.run(gene_files) # write out results fout = open(options.output_file, 'w') for codon in codon_set: fout.write('\t' + codon) if mean_gene_length: fout.write('\t' + codon + ': avg. seq. length') fout.write('\n') for genome_id, codons in genome_codon_usage.iteritems(): fout.write(genome_id) for codon in codon_set: fout.write('\t%d' % codons.get(codon, 0)) if mean_gene_length: mean_len = mean_gene_length[genome_id].get(codon, None) if mean_len: fout.write('\t%.1f' % mean_len) else: fout.write('\tna') fout.write('\n') self.logger.info('') self.logger.info(' Stop codon usage written to: %s' % options.output_file) self.time_keeper.print_time_stamp() def kmer_usage(self, options): """Kmer usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - kmer_usage] Calculating kmer usage within each genome.') self.logger.info('*******************************************************************************') self.logger.info('') if options.k > 10 or options.k <= 0: self.logger.warning('[Warning] CompareM only support kmers with k <= 10.') sys.exit(0) genome_files = self._genome_files(options.genome_dir, options.genome_ext) # calculate amino acid usage kmer_usage = KmerUsage(options.k, options.cpus) genome_kmer_usage, kmer_set = kmer_usage.run(genome_files) # write out results self.logger.info('') self.logger.info(' Writing kmer profile to file (be patient!).') self._write_usage_profile(genome_kmer_usage, kmer_set, options.output_file) self.logger.info('') self.logger.info(' Kmer usage written to: %s' % options.output_file) self.time_keeper.print_time_stamp() def lgt_di(self, options): """LGT dinucleotide usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - lgt_di] Calculating dinuceotide (3rd,1st) usage of genes.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.gene_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.gene_dir) for f in files: if f.endswith(options.gene_ext): gene_files.append(os.path.join(options.gene_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.') return lgt_dinucleotide = LgtDinucleotide(options.cpus) lgt_dinucleotide.run(gene_files, options.crit_value, options.output_dir) self.logger.info('') self.logger.info(' Dinucleotide usage written to directory: %s' % options.output_dir) self.time_keeper.print_time_stamp() def lgt_codon(self, options): """LGT dinucleotide usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - lgt_codon] Calculating codon usage of genes.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.gene_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.gene_dir) for f in files: if f.endswith(options.gene_ext): gene_files.append(os.path.join(options.gene_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.') return lgt_codon = LgtCodon(options.cpus) lgt_codon.run(gene_files, options.output_dir) self.logger.info('') self.logger.info(' Codon usage written to directory: %s' % options.output_dir) self.time_keeper.print_time_stamp() def unique(self, options): """Unique command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - unique] Identifying genes present in a single genome.') self.logger.info('*******************************************************************************') self.time_keeper.print_time_stamp() def pcoa_plot(self, options): """Unique command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - pcoa_plot] Generating PCoA plot showing relative similarity of genomes.') self.logger.info('*******************************************************************************') self.logger.info('') self.logger.info(' Performing PCoA.') pcoa = PCoA() pcoa.plot(options.aai_summary_file) self.time_keeper.print_time_stamp() def heatmap(self, options): """Unique command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - heatmap] Generating heatmap showing relative similarity of genomes.') self.logger.info('*******************************************************************************') self.logger.info('') self.logger.info(' Making heatmap.') heatmapper = Heatmap(options.aai_summary_file, options.output_file) heatmapper.plot(options.cluster, options.method, options.metric) self.time_keeper.print_time_stamp() def parse_options(self, options): """Parse user options and call the correct pipeline(s)""" try: if options.bVerbose: logging.basicConfig(format='', level=logging.DEBUG) elif options.bQuiet: logging.basicConfig(format='', level=logging.ERROR) else: logging.basicConfig(format='', level=logging.INFO) except: logging.basicConfig(format='', level=logging.INFO) try: if options.file == "stdout": options.file = '' except: pass if(options.subparser_name == 'call_genes'): self.call_genes(options) elif(options.subparser_name == 'rblast'): self.rblast(options) elif(options.subparser_name == 'aai'): self.aai(options) elif(options.subparser_name == 'aai_wf'): root_dir = options.output_dir make_sure_path_exists(root_dir) options.output_dir = os.path.join(root_dir, 'genes') self.call_genes(options) options.protein_ext = 'faa' options.protein_dir = os.path.join(root_dir, 'genes') options.output_dir = os.path.join(root_dir, 'rblast') self.rblast(options) options.output_dir = root_dir options.rblast_dir = os.path.join(root_dir, 'rblast') self.aai(options) elif(options.subparser_name == 'aa_usage'): self.aa_usage(options) elif(options.subparser_name == 'codon_usage'): self.codon_usage(options) elif(options.subparser_name == 'kmer_usage'): self.kmer_usage(options) elif(options.subparser_name == 'stop_usage'): self.stop_usage(options) elif(options.subparser_name == 'lgt_di'): self.lgt_di(options) elif(options.subparser_name == 'lgt_codon'): self.lgt_codon(options) elif(options.subparser_name == 'unique'): self.unique(options) elif(options.subparser_name == 'pcoa_plot'): self.pcoa_plot(options) elif(options.subparser_name == 'heatmap'): self.heatmap(options) else: self.logger.error(' [Error] Unknown CompareM command: "' + options.subparser_name + '"\n') sys.exit() return 0
def __init__(self): """Initialization""" self.logger = logging.getLogger() self.time_keeper = TimeKeeper()
class OptionsParser(): def __init__(self): """Initialization""" self.logger = logging.getLogger() self.time_keeper = TimeKeeper() #~ def item_eval(item): #~ try: #~ return ast.literal_eval(item) #~ except ValueError: #~ return item def _genome_files(self, genome_dir, genome_ext): """Identify genomes files. Parameters ---------- genome_dir : str Directory containing genomes of interest. genome_ext : str Extension of genome files. Returns ------- list Path to genome files. """ check_dir_exists(genome_dir) genome_files = [] for f in os.listdir(genome_dir): if f.endswith(genome_ext): genome_files.append(os.path.join(genome_dir, f)) if not genome_files: self.logger.warning(' [Warning] No genomes found. Check the --genome_ext or --protein_ext flag used to identify genomes.') sys.exit() return genome_files def _check_nuclotide_seqs(self, seq_files): """Check if files contain sequences in nucleotide space. Parameters ---------- seq_files : iterable Sequence files to check. Returns ------- boolean True if files can be treated as containing nucleotide sequences. """ for seq_file in seq_files: if not seq_io.is_nucleotide(seq_file): print('Expected all files to contain sequences in nucleotide space.') print('File %s appears like it may contain amino acids sequences.' % seq_file) yes_response = query_yes_no('Do all files contain only nucleotide sequences?', default='no') if not yes_response: return False return True def _check_protein_seqs(self, seq_files): """Check if files contain sequences in amino acid space. Parameters ---------- seq_files : iterable Sequence files to check. Returns ------- boolean True if files can be treated as containing amino acid sequences. """ for seq_file in seq_files: if not seq_io.is_protein(seq_file): print('Expected all files to contain sequences in amino acid space.') print('File %s appears like it may contain nucleotide sequences.' % seq_file) yes_response = query_yes_no('Do all files contain only amino acid sequences?', default='no') if not yes_response: return False return True def scaffold_stats(self, options): """Scaffold statistics command""" print options self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_file) if not self._check_nuclotide_seqs([options.scaffold_file]): self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.') sys.exit() genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() make_sure_path_exists(options.output_dir) # get coverage information if not options.coverage_file: if not options.bam_files: self.logger.warning('\n [Warning] One or more BAM files must be specified in order to calculate coverage profiles.') coverage_file = None else: coverage = Coverage(options.cpus) coverage_file = os.path.join(options.output_dir, 'coverage.tsv') coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist) self.logger.info('') self.logger.info(' Coverage profiles written to: %s' % coverage_file) else: coverage_file = options.coverage_file # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF if not options.tetra_file: self.logger.info('') tetra = Tetranucleotide(options.cpus) tetra_file = os.path.join(options.output_dir, 'tetra.tsv') signatures = tetra.run(options.scaffold_file) tetra.write(signatures, tetra_file) self.logger.info(' Tetranucleotide signatures written to: %s' % tetra_file) else: tetra_file = options.tetra_file # write out scaffold statistics stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv') stats = ScaffoldStats(options.cpus) stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output) self.logger.info(' Scaffold statistic written to: %s' % stats_output) self.time_keeper.print_time_stamp() def genome_stats(self, options): """Genomes statistics command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - genome_stats] Calculating statistics for genomes.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats(options.cpus) scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats.run(scaffold_stats) genome_stats.write(options.output_file) self.logger.info(' Genome statistic written to: %s' % options.output_file) self.time_keeper.print_time_stamp() def gene_profile(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - gene_profile] Generating taxonomic profiles from genes.') self.logger.info('*******************************************************************************') make_sure_path_exists(options.output_dir) check_file_exists(options.scaffold_stats_file) check_file_exists(options.taxonomy_file) check_file_exists(options.db_file) gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext) if not self._check_protein_seqs(gene_files): self.logger.warning('[Warning] All files must contain amino acid sequences.') sys.exit() # build gene profile gene_profile = GeneProfile(options.cpus, options.output_dir) gene_profile.run(gene_files, options.scaffold_stats_file, options.db_file, options.taxonomy_file, options.per_to_classify, options.evalue, options.per_identity) self.logger.info('') self.logger.info(' Results written to: %s' % options.output_dir) self.time_keeper.print_time_stamp() def outliers(self, options): """Outlier command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - outliers] Identifying scaffolds with divergent characteristics.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify outliers outliers = Outliers() outlier_file = os.path.join(options.output_dir, 'outliers.tsv') outliers.identify(scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, outlier_file) self.logger.info(' Outlier information written to: ' + outlier_file) # create outlier plots self.logger.info('') highlight_scaffolds_ids = {} if options.highlight_file: for line in open(options.highlight_file): line_split = line.strip().split('\t') if len(line_split) > 1: highlight_scaffolds_ids[line_split[0]] = [float(x.strip()) / 255.0 for x in line_split[1].split(',')] else: highlight_scaffolds_ids[line_split[0]] = [1.0, 0, 0] link_scaffold_ids = [] if options.links_file: with open(options.links_file) as links_file: for line in links_file: #print line.strip().split('\t') link_scaffold_ids.append([ast.literal_eval(item) if i not in (0,2) else item for i,item in enumerate((line.strip().split('\t')))]) #link_scaffold_ids.append(line.strip().split('\t') for line in open(options.links_file)) #print list(link_scaffold_ids[0]) # create plots genomes_processed = 0 plot_dir = os.path.join(options.output_dir, 'plots') make_sure_path_exists(plot_dir) genome_plots = defaultdict(list) for genome_id, gs in genome_stats.iteritems(): genomes_processed += 1 sys.stdout.write(' Plotting scaffold distribution for %d of %d (%.1f%%) genomes.\r' % (genomes_processed, len(genome_stats), genomes_processed * 100.0 / len(genome_stats))) sys.stdout.flush() genome_scaffold_stats = {} for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]: genome_scaffold_stats[scaffold_id] = scaffold_stats.stats[scaffold_id] if options.individual_plots: #~ # GC plot #~ gc_plots = GcPlots(options) #~ gc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_gc, outliers.gc_dist, [options.gc_perc]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_plots.' + options.image_type) #~ gc_plots.save_plot(output_plot, dpi=options.dpi) #~ gc_plots.save_html(os.path.join(plot_dir, genome_id + '.gc_plots.html')) # TD plot td_plots = TdPlots(options) td_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_signature, outliers.td_dist, [options.td_perc]) output_plot = os.path.join(plot_dir, genome_id + '.td_plots.' + options.image_type) td_plots.save_plot(output_plot, dpi=options.dpi) td_plots.save_html(os.path.join(plot_dir, genome_id + '.td_plots.html')) #~ # mean absolute deviation of coverage profiles #~ cov_perc_plots = CovPercPlots(options) #~ cov_perc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_coverage, [options.cov_perc]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_perc.' + options.image_type) #~ cov_perc_plots.save_plot(output_plot, dpi=options.dpi) #~ cov_perc_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_perc.html')) #~ #~ # coverage correlation plots #~ if len(gs.mean_coverage) > 1: #~ cov_corr_plots = CovCorrPlots(options) #~ cov_corr_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, gs.mean_coverage, [options.cov_corr]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_corr.' + options.image_type) #~ cov_corr_plots.save_plot(output_plot, dpi=options.dpi) #~ cov_corr_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_corr.html')) #~ # combined distribution, GC vs. coverage, and tetranucleotide signature plots #~ combined_plots = CombinedPlots(options) #~ combined_plots.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, link_scaffold_ids, gs, #~ outliers.gc_dist, outliers.td_dist, #~ options.gc_perc, options.td_perc, options.cov_perc) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.combined.' + options.image_type) #~ combined_plots.save_plot(output_plot, dpi=options.dpi) #~ combined_plots.save_html(os.path.join(plot_dir, genome_id + '.combined.html')) #~ #~ genome_plots[genome_id].append(('Combined', genome_id + '.combined.html')) #~ #~ # combined plot of distributions #~ dist_plots = DistributionPlots(options) #~ dist_plots.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, #~ link_scaffold_ids, #~ gs, #~ outliers.gc_dist, outliers.td_dist, #~ options.gc_perc, options.td_perc, options.cov_perc) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.dist_plot.' + options.image_type) #~ dist_plots.save_plot(output_plot, dpi=options.dpi) #~ dist_plots.save_html(os.path.join(plot_dir, genome_id + '.dist_plot.html')) #~ #~ genome_plots[genome_id].append(('Distributions', genome_id + '.dist_plot.html')) #~ #~ # GC vs. coverage plot #~ gc_cov_plot = GcCovPlot(options) #~ gc_cov_plot.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, link_scaffold_ids, #~ gs.mean_gc, gs.mean_coverage) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_coverge.' + options.image_type) #~ gc_cov_plot.save_plot(output_plot, dpi=options.dpi) #~ gc_cov_plot.save_html(os.path.join(plot_dir, genome_id + '.gc_coverge.html')) #~ #~ genome_plots[genome_id].append(('GC vs. coverage', genome_id + '.gc_coverge.html')) # tetranucleotide signature PCA plot tetra = TetraPcaPlot(options) tetra.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids) output_plot = os.path.join(plot_dir, genome_id + '.tetra_pca.' + options.image_type) tetra.save_plot(output_plot, dpi=options.dpi) tetra.save_html(os.path.join(plot_dir, genome_id + '.tetra_pca.html')) genome_plots[genome_id].append(('Tetra PCA', genome_id + '.tetra_pca.html')) sys.stdout.write('\n') outliers.create_html_index(plot_dir, genome_plots) self.logger.info(' Outlier plots written to: ' + plot_dir) self.time_keeper.print_time_stamp() def cluster(self, options): """Cluster command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) cluster = Cluster(options.cpus) cluster.run(scaffold_stats, options.num_clusters, options.num_components, options.K, options.no_coverage, options.no_pca, options.iterations, options.genome_file, options.output_dir) self.logger.info('') self.logger.info(' Partitioned sequences written to: ' + options.output_dir) self.time_keeper.print_time_stamp() def reference(self, options): """Reference command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - reference] Identifying scaffolds similar to specific genome(s).') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_prot_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext) if not self._check_protein_seqs(ref_gene_files): self.logger.warning('[Warning] All files must contain amino acid sequences.') sys.exit() reference = Reference(options.cpus, options.output_dir) reference_out = reference.run(options.scaffold_prot_file, options.scaffold_stats_file, ref_gene_files, options.db_file, options.evalue, options.per_identity) self.logger.info('') self.logger.info(' Results written to: ' + reference_out) self.time_keeper.print_time_stamp() def compatible(self, options): """Compatible command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.') self.logger.info('*******************************************************************************') check_file_exists(options.reference_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) # read scaffold statistics and calculate genome stats self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify putative homologs to reference genomes reference = Reference(1, None) putative_homologs = reference.homology_check(options.reference_file, options.min_genes, float(options.perc_genes)) # identify scaffolds compatible with bins outliers = Outliers() output_file = os.path.join(options.output_dir, 'compatible.tsv') outliers.compatible(putative_homologs, scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, output_file) self.logger.info('') self.logger.info(' Results written to: ' + output_file) self.time_keeper.print_time_stamp() def modify(self, options): """Modify command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - modify] Modifying scaffolds in genome.') self.logger.info('*******************************************************************************') make_sure_path_exists(os.path.dirname(options.output_genome)) if not (options.add or options.remove or options.outlier_file or options.compatible_file): self.logger.warning(' [Warning] No modification to bin requested.\n') sys.exit() if (options.add or options.remove) and (options.outlier_file or options.compatible_file): self.logger.warning(" [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n") sys.exit() if options.outlier_file and options.compatible_file: self.logger.warning(" [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n") sys.exit() failed_to_add = [] failed_to_remove = [] if options.add or options.remove: failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file, options.scaffold_file, options.add, options.remove, options.output_genome) elif options.outlier_file: outliers = Outliers() outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome) elif options.compatible_file: outliers = Outliers() if options.unique_only: outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome) else: outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome) if failed_to_add: self.logger.warning(' [Warning] Failed to add the following sequence(s):') for seq_id in failed_to_add: self.logger.warning(' %s' % seq_id) if failed_to_remove: self.logger.warning(' [Warning] Failed to remove the following sequence(s):') for seq_id in failed_to_remove: self.logger.warning(' %s' % seq_id) self.logger.info('') self.logger.info(' Modified genome written to: ' + options.output_genome) self.time_keeper.print_time_stamp() def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() # call genes in genomes prodigal = Prodigal(options.cpus) prodigal.run(genome_files, options.output_dir) self.logger.info(' Genes in genomes written to: %s' % options.output_dir) # call genes in unbinned scaffolds if options.unbinned_file: unbinned_output_dir = os.path.join(options.output_dir, 'unbinned') prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True) self.logger.info(' Genes in unbinned scaffolds written to: %s' % unbinned_output_dir) self.time_keeper.print_time_stamp() def unique(self, options): """Unique command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - unique] Ensuring sequences are assigned to a single genome.') self.logger.info('*******************************************************************************') genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() duplicates = genome_tk.unique(genome_files) self.logger.info('') if len(duplicates) == 0: self.logger.info(' Pass: All sequences were identified exactly once.') else: self.logger.info(' Fail: One or more sequences were observed multiple times.') genome_ids = sorted(duplicates.keys()) for i in xrange(0, len(genome_ids)): genome_idA = genome_ids[i] for j in xrange(i, len(genome_ids)): genome_idB = genome_ids[j] dup_seq_ids = duplicates[genome_idA][genome_idB] if len(dup_seq_ids) == 0: continue self.logger.info('') if genome_idA == genome_idB: self.logger.info(' There are %d sequences present more than once in %s:' % (len(dup_seq_ids), genome_idA)) else: self.logger.info(' There are %d sequences shared between %s and %s:' % (len(dup_seq_ids), genome_idA, genome_idB)) for seq_id in dup_seq_ids: self.logger.info(' %s' % seq_id) self.time_keeper.print_time_stamp() def bin_compare(self, options): """Bin compare command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - bin_compare] Comparing two sets of genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir1) check_dir_exists(options.genome_nt_dir2) genomes_files1 = self._genome_files(options.genome_nt_dir1, options.genome_ext1) if not self._check_nuclotide_seqs(genomes_files1): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() genomes_files2 = self._genome_files(options.genome_nt_dir2, options.genome_ext2) if not self._check_nuclotide_seqs(genomes_files2): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() bin_comparer = BinComparer() bin_comparer.run(genomes_files1, genomes_files2, options.scaffold_file, options.output_file) self.logger.info('') self.logger.info(' Detailed bin comparison written to: ' + options.output_file) self.time_keeper.print_time_stamp() def unbinned(self, options): """Unbinned Command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - unbinned] Identify unbinned scaffolds.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) genomes_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genomes_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() unbinned = Unbinned() unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file, options.min_seq_len) seq_io.write_fasta(unbinned_seqs, options.output_file) self.logger.info('') self.logger.info(' Unbinned scaffolds written to: ' + options.output_file) self.time_keeper.print_time_stamp() def tetra_compare(self, options): """Tetranucleotide comparison command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - tetra_compare] compare tetranucleotide frequencies') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_file) if not self._check_nuclotide_seqs([options.scaffold_file]): self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.') sys.exit() genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() make_sure_path_exists(options.output_dir) windows=WindowGen(options.cpus) windows_file, links_file=windows.write_windows(options.scaffold_file,options.output_dir,options.window_size,options.gap_size) options.scaffold_file=windows_file print options.scaffold_file options.genome_nt_dir=os.path.split(windows_file)[0] #Expects one genome - the scaffolds file print options.genome_nt_dir options.links_file=links_file print options.links_file self.scaffold_stats(options) options.scaffold_stats_file=os.path.join(options.output_dir, 'scaffold_stats.tsv') print options.scaffold_stats_file self.outliers(options) def parse_options(self, options): """Parse user options and call the correct pipeline(s)""" logging.basicConfig(format='', level=logging.INFO) check_dependencies(('diamond', 'ktImportText')) if(options.subparser_name == 'scaffold_stats'): print options self.scaffold_stats(options) elif(options.subparser_name == 'genome_stats'): self.genome_stats(options) elif(options.subparser_name == 'gene_profile'): self.gene_profile(options) elif(options.subparser_name == 'outliers'): self.outliers(options) elif(options.subparser_name == 'cluster'): self.cluster(options) elif(options.subparser_name == 'reference'): self.reference(options) elif(options.subparser_name == 'compatible'): self.compatible(options) elif(options.subparser_name == 'unique'): self.unique(options) elif(options.subparser_name == 'bin_compare'): self.bin_compare(options) elif(options.subparser_name == 'modify'): self.modify(options) elif(options.subparser_name == 'call_genes'): self.call_genes(options) elif(options.subparser_name == 'unbinned'): self.unbinned(options) elif (options.subparser_name == 'tetra_compare'): self.tetra_compare(options) else: self.logger.error(' [Error] Unknown RefineM command: ' + options.subparser_name + '\n') sys.exit() return 0
def run(self, genome_id_file, marker_id_file, model, output_dir): """Identify phylogenetic tree. Parameters ---------- genome_id_file : str File specifying unique ids of genomes to include in tree. marker_id_file : str File specifying unique ids of marker genes to use for inference. model : str ['wag' or 'jtt'] Model of evolution to use. output_dir : str Directory to store results. """ time_keeper = TimeKeeper() output_alignment_dir = os.path.join(output_dir, 'alignments') make_sure_path_exists(output_alignment_dir) output_model_dir = os.path.join(output_dir, 'hmm_models') make_sure_path_exists(output_model_dir) # read directory for each genome genome_dirs = read_genome_dir_file(self.genome_dir_file) # read genomes within the ingroup ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file) genome_ids = ncbi_genome_ids.union(user_genome_ids) self.logger.info('Inferring tree for %d genomes.' % len(genome_ids)) self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids)) self.logger.info('User genomes: %d' % len(user_genome_ids)) # get marker genes self.logger.info('Reading marker genes.') marker_genes = read_marker_id_file(marker_id_file) self.logger.info('Read %d marker genes.' % len(marker_genes)) # gather all single-copy HMMs into a single model file hmm_model_out = os.path.join(output_dir, 'phylo.hmm') hmm_info_out = os.path.join(output_dir, 'phylo.tsv') self.logger.info('Generating marker gene HMM model files.') self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir) # align gene sequences align_markers = AlignMarkers(self.cpus) align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir) # create concatenated alignment file self.logger.info('Concatenating alignments.') concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa') marker_file = os.path.join(output_dir, 'concatenated_markers.tsv') create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file) # create concatenated genome tree self.logger.info('Inferring concatenated genome tree.') concatenated_tree = os.path.join(output_dir, 'concatenated.tree') concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log') log_file = os.path.join(output_dir, 'fasttree.log') fast_tree = FastTree(multithreaded=True) fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file) # generate summary report report_out = os.path.join(output_dir, 'infer_workflow.log') fout = open(report_out, 'w') fout.write('[infer]\n') fout.write('Genome Id file: %s\n' % genome_id_file) fout.write('Marker Id file: %s\n' % marker_id_file) fout.write('Model of evolution: %s\n' % model) fout.write(time_keeper.get_time_stamp()) fout.close()
class OptionsParser(): def __init__(self): """Initialization""" self.logger = logging.getLogger() self.time_keeper = TimeKeeper() def outliers(self, options): """Create information for identifying taxnomic outliers""" check_file_exists(options.input_tree) if options.plot_taxa_file: check_file_exists(options.plot_taxa_file) if options.trusted_taxa_file: check_file_exists(options.trusted_taxa_file) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) o = Outliers(options.dpi) o.run(options.input_tree, options.taxonomy_file, options.output_dir, options.plot_taxa_file, options.plot_dist_taxa_only, options.plot_domain, options.trusted_taxa_file, options.fixed_root, options.min_children, options.min_support, options.verbose_table) self.logger.info('Done.') def tree_diff(self, options): """Tree diff command.""" check_file_exists(options.input_tree1) check_file_exists(options.input_tree2) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) td = TreeDiff() td.run(options.input_tree1, options.input_tree2, options.output_dir, options.min_support, options.min_taxa, options.named_only) self.logger.info('Done.') def tree_tax_diff(self, options): """Taxonomy difference command.""" check_file_exists(options.input_tree1) check_file_exists(options.input_tree2) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) td = TaxDiff() td.tree_tax_diff(options.input_tree1, options.input_tree2, options.output_dir) self.logger.info('Done.') def tax_diff(self, options): """Taxonomy difference command.""" check_file_exists(options.tax1_file) check_file_exists(options.tax2_file) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) td = TaxDiff() td.tax_diff(options.tax1_file, options.tax2_file, options.include_user_taxa, options.output_dir) self.logger.info('Done.') def dist_plot(self, options): """Distribution plot command""" check_file_exists(options.input_tree) if options.plot_taxa_file: check_file_exists(options.plot_taxa_file) if options.trusted_taxa_file: check_file_exists(options.trusted_taxa_file) dist_plot = DistributionPlot() dist_plot.run(options.input_tree, options.output_prefix, options.plot_taxa_file, options.trusted_taxa_file, options.min_children, options.min_support) self.logger.info('Done.') def mark_tree(self, options): """Mark tree command""" check_file_exists(options.input_tree) mt = MarkTree() mt.run(options.input_tree, options.output_tree, options.min_support, options.only_named_clades, options.min_length, not options.no_percentile, not options.no_relative_divergence, not options.no_prediction, options.thresholds) self.logger.info('Marked tree written to: %s' % options.output_tree) def decorate(self, options): """Place internal taxonomic labels on tree.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) decorate = Decorate() decorate.run(options.input_tree, options.taxonomy_file, options.trusted_taxa_file, options.min_children, options.min_support, options.output_tree) self.logger.info('Finished decorating tree.') def pull(self, options): """Pull command""" check_file_exists(options.input_tree) t = Taxonomy().read_from_tree(options.input_tree) #, False) if not options.no_rank_fill: for taxon_id, taxa in t.iteritems(): t[taxon_id] = Taxonomy().fill_missing_ranks(taxa) Taxonomy().write(t, options.output_file) self.logger.info('Taxonomy strings written to: %s' % options.output_file) def validate(self, options): """Validate command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy() t = taxonomy.read(options.taxonomy_file) errors = taxonomy.validate(t, not options.no_prefix, not options.no_all_ranks, not options.no_hierarhcy, not options.no_species, True) invalid_ranks, invalid_prefixes, invalid_species_name, invalid_hierarchies = errors if sum([len(e) for e in errors]) == 0: self.logger.info('No errors identified in taxonomy file.') else: self.logger.info('Identified %d incomplete taxonomy strings.' % len(invalid_ranks)) self.logger.info('Identified %d rank prefix errors.' % len(invalid_prefixes)) self.logger.info('Identified %d invalid species names.' % len(invalid_species_name)) self.logger.info('Identified %d taxa with multiple parents.' % len(invalid_hierarchies)) def append(self, options): """Append command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [PhyloRank - append] Appending taxonomy to extant tree labels.') self.logger.info('*******************************************************************************') check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) taxonomy = Taxonomy().read(options.taxonomy_file) tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for n in tree.leaf_node_iter(): taxa_str = taxonomy.get(n.label, None) if taxa_str == None: self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label) sys.exit(-1) n.label = n.label + '|' + ';'.join(taxonomy[n.label]) tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('') self.logger.info(' Decorated tree written to: %s' % options.output_tree) self.time_keeper.print_time_stamp() def taxon_stats(self, options): """Taxon stats command""" check_file_exists(options.taxonomy_file) taxonomy = Taxonomy().read(options.taxonomy_file) taxon_children = Taxonomy().taxon_children(taxonomy) fout = open(options.output_file, 'w') fout.write('Taxa') for rank in Taxonomy.rank_labels[1:]: fout.write('\t# named %s' % rank) fout.write('\t# extant taxon with complete taxonomy') fout.write('\n') for rank_prefix in Taxonomy.rank_prefixes: # find taxon at the specified rank cur_taxa = [] for taxon in taxon_children: if taxon.startswith(rank_prefix): cur_taxa.append(taxon) cur_taxa.sort() for taxon in cur_taxa: fout.write(taxon) fout.write('\t-' * Taxonomy.rank_index[rank_prefix]) next_taxa = [taxon] for _ in xrange(Taxonomy.rank_index[rank_prefix], Taxonomy.rank_index['s__'] + 1): children_taxa = set() for t in next_taxa: children_taxa.update(taxon_children[t]) fout.write('\t%d' % len(children_taxa)) next_taxa = children_taxa fout.write('\n') fout.close() self.logger.info('Summary statistics written to: %s' % options.output_file) def robustness_plot(self, options): """Robustness plot command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [PhyloRank - robustness_plot] Plotting distances across a set of tree.') self.logger.info('*******************************************************************************') robustness_plot = RobustnessPlot() robustness_plot.run(options.rank, options.input_tree_dir, options.full_tree_file, options.derep_tree_file, options.taxonomy_file, options.output_prefix, options.min_children, options.title) self.time_keeper.print_time_stamp() def rd_ranks(self, options): """Calculate number of taxa for specified rd thresholds.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) r = RdRanks() r.run(options.input_tree, options.thresholds, options.output_dir) self.logger.info('Done.') def bl_dist(self, options): """Calculate distribution of branch lengths at each taxonomic rank.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) b = BranchLengthDistribution() b.run(options.input_tree, options.trusted_taxa_file, options.min_children, options.taxonomy_file, options.output_dir) self.logger.info('Done.') def bl_optimal(self, options): """Determine branch length for best congruency with existing taxonomy.""" b = BranchLengthDistribution() optimal_bl, correct_taxa, incorrect_taxa = b.optimal(options.input_tree, options.rank, options.min_dist, options.max_dist, options.step_size, options.output_table) prec = float(correct_taxa) / (correct_taxa + incorrect_taxa) self.logger.info('Optimal branch length is %f.' % optimal_bl) self.logger.info('This results in %d correct and %d incorrect taxa (precision = %.2f).' % (correct_taxa, incorrect_taxa, prec)) def bl_decorate(self, options): """Decorate tree based using a mean branch length criterion.""" check_file_exists(options.input_tree) b = BranchLengthDistribution() b.decorate(options.input_tree, options.taxonomy_file, options.threshold, options.rank, options.retain_named_lineages, options.keep_labels, options.prune, options.output_tree) self.logger.info('Done.') def bl_table(self, options): """Produce table with number of lineage for increasing mean branch lengths.""" check_file_exists(options.input_tree) check_file_exists(options.taxon_category) b = BranchLengthDistribution() b.table(options.input_tree, options.taxon_category, options.step_size, options.output_table) self.logger.info('Done.') def rank_res(self, options): """Calculate taxonomic resolution at each rank.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) if options.taxa_file: taxa_out = open(options.taxa_file, 'w') taxa_out.write('Rank\tLowest Rank\tTaxon\n') # determine taxonomic resolution of named groups tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rank_res = defaultdict(lambda: defaultdict(int)) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: lowest_rank = [x.strip() for x in taxon_name.split(';')][-1][0:3] for rank_prefix in Taxonomy.rank_prefixes: if rank_prefix in taxon_name: rank_res[rank_prefix][lowest_rank] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]] lowest_rank_name = Taxonomy.rank_labels[Taxonomy.rank_index[lowest_rank]] taxa_out.write('%s\t%s\t%s\n' % (rank_prefix_name, lowest_rank_name, taxon_name)) # identify any singleton taxa which are treated as having species level resolution for line in open(options.taxonomy_file): line_split = line.split('\t') genome_id = line_split[0] taxonomy = line_split[1].split(';') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes): if taxonomy[i] == rank_prefix: # this taxa is undefined at the specified rank so # must be the sole representative; e.g., a p__ # indicates a taxon that represents a novel phyla rank_res[rank_prefix]['s__'] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[Taxonomy.rank_index[rank_prefix]] taxa_out.write('%s\t%s\t%s (%s)\n' % (rank_prefix_name, 'species', taxonomy[i], genome_id)) if options.taxa_file: taxa_out.close() # write out results fout = open(options.output_file, 'w') fout.write('Category') for rank in Taxonomy.rank_labels[1:]: fout.write('\t' + rank) fout.write('\n') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]): fout.write(Taxonomy.rank_labels[i+1]) for j, r in enumerate(Taxonomy.rank_prefixes[1:]): if i >= j: fout.write('\t' + str(rank_res[r].get(rank_prefix, 0))) else: fout.write('\t-') fout.write('\n') fout.close() self.logger.info('Done.') def parse_options(self, options): """Parse user options and call the correct pipeline(s)""" logging.basicConfig(format='', level=logging.INFO) # check_dependencies(('diamond', 'ktImportText')) if(options.subparser_name == 'outliers'): self.outliers(options) elif(options.subparser_name == 'mark_tree'): self.mark_tree(options) elif(options.subparser_name == 'tree_diff'): self.tree_diff(options) elif(options.subparser_name == 'tree_tax_diff'): self.tree_tax_diff(options) elif(options.subparser_name == 'tax_diff'): self.tax_diff(options) elif(options.subparser_name == 'decorate'): self.decorate(options) elif(options.subparser_name == 'pull'): self.pull(options) elif(options.subparser_name == 'validate'): self.validate(options) elif(options.subparser_name == 'append'): self.append(options) elif(options.subparser_name == 'taxon_stats'): self.taxon_stats(options) elif(options.subparser_name == 'robustness_plot'): self.robustness_plot(options) elif(options.subparser_name == 'dist_plot'): self.dist_plot(options) elif(options.subparser_name == 'rd_ranks'): self.rd_ranks(options) elif(options.subparser_name == 'bl_dist'): self.bl_dist(options) elif(options.subparser_name == 'bl_optimal'): self.bl_optimal(options) elif(options.subparser_name == 'bl_decorate'): self.bl_decorate(options) elif(options.subparser_name == 'bl_table'): self.bl_table(options) elif(options.subparser_name == 'rank_res'): self.rank_res(options) else: self.logger.error(' [Error] Unknown PhyloRank command: ' + options.subparser_name + '\n') sys.exit() return 0
class MakeDatabase(object): """Make a dereplicated database of genes. Dereplication is done between genes within a named taxonomic group (e.g., genomes in the same genus) and is based on the average amino acid identity (AAI) between genes. Groups with large numbers of taxa can take an excessive amount of time to process so are subsampled to a specific number of taxa. Subsampling is done in a manor which aims to retain phylogenetic diversity and thus helps ensures a good distribution of genes within the group. Care is taken to ensure type strains are retained during dereplication. Note: this script is tailored to IMG in that it assumes a certain directory structure and file extensions. It also corrects a number of common issues with IMG genomes: - non-ascii characters in fasta header lines - hyphens at the start of some protein sequences """ def __init__(self): """Initialize.""" check_dependencies(['comparem', 'diamond', 'makeblastdb']) self.underclassified = 'underclassified' self.rank_prefixes = Taxonomy.rank_prefixes self.rank_index = Taxonomy.rank_index self.rank_labels = Taxonomy.rank_labels self.time_keeper = TimeKeeper() def read_taxonomy(self, input_taxonomy): """Read taxonomy file. Taxonomy file should have the following format: <genome_id>\t<taxonomy_str> where taxonomy_str is in GreenGenes format: d__Bacteria;p__Firmicutes;... Parameters ---------- input_taxonomy : str Taxonomy file. Returns ------- dict Taxonomy for each genome id. """ taxonomy = {} for line in open(input_taxonomy): line_split = line.split('\t') taxonomy[line_split[0]] = [x.strip() for x in line_split[1].split(';')] return taxonomy def read_type_strain(self, type_strain_file): """Read type strain file. The type strain file should have the following format: <genome_id>\t<genome_name> Parameters ---------- type_strain_file : str File specifying type strains. Returns ------- set Set of all genome ids specified as type strains. """ type_strains = set() for line in open(type_strain_file): line_split = line.split('\t') type_strains.add(line_split[0]) return type_strains def select_taxa(self, genome_list, taxonomy, type_strains, max_taxa): """Select subset of genomes with a good distribution across named groups. Groups genomes into named groups and subsamples evenly across these groups. Ideally, genomes would be grouped into species, but some genomes may not have a species identifier. Such genomes are assigned to the most specific named group possible. Any genome marked as a type strain will be retained. Parameters ---------- genome_list : iterable of genome ids Genomes to subsample. taxonomy : d[genome_id] -> [domain, ..., species] Taxonomy of each genome. type_strains : iterable Genome identifiers of type strains. max_taxa : int Number of genomes to retain. Returns ------- iterable Subsampled list of genomes. """ if len(genome_list) <= max_taxa: return genome_list reduced_genome_list = [] # group genomes into the most specific named groups possible groups = defaultdict(set) for genome_id in genome_list: # add in type strains regardless of taxonomy if genome_id in type_strains: reduced_genome_list.append(genome_id) continue # get first classified rank for rank_index in xrange(self.rank_index['s__'], -1, -1): taxa = taxonomy[genome_id][rank_index] if taxa != self.rank_prefixes[rank_index]: break groups[taxa].add(genome_id) # sample genomes from each named group while len(reduced_genome_list) < max_taxa: genomes_to_select = max_taxa - len(reduced_genome_list) genomes_per_group = max(genomes_to_select / len(groups), 1) for taxa, genome_ids in groups.iteritems(): selected_genomes = random.sample(genome_ids, min(len(genome_ids), genomes_per_group)) groups[taxa] = genome_ids.difference(selected_genomes) reduced_genome_list.extend(selected_genomes) if len(reduced_genome_list) == max_taxa: break # special case where we are adding single genomes from each group return reduced_genome_list def write_gene_file(self, gene_out, gene_dir, genome_list, taxonomy, genes_to_ignore): """Write genes to output stream. Parameters ---------- gene_out : stream Output stream. gene_dir : str Directory containing called genes in amino acid space. genome_list : iterable Genomes to process. genes_to_ignore : set Genes which should not be written to file. """ genes_kept = 0 for genome_id in genome_list: genome_gene_file = os.path.join(gene_dir, genome_id + '.faa') if not os.path.exists(genome_gene_file): print '[WARNING] Missing gene file for genome %s.' % genome_gene_file continue if os.stat(genome_gene_file).st_size == 0: print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file continue for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True): if gene_id in genes_to_ignore: continue # IMG headers sometimes contain non-ascii characters which cause # problems with BLAST and DIAMOND so there are explicitly filtered out annotation = filter(lambda x: x in string.printable, annotation) # a few IMG genomes contain protein sequences which start with a hyphen if seq[0] == '-': seq = seq[1:] gene_out.write('>' + gene_id + ' ' + annotation + '\n') gene_out.write(seq + '\n') genes_kept += 1 return genes_kept def img_gene_id_to_scaffold_id(self, genome_dir, genome_id, output_dir): """Modify IMG gene ids to format which explicitly gives scaffold names. For downstream processing it is often necessary to know which scaffold a gene is contained on. IMG uses unique identifiers for genes. As such, these are changed to the following format: <scaffold_id>_<gene #> <annotation> [IMG gene id] Parameters ---------- genome_dir : str Directory with files for genome. genome_id : str Unique identifier of genome. output_dir : float Directory to contain modified fasta files. """ # determine source scaffold for each gene gene_id_to_scaffold_id = {} gene_number = defaultdict(int) for line in open(os.path.join(genome_dir, genome_id + '.gff')): if line[0] == '#': continue line_split = line.split('\t') scaffold_id = line_split[0] info = line_split[8] if info != '': # this will be empty for non-protein coding genes gene_id = info.split(';')[0].replace('ID=', '') gene_number[scaffold_id] += 1 gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id]) # write out gene file with modified identifiers genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa')) fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w') for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True): annotation = annotation[annotation.find(' ') + 1:] # remove additional gene id from annotation annotation += ' [IMG Gene ID: ' + gene_id + ']' # append IMG gene id for future reference fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n') fout.write(seq + '\n') fout.close() def amend_gene_identifies(self, gene_dir, output_dir): """Modify gene ids to include source genome id. The following format is used: <gene_id>~<genome_id> Parameters ---------- gene_dir : str Directory with fasta files containing protein sequences. output_dir : float Directory to contain modified fasta files. """ if not os.path.exists(output_dir): os.makedirs(output_dir) for f in os.listdir(gene_dir): gf = os.path.join(gene_dir, f) genome_id = remove_extension(gf) aa_file = os.path.join(output_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close() def filter_aai(self, tmp_dir, gene_dir, ammended_gene_dir, per_identity, per_aln_len, cpus): """Filter genes with similar amino acid identity. Parameters ---------- tmp_dir : str Temporary directory for storing results. gene_dir : str Directory with fasta files containing protein sequences. ammended_gene_dir : str Directory to store protein sequences with ammended gene ids. per_identity : float Percent identity for subsampling similar genes. per_aln_len : float Percent alignment length for subsampling similar genes. cpus : int Number of cpus to use. Returns ------- genes_to_remove : set Unique identifiers of genes to filter. """ rblast_dir = os.path.join(tmp_dir, 'rblast') os.system('comparem rblast -e 1e-10 -p %d -c %d %s %s' % (per_identity, cpus, gene_dir, rblast_dir)) aai_dir = os.path.join(tmp_dir, 'aai') os.system('comparem aai -p %d -a %d -c %d %s %s' % (per_identity, per_aln_len, cpus, rblast_dir, aai_dir)) # identify homologs to be filtered print '' print ' Identifying homologs to be filtered.' shared_genes_dir = os.path.join(aai_dir, 'shared_genes') files = os.listdir(shared_genes_dir) homologs = defaultdict(set) for f in files: with open(os.path.join(shared_genes_dir, f)) as fin: fin.readline() for line in fin: line_split = line.split('\t') gene_idA = line_split[0] gene_idB = line_split[1] homologs[gene_idA].add(gene_idB) homologs[gene_idB].add(gene_idA) genes_to_remove = set() genes_to_keep = set() sorted_keys = sorted(homologs, key=lambda k: len(homologs[k]), reverse=True) for gene_id in sorted_keys: gene_set = homologs[gene_id] if len(gene_set.intersection(genes_to_keep)) > 0: genes_to_remove.update(gene_set - genes_to_keep) genes_to_remove.add(gene_id) else: genes_to_keep.add(gene_id) genes_to_remove.update(gene_set - genes_to_keep) # The CompareM call to rblast creates fasta files where gene ids are modified to # also contain genome ids. This is just a hack so to point to the directory with # these amended fasta files. os.system('ln -s %s %s' % (os.path.join(rblast_dir, 'genes'), ammended_gene_dir)) return genes_to_remove def run(self, taxonomy_file, type_strains_file, genome_dir, max_taxa, rank, per_identity, per_aln_len, genomes_to_process, keep_all_genes, create_diamond_db, create_blast_db, cpus, output_dir): """ Create dereplicate set of genes. Taxonomy file should have the following format: <genome_id>\t<taxonomy_str> where taxonomy_str is in GreenGenes format: d__Bacteria;p__Proteobacteria;...;s__Escherichia coli Type strain file should have the following format: <genome_id>\t<genome name> Parameters ---------- taxonomy_file : str File indicating taxonomy string for all genomes of interest type_strains_file : str File indicating type strains. genome_dir : str Directory with genomes in individual directories. max_taxa : int Maximum taxa to retain in a named group. rank : int Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species). per_identity : float Percent identity for subsampling similar genes. per_aln_len : float Percent alignment length for subsampling similar genes. genomes_to_process : str File with list of genomes to retain instead of performing taxon subsampling. keep_all_genes : boolean Flag indicating that no gene subsampling should be performed. create_diamond_db : boolean Flag indicating if DIAMOND database should be created. create_blast_db : boolean Flag indicating if BLAST database should be created. cpus : int Number of cpus to use. output_dir : str Desired output directory for storing results. """ make_sure_path_exists(output_dir) print 'Dereplicating at the rank of %s.' % self.rank_labels[rank] print '' print 'Reading taxonomy file.' taxonomy = self.read_taxonomy(taxonomy_file) print ' There are %d genomes with taxonomy strings.' % len(taxonomy) print '' print 'Reading type strain file.' type_strains = self.read_type_strain(type_strains_file) print ' There are %d type strains.' % len(type_strains) # get specific list of genomes to process genomes_to_retain = set() if genomes_to_process: print '' print 'Reading genomes to retain.' for line in open(genomes_to_process): line_split = line.split() genomes_to_retain.add(line_split[0]) print ' Retaining %d genomes.' % len(genomes_to_retain) # identify unique genes in each named group fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w') rank_genomes = defaultdict(list) genomes_with_missing_data = set() underclassified_genomes = 0 for genome_id, t in taxonomy.iteritems(): if genomes_to_process and genome_id not in genomes_to_retain: continue genome_file = os.path.join(genome_dir, genome_id, genome_id + '.genes.faa') if not os.path.exists(genome_file): genomes_with_missing_data.add(genome_id) fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n') continue taxa = t[rank] if taxa[3:] == '': underclassified_genomes += 1 rank_genomes[self.underclassified].append(genome_id) else: rank_genomes[taxa].append(genome_id) fout.close() total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()]) print '' print 'Under-classified genomes automatically placed into the database: %d' % underclassified_genomes print 'Genomes with missing sequence data: %d' % len(genomes_with_missing_data) print '' print 'Total named groups: %d' % len(rank_genomes) print 'Total genomes to process: %d' % total_genomes_to_process # process each named group print '' gene_file = os.path.join(output_dir, 'genome_db.%s.genes.faa' % str(datetime.date.today())) gene_out = open(gene_file, 'w') taxonomy_out = open(os.path.join(output_dir, 'taxonomy.%s.tsv' % str(datetime.date.today())), 'w') tmp_dir = tempfile.mkdtemp() total_genes_removed = 0 total_genes_kept = 0 total_genomes_kept = 0 processed_genomes = 0 for taxa, genome_list in rank_genomes.iteritems(): processed_genomes += len(genome_list) print '' print '-------------------------------------------------------------------------------' print ' Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process) print self.time_keeper.get_time_stamp() print '-------------------------------------------------------------------------------' # create directory with selected genomes taxon_dir = os.path.join(tmp_dir, 'taxon') os.mkdir(taxon_dir) reduced_genome_list = genome_list if not genomes_to_process and taxa != self.underclassified: # perform taxon subsampling reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa) total_genomes_kept += len(reduced_genome_list) gene_dir = os.path.join(taxon_dir, 'genes') os.mkdir(gene_dir) for genome_id in reduced_genome_list: taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n') cur_genome_dir = os.path.join(genome_dir, genome_id) self.img_gene_id_to_scaffold_id(cur_genome_dir, genome_id, gene_dir) # filter genes based on amino acid identity genes_to_remove = [] amended_gene_dir = os.path.join(taxon_dir, 'ammended_genes') if keep_all_genes or taxa == self.underclassified: # modify gene identifiers to include genome ids self.amend_gene_identifies(gene_dir, amended_gene_dir) else: # filter genes on AAI genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, cpus) print '' print ' Writing unique genes from genomes in %s.' % taxa genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove) print ' Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list)) print ' Genes to keep: %d' % genes_kept print ' Genes removed: %d' % len(genes_to_remove) total_genes_kept += genes_kept total_genes_removed += len(genes_to_remove) shutil.rmtree(taxon_dir) taxonomy_out.close() gene_out.close() print '' print 'Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process)) print ' Total genes kept: %d' % total_genes_kept print ' Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed)) if create_diamond_db: print '' print 'Creating DIAMOND database.' os.system('diamond makedb -b 10 -p 32 -d %s --in %s' % (gene_file, gene_file)) print '' if create_blast_db: print '' print 'Creating BLAST database.' os.system('makeblastdb -dbtype prot -in %s' % gene_file) print '' shutil.rmtree(tmp_dir)