def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() # call genes in genomes prodigal = Prodigal(options.cpus) prodigal.run(genome_files, options.output_dir) self.logger.info(' Genes in genomes written to: %s' % options.output_dir) # call genes in unbinned scaffolds if options.unbinned_file: unbinned_output_dir = os.path.join(options.output_dir, 'unbinned') prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True) self.logger.info(' Genes in unbinned scaffolds written to: %s' % unbinned_output_dir) self.time_keeper.print_time_stamp()
def _genome_files(self, genome_dir, genome_ext): """Identify genomes files. Parameters ---------- genome_dir : str Directory containing genomes of interest. genome_ext : str Extension of genome files. Returns ------- list Name of genome files in directory. """ check_dir_exists(genome_dir) genome_files = [] for f in os.listdir(genome_dir): if f.endswith(genome_ext): genome_files.append(os.path.join(genome_dir, f)) if not genome_files: self.logger.warning(' [Warning] No genomes found. Check the --genome_ext flag used to identify genomes.') sys.exit() return genome_files
def lgt_codon(self, options): """LGT dinucleotide usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - lgt_codon] Calculating codon usage of genes.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.gene_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.gene_dir) for f in files: if f.endswith(options.gene_ext): gene_files.append(os.path.join(options.gene_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.') return lgt_codon = LgtCodon(options.cpus) lgt_codon.run(gene_files, options.output_dir) self.logger.info('') self.logger.info(' Codon usage written to directory: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def codon_usage(self, options): """Codon usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - codon_usage] Calculating codon usage within each genome.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.gene_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.gene_dir) for f in files: if f.endswith(options.gene_ext): gene_files.append(os.path.join(options.gene_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.') return # calculate amino acid usage codon_usage = CodonUsage(options.cpus, options.keep_ambiguous) genome_codon_usage, codon_set, _mean_length = codon_usage.run(gene_files) # write out results self._write_usage_profile(genome_codon_usage, codon_set, options.output_file) self.logger.info('') self.logger.info(' Codon usage written to: %s' % options.output_file) self.time_keeper.print_time_stamp()
def _genome_files(self, genome_dir, genome_ext): """Identify genomes files. Parameters ---------- genome_dir : str Directory containing genomes of interest. genome_ext : str Extension of genome files. Returns ------- list Path to genome files. """ check_dir_exists(genome_dir) genome_files = [] for f in os.listdir(genome_dir): if f.endswith(genome_ext): genome_files.append(os.path.join(genome_dir, f)) if not genome_files: self.logger.warning( 'No genomes found. Check the --genome_ext or --protein_ext flag used to identify genomes.' ) sys.exit() return genome_files
def bin_compare(self, options): """Bin compare command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - bin_compare] Comparing two sets of genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir1) check_dir_exists(options.genome_nt_dir2) genomes_files1 = self._genome_files(options.genome_nt_dir1, options.genome_ext1) if not self._check_nuclotide_seqs(genomes_files1): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() genomes_files2 = self._genome_files(options.genome_nt_dir2, options.genome_ext2) if not self._check_nuclotide_seqs(genomes_files2): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() bin_comparer = BinComparer() bin_comparer.run(genomes_files1, genomes_files2, options.scaffold_file, options.output_file) self.logger.info('') self.logger.info(' Detailed bin comparison written to: ' + options.output_file) self.time_keeper.print_time_stamp()
def call_genes(self, options): """Call genes command""" check_dir_exists(options.genome_nt_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() # call genes in genomes prodigal = Prodigal(options.cpus) prodigal.run(genome_files, options.output_dir) self.logger.info('Genes in genomes written to: %s' % options.output_dir) # call genes in unbinned scaffolds if options.unbinned_file: unbinned_output_dir = os.path.join(options.output_dir, 'unbinned') prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True) self.logger.info('Genes in unbinned scaffolds written to: %s' % unbinned_output_dir)
def unanimous(self, options): """Unanimous command""" check_dir_exists(options.profile_dir) make_sure_path_exists(options.output_dir) bin_dirs = self._bin_dirs(options) e = Ensemble(options.bin_prefix) e.run( options.profile_dir, bin_dirs, options.weight, options.sel_min_quality, options.sel_min_comp, options.sel_max_cont, None, None, None, False, # perform greedy bin selection True, # perform unanimous bin selection False, # do not merge None, # no coverage file options.report_min_quality, options.simple_headers, options.output_dir) self.logger.info("UniteM 'unanimous' results written to: %s" % options.output_dir)
def unique(self, options): """Unique command""" check_dir_exists(options.bin_dir) bt = BinTools() bin_files = bt.bin_files(options.bin_dir, options.extension) bt.unique(bin_files)
def stop_usage(self, options): """Stop codon usage command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - stop_usage] Calculating stop codon usage within each genome.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.gene_dir) # get list of files with called genes gene_files = [] files = os.listdir(options.gene_dir) for f in files: if f.endswith(options.gene_ext): gene_files.append(os.path.join(options.gene_dir, f)) # warn use if no files were found if len(gene_files) == 0: self.logger.warning(' [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.') return # calculate amino acid usage codon_usage = CodonUsage(options.cpus, keep_ambiguous=False, stop_codon_only=True) genome_codon_usage, codon_set, mean_gene_length = codon_usage.run(gene_files) # write out results fout = open(options.output_file, 'w') for codon in codon_set: fout.write('\t' + codon) if mean_gene_length: fout.write('\t' + codon + ': avg. seq. length') fout.write('\n') for genome_id, codons in genome_codon_usage.iteritems(): fout.write(genome_id) for codon in codon_set: fout.write('\t%d' % codons.get(codon, 0)) if mean_gene_length: mean_len = mean_gene_length[genome_id].get(codon, None) if mean_len: fout.write('\t%.1f' % mean_len) else: fout.write('\tna') fout.write('\n') self.logger.info('') self.logger.info(' Stop codon usage written to: %s' % options.output_file) self.time_keeper.print_time_stamp()
def marker_files(self, options): """Generate marker gene file.""" check_dir_exists(options.bac120_gene_dir) check_dir_exists(options.ar122_gene_dir) check_file_exists(options.user_gid_table) make_sure_path_exists(options.output_dir) p = WebsiteData(options.release_number, options.output_dir) p.marker_files(options.bac120_gene_dir, options.ar122_gene_dir, options.user_gid_table) self.logger.info('Done.')
def rogue_test(self, options): """Rogue taxa command.""" check_dir_exists(options.input_tree_dir) check_file_exists(options.taxonomy_file) make_sure_path_exists(options.output_dir) if options.decorate: check_dependencies(['genometreetk']) rt = RogueTest() rt.run(options.input_tree_dir, options.taxonomy_file, options.outgroup_taxon, options.decorate, options.output_dir) self.logger.info('Finished rogue taxa test.')
def compare(self, options): """Compare command""" check_dir_exists(options.bin_dir1) check_dir_exists(options.bin_dir2) bt = BinTools() bin_files1 = bt.bin_files(options.bin_dir1, options.extension1) bin_files2 = bt.bin_files(options.bin_dir2, options.extension2) bt.compare(bin_files1, bin_files2, options.assembly_file, options.output_file) self.logger.info("UniteM 'compare' results written to: %s" % options.output_file)
def classify(self, options): """Determine taxonomic classification of genomes.""" check_dir_exists(options.align_dir) make_sure_path_exists(options.out_dir) genomes = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) classify = Classify(options.cpus) classify.run(genomes, options.align_dir, options.out_dir, options.prefix, options.debug) self.logger.info('Done.')
def align(self, options): """Create MSA from marker genes.""" check_dir_exists(options.identify_dir) make_sure_path_exists(options.out_dir) if not hasattr(options, 'outgroup_taxon'): options.outgroup_taxon = None markers = Markers(options.cpus) markers.align(options.identify_dir, options.taxa_filter, options.min_perc_aa, options.custom_msa_filters, options.consensus, options.min_perc_taxa, options.out_dir, options.prefix, options.outgroup_taxon) self.logger.info('Done.')
def unbinned(self, options): """Unbinned Command""" check_dir_exists(options.genome_nt_dir) genomes_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genomes_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() unbinned = Unbinned() unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file, options.min_seq_len) seq_io.write_fasta(unbinned_seqs, options.output_file) self.logger.info('Unbinned scaffolds written to: ' + options.output_file)
def _bin_dirs(self, options): """Get directories with bins from different binners.""" bin_dirs = {} if hasattr(options, 'bin_dirs') and options.bin_dirs: for d in options.bin_dirs: check_dir_exists(d) method_id = os.path.basename(os.path.normpath(d)) bin_ext, count = self._bin_extension(d) if not bin_ext: self.logger.warning('No bins identified for %s in %s.' % (method_id, d)) else: bin_dirs[method_id] = (d, bin_ext) self.logger.info( "Processing %d genomes from %s with extension '%s'." % (count, method_id, bin_ext)) if hasattr(options, 'bin_file') and options.bin_file: check_file_exists(options.bin_file) for line in open(options.bin_file): if line.strip(): line_split = map(str.strip, line.split('\t')) if len(line_split) != 2: self.logger.warning("Skipping invalid line: %s" % line.strip()) continue method_id = line_split[0] d = line_split[1] check_dir_exists(d) bin_ext, count = self._bin_extension(d) if not bin_ext: self.logger.warning( 'No bins identified for %s in %s.' % (method_id, d)) else: bin_dirs[method_id] = (d, bin_ext) self.logger.info( "Processing %d genomes from %s with extension '%s'." % (count, method_id, bin_ext)) return bin_dirs
def identify(self, options): """Identify marker genes in genomes.""" if options.genome_dir: check_dir_exists(options.genome_dir) if options.batchfile: check_file_exists(options.batchfile) make_sure_path_exists(options.out_dir) genomes = self._genomes_to_process(options.genome_dir, options.batchfile, options.extension) markers = Markers(options.cpus) markers.identify(genomes, options.out_dir, options.prefix) self.logger.info('Done.')
def unbinned(self, options): """Unbinned Command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - unbinned] Identify unbinned scaffolds.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) genomes_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genomes_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() unbinned = Unbinned() unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file, options.min_seq_len) seq_io.write_fasta(unbinned_seqs, options.output_file) self.logger.info('') self.logger.info(' Unbinned scaffolds written to: ' + options.output_file) self.time_keeper.print_time_stamp()
def ssu_erroneous(self, options): """Erroneous SSU command""" check_dependencies(('nhmmer', 'blastn')) check_dir_exists(options.genome_nt_dir) check_dir_exists(options.taxon_profile_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() # identify scaffolds with 16S sequences ssu = SSU(options.cpus) ssu_hits = ssu.identify(genome_files, options.evalue, options.concatenate, options.output_dir) ssu_seq_files = ssu.extract(genome_files, ssu_hits, options.output_dir) ssu_classifications = ssu.classify(ssu_seq_files, options.ssu_db, options.ssu_taxonomy_file, options.evalue, options.output_dir) # report statistics for SSU scaffolds self.logger.info( 'Identifying scaffolds with 16S rRNA genes with divergent taxonomic classification.' ) ssu.erroneous(ssu_hits, ssu_classifications, options.taxon_profile_dir, options.common_taxon, options.ssu_min_len, options.ssu_domain, options.ssu_phylum, options.ssu_class, options.ssu_order, options.ssu_family, options.ssu_genus, options.output_dir) self.logger.info('SSU information written to: ' + options.output_dir)
def aai(self, options): """AAI command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.') self.logger.info('*******************************************************************************') self.logger.info('') check_dir_exists(options.rblast_dir) make_sure_path_exists(options.output_dir) genome_ids = [] protein_dir = os.path.join(options.rblast_dir, 'genes') for f in os.listdir(protein_dir): if f.endswith('.faa'): genome_id = remove_extension(f, '.faa') genome_ids.append(genome_id) if not genome_ids: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() aai_calculator = AAICalculator(options.cpus) aai_calculator.run(genome_ids, protein_dir, options.rblast_dir, options.per_identity, options.per_aln_len, options.write_shared_genes, options.output_dir) shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes) self.logger.info('') self.logger.info(' Identified homologs between genome pairs written to: %s' % shared_genes_dir) self.time_keeper.print_time_stamp()
def bin_compare(self, options): """Bin compare command""" check_dir_exists(options.genome_nt_dir1) check_dir_exists(options.genome_nt_dir2) genomes_files1 = self._genome_files(options.genome_nt_dir1, options.genome_ext1) if not self._check_nuclotide_seqs(genomes_files1): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() genomes_files2 = self._genome_files(options.genome_nt_dir2, options.genome_ext2) if not self._check_nuclotide_seqs(genomes_files2): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() bin_comparer = BinComparer() bin_comparer.run(genomes_files1, genomes_files2, options.scaffold_file, options.output_file) self.logger.info('Detailed bin comparison written to: ' + options.output_file)
def rblast(self, options): """Reciprocal blast command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.protein_dir) make_sure_path_exists(options.output_dir) aa_gene_files = [] for f in os.listdir(options.protein_dir): if f.endswith(options.protein_ext): aa_gene_files.append(os.path.join(options.protein_dir, f)) if not aa_gene_files: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() # modify gene ids to include genome ids in order to ensure # all gene identifiers are unique across the set of genomes, # also removes the trailing asterisk used to identify the stop # codon self.logger.info('') self.logger.info(' Appending genome identifiers to all gene identifiers.') gene_out_dir = os.path.join(options.output_dir, 'genes') make_sure_path_exists(gene_out_dir) modified_aa_gene_files = [] for gf in aa_gene_files: genome_id = remove_extension(gf) aa_file = os.path.join(gene_out_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close() modified_aa_gene_files.append(aa_file) # perform the reciprocal blast with blastp or diamond self.logger.info('') if options.blastp: rblast = ReciprocalBlast(options.cpus) rblast.run(modified_aa_gene_files, options.evalue, options.output_dir) # concatenate all blast tables to mimic output of diamond, all hits # for a given genome MUST be in consecutive order to fully mimic # the expected results from diamond self.logger.info('') self.logger.info(' Creating single file with all blast hits (be patient!).') blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')]) hit_tables = [os.path.join(options.output_dir, f) for f in blast_files] concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv')) else: rdiamond = ReciprocalDiamond(options.cpus) rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir) self.logger.info('') self.logger.info(' Reciprocal blast hits written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def annoted_features(self, options): """Making annoted features matrix""" missing = [] features2annotation = {} with open(options.features_annotation) as f: for line in f: line = line.rstrip() features_id, annotation = line.split('\t') features2annotation[features_id] = annotation counts = {} id2description = {} annotation_id_list = [] with open(options.annotation_description) as f: for line in f: line = line.rstrip() annotation_id, description = line.split('\t') id2description[annotation_id] = description annotation_id_list.append(annotation_id) counts[annotation_id] = {} annotation_id_list.append('hypothetical protein') counts['hypothetical protein'] = {} check_dir_exists(options.features_dir) input_matrices = DefaultValues.FEATURES_ABUNDANCE_FILES output_matrices = DefaultValues.ANNOTATE_ABUNDANCE_FILES for index, input_matrix in enumerate(input_matrices): input_matrix = os.path.join(options.features_dir, input_matrix) count_type, abundance_type = input_matrix.split('_')[1:3] check_file_exists(input_matrix) counts_all = {} header = [] with open(input_matrix) as f: for line in f: line = line.rstrip() line_list = line.split('\t') if len(header) == 0: header = line_list for i in range(3, len(header), 1): sample = header[i] for annotation_id in annotation_id_list: counts[annotation_id][sample] = 0 counts_all[sample] = 0 else: features = line_list[0] annotation_id = features2annotation[features] if annotation_id not in counts: if annotation_id not in missing: self.logger.warning( "'%s' not present in %s" % (annotation_id, options.annotation_description)) missing.append(annotation_id) continue for i in range(3, len(header), 1): sample = header[i] counts[annotation_id][sample] = counts[ annotation_id][sample] + float(line_list[i]) counts_all[sample] = counts_all[sample] + float( line_list[i]) output_matrix = os.path.join(options.features_dir, output_matrices[index]) self.logger.info('Print %s %s abundance matrix in "%s"' % (count_type, abundance_type, output_matrix)) output_handle = open(output_matrix, "w") output_handle.write('\t'.join(['Features'] + header[3:len(header)]) + '\n') for annotation in annotation_id_list: if sum([counts[annotation][s] for s in counts[annotation]]) == 0 and options.removed: continue else: output_handle.write('\t'.join([annotation] + [ str(counts[annotation][s]) for s in counts[annotation] ]) + '\n') self.logger.info('Printing matrices done')