def cluster(self, input_files, args, work_dir, threads=1): J = lambda p: os.path.join(work_dir, p) bin_prefix = J('METABAT_') log_path = J('logs.txt') cmd_line = [self.program_name, '-i', input_files.contigs_fasta, '-a', input_files.contig_coverages, '-o', bin_prefix, '--cvExt', '-l', *utils.serialize_args(args)] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_path) self.progress.end() output_file_paths = glob.glob(J(bin_prefix + '*')) if not len(output_file_paths): raise ConfigError("Some critical output files are missing. Please take a look at the\ log file: %s" % (log_path)) clusters = {} bin_count = 0 for bin_file in output_file_paths: bin_count += 1 with open(bin_file, 'r') as f: pretty_bin_name = os.path.basename(bin_file).replace('.', '_') clusters[pretty_bin_name] = list(map(str.strip, f.readlines())) return clusters
def cluster(self, input_files, args, work_dir, threads=1, log_file_path=None): J = lambda p: os.path.join(work_dir, p) if not log_file_path: log_file_path = J('logs.txt') translation = { 'preference': 'p', 'maxiter': 'm', 'conviter': 'v', 'damp': 'd', 'contigsize': 'x' } cmd_line = [ self.program_name, '-c', input_files.contig_coverages_log_norm, '-f', os.path.dirname(input_files.contigs_fasta), '-l', os.path.basename(input_files.contigs_fasta), '-o', work_dir, *utils.serialize_args( args, single_dash=True, translate=translation) ] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_file_path) self.progress.end() output_file_paths = glob.glob(J('*.fna')) if not len(output_file_paths): raise ConfigError( "Some critical output files are missing. Please take a look at the " "log file: %s" % (log_file_path)) clusters = {} bin_count = 0 for bin_file in output_file_paths: bin_count += 1 with open(bin_file, 'r') as f: pretty_bin_name = os.path.basename(bin_file) pretty_bin_name = pretty_bin_name.replace('sequence_', '') pretty_bin_name = pretty_bin_name.replace('.fna', '') pretty_bin_name = pretty_bin_name.replace('-', '_') clusters[pretty_bin_name] = [ line.strip().replace('>', '') for line in f if line.startswith('>') ] return clusters
def cluster(self, input_files, args, work_dir, threads=1, log_file_path=None): J = lambda p: os.path.join(work_dir, p) output_file_prefix = J('MAXBIN_') if not log_file_path: log_file_path = J('logs.txt') cmd_line = [ self.program_name, '-contig', input_files.contigs_fasta, '-abund', input_files.contig_coverages, '-out', output_file_prefix, '-thread', str(threads), *utils.serialize_args(args, single_dash=True, use_underscore=True) ] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_file_path) self.progress.end() output_file_paths = glob.glob(J(output_file_prefix + '*.fasta')) if not len(output_file_paths): raise ConfigError( "Some critical output files are missing. Please take a look at the " "log file: %s" % (log_file_path)) clusters = {} bin_count = 0 for bin_file in output_file_paths: bin_count += 1 with open(bin_file, 'r') as f: bin_name = os.path.basename(bin_file).replace('.fasta', '') bin_name = bin_name.replace('.', '_') clusters[bin_name] = [] for line in f.readlines(): if line.startswith('>'): clusters[bin_name].append(line[1:].strip()) return clusters
def cluster(self, input_files, args, work_dir, threads=1, log_file_path=None): J = lambda p: os.path.join(work_dir, p) if not log_file_path: log_file_path = J('logs.txt') cmd_line = [self.program_name, '--coverage_file', input_files.contig_coverages, '--composition_file', input_files.contigs_fasta, '--basename', work_dir, '--threads', threads, *utils.serialize_args(args, use_underscore=True)] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_file_path) self.progress.end() clusters = {} threshold = args.length_threshold or '1000' output_file_name = 'clustering_gt%s.csv' % threshold output_file_path = J(output_file_name) if not os.path.exists(output_file_path): raise ConfigError("One of the critical output files is missing ('%s'). Please take a look at the " "log file: %s" % (output_file_name, log_file_path)) with open(output_file_path, 'r') as f: lines = f.readlines()[1:] for entry in lines: contig, bin_name = map(str.strip, entry.split(',')) pretty_bin_name = 'Bin_' + bin_name if pretty_bin_name not in clusters: clusters[pretty_bin_name] = [] clusters[pretty_bin_name].append(contig) return clusters
def cluster(self, input_files, args, work_dir, threads=1): J = lambda p: os.path.join(work_dir, p) cwd_backup = os.getcwd() os.chdir(work_dir) log_path = J('logs.txt') c = ccollections.Collections(r=run, p=progress) c.populate_collections_dict(input_files.profile_db) source_collections = set( map(str.strip, args.source_collections.split(','))) missing_collections = source_collections - set( c.collections_dict.keys()) if len(missing_collections): raise ConfigError( "Some of the collections you wanted are missing in the database. " "Here is the list of missing collections: %s" % (", ".join(missing_collections))) c_names = [] c_files = [] for collection_name in source_collections: prefix = J(collection_name) c_names.append(collection_name) c_files.append(prefix + '.txt') c.export_collection(collection_name, output_file_prefix=prefix, include_unbinned=False) cmd_line = [ self.program_name, '-c', input_files.splits_fasta, '-i', ','.join(c_files), '-l', ','.join(c_names), '-o', J('OUTPUT'), '--threads', str(threads), *utils.serialize_args( args, use_underscore=True, skip_keys=['source_collections']) ] self.progress.new(self.program_name) self.progress.update('Running using %d threads...' % threads) utils.run_command(cmd_line, log_path) self.progress.end() output_file_name = 'OUTPUT_DASTool_scaffolds2bin.txt' output_file_path = J(output_file_name) if not os.path.exists(output_file_path): raise ConfigError( "One of the critical output files is missing ('%s'). Please take a look at the " "log file: %s" % (output_file_name, log_path)) clusters = {} with open(output_file_path, 'r') as f: lines = f.readlines() for entry in lines: contig, bin_name = map(str.strip, entry.split()) pretty_bin_name = 'Bin_' + bin_name.replace('.', '_') if pretty_bin_name not in clusters: clusters[pretty_bin_name] = [] clusters[pretty_bin_name].append(contig) # restore cwd os.chdir(cwd_backup) return clusters