def generate_cfg(self, cfg, output_files): self.cfg = merge_configs(cfg["assembly"], cfg["common"]) self.cfg.__dict__["result_contigs"] = output_files[ "result_contigs_filename"] self.cfg.__dict__["result_scaffolds"] = output_files[ "result_scaffolds_filename"] self.cfg.__dict__["result_graph"] = output_files[ "result_assembly_graph_filename"] self.cfg.__dict__["result_graph_gfa"] = output_files[ "result_assembly_graph_filename_gfa"] self.cfg.__dict__["result_contigs_paths"] = output_files[ "result_contigs_paths_filename"] self.cfg.__dict__["result_scaffolds_paths"] = output_files[ "result_scaffolds_paths_filename"] self.cfg.__dict__["result_transcripts"] = output_files[ "result_transcripts_filename"] self.cfg.__dict__["result_transcripts_paths"] = output_files[ "result_transcripts_paths_filename"] self.cfg.__dict__["result_gene_clusters"] = output_files[ "result_gene_clusters_filename"] self.cfg.__dict__["result_bgc_statistics"] = output_files[ "result_bgc_stats_filename"] self.cfg.__dict__["result_domain_graph"] = output_files[ "result_domain_graph_filename"] if self.cfg.disable_rr: self.cfg.__dict__["rr_enable"] = False else: self.cfg.__dict__["rr_enable"] = True dataset_filename = os.path.join(self.cfg.output_dir, "dataset.info") self.cfg.__dict__["dataset"] = dataset_filename self.cfg.tmp_dir = support.get_tmp_dir(prefix="spades_")
def run_corrector(configs_dir, execution_home, cfg, ext_python_modules_home, log, to_correct, result): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) dir_util.copy_tree(os.path.join(configs_dir, "corrector"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "corrector.info") cfg.tmp_dir = support.get_tmp_dir(prefix="corrector_") prepare_config_corr(cfg_file_name, cfg, ext_python_modules_home) binary_name = "corrector" command = [os.path.join(execution_home, binary_name), os.path.abspath(cfg_file_name), os.path.abspath(to_correct)] log.info("\n== Running contig polishing tool: " + ' '.join(command) + "\n") log.info("\n== Dataset description file was created: " + cfg_file_name + "\n") support.sys_call(command, log) if not os.path.isfile(result): support.error("Mismatch correction finished abnormally: " + result + " not found!") if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir)
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg, not_used_dataset_data, ext_python_modules_home, log): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) if cfg.iontorrent: dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg") else: dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "config.info") # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_") if cfg.iontorrent: prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home) binary_name = "ionhammer" else: prepare_config_bh(cfg_file_name, cfg, log) binary_name = "hammer" command = [os.path.join(execution_home, binary_name), os.path.abspath(cfg_file_name)] log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n") support.sys_call(command, log) if not os.path.isfile(corrected_dataset_yaml_filename): support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!") corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) remove_not_corrected_reads(cfg.output_dir) is_changed = False if cfg.gzip_output: is_changed = True compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log) if not_used_dataset_data: is_changed = True corrected_dataset_data += not_used_dataset_data if is_changed: pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w')) log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n") if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir)
def generate_config(self, cfg): dst_configs = os.path.join(self.cfg.output_dir, "configs") if os.path.isdir(dst_configs): shutil.rmtree(dst_configs) dir_util.copy_tree(os.path.join(self.tmp_configs_dir, "corrector"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "corrector.info") self.cfg.tmp_dir = support.get_tmp_dir(prefix="corrector_") prepare_config_corr(cfg_file_name, self.cfg, self.ext_python_modules_home)
def generate_config(self, cfg): dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.isdir(dst_configs): shutil.rmtree(dst_configs) if cfg.iontorrent: dir_util.copy_tree(os.path.join(self.tmp_configs_dir, "ionhammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg") else: dir_util.copy_tree(os.path.join(self.tmp_configs_dir, "hammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "config.info") cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_") if cfg.iontorrent: self.prepare_config_ih(cfg_file_name, cfg, self.ext_python_modules_home) else: self.prepare_config_bh(cfg_file_name, cfg, self.log)
def main(ds_args_list, general_args_list, spades_home, bin_home): log = logging.getLogger('dipspades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) ds_args = parse_arguments(ds_args_list, log) if not os.path.exists(ds_args.output_dir): os.makedirs(ds_args.output_dir) log_filename = os.path.join(ds_args.output_dir, "dipspades.log") if os.path.exists(log_filename): os.remove(log_filename) log_handler = logging.FileHandler(log_filename, mode='a') log.addHandler(log_handler) params_filename = os.path.join(ds_args.output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='a') log.addHandler(params_handler) log.info("\n") log.info("General command line: " + " ".join(general_args_list) + "\n") log.info("dipSPAdes command line: " + " ".join(ds_args_list) + "\n") print_ds_args(ds_args, log) log.removeHandler(params_handler) log.info("\n======= dipSPAdes started. Log can be found here: " + log_filename + "\n") write_haplocontigs_in_file(ds_args.haplocontigs, ds_args.haplocontigs_fnames) config_fname = prepare_configs(os.path.join(spades_home, "configs", "dipspades"), ds_args, log) ds_args.tmp_dir = support.get_tmp_dir(prefix="dipspades_", base_dir=ds_args.tmp_dir) prepare_config(config_fname, ds_args, log) try: log.info("===== Assembling started.\n") binary_path = os.path.join(bin_home, "spades-dipspades-core") command = [binary_path, config_fname] support.sys_call(command, log) log.info("\n===== Assembling finished.\n") print_ds_output(ds_args.output_dir, log) if os.path.isdir(ds_args.tmp_dir): shutil.rmtree(ds_args.tmp_dir) log.info("\n======= dipSPAdes finished.\n") log.info("dipSPAdes log can be found here: " + log_filename + "\n") log.info("Thank you for using dipSPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message(), dipspades=True) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log, dipspades=True)
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, only_compressing_is_needed, log): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml # not all reads need processing if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500) cfg.dataset_yaml_filename = to_correct_dataset_yaml_filename else: not_used_dataset_data = None if not only_compressing_is_needed: dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) if cfg.iontorrent: dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg") else: dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "config.info") cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_") if cfg.iontorrent: prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home) binary_name = "ionhammer" else: prepare_config_bh(cfg_file_name, cfg, log) binary_name = "hammer" command = [os.path.join(execution_home, binary_name), os.path.abspath(cfg_file_name)] log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n") support.sys_call(command, log) if not os.path.isfile(corrected_dataset_yaml_filename): support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!") else: log.info("\n===== Skipping %s (already processed). \n" % "read error correction tool") support.continue_from_here(log) corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) remove_not_corrected_reads(cfg.output_dir) is_changed = False if cfg.gzip_output: is_changed = True compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log) if not_used_dataset_data: is_changed = True corrected_dataset_data += not_used_dataset_data if is_changed: pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500) log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n") if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir)
def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, log): if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) # checking and removing conflicting K-mer directories if options_storage.restart_from: processed_K = [] for k in range(options_storage.MIN_K, options_storage.MAX_K, 2): cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k) if os.path.isdir(cur_K_dir) and os.path.isfile( os.path.join(cur_K_dir, "final_contigs.fasta")): processed_K.append(k) if processed_K: RL = get_read_length(cfg.output_dir, processed_K[0], ext_python_modules_home, log) needed_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log, silent=True) needed_K = [k for k in needed_K if k < RL] original_K = reveal_original_k_mers(RL) k_to_delete = [] for id, k in enumerate(needed_K): if len(processed_K) == id: if processed_K[-1] == original_K[ -1]: # the last K in the original run was processed in "last_one" mode k_to_delete = [original_K[-1]] break if processed_K[id] != k: k_to_delete = processed_K[id:] break if not k_to_delete and (len(processed_K) > len(needed_K)): k_to_delete = processed_K[len(needed_K) - 1:] if k_to_delete: log.info( "Restart mode: removing previously processed directories for K=%s " "to avoid conflicts with K specified with --restart-from" % (str(k_to_delete))) for k in k_to_delete: shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k)) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode: shutil.rmtree(bin_reads_dir) cfg.tmp_dir = support.get_tmp_dir(prefix="spades_") if len(cfg.iterative_K) == 1: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) K = cfg.iterative_K[0] else: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, False) prev_K = cfg.iterative_K[0] RL = get_read_length(cfg.output_dir, cfg.iterative_K[0], ext_python_modules_home, log) cfg.iterative_K = update_k_mers_in_special_cases( cfg.iterative_K, RL, log) if cfg.iterative_K[1] + 1 > RL: if cfg.rr_enable: support.warning( "Second value of iterative K (%d) exceeded estimated read length (%d). " "Rerunning for the first value of K (%d) with Repeat Resolving" % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log) run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) K = cfg.iterative_K[0] else: rest_of_iterative_K = cfg.iterative_K rest_of_iterative_K.pop(0) count = 0 for K in rest_of_iterative_K: count += 1 last_one = count == len( cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL) run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one) prev_K = K if last_one: break if count < len(cfg.iterative_K): support.warning( "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)" % (cfg.iterative_K[count], RL), log) latest = os.path.join(cfg.output_dir, "K%d" % K) for format in [".fasta", ".fastg"]: if os.path.isfile(os.path.join(latest, "before_rr" + format)): result_before_rr_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "before_rr" + format) if not os.path.isfile(result_before_rr_contigs ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "before_rr" + format), result_before_rr_contigs) if os.path.isfile(os.path.join(latest, "final_contigs" + format)): if not os.path.isfile(cfg.result_contigs[:-6] + format) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "final_contigs" + format), cfg.result_contigs[:-6] + format) if cfg.rr_enable: if os.path.isfile(os.path.join(latest, "scaffolds" + format)): if not os.path.isfile(cfg.result_scaffolds[:-6] + format ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "scaffolds" + format), cfg.result_scaffolds[:-6] + format) if cfg.developer_mode: # saves saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves") if os.path.lexists( saves_link ): # exists return False for broken link! lexists return True os.remove(saves_link) os.symlink(os.path.join(latest, "saves"), saves_link) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir) return latest
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg options = args cfg, dataset_data = fill_cfg(options, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params( os.path.join(options_storage.output_dir, "params.txt"), args[0]) if not options: support.error( "failed to parse command line of the previous run! Please restart from the beginning or specify another output directory." ) cfg, dataset_data = fill_cfg(options, log) if options_storage.restart_from: check_cfg_for_restart_from(cfg) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info( "\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" flag = False for v in args[1:]: if v == '-o' or v == '--restart-from': flag = True continue if flag: flag = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line:" for v in args: command += " " + v log.info(command) # special case if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type( dataset_data, 'paired-end'): support.warning( 'cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) support.check_single_reads_in_options(options, log) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads( dataset_data) or support.dataset_has_additional_contigs( dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads(dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads( dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs( dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join( options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir( tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type( dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__[ "dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg[ "dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join( misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir( os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error( "failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [ result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename ] for format in [".fasta", ".fastg"]: for old_result_file in old_result_files: if os.path.isfile(old_result_file[:-6] + format): os.remove(old_result_file[:-6] + format) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load( open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths( dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile( dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write( "single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write( "reads" + '\t' + process_cfg.process_spaces( corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces( cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg[ "dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write( process_cfg.process_spaces( cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir( misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith( 'k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error( "failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) #corrector if "mismatch_corrector" in cfg and ( os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or ( options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue for format in [".fasta", ".fastg"]: if os.path.isfile(old[:-6] + format): shutil.move(old[:-6] + format, new[:-6] + format) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size est_params_data = pyyaml.load( open(os.path.join(latest_dir, "final.lib_data"), 'r')) max_IS_library = None for reads_library in est_params_data: if reads_library['type'] == 'paired-end': if not max_IS_library or float( reads_library["insert size mean"]) > float( max_IS_library["insert size mean"]): max_IS_library = reads_library if not max_IS_library: support.error( 'Mismatch correction cannot be performed without at least one paired-end library!', log) if not max_IS_library["insert size mean"]: support.warning( 'Failed to estimate insert size for all paired-end libraries. Starting Mismatch correction' ' based on the first paired-end library and with default insert size.', log) else: cfg["mismatch_corrector"].__dict__[ "insert-size"] = round( max_IS_library["insert size mean"]) yaml_dirname = os.path.dirname( options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list( map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list( map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['right reads'])) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value is not None: args.append(value) # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile( corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = support.get_tmp_dir( prefix="mis_cor_%s_" % assembly_type) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.join( tmp_dir_for_corrector, "corrected_contigs.fasta") # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) assembled_fastg = assembled[:-6] + ".fastg" if os.path.isfile(assembled_fastg): support.create_fastg_from_fasta( corrected, assembled_fastg, log) log.info("\n===== %s finished.\n" % STAGE_NAME) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if "error_correction" in cfg and os.path.isdir( os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces( os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces( result_contigs_filename) if os.path.isfile(result_contigs_filename[:-6] + ".fastg"): message += " (" + os.path.basename( result_contigs_filename[:-6] + ".fastg") + ")" log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces( result_scaffolds_filename) if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"): message += " (" + os.path.basename( result_scaffolds_filename[:-6] + ".fastg") + ")" log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds ) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds( result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished." ) # otherwise it finished WITH WARNINGS if options_storage.test_mode: for result_filename in [ result_contigs_filename, result_scaffolds_filename ]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error( "TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error( "TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error( "TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error( "It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, only_compressing_is_needed, log): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml # not all reads need processing if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join( cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'), default_flow_style=False, default_style='"', width=float("inf")) cfg.dataset_yaml_filename = to_correct_dataset_yaml_filename else: not_used_dataset_data = None if not only_compressing_is_needed: dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) if cfg.iontorrent: dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg") else: dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "config.info") cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_") if cfg.iontorrent: prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home) binary_name = "spades-ionhammer" else: prepare_config_bh(cfg_file_name, cfg, log) binary_name = "spades-hammer" command = [ os.path.join(execution_home, binary_name), os.path.abspath(cfg_file_name) ] log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n") support.sys_call(command, log) if not os.path.isfile(corrected_dataset_yaml_filename): support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!") else: log.info("\n===== Skipping %s (already processed). \n" % "read error correction tool") support.continue_from_here(log) corrected_dataset_data = pyyaml.load( open(corrected_dataset_yaml_filename, 'r')) remove_not_corrected_reads(cfg.output_dir) is_changed = False if cfg.gzip_output: is_changed = True compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log) if not_used_dataset_data: is_changed = True corrected_dataset_data += not_used_dataset_data if is_changed: pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'), default_flow_style=False, default_style='"', width=float("inf")) log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n") if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir)
def main(ds_args_list, general_args_list, spades_home, bin_home): log = logging.getLogger('dipspades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) ds_args = parse_arguments(ds_args_list, log) if not os.path.exists(ds_args.output_dir): os.makedirs(ds_args.output_dir) log_filename = os.path.join(ds_args.output_dir, "dipspades.log") if os.path.exists(log_filename): os.remove(log_filename) log_handler = logging.FileHandler(log_filename, mode='a') log.addHandler(log_handler) params_filename = os.path.join(ds_args.output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='a') log.addHandler(params_handler) log.info("\n") log.info("General command line: " + " ".join(general_args_list) + "\n") log.info("dipSPAdes command line: " + " ".join(ds_args_list) + "\n") print_ds_args(ds_args, log) log.removeHandler(params_handler) log.info("\n======= dipSPAdes started. Log can be found here: " + log_filename + "\n") write_haplocontigs_in_file(ds_args.haplocontigs, ds_args.haplocontigs_fnames) config_fname = prepare_configs(os.path.join(spades_home, "configs", "dipspades"), ds_args, log) ds_args.tmp_dir = support.get_tmp_dir(prefix="dipspades_", base_dir=ds_args.tmp_dir) prepare_config(config_fname, ds_args, log) try: log.info("===== Assembling started.\n") binary_path = os.path.join(bin_home, "dipspades") command = [binary_path, config_fname] support.sys_call(command, log) log.info("\n===== Assembling finished.\n") print_ds_output(ds_args.output_dir, log) if os.path.isdir(ds_args.tmp_dir): shutil.rmtree(ds_args.tmp_dir) log.info("\n======= dipSPAdes finished.\n") log.info("dipSPAdes log can be found here: " + log_filename + "\n") log.info("Thank you for using dipSPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message(), dipspades=True) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log, dipspades=True)
def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, log): if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) used_K = [] # checking and removing conflicting K-mer directories if options_storage.restart_from and (options_storage.restart_k_mers != options_storage.original_k_mers): processed_K = [] for k in range(options_storage.MIN_K, options_storage.MAX_K, 2): cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k) if os.path.isdir(cur_K_dir) and os.path.isfile( os.path.join(cur_K_dir, "final_contigs.fasta")): processed_K.append(k) if processed_K: RL = get_read_length(cfg.output_dir, processed_K[0], ext_python_modules_home, log) needed_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log, silent=True) needed_K = [k for k in needed_K if k < RL] original_K = reveal_original_k_mers(RL) k_to_delete = [] for id, k in enumerate(needed_K): if len(processed_K) == id: if processed_K[-1] == original_K[ -1]: # the last K in the original run was processed in "last_one" mode k_to_delete = [original_K[-1]] break if processed_K[id] != k: k_to_delete = processed_K[id:] break if not k_to_delete and (len(processed_K) > len(needed_K)): k_to_delete = processed_K[len(needed_K) - 1:] if k_to_delete: log.info( "Restart mode: removing previously processed directories for K=%s " "to avoid conflicts with K specified with --restart-from" % (str(k_to_delete))) for k in k_to_delete: shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k)) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode: shutil.rmtree(bin_reads_dir) cfg.tmp_dir = support.get_tmp_dir(prefix="spades_") finished_on_stop_after = False K = cfg.iterative_K[0] if len(cfg.iterative_K) == 1: run_iteration(configs_dir, execution_home, cfg, log, K, None, True) used_K.append(K) else: run_iteration(configs_dir, execution_home, cfg, log, K, None, False) used_K.append(K) if options_storage.stop_after == "k%d" % K: finished_on_stop_after = True else: prev_K = K RL = get_read_length(cfg.output_dir, K, ext_python_modules_home, log) cfg.iterative_K = update_k_mers_in_special_cases( cfg.iterative_K, RL, log) if len(cfg.iterative_K) < 2 or cfg.iterative_K[1] + 1 > RL: if cfg.rr_enable: if len(cfg.iterative_K) < 2: log.info( "== Rerunning for the first value of K (%d) with Repeat Resolving" % cfg.iterative_K[0]) else: support.warning( "Second value of iterative K (%d) exceeded estimated read length (%d). " "Rerunning for the first value of K (%d) with Repeat Resolving" % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log) run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) used_K.append(cfg.iterative_K[0]) K = cfg.iterative_K[0] else: rest_of_iterative_K = cfg.iterative_K rest_of_iterative_K.pop(0) count = 0 for K in rest_of_iterative_K: count += 1 last_one = count == len(cfg.iterative_K) or ( rest_of_iterative_K[count] + 1 > RL) run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one) used_K.append(K) prev_K = K if last_one: break if options_storage.stop_after == "k%d" % K: finished_on_stop_after = True break if count < len(cfg.iterative_K) and not finished_on_stop_after: support.warning( "Iterations stopped. Value of K (%d) exceeded estimated read length (%d)" % (cfg.iterative_K[count], RL), log) if options_storage.stop_after and options_storage.stop_after.startswith( 'k'): support.finish_here(log) latest = os.path.join(cfg.output_dir, "K%d" % K) if cfg.correct_scaffolds and not options_storage.run_completed: if options_storage.continue_mode and os.path.isfile( os.path.join(cfg.output_dir, "SCC", "corrected_scaffolds.fasta" )) and not options_storage.restart_from == "scc": log.info("\n===== Skipping %s (already processed). \n" % "scaffold correction") else: if options_storage.continue_mode: support.continue_from_here(log) run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, 21) latest = os.path.join(os.path.join(cfg.output_dir, "SCC"), "K21") if options_storage.stop_after == 'scc': support.finish_here(log) if cfg.correct_scaffolds: correct_scaffolds_fpath = os.path.join(latest, "corrected_scaffolds.fasta") if os.path.isfile(correct_scaffolds_fpath): shutil.copyfile(correct_scaffolds_fpath, cfg.result_scaffolds) elif not finished_on_stop_after: # interupted by --stop-after, so final K is not processed! if os.path.isfile(os.path.join(latest, "before_rr.fasta")): result_before_rr_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "before_rr.fasta") if not os.path.isfile(result_before_rr_contigs ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "before_rr.fasta"), result_before_rr_contigs) if options_storage.rna: if os.path.isfile(os.path.join(latest, "transcripts.fasta")): if not os.path.isfile(cfg.result_transcripts ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "transcripts.fasta"), cfg.result_transcripts) if os.path.isfile(os.path.join(latest, "transcripts.paths")): if not os.path.isfile(cfg.result_transcripts_paths ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "transcripts.paths"), cfg.result_transcripts_paths) for filtering_type in options_storage.filtering_types: prefix = filtering_type + "_filtered_" result_filtered_transcripts = os.path.join( cfg.output_dir, prefix + options_storage.transcripts_name) latest_filtered_transcripts = os.path.join( latest, prefix + "final_paths.fasta") if os.path.isfile(latest_filtered_transcripts): if not os.path.isfile( result_filtered_transcripts ) or not options_storage.continue_mode: shutil.copyfile(latest_filtered_transcripts, result_filtered_transcripts) else: if os.path.isfile(os.path.join(latest, "final_contigs.fasta")): if not os.path.isfile(cfg.result_contigs ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "final_contigs.fasta"), cfg.result_contigs) if os.path.isfile(os.path.join(latest, "first_pe_contigs.fasta")): result_first_pe_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "first_pe_contigs.fasta") if not os.path.isfile(result_first_pe_contigs ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "first_pe_contigs.fasta"), result_first_pe_contigs) if cfg.rr_enable: if os.path.isfile(os.path.join(latest, "scaffolds.fasta")): if not os.path.isfile( cfg.result_scaffolds ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "scaffolds.fasta"), cfg.result_scaffolds) if os.path.isfile(os.path.join(latest, "scaffolds.paths")): if not os.path.isfile( cfg.result_scaffolds_paths ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths) if os.path.isfile( os.path.join(latest, "assembly_graph_with_scaffolds.gfa")): if not os.path.isfile(cfg.result_graph_gfa ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "assembly_graph_with_scaffolds.gfa"), cfg.result_graph_gfa) if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")): if not os.path.isfile( cfg.result_graph) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph) if os.path.isfile(os.path.join(latest, "final_contigs.paths")): if not os.path.isfile(cfg.result_contigs_paths ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "final_contigs.paths"), cfg.result_contigs_paths) if cfg.developer_mode: # saves saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves") if os.path.lexists( saves_link ): # exists returns False for broken links! lexists return True os.remove(saves_link) os.symlink(os.path.join(latest, "saves"), saves_link) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir) return used_K
def run_mulksg(configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, log, comm, rank, size): #Running this on multiple nodes, need to differentiate the actual work here if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) while (len(cfg.iterative_K) - 1) % size != 0: bgap_idx = 0 bgap = 0 for i in range(0, len(cfg.iterative_K) - 1): gap = cfg.iterative_K[i + 1] - cfg.iterative_K[i] if gap > bgap: bgap = gap bgap_idx = i if bgap == 0: break ul = cfg.iterative_K[i + 1] ll = cfg.iterative_K[i] new_k = (int)(ul - ll) / 2 new_k = new_k / 2 * 2 + 1 cfg.iterative_K.append(new_k) cfg.iterative_K = sorted(cfg.iterative_K) #add to cfg.iterative_K #add a number between the biggest gaps in the current cfg.iterative_K if rank == 0: #Setup all the configs, we will put them in a work_list # checking and removing conflicting K-mer directories if options_storage.restart_from and (options_storage.restart_k_mers != options_storage.original_k_mers): processed_K = [] for k in range(options_storage.MIN_K, options_storage.MAX_K, 2): cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k) if os.path.isdir(cur_K_dir) and os.path.isfile( os.path.join(cur_K_dir, "final_contigs.fasta")): processed_K.append(k) if processed_K: RL = get_read_length(cfg.output_dir, processed_K[0], ext_python_modules_home, log) needed_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log, silent=True) needed_K = [k for k in needed_K if k < RL] original_K = reveal_original_k_mers(RL) k_to_delete = [] for id, k in enumerate(needed_K): if len(processed_K) == id: if processed_K[-1] == original_K[ -1]: # the last K in the original run was processed in "last_one" mode k_to_delete = [original_K[-1]] break if processed_K[id] != k: k_to_delete = processed_K[id:] break if not k_to_delete and (len(processed_K) > len(needed_K)): k_to_delete = processed_K[len(needed_K) - 1:] if k_to_delete: log.info( "Restart mode: removing previously processed directories for K=%s " "to avoid conflicts with K specified with --restart-from" % (str(k_to_delete))) for k in k_to_delete: shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k)) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) cfg.tmp_dir = support.get_tmp_dir(prefix="mulksg_") first_kValues = cfg.iterative_K last_kValue = [] last_kValue.append(first_kValues[-1]) del first_kValues[-1] #process all of the files as last_one first_commands, contig_files = create_mulksg_configs( configs_dir, execution_home, cfg, log, first_kValues, None, True, ext_python_modules_home) work_list = first_commands if size > 1 and len(cfg.iterative_K) > 1: num_running = 0 for command in work_list: if num_running == (size - 1): support.sys_call(command, log) num_running = 0 else: node_rank = comm.recv() #source=ANY_SOURCE comm.send(command, node_rank) num_running += 1 for i in range(1, size): node_rank = comm.recv() #source=ANY_SOURCE comm.send("Done", node_rank) print "Rank", node_rank, "Finished" else: if len(cfg.iterative_K) > 1: pool = Pool(len(first_commands)) results = pool.map( support.sys_call, first_commands) #(command for command in first_commands)) #Now we have generated all the contig files, concatenate into one file extra_contig_filename = None try: extra_contig_filename = cfg.output_dir + "/extracontigs.fa" #"/K" + last_kValue[0] with open(extra_contig_filename, 'w') as contig_file: for fname in contig_files: try: with open(fname) as infile: contig_file.write(infile.read()) except: continue except: log.info("Could not create extra contig file!!!!") extra_contig_filename = None last_command, contig_files = create_mulksg_configs( configs_dir, execution_home, cfg, log, last_kValue, extra_contig_filename, True, ext_python_modules_home) last_command = last_command[0] support.sys_call(last_command, log) K = last_kValue[0] latest = os.path.join(cfg.output_dir, "K%d" % K) finished_on_stop_after = False if cfg.correct_scaffolds and not options_storage.run_completed: if options_storage.continue_mode and os.path.isfile( os.path.join(cfg.output_dir, "SCC", "corrected_scaffolds.fasta") ) and not options_storage.restart_from == "scc": log.info("\n===== Skipping %s (already processed). \n" % "scaffold correction") else: if options_storage.continue_mode: support.continue_from_here(log) run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, 21) latest = os.path.join(os.path.join(cfg.output_dir, "SCC"), "K21") if options_storage.stop_after == 'scc': support.finish_here(log) if cfg.correct_scaffolds: correct_scaffolds_fpath = os.path.join( latest, "corrected_scaffolds.fasta") if os.path.isfile(correct_scaffolds_fpath): shutil.copyfile(correct_scaffolds_fpath, cfg.result_scaffolds) elif not finished_on_stop_after: # interupted by --stop-after, so final K is not processed! if os.path.isfile(os.path.join(latest, "before_rr.fasta")): result_before_rr_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "before_rr.fasta") if not os.path.isfile(result_before_rr_contigs ) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "before_rr.fasta"), result_before_rr_contigs) if options_storage.rna: if os.path.isfile(os.path.join(latest, "transcripts.fasta")): if not os.path.isfile( cfg.result_transcripts ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "transcripts.fasta"), cfg.result_transcripts) if os.path.isfile(os.path.join(latest, "transcripts.paths")): if not os.path.isfile( cfg.result_transcripts_paths ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "transcripts.paths"), cfg.result_transcripts_paths) for filtering_type in options_storage.filtering_types: prefix = filtering_type + "_filtered_" result_filtered_transcripts = os.path.join( cfg.output_dir, prefix + options_storage.transcripts_name) latest_filtered_transcripts = os.path.join( latest, prefix + "final_paths.fasta") if os.path.isfile(latest_filtered_transcripts): if not os.path.isfile( result_filtered_transcripts ) or not options_storage.continue_mode: shutil.copyfile(latest_filtered_transcripts, result_filtered_transcripts) else: if os.path.isfile(os.path.join(latest, "final_contigs.fasta")): if not os.path.isfile( cfg.result_contigs ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "final_contigs.fasta"), cfg.result_contigs) if os.path.isfile( os.path.join(latest, "first_pe_contigs.fasta")): result_first_pe_contigs = os.path.join( os.path.dirname(cfg.result_contigs), "first_pe_contigs.fasta") if not os.path.isfile( result_first_pe_contigs ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "first_pe_contigs.fasta"), result_first_pe_contigs) if cfg.rr_enable: if os.path.isfile(os.path.join(latest, "scaffolds.fasta")): if not os.path.isfile( cfg.result_scaffolds ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "scaffolds.fasta"), cfg.result_scaffolds) if os.path.isfile(os.path.join(latest, "scaffolds.paths")): if not os.path.isfile( cfg.result_scaffolds_paths ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths) if os.path.isfile( os.path.join(latest, "assembly_graph_with_scaffolds.gfa")): if not os.path.isfile( cfg.result_graph_gfa ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "assembly_graph_with_scaffolds.gfa"), cfg.result_graph_gfa) if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")): if not os.path.isfile( cfg.result_graph ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph) if os.path.isfile(os.path.join(latest, "final_contigs.paths")): if not os.path.isfile( cfg.result_contigs_paths ) or not options_storage.continue_mode: shutil.copyfile( os.path.join(latest, "final_contigs.paths"), cfg.result_contigs_paths) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir) return first_kValues + last_kValue elif rank != 0 and len(cfg.iterative_K) > 1: comm.send(rank, 0) command = comm.recv(source=0) while command != "Done": # print rank, command support.sys_call(command, log) comm.send(rank, 0) command = comm.recv(source=0) # comm.send(rank, 0) #tell the master I am done and exiting return None
def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, log): if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) # checking and removing conflicting K-mer directories if options_storage.restart_from: processed_K = [] for k in range(options_storage.MIN_K, options_storage.MAX_K, 2): cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k) if os.path.isdir(cur_K_dir) and os.path.isfile(os.path.join(cur_K_dir, "final_contigs.fasta")): processed_K.append(k) if processed_K: RL = get_read_length(cfg.output_dir, processed_K[0], ext_python_modules_home, log) needed_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log, silent=True) needed_K = [k for k in needed_K if k < RL] original_K = reveal_original_k_mers(RL) k_to_delete = [] for id, k in enumerate(needed_K): if len(processed_K) == id: if processed_K[-1] == original_K[-1]: # the last K in the original run was processed in "last_one" mode k_to_delete = [original_K[-1]] break if processed_K[id] != k: k_to_delete = processed_K[id:] break if not k_to_delete and (len(processed_K) > len(needed_K)): k_to_delete = processed_K[len(needed_K) - 1:] if k_to_delete: log.info("Restart mode: removing previously processed directories for K=%s " "to avoid conflicts with K specified with --restart-from" % (str(k_to_delete))) for k in k_to_delete: shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k)) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode: shutil.rmtree(bin_reads_dir) cfg.tmp_dir = support.get_tmp_dir(prefix="spades_") if len(cfg.iterative_K) == 1: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) K = cfg.iterative_K[0] else: run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, False) prev_K = cfg.iterative_K[0] RL = get_read_length(cfg.output_dir, cfg.iterative_K[0], ext_python_modules_home, log) cfg.iterative_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log) if cfg.iterative_K[1] + 1 > RL: if cfg.rr_enable: support.warning("Second value of iterative K (%d) exceeded estimated read length (%d). " "Rerunning for the first value of K (%d) with Repeat Resolving" % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log) run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) K = cfg.iterative_K[0] else: rest_of_iterative_K = cfg.iterative_K rest_of_iterative_K.pop(0) count = 0 for K in rest_of_iterative_K: count += 1 last_one = count == len(cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL) run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one) prev_K = K if last_one: break if count < len(cfg.iterative_K): support.warning("Iterations stopped. Value of K (%d) exceeded estimated read length (%d)" % (cfg.iterative_K[count], RL), log) latest = os.path.join(cfg.output_dir, "K%d" % K) for format in [".fasta", ".fastg"]: if os.path.isfile(os.path.join(latest, "before_rr" + format)): result_before_rr_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "before_rr" + format) if not os.path.isfile(result_before_rr_contigs) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "before_rr" + format), result_before_rr_contigs) if os.path.isfile(os.path.join(latest, "final_contigs" + format)): if not os.path.isfile(cfg.result_contigs[:-6] + format) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "final_contigs" + format), cfg.result_contigs[:-6] + format) if cfg.rr_enable: if os.path.isfile(os.path.join(latest, "scaffolds" + format)): if not os.path.isfile(cfg.result_scaffolds[:-6] + format) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "scaffolds" + format), cfg.result_scaffolds[:-6] + format) if cfg.developer_mode: # saves saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves") if os.path.lexists(saves_link): # exists return False for broken link! lexists return True os.remove(saves_link) os.symlink(os.path.join(latest, "saves"), saves_link) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir) return latest
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg cfg, dataset_data = fill_cfg(args, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt")) if not options: support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.") cfg, dataset_data = fill_cfg(options, log) if options_storage.restart_from: check_cfg_for_restart_from(cfg) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" flag = False for v in args[1:]: if v == '-o' or v == '--restart-from': flag = True continue if flag: flag = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line:" for v in args: command += " " + v log.info(command) # special case if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'): support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads(dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename] for format in [".fasta", ".fastg"]: for old_result_file in old_result_files: if os.path.isfile(old_result_file[:-6] + format): os.remove(old_result_file[:-6] + format) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile(dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir(misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) #corrector if "mismatch_corrector" in cfg and (os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue for format in [".fasta", ".fastg"]: if os.path.isfile(old[:-6] + format): shutil.move(old[:-6] + format, new[:-6] + format) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size est_params_data = pyyaml.load(open(os.path.join(latest_dir, "final.lib_data"), 'r')) max_IS_library = None for reads_library in est_params_data: if reads_library['type'] == 'paired-end': if not max_IS_library or float(reads_library["insert size mean"]) > float(max_IS_library["insert size mean"]): max_IS_library = reads_library if not max_IS_library: support.error('Mismatch correction cannot be performed without at least one paired-end library!', log) if not max_IS_library["insert size mean"]: support.warning('Failed to estimate insert size for all paired-end libraries. Starting Mismatch correction' ' based on the first paired-end library and with default insert size.', log) else: cfg["mismatch_corrector"].__dict__["insert-size"] = round(max_IS_library["insert size mean"]) yaml_dirname = os.path.dirname(options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list(map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list(map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['right reads'])) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value is not None: args.append(value) # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = support.get_tmp_dir(prefix="mis_cor_%s_" % assembly_type) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta") # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) assembled_fastg = assembled[:-6] + ".fastg" if os.path.isfile(assembled_fastg): support.create_fastg_from_fasta(corrected, assembled_fastg, log) log.info("\n===== %s finished.\n" % STAGE_NAME) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) if os.path.isfile(result_contigs_filename[:-6] + ".fastg"): message += " (" + os.path.basename(result_contigs_filename[:-6] + ".fastg") + ")" log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"): message += " (" + os.path.basename(result_scaffolds_filename[:-6] + ".fastg") + ")" log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS if options_storage.test_mode: for result_filename in [result_contigs_filename, result_scaffolds_filename]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error("TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error("TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, log): if not isinstance(cfg.iterative_K, list): cfg.iterative_K = [cfg.iterative_K] cfg.iterative_K = sorted(cfg.iterative_K) used_K = [] # checking and removing conflicting K-mer directories if options_storage.restart_from and (options_storage.restart_k_mers != options_storage.original_k_mers): processed_K = [] for k in range(options_storage.MIN_K, options_storage.MAX_K, 2): cur_K_dir = os.path.join(cfg.output_dir, "K%d" % k) if os.path.isdir(cur_K_dir) and os.path.isfile(os.path.join(cur_K_dir, "final_contigs.fasta")): processed_K.append(k) if processed_K: RL = get_read_length(cfg.output_dir, processed_K[0], ext_python_modules_home, log) needed_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log, silent=True) needed_K = [k for k in needed_K if k < RL] original_K = reveal_original_k_mers(RL) k_to_delete = [] for id, k in enumerate(needed_K): if len(processed_K) == id: if processed_K[-1] == original_K[-1]: # the last K in the original run was processed in "last_one" mode k_to_delete = [original_K[-1]] break if processed_K[id] != k: k_to_delete = processed_K[id:] break if not k_to_delete and (len(processed_K) > len(needed_K)): k_to_delete = processed_K[len(needed_K) - 1:] if k_to_delete: log.info("Restart mode: removing previously processed directories for K=%s " "to avoid conflicts with K specified with --restart-from" % (str(k_to_delete))) for k in k_to_delete: shutil.rmtree(os.path.join(cfg.output_dir, "K%d" % k)) bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") if os.path.isdir(bin_reads_dir) and not options_storage.continue_mode: shutil.rmtree(bin_reads_dir) cfg.tmp_dir = support.get_tmp_dir(prefix="spades_") finished_on_stop_after = False K = cfg.iterative_K[0] if len(cfg.iterative_K) == 1: run_iteration(configs_dir, execution_home, cfg, log, K, None, True) used_K.append(K) else: run_iteration(configs_dir, execution_home, cfg, log, K, None, False) used_K.append(K) if options_storage.stop_after == "k%d" % K: finished_on_stop_after = True else: prev_K = K RL = get_read_length(cfg.output_dir, K, ext_python_modules_home, log) cfg.iterative_K = update_k_mers_in_special_cases(cfg.iterative_K, RL, log) if len(cfg.iterative_K) < 2 or cfg.iterative_K[1] + 1 > RL: if cfg.rr_enable: if len(cfg.iterative_K) < 2: log.info("== Rerunning for the first value of K (%d) with Repeat Resolving" % cfg.iterative_K[0]) else: support.warning("Second value of iterative K (%d) exceeded estimated read length (%d). " "Rerunning for the first value of K (%d) with Repeat Resolving" % (cfg.iterative_K[1], RL, cfg.iterative_K[0]), log) run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True) used_K.append(cfg.iterative_K[0]) K = cfg.iterative_K[0] else: rest_of_iterative_K = cfg.iterative_K rest_of_iterative_K.pop(0) count = 0 for K in rest_of_iterative_K: count += 1 last_one = count == len(cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL) run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one) used_K.append(K) prev_K = K if last_one: break if options_storage.stop_after == "k%d" % K: finished_on_stop_after = True break if count < len(cfg.iterative_K) and not finished_on_stop_after: support.warning("Iterations stopped. Value of K (%d) exceeded estimated read length (%d)" % (cfg.iterative_K[count], RL), log) if options_storage.stop_after and options_storage.stop_after.startswith('k'): support.finish_here(log) latest = os.path.join(cfg.output_dir, "K%d" % K) if cfg.correct_scaffolds and not options_storage.run_completed: if options_storage.continue_mode and os.path.isfile(os.path.join(cfg.output_dir, "SCC", "corrected_scaffolds.fasta")) and not options_storage.restart_from == "scc": log.info("\n===== Skipping %s (already processed). \n" % "scaffold correction") else: if options_storage.continue_mode: support.continue_from_here(log) run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, 21) latest = os.path.join(os.path.join(cfg.output_dir, "SCC"), "K21") if options_storage.stop_after == 'scc': support.finish_here(log) if cfg.correct_scaffolds: correct_scaffolds_fpath = os.path.join(latest, "corrected_scaffolds.fasta") if os.path.isfile(correct_scaffolds_fpath): shutil.copyfile(correct_scaffolds_fpath, cfg.result_scaffolds) elif not finished_on_stop_after: # interupted by --stop-after, so final K is not processed! if os.path.isfile(os.path.join(latest, "before_rr.fasta")): result_before_rr_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "before_rr.fasta") if not os.path.isfile(result_before_rr_contigs) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "before_rr.fasta"), result_before_rr_contigs) if options_storage.rna: if os.path.isfile(os.path.join(latest, "transcripts.fasta")): if not os.path.isfile(cfg.result_transcripts) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "transcripts.fasta"), cfg.result_transcripts) if os.path.isfile(os.path.join(latest, "transcripts.paths")): if not os.path.isfile(cfg.result_transcripts_paths) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "transcripts.paths"), cfg.result_transcripts_paths) for filtering_type in options_storage.filtering_types: prefix = filtering_type + "_filtered_" result_filtered_transcripts = os.path.join(cfg.output_dir, prefix + options_storage.transcripts_name) latest_filtered_transcripts = os.path.join(latest, prefix + "final_paths.fasta") if os.path.isfile(latest_filtered_transcripts): if not os.path.isfile(result_filtered_transcripts) or not options_storage.continue_mode: shutil.copyfile(latest_filtered_transcripts, result_filtered_transcripts) else: if os.path.isfile(os.path.join(latest, "final_contigs.fasta")): if not os.path.isfile(cfg.result_contigs) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "final_contigs.fasta"), cfg.result_contigs) if os.path.isfile(os.path.join(latest, "first_pe_contigs.fasta")): result_first_pe_contigs = os.path.join(os.path.dirname(cfg.result_contigs), "first_pe_contigs.fasta") if not os.path.isfile(result_first_pe_contigs) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "first_pe_contigs.fasta"), result_first_pe_contigs) if cfg.rr_enable: if os.path.isfile(os.path.join(latest, "scaffolds.fasta")): if not os.path.isfile(cfg.result_scaffolds) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "scaffolds.fasta"), cfg.result_scaffolds) if os.path.isfile(os.path.join(latest, "scaffolds.paths")): if not os.path.isfile(cfg.result_scaffolds_paths) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths) if os.path.isfile(os.path.join(latest, "assembly_graph_with_scaffolds.gfa")): if not os.path.isfile(cfg.result_graph_gfa) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "assembly_graph_with_scaffolds.gfa"), cfg.result_graph_gfa) if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")): if not os.path.isfile(cfg.result_graph) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph) if os.path.isfile(os.path.join(latest, "final_contigs.paths")): if not os.path.isfile(cfg.result_contigs_paths) or not options_storage.continue_mode: shutil.copyfile(os.path.join(latest, "final_contigs.paths"), cfg.result_contigs_paths) if cfg.developer_mode: # saves saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves") if os.path.lexists(saves_link): # exists returns False for broken links! lexists return True os.remove(saves_link) os.symlink(os.path.join(latest, "saves"), saves_link) if os.path.isdir(bin_reads_dir): shutil.rmtree(bin_reads_dir) if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir) return used_K