def prepare_config_spades(filename, cfg, log, use_additional_contigs, K, last_one): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["run_mode"] = "false" subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["additional_contigs"] = process_cfg.process_spaces( cfg.additional_contigs) subst_dict["entry_point"] = "construction" subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["gap_closer_enable"] = bool_to_str(last_one) subst_dict["paired_mode"] = bool_to_str(last_one and cfg.paired_mode) subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) subst_dict["use_additional_contigs"] = bool_to_str(use_additional_contigs) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory subst_dict["correct_mismatches"] = bool_to_str(last_one) if "resolving_mode" in cfg.__dict__: subst_dict["resolving_mode"] = cfg.resolving_mode if "careful" in cfg.__dict__: subst_dict["mismatch_careful"] = bool_to_str(cfg.careful) process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["run_mode"] = "false" if "diploid_mode" in cfg.__dict__: subst_dict["diploid_mode"] = bool_to_str(cfg.diploid_mode) subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir) if additional_contigs_fname: subst_dict["additional_contigs"] = process_cfg.process_spaces(additional_contigs_fname) subst_dict["use_additional_contigs"] = bool_to_str(True) else: subst_dict["use_additional_contigs"] = bool_to_str(False) subst_dict["main_iteration"] = bool_to_str(last_one) subst_dict["entry_point"] = stage subst_dict["load_from"] = saves_dir subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["gap_closer_enable"] = bool_to_str(last_one) subst_dict["rr_enable"] = bool_to_str(last_one and cfg.rr_enable) # subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory subst_dict["correct_mismatches"] = bool_to_str(last_one) if "resolving_mode" in cfg.__dict__: subst_dict["resolving_mode"] = cfg.resolving_mode if "careful" in cfg.__dict__: subst_dict["mismatch_careful"] = bool_to_str(cfg.careful) if "pacbio_mode" in cfg.__dict__: subst_dict["pacbio_test_on"] = bool_to_str(cfg.pacbio_mode) subst_dict["pacbio_reads"] = process_cfg.process_spaces(cfg.pacbio_reads) process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_mulksg(filename, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir) if additional_contigs_fname: subst_dict["additional_contigs"] = process_cfg.process_spaces( additional_contigs_fname) subst_dict["use_additional_contigs"] = bool_to_str(True) else: subst_dict["use_additional_contigs"] = bool_to_str(False) subst_dict["main_iteration"] = bool_to_str(last_one) subst_dict["entry_point"] = stage subst_dict["load_from"] = saves_dir subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["gap_closer_enable"] = bool_to_str(last_one or K >= 55) subst_dict["rr_enable"] = bool_to_str(last_one and cfg.rr_enable) # subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) #TODO: If on multi node, max_threads and max_memory need to change!!! if K == cfg.iterative_K[-1]: subst_dict["max_threads"] = min(multiprocessing.cpu_count(), cfg.max_threads) subst_dict["max_memory"] = cfg.max_memory else: subst_dict["max_threads"] = min( multiprocessing.cpu_count(), cfg.max_threads) #/(len(cfg.iterative_K)-1) subst_dict["max_memory"] = cfg.max_memory / (len(cfg.iterative_K) - 1) subst_dict["save_gp"] = bool_to_str(cfg.save_gp) if (not last_one): subst_dict["correct_mismatches"] = bool_to_str(False) if "resolving_mode" in cfg.__dict__: subst_dict["resolving_mode"] = cfg.resolving_mode if "pacbio_mode" in cfg.__dict__: subst_dict["pacbio_test_on"] = bool_to_str(cfg.pacbio_mode) subst_dict["pacbio_reads"] = process_cfg.process_spaces( cfg.pacbio_reads) if cfg.cov_cutoff == "off": subst_dict["use_coverage_threshold"] = bool_to_str(False) else: subst_dict["use_coverage_threshold"] = bool_to_str(True) if cfg.cov_cutoff == "auto": subst_dict["coverage_threshold"] = 0.0 else: subst_dict["coverage_threshold"] = cfg.cov_cutoff if cfg.lcer_cutoff is not None: subst_dict["lcer_enabled"] = bool_to_str(True) subst_dict["lcer_coverage_threshold"] = cfg.lcer_cutoff if "series_analysis" in cfg.__dict__: subst_dict["series_analysis"] = cfg.series_analysis process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config(config_fname, ds_args, log): args_dict = dict() args_dict["tails_lie_on_bulges"] = process_cfg.bool_to_str(ds_args.allow_gaps) args_dict["align_bulge_sides"] = process_cfg.bool_to_str(ds_args.weak_align) args_dict["haplocontigs"] = process_cfg.process_spaces(ds_args.haplocontigs) args_dict["output_dir"] = process_cfg.process_spaces(ds_args.output_dir) args_dict["developer_mode"] = "false" #process_cfg.bool_to_str(False) args_dict["tmp_dir"] = process_cfg.process_spaces(ds_args.tmp_dir) args_dict["max_threads"] = ds_args.max_threads args_dict["max_memory"] = ds_args.max_memory process_cfg.substitute_params(config_fname, args_dict, log)
def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir) if additional_contigs_fname: subst_dict["additional_contigs"] = process_cfg.process_spaces( additional_contigs_fname) subst_dict["use_additional_contigs"] = bool_to_str(True) else: subst_dict["use_additional_contigs"] = bool_to_str(False) subst_dict["main_iteration"] = bool_to_str(last_one) subst_dict["entry_point"] = stage subst_dict["load_from"] = saves_dir if "checkpoints" in cfg.__dict__: subst_dict["checkpoints"] = cfg.checkpoints subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["time_tracer_enabled"] = bool_to_str(cfg.time_tracer) subst_dict["gap_closer_enable"] = bool_to_str( last_one or K >= options_storage.GAP_CLOSER_ENABLE_MIN_K) subst_dict["rr_enable"] = bool_to_str(last_one and cfg.rr_enable) # subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory subst_dict["save_gp"] = bool_to_str(cfg.save_gp) if not last_one: subst_dict["correct_mismatches"] = bool_to_str(False) if "resolving_mode" in cfg.__dict__: subst_dict["resolving_mode"] = cfg.resolving_mode if "pacbio_mode" in cfg.__dict__: subst_dict["pacbio_test_on"] = bool_to_str(cfg.pacbio_mode) subst_dict["pacbio_reads"] = process_cfg.process_spaces( cfg.pacbio_reads) if cfg.cov_cutoff == "off": subst_dict["use_coverage_threshold"] = bool_to_str(False) else: subst_dict["use_coverage_threshold"] = bool_to_str(True) if cfg.cov_cutoff == "auto": subst_dict["coverage_threshold"] = 0.0 else: subst_dict["coverage_threshold"] = cfg.cov_cutoff if cfg.lcer_cutoff is not None: subst_dict["lcer_enabled"] = bool_to_str(True) subst_dict["lcer_coverage_threshold"] = cfg.lcer_cutoff if "series_analysis" in cfg.__dict__: subst_dict["series_analysis"] = cfg.series_analysis process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_ih(filename, cfg, ext_python_modules_home): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml data = pyyaml.load(open(filename, 'r')) data["dataset"] = process_cfg.process_spaces(cfg.dataset_yaml_filename) data["working_dir"] = process_cfg.process_spaces(cfg.tmp_dir) data["output_dir"] = process_cfg.process_spaces(cfg.output_dir) data["hard_memory_limit"] = cfg.max_memory data["max_nthreads"] = cfg.max_threads pyyaml.dump(data, open(filename, 'w'))
def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir) if additional_contigs_fname: subst_dict["additional_contigs"] = process_cfg.process_spaces(additional_contigs_fname) subst_dict["use_additional_contigs"] = bool_to_str(True) else: subst_dict["use_additional_contigs"] = bool_to_str(False) subst_dict["main_iteration"] = bool_to_str(last_one) subst_dict["entry_point"] = stage subst_dict["load_from"] = saves_dir subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["gap_closer_enable"] = bool_to_str(last_one or K >= 55) subst_dict["rr_enable"] = bool_to_str(last_one and cfg.rr_enable) # subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory subst_dict["save_gp"] = bool_to_str(cfg.save_gp) if (not last_one): subst_dict["correct_mismatches"] = bool_to_str(False) if "resolving_mode" in cfg.__dict__: subst_dict["resolving_mode"] = cfg.resolving_mode if "pacbio_mode" in cfg.__dict__: subst_dict["pacbio_test_on"] = bool_to_str(cfg.pacbio_mode) subst_dict["pacbio_reads"] = process_cfg.process_spaces(cfg.pacbio_reads) if cfg.cov_cutoff == "off": subst_dict["use_coverage_threshold"] = bool_to_str(False) else: subst_dict["use_coverage_threshold"] = bool_to_str(True) if cfg.cov_cutoff == "auto": subst_dict["coverage_threshold"] = 0.0 else: subst_dict["coverage_threshold"] = cfg.cov_cutoff if cfg.lcer_cutoff is not None: subst_dict["lcer_enabled"] = bool_to_str(True) subst_dict["lcer_coverage_threshold"] = cfg.lcer_cutoff #TODO: make something about spades.py and config param substitution if "bwa_paired" in cfg.__dict__: subst_dict["bwa_enable"] = bool_to_str(True) subst_dict["path_to_bwa"] = os.path.join(execution_home, "bwa-spades") if "series_analysis" in cfg.__dict__: subst_dict["series_analysis"] = cfg.series_analysis process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_bh(filename, cfg, log): subst_dict = dict() subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset_yaml_filename) subst_dict["input_working_dir"] = process_cfg.process_spaces(cfg.tmp_dir) subst_dict["output_dir"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["general_max_iterations"] = cfg.max_iterations subst_dict["general_max_nthreads"] = cfg.max_threads subst_dict["count_merge_nthreads"] = cfg.max_threads subst_dict["bayes_nthreads"] = cfg.max_threads subst_dict["expand_nthreads"] = cfg.max_threads subst_dict["correct_nthreads"] = cfg.max_threads subst_dict["general_hard_memory_limit"] = cfg.max_memory if "qvoffset" in cfg.__dict__: subst_dict["input_qvoffset"] = cfg.qvoffset process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config(config_fname, ds_args, log): args_dict = dict() args_dict["tails_lie_on_bulges"] = process_cfg.bool_to_str(not ds_args.allow_gaps) args_dict["align_bulge_sides"] = process_cfg.bool_to_str(not ds_args.weak_align) args_dict["haplocontigs"] = process_cfg.process_spaces(ds_args.haplocontigs) args_dict["output_dir"] = process_cfg.process_spaces(ds_args.output_dir) args_dict["developer_mode"] = process_cfg.bool_to_str(ds_args.dev_mode) args_dict["tmp_dir"] = process_cfg.process_spaces(ds_args.tmp_dir) args_dict["max_threads"] = ds_args.max_threads args_dict["max_memory"] = ds_args.max_memory args_dict["output_base"] = "" args_dict["ha_enabled"] = process_cfg.bool_to_str(ds_args.haplotype_assembly) args_dict["K"] = str(ds_args.k) args_dict['saves'] = ds_args.saves args_dict['entry_point'] = ds_args.start_from process_cfg.substitute_params(config_fname, args_dict, log)
def prepare_config_scaffold_correction(filename, cfg, log, saves_dir, K): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(os.path.join(cfg.output_dir, "SCC")) subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir) subst_dict["use_additional_contigs"] = bool_to_str(False) subst_dict["main_iteration"] = bool_to_str(False) subst_dict["entry_point"] = BASE_STAGE subst_dict["load_from"] = saves_dir subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory #todo process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_bh(filename, cfg, log): subst_dict = dict() subst_dict["dataset"] = process_cfg.process_spaces( cfg.dataset_yaml_filename) subst_dict["input_working_dir"] = process_cfg.process_spaces(cfg.tmp_dir) subst_dict["output_dir"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["general_max_iterations"] = cfg.max_iterations subst_dict["general_max_nthreads"] = cfg.max_threads subst_dict["count_merge_nthreads"] = cfg.max_threads subst_dict["bayes_nthreads"] = cfg.max_threads subst_dict["expand_nthreads"] = cfg.max_threads subst_dict["correct_nthreads"] = cfg.max_threads subst_dict["general_hard_memory_limit"] = cfg.max_memory if "qvoffset" in cfg.__dict__: subst_dict["input_qvoffset"] = cfg.qvoffset process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["run_mode"] = "false" if "diploid_mode" in cfg.__dict__: subst_dict["diploid_mode"] = bool_to_str(cfg.diploid_mode) subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["tmp_dir"] = process_cfg.process_spaces(cfg.tmp_dir) if additional_contigs_fname: subst_dict["additional_contigs"] = process_cfg.process_spaces( additional_contigs_fname) subst_dict["use_additional_contigs"] = bool_to_str(True) else: subst_dict["use_additional_contigs"] = bool_to_str(False) subst_dict["main_iteration"] = bool_to_str(last_one) subst_dict["entry_point"] = stage subst_dict["load_from"] = saves_dir subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["gap_closer_enable"] = bool_to_str(last_one or K >= 55) subst_dict["rr_enable"] = bool_to_str(last_one and cfg.rr_enable) # subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory subst_dict["correct_mismatches"] = bool_to_str(last_one) if "resolving_mode" in cfg.__dict__: subst_dict["resolving_mode"] = cfg.resolving_mode if "careful" in cfg.__dict__: subst_dict["mismatch_careful"] = bool_to_str(cfg.careful) if "pacbio_mode" in cfg.__dict__: subst_dict["pacbio_test_on"] = bool_to_str(cfg.pacbio_mode) subst_dict["pacbio_reads"] = process_cfg.process_spaces( cfg.pacbio_reads) if cfg.cov_cutoff == "off": subst_dict["use_coverage_threshold"] = bool_to_str(False) else: subst_dict["use_coverage_threshold"] = bool_to_str(True) if cfg.cov_cutoff == "auto": subst_dict["coverage_threshold"] = 0.0 else: subst_dict["coverage_threshold"] = cfg.cov_cutoff process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_bh(self, filename, cfg, log): subst_dict = dict() subst_dict["dataset"] = process_cfg.process_spaces( cfg.dataset_yaml_filename) subst_dict["input_working_dir"] = process_cfg.process_spaces( cfg.tmp_dir) subst_dict["output_dir"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["general_max_iterations"] = options_storage.ITERATIONS subst_dict["general_max_nthreads"] = cfg.max_threads subst_dict["count_merge_nthreads"] = cfg.max_threads subst_dict["bayes_nthreads"] = cfg.max_threads subst_dict["expand_nthreads"] = cfg.max_threads subst_dict["correct_nthreads"] = cfg.max_threads subst_dict["general_hard_memory_limit"] = cfg.max_memory if "qvoffset" in cfg.__dict__: subst_dict["input_qvoffset"] = cfg.qvoffset if "count_filter_singletons" in cfg.__dict__: subst_dict["count_filter_singletons"] = cfg.count_filter_singletons if "read_buffer_size" in cfg.__dict__: subst_dict["count_split_buffer"] = cfg.read_buffer_size process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_spades(filename, cfg, log, use_additional_contigs, K, last_one): subst_dict = dict() subst_dict["K"] = str(K) subst_dict["run_mode"] = "false" subst_dict["dataset"] = process_cfg.process_spaces(cfg.dataset) subst_dict["output_base"] = process_cfg.process_spaces(cfg.output_dir) subst_dict["additional_contigs"] = process_cfg.process_spaces(cfg.additional_contigs) subst_dict["entry_point"] = "construction" subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode) subst_dict["gap_closer_enable"] = bool_to_str(last_one) subst_dict["paired_mode"] = bool_to_str(last_one and cfg.paired_mode) subst_dict["topology_simplif_enabled"] = bool_to_str(last_one) subst_dict["use_additional_contigs"] = bool_to_str(use_additional_contigs) subst_dict["max_threads"] = cfg.max_threads subst_dict["max_memory"] = cfg.max_memory subst_dict["correct_mismatches"] = bool_to_str(last_one) if "resolving_mode" in cfg.__dict__: subst_dict["resolving_mode"] = cfg.resolving_mode if "careful" in cfg.__dict__: subst_dict["mismatch_careful"] = bool_to_str(cfg.careful) process_cfg.substitute_params(filename, subst_dict, log)
def prepare_config_corr(filename, cfg, ext_python_modules_home): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml data = pyyaml.load(open(filename, 'r')) data["dataset"] = cfg.dataset data["output_dir"] = cfg.output_dir data["work_dir"] = process_cfg.process_spaces(cfg.tmp_dir) #data["hard_memory_limit"] = cfg.max_memory data["max_nthreads"] = cfg.max_threads data["bwa"] = cfg.bwa file_c = open(filename, 'w') pyyaml.dump(data, file_c, default_flow_style = False, default_style='"', width=100500) file_c.close()
def prepare_config_corr(filename, cfg, ext_python_modules_home): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml data = pyyaml.load(open(filename, 'r')) data["dataset"] = cfg.dataset data["output_dir"] = cfg.output_dir data["work_dir"] = process_cfg.process_spaces(cfg.tmp_dir) #data["hard_memory_limit"] = cfg.max_memory data["max_nthreads"] = cfg.max_threads data["bwa"] = cfg.bwa file_c = open(filename, 'w') pyyaml.dump(data, file_c, default_flow_style=False, default_style='"', width=100500) file_c.close()
def main(): os.environ["LC_ALL"] = "C" if len(sys.argv) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) check_binaries(bin_home, log) # parse options and safe all parameters to cfg cfg, dataset_data = fill_cfg(sys.argv, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt")) if not options: support.error("failed to parse command line of the previous run! Please restart from the beginning.") cfg, dataset_data = fill_cfg(options, log) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) else: params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) command = "Command line:" for v in sys.argv: command += " " + v log.info(command) print_used_values(cfg, log) log.removeHandler(params_handler) log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads if needed if support.dataset_has_interlaced_reads(dataset_data): dir_for_split_reads = os.path.join(os.path.abspath(options_storage.output_dir), 'split_reads') if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = os.path.abspath(options_storage.dataset_yaml_filename) try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): shutil.copytree(os.path.join(spades_home, "configs"), tmp_configs_dir) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode: log.info("\n===== Skipping read error correction (already processed). \n") else: options_storage.continue_mode = False # continue from here if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if not os.path.exists(bh_cfg.tmp_dir): os.makedirs(bh_cfg.tmp_dir) log.info("\n===== Read error correction started. \n") bh_logic.run_bh(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, ext_python_modules_home, log) log.info("\n===== Read error correction finished. \n") result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename spades_cfg.__dict__["additional_contigs"] = os.path.join(spades_cfg.output_dir, "simplified_contigs.fasta") if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename))): log.info("\n===== Skipping assembling (already processed). \n") # calculating latest_dir for the next stages latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error("failed to continue the previous run! Please restart from the beginning.") else: if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if support.dataset_has_paired_reads(dataset_data): spades_cfg.__dict__["paired_mode"] = True else: spades_cfg.__dict__["paired_mode"] = False if options_storage.rectangles: spades_cfg.__dict__["resolving_mode"] = "rectangles" if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== Assembling started.\n") # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile(dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write(process_cfg.process_spaces(os.path.abspath(cfg["dataset"].reference)) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, log) #rectangles if spades_cfg.paired_mode and options_storage.rectangles: if options_storage.continue_mode: # TODO: continue mode support.warning("sorry, --continue doesn't work with --rectangles yet. Skipping repeat resolving.") else: sys.path.append(os.path.join(python_modules_home, "rectangles")) import rrr rrr_input_dir = os.path.join(latest_dir, "saves") rrr_outpath = os.path.join(spades_cfg.output_dir, "rectangles") if not os.path.exists(rrr_outpath): os.mkdir(rrr_outpath) rrr_reference_information_file = os.path.join(rrr_input_dir, "late_pair_info_counted_etalon_distance.txt") rrr_test_util = rrr.TestUtils(rrr_reference_information_file, os.path.join(rrr_outpath, "rectangles.log")) rrr.resolve(rrr_input_dir, rrr_outpath, rrr_test_util, "", cfg["dataset"].single_cell, spades_cfg.careful) shutil.copyfile(os.path.join(rrr_outpath, "rectangles_extend_before_scaffold.fasta"), spades_cfg.result_contigs) shutil.copyfile(os.path.join(rrr_outpath, "rectangles_extend.fasta"), spades_cfg.result_scaffolds) if not spades_cfg.developer_mode: if os.path.exists(rrr_input_dir): shutil.rmtree(rrr_input_dir) if os.path.exists(rrr_outpath): shutil.rmtree(rrr_outpath, True) if os.path.exists(rrr_outpath): os.system('rm -r ' + rrr_outpath) #EOR if os.path.isdir(misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if os.path.isfile(spades_cfg.additional_contigs): shutil.move(spades_cfg.additional_contigs, misc_dir) log.info("\n===== Assembling finished. \n") #corrector if "mismatch_corrector" in cfg and (os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for k, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue shutil.move(old, new) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)): log.info("\n===== Skipping mismatch correction (already processed). \n") else: log.info("\n===== Mismatch correction started.") # detecting paired-end library with the largest insert size dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r')) ### initial dataset, i.e. before error correction dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename)) paired_end_libraries_ids = [] for id, reads_library in enumerate(dataset_data): if reads_library['type'] == 'paired-end': paired_end_libraries_ids.append(id) if not len(paired_end_libraries_ids): support.error('Mismatch correction cannot be performed without at least one paired-end library!') estimated_params = load_config_from_file(os.path.join(latest_dir, "_est_params.info")) max_insert_size = -1 target_paired_end_library_id = -1 for id in paired_end_libraries_ids: if float(estimated_params.__dict__["insert_size_" + str(id)]) > max_insert_size: max_insert_size = float(estimated_params.__dict__["insert_size_" + str(id)]) target_paired_end_library_id = id yaml_dirname = os.path.dirname(options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list(map(lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id]['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list(map(lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id]['right reads'])) cfg["mismatch_corrector"].__dict__["insert-size"] = round(max_insert_size) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value: args.append(value) # processing contigs and scaffolds (or only contigs) for k, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(corrected): log.info("\n== Skipping processing of " + k + " (already processed)\n") continue options_storage.continue_mode = False log.info("\n== Processing of " + k + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = os.path.join(corrector_cfg.__dict__["output-dir"], "mismatch_corrector_" + k) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.abspath(os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")) # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) log.info("\n===== Mismatch correction finished.\n") if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + os.path.dirname(corrected_dataset_yaml_filename) + "/") if os.path.isfile(result_contigs_filename): log.info(" * Assembled contigs are in " + result_contigs_filename) if os.path.isfile(result_scaffolds_filename): log.info(" * Assembled scaffolds are in " + result_scaffolds_filename) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") threshold = 3 if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode: support.break_scaffolds(result_scaffolds_filename, threshold, result_broken_scaffolds) #log.info(" * Scaffolds broken by " + str(threshold) + " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: _, exc, _ = sys.exc_info() log.exception(exc) support.error("exception caught", log)
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: show_usage(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg options = args cfg, dataset_data = fill_cfg(options, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0]) if not options: support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.") cfg, dataset_data = fill_cfg(options, log, secondary_filling=True) if options_storage.restart_from: check_cfg_for_partial_run(cfg, type='restart-from') options_storage.continue_mode = True if options_storage.stop_after: check_cfg_for_partial_run(cfg, type='stop-after') log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" skip_next = False for v in args[1:]: if v == '-o' or v == '--restart-from': skip_next = True continue if skip_next or v.startswith('--restart-from='): # you can specify '--restart-from=k33' but not '-o=out_dir' skip_next = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line: " for v in args: # substituting relative paths with absolute ones (read paths, output dir path, etc) v, prefix = support.get_option_prefix(v) if v in options_storage.dict_of_rel2abs.keys(): v = options_storage.dict_of_rel2abs[v] if prefix: command += prefix + ":" command += v + " " log.info(command) # special case # if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'): # support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) # del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) support.check_single_reads_in_options(options, log) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data)\ or support.dataset_has_nxmate_reads(dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_nxmate_reads(dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) if support.dataset_has_interlaced_reads(dataset_data): dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) if support.dataset_has_nxmate_reads(dataset_data): dataset_data = support.process_nxmate_reads(dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): if options_storage.configs_dir: dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False) else: dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) if options_storage.stop_after == 'ec': support.finish_here(log) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, "assembly_graph.fastg") truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads") truseq_long_reads_file = truseq_long_reads_file_base + ".fasta" misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg and not options_storage.run_completed: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not options_storage.restart_from == 'scc' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename] for old_result_file in old_result_files: if os.path.isfile(old_result_file): os.remove(old_result_file) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile(dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') dataset_file.write("meta" + '\t' + process_cfg.bool_to_str(cfg["dataset"].meta) + '\n') dataset_file.write("moleculo" + '\t' + process_cfg.bool_to_str(cfg["dataset"].truseq) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir(misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) if not options_storage.run_completed: if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')): support.finish_here(log) #postprocessing if cfg["run_truseq_postprocessing"] and not options_storage.run_completed: if options_storage.continue_mode and os.path.isfile(truseq_long_reads_file_base + ".fastq") and not options_storage.restart_from == 'tpp': log.info("\n===== Skipping %s (already processed). \n" % "TruSeq postprocessing") else: support.continue_from_here(log) if os.path.isfile(result_scaffolds_filename): shutil.move(result_scaffolds_filename, assembled_scaffolds_filename) reads_library = dataset_data[0] alignment_bin = os.path.join(bin_home, "bwa-spades") alignment_dir = os.path.join(cfg["common"].output_dir, "alignment") sam_files = alignment.align_bwa(alignment_bin, assembled_scaffolds_filename, dataset_data, alignment_dir, log, options_storage.threads) moleculo_postprocessing.moleculo_postprocessing(assembled_scaffolds_filename, truseq_long_reads_file_base, sam_files, log) if options_storage.stop_after == 'tpp': support.finish_here(log) #corrector if "mismatch_corrector" in cfg and not options_storage.run_completed and \ (os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue if os.path.isfile(old): shutil.move(old, new) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size cfg["mismatch_corrector"].__dict__["dataset"] = cfg["dataset"].yaml_filename #TODO: add reads orientation import corrector_logic corrector_cfg = cfg["mismatch_corrector"] # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") tmp_dir_for_corrector = os.path.join(cfg["common"].output_dir, "mismatch_corrector", assembly_type) cfg["mismatch_corrector"].__dict__["output_dir"] = tmp_dir_for_corrector # correcting corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"]) result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta") corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg, ext_python_modules_home, log, assembled, result_corrected_filename) if os.path.isfile(result_corrected_filename): shutil.copyfile(result_corrected_filename, corrected) tmp_d = os.path.join(tmp_dir_for_corrector, "tmp") if os.path.isdir(tmp_d) and not cfg["common"].developer_mode: shutil.rmtree(tmp_d) log.info("\n===== %s finished.\n" % STAGE_NAME) if options_storage.stop_after == 'mc': support.finish_here(log) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) if not options_storage.run_completed: #log.info("") if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) log.info(message) if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename): message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename) log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS if options_storage.test_mode: if options_storage.truseq_mode: if not os.path.isfile(truseq_long_reads_file): support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file) else: for result_filename in [result_contigs_filename, result_scaffolds_filename]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error("TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error("TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def __init__(self, cfg, get_stage, *args): super(SpadesStage, self).__init__(*args) self.get_stage = get_stage self.generate_cfg(cfg, self.output_files) # creating dataset dataset_filename = os.path.join(self.cfg.output_dir, "dataset.info") with open(dataset_filename, 'w') as dataset_file: import process_cfg # TODO don't exists at that moment, what better to write???? if self.output_files["corrected_dataset_yaml_filename"] != "": dataset_file.write("reads\t%s\n" % process_cfg.process_spaces( self.output_files["corrected_dataset_yaml_filename"])) else: dataset_file.write( "reads\t%s\n" % process_cfg.process_spaces(cfg["dataset"].yaml_filename)) if self.cfg.developer_mode and "reference" in cfg[ "dataset"].__dict__: dataset_file.write("reference_genome\t") dataset_file.write( process_cfg.process_spaces(cfg["dataset"].reference) + '\n') if not os.path.isdir(self.output_files["misc_dir"]): os.makedirs(self.output_files["misc_dir"]) generateK(self.cfg, self.log, self.dataset_data) self.used_K = [] count = 0 prev_K = None for K in self.cfg.iterative_K: count += 1 last_one = count == len(self.cfg.iterative_K) iter_stage = spades_iteration_stage.IterationStage( K, prev_K, last_one, self.get_stage, self.latest, "k%d" % K, self.output_files, self.tmp_configs_dir, self.dataset_data, self.log, self.bin_home, self.ext_python_modules_home, self.python_modules_home) self.stages.append(iter_stage) self.latest = os.path.join(self.cfg.output_dir, "K%d" % K) self.used_K.append(K) prev_K = K if last_one: break if self.cfg.correct_scaffolds: self.stages.append( scaffold_correction_stage.ScaffoldCorrectionStage( self.latest, "scc", self.output_files, self.tmp_configs_dir, self.dataset_data, self.log, self.bin_home, self.ext_python_modules_home, self.python_modules_home)) self.latest = os.path.join( os.path.join(self.cfg.output_dir, "SCC"), "K21") if options_storage.args.plasmid and options_storage.args.meta: self.stages.append( PlasmidGlueFileStage(self.latest, "plasmid_copy_files", self.output_files, self.tmp_configs_dir, self.dataset_data, self.log, self.bin_home, self.ext_python_modules_home, self.python_modules_home)) self.stages.append( SpadesCopyFileStage(self.latest, "copy_files", self.output_files, self.tmp_configs_dir, self.dataset_data, self.log, self.bin_home, self.ext_python_modules_home, self.python_modules_home))
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg options = args cfg, dataset_data = fill_cfg(options, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params( os.path.join(options_storage.output_dir, "params.txt"), args[0]) if not options: support.error( "failed to parse command line of the previous run! Please restart from the beginning or specify another output directory." ) cfg, dataset_data = fill_cfg(options, log) if options_storage.restart_from: check_cfg_for_restart_from(cfg) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info( "\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" flag = False for v in args[1:]: if v == '-o' or v == '--restart-from': flag = True continue if flag: flag = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line:" for v in args: command += " " + v log.info(command) # special case # if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'): # support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) # del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) support.check_single_reads_in_options(options, log) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data)\ or support.dataset_has_nxmate_reads(dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads( dataset_data) or support.dataset_has_nxmate_reads( dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) if support.dataset_has_interlaced_reads(dataset_data): dataset_data = support.split_interlaced_reads( dataset_data, dir_for_split_reads, log) if support.dataset_has_nxmate_reads(dataset_data): dataset_data = support.process_nxmate_reads( dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs( dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join( options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir( tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): if options_storage.configs_dir: dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False) else: dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type( dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type( dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__[ "dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg[ "dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join( misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir( os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error( "failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [ result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename ] for format in [".fasta", ".fastg"]: for old_result_file in old_result_files: if os.path.isfile(old_result_file[:-6] + format): os.remove(old_result_file[:-6] + format) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load( open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths( dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile( dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write( "single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write( "reads" + '\t' + process_cfg.process_spaces( corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces( cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg[ "dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write( process_cfg.process_spaces( cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir( misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith( 'k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error( "failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) #corrector if "mismatch_corrector" in cfg and ( os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or ( options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue for format in [".fasta", ".fastg"]: if os.path.isfile(old[:-6] + format): shutil.move(old[:-6] + format, new[:-6] + format) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size cfg["mismatch_corrector"].__dict__["dataset"] = cfg[ "dataset"].yaml_filename #TODO: add reads orientation import corrector_logic corrector_cfg = cfg["mismatch_corrector"] # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile( corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") tmp_dir_for_corrector = os.path.join( cfg["common"].output_dir, "mismatch_corrector", assembly_type) cfg["mismatch_corrector"].__dict__[ "output_dir"] = tmp_dir_for_corrector # correcting corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"]) result_corrected_filename = os.path.join( tmp_dir_for_corrector, "corrected_contigs.fasta") corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg, ext_python_modules_home, log, assembled, result_corrected_filename) if os.path.isfile(result_corrected_filename): shutil.copyfile(result_corrected_filename, corrected) tmp_d = os.path.join(tmp_dir_for_corrector, "tmp") if os.path.isdir( tmp_d) and not cfg["common"].developer_mode: shutil.rmtree(tmp_d) assembled_fastg = assembled[:-6] + ".fastg" if os.path.isfile(assembled_fastg): support.create_fastg_from_fasta( corrected, assembled_fastg, log) log.info("\n===== %s finished.\n" % STAGE_NAME) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if "error_correction" in cfg and os.path.isdir( os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces( os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces( result_contigs_filename) if os.path.isfile(result_contigs_filename[:-6] + ".fastg"): message += " (" + os.path.basename( result_contigs_filename[:-6] + ".fastg") + ")" log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces( result_scaffolds_filename) if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"): message += " (" + os.path.basename( result_scaffolds_filename[:-6] + ".fastg") + ")" log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds ) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds( result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished." ) # otherwise it finished WITH WARNINGS if options_storage.test_mode: for result_filename in [ result_contigs_filename, result_scaffolds_filename ]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error( "TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error( "TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error( "TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error( "It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg cfg, dataset_data = fill_cfg(args, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt")) if not options: support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.") cfg, dataset_data = fill_cfg(options, log) if options_storage.restart_from: check_cfg_for_restart_from(cfg) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" flag = False for v in args[1:]: if v == '-o' or v == '--restart-from': flag = True continue if flag: flag = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line:" for v in args: command += " " + v log.info(command) # special case if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'): support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads(dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename] for format in [".fasta", ".fastg"]: for old_result_file in old_result_files: if os.path.isfile(old_result_file[:-6] + format): os.remove(old_result_file[:-6] + format) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile(dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir(misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) #corrector if "mismatch_corrector" in cfg and (os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue for format in [".fasta", ".fastg"]: if os.path.isfile(old[:-6] + format): shutil.move(old[:-6] + format, new[:-6] + format) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size est_params_data = pyyaml.load(open(os.path.join(latest_dir, "final.lib_data"), 'r')) max_IS_library = None for reads_library in est_params_data: if reads_library['type'] == 'paired-end': if not max_IS_library or float(reads_library["insert size mean"]) > float(max_IS_library["insert size mean"]): max_IS_library = reads_library if not max_IS_library: support.error('Mismatch correction cannot be performed without at least one paired-end library!', log) if not max_IS_library["insert size mean"]: support.warning('Failed to estimate insert size for all paired-end libraries. Starting Mismatch correction' ' based on the first paired-end library and with default insert size.', log) else: cfg["mismatch_corrector"].__dict__["insert-size"] = round(max_IS_library["insert size mean"]) yaml_dirname = os.path.dirname(options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list(map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list(map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['right reads'])) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value is not None: args.append(value) # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = support.get_tmp_dir(prefix="mis_cor_%s_" % assembly_type) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta") # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) assembled_fastg = assembled[:-6] + ".fastg" if os.path.isfile(assembled_fastg): support.create_fastg_from_fasta(corrected, assembled_fastg, log) log.info("\n===== %s finished.\n" % STAGE_NAME) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) if os.path.isfile(result_contigs_filename[:-6] + ".fastg"): message += " (" + os.path.basename(result_contigs_filename[:-6] + ".fastg") + ")" log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"): message += " (" + os.path.basename(result_scaffolds_filename[:-6] + ".fastg") + ")" log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS if options_storage.test_mode: for result_filename in [result_contigs_filename, result_scaffolds_filename]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error("TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error("TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def main(): os.environ["LC_ALL"] = "C" if len(sys.argv) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) check_binaries(bin_home, log) # parse options and safe all parameters to cfg cfg, dataset_data = fill_cfg(sys.argv, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params( os.path.join(options_storage.output_dir, "params.txt")) if not options: support.error( "failed to parse command line of the previous run! Please restart from the beginning." ) cfg, dataset_data = fill_cfg(options, log) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info( "\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) else: params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) command = "Command line:" for v in sys.argv: command += " " + v log.info(command) print_used_values(cfg, log) log.removeHandler(params_handler) log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads if needed if support.dataset_has_interlaced_reads(dataset_data): dir_for_split_reads = os.path.join( os.path.abspath(options_storage.output_dir), 'split_reads') if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join( options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = os.path.abspath( options_storage.dataset_yaml_filename) try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir( tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): shutil.copytree(os.path.join(spades_home, "configs"), tmp_configs_dir) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) bh_cfg.__dict__["dataset_yaml_filename"] = cfg[ "dataset"].yaml_filename corrected_dataset_yaml_filename = os.path.join( bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename ) and options_storage.continue_mode: log.info( "\n===== Skipping read error correction (already processed). \n" ) else: options_storage.continue_mode = False # continue from here if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if not os.path.exists(bh_cfg.tmp_dir): os.makedirs(bh_cfg.tmp_dir) log.info("\n===== Read error correction started. \n") bh_logic.run_bh(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, ext_python_modules_home, log) log.info("\n===== Read error correction finished. \n") result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join( misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename spades_cfg.__dict__["additional_contigs"] = os.path.join( spades_cfg.output_dir, "simplified_contigs.fasta") if options_storage.continue_mode and ( os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename))): log.info("\n===== Skipping assembling (already processed). \n") # calculating latest_dir for the next stages latest_dir = support.get_latest_dir( os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error( "failed to continue the previous run! Please restart from the beginning." ) else: if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load( open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths( dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if support.dataset_has_paired_reads(dataset_data): spades_cfg.__dict__["paired_mode"] = True else: spades_cfg.__dict__["paired_mode"] = False if options_storage.rectangles: spades_cfg.__dict__["resolving_mode"] = "rectangles" if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== Assembling started.\n") # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile( dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write( "single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write( "reads" + '\t' + process_cfg.process_spaces( corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces( cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg[ "dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write( process_cfg.process_spaces( os.path.abspath(cfg["dataset"].reference)) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, log) #rectangles if spades_cfg.paired_mode and options_storage.rectangles: if options_storage.continue_mode: # TODO: continue mode support.warning( "sorry, --continue doesn't work with --rectangles yet. Skipping repeat resolving." ) else: sys.path.append( os.path.join(python_modules_home, "rectangles")) import rrr rrr_input_dir = os.path.join(latest_dir, "saves") rrr_outpath = os.path.join(spades_cfg.output_dir, "rectangles") if not os.path.exists(rrr_outpath): os.mkdir(rrr_outpath) rrr_reference_information_file = os.path.join( rrr_input_dir, "late_pair_info_counted_etalon_distance.txt") rrr_test_util = rrr.TestUtils( rrr_reference_information_file, os.path.join(rrr_outpath, "rectangles.log")) rrr.resolve(rrr_input_dir, rrr_outpath, rrr_test_util, "", cfg["dataset"].single_cell, spades_cfg.careful) shutil.copyfile( os.path.join( rrr_outpath, "rectangles_extend_before_scaffold.fasta"), spades_cfg.result_contigs) shutil.copyfile( os.path.join(rrr_outpath, "rectangles_extend.fasta"), spades_cfg.result_scaffolds) if not spades_cfg.developer_mode: if os.path.exists(rrr_input_dir): shutil.rmtree(rrr_input_dir) if os.path.exists(rrr_outpath): shutil.rmtree(rrr_outpath, True) if os.path.exists(rrr_outpath): os.system('rm -r ' + rrr_outpath) #EOR if os.path.isdir( misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if os.path.isfile(spades_cfg.additional_contigs): shutil.move(spades_cfg.additional_contigs, misc_dir) log.info("\n===== Assembling finished. \n") #corrector if "mismatch_corrector" in cfg and ( os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or ( options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for k, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue shutil.move(old, new) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)): log.info( "\n===== Skipping mismatch correction (already processed). \n" ) else: log.info("\n===== Mismatch correction started.") # detecting paired-end library with the largest insert size dataset_data = pyyaml.load( open(options_storage.dataset_yaml_filename, 'r') ) ### initial dataset, i.e. before error correction dataset_data = support.relative2abs_paths( dataset_data, os.path.dirname(options_storage.dataset_yaml_filename)) paired_end_libraries_ids = [] for id, reads_library in enumerate(dataset_data): if reads_library['type'] == 'paired-end': paired_end_libraries_ids.append(id) if not len(paired_end_libraries_ids): support.error( 'Mismatch correction cannot be performed without at least one paired-end library!' ) estimated_params = load_config_from_file( os.path.join(latest_dir, "_est_params.info")) max_insert_size = -1 target_paired_end_library_id = -1 for id in paired_end_libraries_ids: if float(estimated_params.__dict__[ "insert_size_" + str(id)]) > max_insert_size: max_insert_size = float( estimated_params.__dict__["insert_size_" + str(id)]) target_paired_end_library_id = id yaml_dirname = os.path.dirname( options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list( map( lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id] ['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list( map( lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id] ['right reads'])) cfg["mismatch_corrector"].__dict__["insert-size"] = round( max_insert_size) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value: args.append(value) # processing contigs and scaffolds (or only contigs) for k, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile( corrected): log.info("\n== Skipping processing of " + k + " (already processed)\n") continue options_storage.continue_mode = False log.info("\n== Processing of " + k + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = os.path.join( corrector_cfg.__dict__["output-dir"], "mismatch_corrector_" + k) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.abspath( os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")) # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) log.info("\n===== Mismatch correction finished.\n") if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + os.path.dirname(corrected_dataset_yaml_filename) + "/") if os.path.isfile(result_contigs_filename): log.info(" * Assembled contigs are in " + result_contigs_filename) if os.path.isfile(result_scaffolds_filename): log.info(" * Assembled scaffolds are in " + result_scaffolds_filename) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") threshold = 3 if not os.path.isfile(result_broken_scaffolds ) or not options_storage.continue_mode: support.break_scaffolds(result_scaffolds_filename, threshold, result_broken_scaffolds) #log.info(" * Scaffolds broken by " + str(threshold) + " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished." ) # otherwise it finished WITH WARNINGS log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: _, exc, _ = sys.exc_info() log.exception(exc) support.error("exception caught", log)