def run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, K): data_dir = os.path.join(cfg.output_dir, "SCC", "K%d" % K) saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") cfg_file_name = os.path.join(dst_configs, "config.info") if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False) log.info("\n== Running scaffold correction \n") scaffolds_file = os.path.join(latest, "scaffolds.fasta") if not os.path.isfile(scaffolds_file): support.error("Scaffodls were not found in " + scaffolds_file, log) if "read_buffer_size" in cfg.__dict__: construction_cfg_file_name = os.path.join(dst_configs, "construction.info") process_cfg.substitute_params( construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log) process_cfg.substitute_params( os.path.join(dst_configs, "moleculo_mode.info"), {"scaffolds_file": scaffolds_file}, log) prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, K) command = [ os.path.join(execution_home, "spades-truseq-scfcorrection"), cfg_file_name ] add_configs(command, dst_configs) log.info(str(command)) support.sys_call(command, log)
def run_scaffold_correction(configs_dir, execution_home, cfg, log, K): data_dir = os.path.join(cfg.output_dir, "SCC") saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") cfg_file_name = os.path.join(dst_configs, "config.info") if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False) # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.info.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) log.info("\n== Running scaffold correction \n") latest = os.path.join(cfg.output_dir, "K%d" % K) scaffolds_file = os.path.join(latest, "scaffolds.fasta") if not os.path.isfile(scaffolds_file): support.error("Scaffodls were not found in " + scaffolds_file, log) if "read_buffer_size" in cfg.__dict__: construction_cfg_file_name = os.path.join(dst_configs, "construction.info") process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log) prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, scaffolds_file) command = [os.path.join(execution_home, "scaffold_correction"), cfg_file_name] log.info(str(command)) support.sys_call(command, log)
def get_options_from_params(params_filename, spades_py_name=None): if not os.path.isfile(params_filename): return None, None params = open(params_filename, 'r') cmd_line = params.readline().strip() spades_prev_version = None for line in params: if line.find('rnaSPAdes version:') != -1: spades_prev_version = line.split('rnaSPAdes version:')[1] break params.close() if spades_prev_version is None: support.error("failed to parse rnaSPAdes version of the previous run! " "Please restart from the beginning or specify another output directory.") if spades_prev_version.strip() != spades_version.strip(): support.error("rnaSPAdes version of the previous run (%s) is not equal to the current version of rnaSPAdes (%s)! " "Please restart from the beginning or specify another output directory." % (spades_prev_version.strip(), spades_version.strip())) if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1: spades_py_name = 'spades.py' # try default name else: spades_py_name = os.path.basename(spades_py_name) spades_py_pos = cmd_line.find(spades_py_name) if spades_py_pos == -1: return None, None return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split()
def run_corrector(configs_dir, execution_home, cfg, ext_python_modules_home, log, to_correct, result): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) dir_util.copy_tree(os.path.join(configs_dir, "corrector"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "corrector.info") cfg.tmp_dir = support.get_tmp_dir(prefix="corrector_") prepare_config_corr(cfg_file_name, cfg, ext_python_modules_home) binary_name = "corrector" command = [os.path.join(execution_home, binary_name), os.path.abspath(cfg_file_name), os.path.abspath(to_correct)] log.info("\n== Running contig polishing tool: " + ' '.join(command) + "\n") log.info("\n== Dataset description file was created: " + cfg_file_name + "\n") support.sys_call(command, log) if not os.path.isfile(result): support.error("Mismatch correction finished abnormally: " + result + " not found!") if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir)
def parse_args(args, log): options, cfg, dataset_data = options_parser.parse_args(log, bin_home, spades_home, secondary_filling=False, restart_from=False) command_line = "" if options_storage.args.continue_mode: restart_from = options_storage.args.restart_from command_line, options, script_name, err_msg = get_options_from_params( os.path.join(options_storage.args.output_dir, "params.txt"), args[0]) if err_msg: support.error(err_msg + " Please restart from the beginning or specify another output directory.") options, cfg, dataset_data = options_parser.parse_args(log, bin_home, spades_home, secondary_filling=True, restart_from=(options_storage.args.restart_from is not None), options=options) options_storage.args.continue_mode = True options_storage.args.restart_from = restart_from if options_storage.args.restart_from: check_cfg_for_partial_run(cfg, partial_run_type="restart-from") if options_storage.args.stop_after: check_cfg_for_partial_run(cfg, partial_run_type="stop-after") support.check_single_reads_in_options(log) return cfg, dataset_data, command_line
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log): log.info("\n== Compressing corrected reads (with gzip)") to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): compressed_reads_filenames = [] for reads_file in value: if not os.path.isfile(reads_file): support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) to_compress.append(reads_file) compressed_reads_filenames.append(reads_file + ".gz") reads_library[key] = compressed_reads_filenames if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def get_options_from_params(params_filename, spades_py_name=None): if not os.path.isfile(params_filename): return None, None params = open(params_filename, 'r') cmd_line = params.readline().strip() spades_prev_version = None for line in params: if line.find('SPAdes version:') != -1: spades_prev_version = line.split('SPAdes version:')[1] break params.close() if spades_prev_version is None: support.error( "failed to parse SPAdes version of the previous run! " "Please restart from the beginning or specify another output directory." ) if spades_prev_version.strip() != spades_version.strip(): support.error( "SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! " "Please restart from the beginning or specify another output directory." % (spades_prev_version.strip(), spades_version.strip())) if spades_py_name is None or cmd_line.find( os.path.basename(spades_py_name)) == -1: spades_py_name = 'spades.py' # try default name else: spades_py_name = os.path.basename(spades_py_name) spades_py_pos = cmd_line.find(spades_py_name) if spades_py_pos == -1: return None, None return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split()
def parse_arguments(argv, log): try: options, not_options = getopt.gnu_getopt(argv, DS_Args_List.short_options, DS_Args_List.long_options) except getopt.GetoptError: _, exc, _ = sys.exc_info() sys.stderr.write(str(exc) + "\n") sys.stderr.flush() options_storage.usage("", dipspades=True) sys.exit(1) ds_args = DS_Args() for opt, arg in options: if opt == '-o': ds_args.output_dir = os.path.abspath(arg) elif opt == '--expect-gaps': ds_args.allow_gaps = True elif opt == '--expect-rearrangements': ds_args.weak_align = True elif opt == '--hap': ds_args.haplocontigs_fnames.append(support.check_file_existence(arg, 'haplocontigs', log, dipspades=True)) elif opt == '-t' or opt == "--threads": ds_args.max_threads = int(arg) elif opt == '-m' or opt == "--memory": ds_args.max_memory = int(arg) elif opt == '--tmp-dir': ds_args.tmp_dir = os.path.abspath(arg) ds_args.haplocontigs = os.path.join(ds_args.output_dir, "haplocontigs") if not ds_args.output_dir: support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log, dipspades=True) if not ds_args.haplocontigs_fnames: support.error("cannot start dipSPAdes without at least one haplocontigs file!", log, dipspades=True) if not ds_args.tmp_dir: ds_args.tmp_dir = os.path.join(ds_args.output_dir, options_storage.TMP_DIR) return ds_args
def get_command_and_stage_id_before_restart_from(draft_commands, cfg, log): restart_from_stage_name = options_storage.args.restart_from.split(":")[0] if options_storage.args.restart_from == options_storage.LAST_STAGE: last_command = get_first_incomplete_command(os.path.join(get_stage.cfg["common"].output_dir, "run_spades.yaml")) if last_command is None: restart_from_stage_name = draft_commands[-1].short_name else: restart_from_stage_name = last_command["short_name"] restart_from_stage_id = None for num in range(len(draft_commands)): stage = draft_commands[num] if stage.short_name.startswith(restart_from_stage_name): restart_from_stage_id = num break if restart_from_stage_id is None: support.error( "failed to restart from %s because this stage was not specified!" % options_storage.args.restart_from, log) if ":" in options_storage.args.restart_from or options_storage.args.restart_from == options_storage.LAST_STAGE: return draft_commands[restart_from_stage_id], restart_from_stage_id if restart_from_stage_id > 0: stage_filename = options_storage.get_stage_filename(restart_from_stage_id - 1, draft_commands[restart_from_stage_id - 1].short_name) if not os.path.isfile(stage_filename): support.error( "cannot restart from stage %s: previous stage was not complete." % options_storage.args.restart_from, log) return draft_commands[restart_from_stage_id - 1], restart_from_stage_id - 1 return None, None
def run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, K): data_dir = os.path.join(cfg.output_dir, "SCC", "K%d" % K) saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") cfg_file_name = os.path.join(dst_configs, "config.info") if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False) log.info("\n== Running scaffold correction \n") scaffolds_file = os.path.join(latest, "scaffolds.fasta") if not os.path.isfile(scaffolds_file): support.error("Scaffodls were not found in " + scaffolds_file, log) if "read_buffer_size" in cfg.__dict__: construction_cfg_file_name = os.path.join(dst_configs, "construction.info") process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log) process_cfg.substitute_params(os.path.join(dst_configs, "moleculo_mode.info"), {"scaffolds_file": scaffolds_file}, log) prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, K) command = [os.path.join(execution_home, "scaffold_correction"), cfg_file_name] add_configs(command, dst_configs) log.info(str(command)) support.sys_call(command, log)
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one): data_dir = os.path.join(cfg.output_dir, "K%d" % K) stage = BASE_STAGE saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") cfg_file_name = os.path.join(dst_configs, "config.info") if options_storage.continue_mode: if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))): log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)") return if options_storage.restart_from and options_storage.restart_from.find(":") != -1: stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:] support.continue_from_here(log) if stage != BASE_STAGE: if not os.path.isdir(saves_dir): support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir)) else: if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) shutil.copytree(os.path.join(configs_dir, "debruijn"), dst_configs) # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.info.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) log.info("\n== Running assembler: " + ("K%d" % K) + "\n") if prev_K: additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta") if not os.path.isfile(additional_contigs_fname): support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log) additional_contigs_fname = None else: additional_contigs_fname = None if "read_buffer_size" in cfg.__dict__: construction_cfg_file_name = os.path.join(dst_configs, "construction.info") process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log) prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one) command = [os.path.join(execution_home, "spades"), cfg_file_name] ## this code makes sense for src/debruijn/simplification.cpp: corrected_and_save_reads() function which is not used now # bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads") # if os.path.isdir(bin_reads_dir): # if glob.glob(os.path.join(bin_reads_dir, "*_cor*")): # for cor_filename in glob.glob(os.path.join(bin_reads_dir, "*_cor*")): # cor_index = cor_filename.rfind("_cor") # new_bin_filename = cor_filename[:cor_index] + cor_filename[cor_index + 4:] # shutil.move(cor_filename, new_bin_filename) support.sys_call(command, log)
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg, not_used_dataset_data, ext_python_modules_home, log): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) if cfg.iontorrent: dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg") else: dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "config.info") # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_") if cfg.iontorrent: prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home) binary_name = "ionhammer" else: prepare_config_bh(cfg_file_name, cfg, log) binary_name = "hammer" command = [os.path.join(execution_home, binary_name), os.path.abspath(cfg_file_name)] log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n") support.sys_call(command, log) if not os.path.isfile(corrected_dataset_yaml_filename): support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!") corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) remove_not_corrected_reads(cfg.output_dir) is_changed = False if cfg.gzip_output: is_changed = True compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log) if not_used_dataset_data: is_changed = True corrected_dataset_data += not_used_dataset_data if is_changed: pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w')) log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n") if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir)
def check_binaries(binary_dir, log): for binary in ["hammer", "spades", "bwa-spades"]: binary_path = os.path.join(binary_dir, binary) if not os.path.isfile(binary_path): support.error("SPAdes binaries not found: " + binary_path + "\nYou can obtain SPAdes binaries in one of two ways:" + "\n1. Download them from http://spades.bioinf.spbau.ru/release" + str(spades_version).strip() + "/SPAdes-" + str(spades_version).strip() + "-Linux.tar.gz" + "\n2. Build source code with ./spades_compile.sh script", log)
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False): to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): moved_reads_files = [] for reads_file in value: dst_filename = os.path.join(dst, os.path.basename(reads_file)) # TODO: fix problem with files with the same basenames in Hammer binary! if not os.path.isfile(reads_file): if (not gzip and os.path.isfile(dst_filename)) or ( gzip and os.path.isfile(dst_filename + '.gz')): support.warning( 'file with corrected reads (' + reads_file + ') is the same in several libraries', log) if gzip: dst_filename += '.gz' else: support.error( 'something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) else: shutil.move(reads_file, dst_filename) if gzip: to_compress.append(dst_filename) dst_filename += '.gz' moved_reads_files.append(dst_filename) reads_library[key] = moved_reads_files if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([ pigz_path, '-f', '-7', '-p', str(max_threads), reads_file ], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)( delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def check_binaries(binary_dir, log): for binary in ["hammer", "spades", "bwa-spades"]: binary_path = os.path.join(binary_dir, binary) if not os.path.isfile(binary_path): support.error( "SPAdes binaries not found: " + binary_path + "\nYou can obtain SPAdes binaries in one of two ways:" + "\n1. Download them from http://spades.bioinf.spbau.ru/release" + str(spades_version).strip() + "/SPAdes-" + str(spades_version).strip() + "-Linux.tar.gz" + "\n2. Build source code with ./spades_compile.sh script", log)
def parse_arguments(argv, log): try: options, not_options = getopt.gnu_getopt(argv, DS_Args_List.short_options, DS_Args_List.long_options) except getopt.GetoptError: _, exc, _ = sys.exc_info() sys.stderr.write(str(exc) + "\n") sys.stderr.flush() options_storage.usage("", dipspades=True) sys.exit(1) ds_args = DS_Args() for opt, arg in options: if opt == '-o': ds_args.output_dir = os.path.abspath(arg) elif opt == '--expect-gaps': ds_args.allow_gaps = True elif opt == '--expect-rearrangements': ds_args.weak_align = True elif opt == '--hap': ds_args.haplocontigs_fnames.append( support.check_file_existence(arg, 'haplocontigs', log, dipspades=True)) elif opt == '-t' or opt == "--threads": ds_args.max_threads = int(arg) elif opt == '-m' or opt == "--memory": ds_args.max_memory = int(arg) elif opt == '--tmp-dir': ds_args.tmp_dir = os.path.abspath(arg) elif opt == '--dsdebug': ds_args.dev_mode = True elif opt == '--hap-assembly': ds_args.haplotype_assembly = True elif opt == '--dsK': ds_args.k = int(arg) ds_args.haplocontigs = os.path.join(ds_args.output_dir, "haplocontigs") if not ds_args.output_dir: support.error( "the output_dir is not set! It is a mandatory parameter (-o output_dir).", log, dipspades=True) if not ds_args.haplocontigs_fnames: support.error( "cannot start dipSPAdes without at least one haplocontigs file!", log, dipspades=True) if not ds_args.tmp_dir: ds_args.tmp_dir = os.path.join(ds_args.output_dir, options_storage.TMP_DIR) return ds_args
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one): data_dir = os.path.join(cfg.output_dir, "K%d" % K) stage = BASE_STAGE saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") if options_storage.continue_mode: if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))): log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)") return if options_storage.restart_from and options_storage.restart_from.find(":") != -1 \ and options_storage.restart_from.startswith("k%d:" % K): stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:] support.continue_from_here(log) if stage != BASE_STAGE: if not os.path.isdir(saves_dir): support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir)) else: if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) dir_util._path_created = {} # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False) log.info("\n== Running assembler: " + ("K%d" % K) + "\n") if prev_K: additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta") if not os.path.isfile(additional_contigs_fname): support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log) additional_contigs_fname = None else: additional_contigs_fname = None if "read_buffer_size" in cfg.__dict__: #FIXME why here??? process_cfg.substitute_params(os.path.join(dst_configs, "construction.info"), {"read_buffer_size": cfg.read_buffer_size}, log) if "scaffolding_mode" in cfg.__dict__: #FIXME why here??? process_cfg.substitute_params(os.path.join(dst_configs, "pe_params.info"), {"scaffolding_mode": cfg.scaffolding_mode}, log) prepare_config_rnaspades(os.path.join(dst_configs, "rna_mode.info"), log) prepare_config_construction(os.path.join(dst_configs, "construction.info"), log) cfg_fn = os.path.join(dst_configs, "config.info") prepare_config_spades(cfg_fn, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home) command = [os.path.join(execution_home, "spades-core"), cfg_fn] add_configs(command, dst_configs) #print("Calling: " + " ".join(command)) support.sys_call(command, log)
def run_bh(result_filename, configs_dir, execution_home, cfg, ext_python_modules_home, log): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) shutil.copytree(os.path.join(configs_dir, "hammer"), dst_configs) cfg_file_name = os.path.join(dst_configs, "config.info") # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.info.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) prepare_config_bh(cfg_file_name, cfg, log) command = [ os.path.join(execution_home, "hammer"), os.path.abspath(cfg_file_name) ] log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n") support.sys_call(command, log) corrected_dataset_yaml_filename = os.path.join(cfg.tmp_dir, "corrected.yaml") if not os.path.isfile(corrected_dataset_yaml_filename): support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!") corrected_dataset_data = pyyaml.load( open(corrected_dataset_yaml_filename, 'r')) if cfg.gzip_output: log.info("\n== Compressing corrected reads (with gzip)") move_dataset_files(corrected_dataset_data, cfg.output_dir, ext_python_modules_home, cfg.max_threads, log, cfg.gzip_output) corrected_dataset_yaml_filename = result_filename pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w')) log.info("\n== Dataset description file created: " + corrected_dataset_yaml_filename + "\n") shutil.rmtree(cfg.tmp_dir)
def substitute_params(filename, var_dict, log): lines = file_lines(filename) vars_in_file = vars_from_lines(lines) for var, value in var_dict.items(): if var not in vars_in_file: support.error("Couldn't find " + var + " in " + filename, log) meta = vars_in_file[var] lines[meta.line_num] = meta.indent + str(var) + " " + str(value) + "\n" file = open(filename, "w") file.writelines(lines)
def init_parser(args): if options_parser.is_first_run(): options_storage.first_command_line = args check_dir_is_empty(options_parser.get_output_dir_from_args()) else: command_line, options, script, err_msg = get_options_from_params( os.path.join(options_parser.get_output_dir_from_args(), "params.txt"), args[0]) if err_msg != "": support.error(err_msg) options_storage.first_command_line = [script] + options
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one): data_dir = os.path.join(cfg.output_dir, "K%d" % K) stage = BASE_STAGE saves_dir = os.path.join(data_dir, 'saves') dst_configs = os.path.join(data_dir, "configs") cfg_file_name = os.path.join(dst_configs, "config.info") if options_storage.continue_mode: if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))): log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)") return if options_storage.restart_from and options_storage.restart_from.find(":") != -1: stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:] support.continue_from_here(log) if stage != BASE_STAGE: if not os.path.isdir(saves_dir): support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir)) else: if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False) # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.info.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) log.info("\n== Running assembler: " + ("K%d" % K) + "\n") if prev_K: additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta") if not os.path.isfile(additional_contigs_fname): support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log) additional_contigs_fname = None else: additional_contigs_fname = None if "read_buffer_size" in cfg.__dict__: construction_cfg_file_name = os.path.join(dst_configs, "construction.info") process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log) prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home) command = [os.path.join(execution_home, "spades"), cfg_file_name] support.sys_call(command, log)
def check_cfg_for_restart_from(cfg): if options_storage.restart_from == 'ec' and ("error_correction" not in cfg): support.error("failed to restart from read error correction because this stage was not specified!") if options_storage.restart_from == 'mc' and ("mismatch_corrector" not in cfg): support.error("failed to restart from mismatch correction because this stage was not specified!") if options_storage.restart_from == 'as' or options_storage.restart_from.startswith('k'): if "assembly" not in cfg: support.error("failed to restart from assembling because this stage was not specified!") if options_storage.restart_from.startswith('k'): correct_k = False k_to_check = options_storage.k_mers if not k_to_check: if options_storage.auto_K_allowed(): k_to_check = list(set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250)) else: k_to_check = options_storage.K_MERS_SHORT for k in k_to_check: if options_storage.restart_from == ("k%d" % k) or options_storage.restart_from.startswith("k%d:" % k): correct_k = True break if not correct_k: k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error("failed to restart from K=%s because this K was not specified!" % k_str)
def get_read_length(output_dir, K, ext_python_modules_home, log): est_params_filename = os.path.join(output_dir, "K%d" % K, "final.lib_data") max_read_length = 0 if os.path.isfile(est_params_filename): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml est_params_data = pyyaml.load(open(est_params_filename, 'r')) max_read_length = int(est_params_data['nomerge max read length']) log.info("Max read length detected as %d" % max_read_length) if max_read_length == 0: support.error("Failed to estimate maximum read length! File with estimated params: " + est_params_filename, log) return max_read_length
def main(): args = parse_args() # create logger log = logging.getLogger("Preprocess interlaced reads") log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter("%(message)s")) console.setLevel(logging.DEBUG) log.addHandler(console) with open(args.args_filename) as f: lines = f.readlines() lines = [x.rstrip() for x in lines] for input_filename, out_left_filename, out_right_filename, was_compressed, is_fastq in \ zip(lines[0::5], lines[1::5], lines[2::5], lines[3::5], lines[4::5]): was_compressed = (was_compressed == "True") is_fastq = (is_fastq == "True") if was_compressed: input_file = gzip.open(input_filename, 'r') else: input_file = open(input_filename) log.info( "== Splitting %s into left and right reads (in %s directory)" % (input_filename, args.dst)) out_files = [ open(out_left_filename, 'w'), open(out_right_filename, 'w') ] i = 0 next_read_name = write_single_read( input_file, out_files[i], None, is_fastq, sys.version.startswith("3.") and was_compressed) while next_read_name: i = (i + 1) % 2 next_read_name = write_single_read( input_file, out_files[i], next_read_name, is_fastq, sys.version.startswith("3.") and was_compressed) if i == 0: support.error( "the number of reads in file with interlaced reads (%s) should be EVEN!" % (input_filename), log) out_files[0].close() out_files[1].close() input_file.close()
def save_restart_options(log): if dataset_yaml_filename: support.error( "you cannot specify --dataset with --restart-from option!", log) if single_cell: support.error("you cannot specify --sc with --restart-from option!", log) if iontorrent: support.error( "you cannot specify --iontorrent with --restart-from option!", log) if only_assembler: support.error( "you cannot specify --only-assembler with --restart-from option!", log) if only_error_correction: support.error( "you cannot specify --only-error-correction with --restart-from option!", log) global restart_k_mers global restart_careful global restart_mismatch_corrector global restart_disable_gzip_output global restart_disable_rr global restart_threads global restart_memory global restart_tmp_dir global restart_qvoffset global restart_cov_cutoff global restart_developer_mode global restart_reference global restart_configs_dir global restart_read_buffer_size restart_k_mers = k_mers restart_careful = careful restart_mismatch_corrector = mismatch_corrector restart_disable_gzip_output = disable_gzip_output restart_disable_rr = disable_rr restart_threads = threads restart_memory = memory restart_tmp_dir = tmp_dir restart_qvoffset = qvoffset restart_cov_cutoff = cov_cutoff restart_developer_mode = developer_mode restart_reference = reference restart_configs_dir = configs_dir restart_read_buffer_size = read_buffer_size
def get_read_length(output_dir, K, ext_python_modules_home, log): est_params_filename = os.path.join(output_dir, "K%d" % K, "final.lib_data") max_read_length = 0 if os.path.isfile(est_params_filename): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml est_params_data = pyyaml.load(open(est_params_filename, 'r')) for reads_library in est_params_data: if reads_library['type'] in READS_TYPES_USED_IN_CONSTRUCTION: if int(reads_library["read length"]) > max_read_length: max_read_length = int(reads_library["read length"]) if max_read_length == 0: support.error("Failed to estimate maximum read length! File with estimated params: " + est_params_filename, log) return max_read_length
def clear_configs(cfg, log, command_before_restart_from, stage_id_before_restart_from): def matches_with_restart_from_arg(stage, restart_from_arg): return stage["short_name"].startswith(restart_from_arg.split(":")[0]) spades_commands_fpath = os.path.join(cfg["common"].output_dir, "run_spades.yaml") with open(spades_commands_fpath) as stream: old_pipeline = pyyaml.load(stream) restart_from_stage_id = None for num in range(len(old_pipeline)): stage = old_pipeline[num] if matches_with_restart_from_arg(stage, options_storage.args.restart_from): restart_from_stage_id = num break if command_before_restart_from is not None and \ old_pipeline[stage_id_before_restart_from]["short_name"] != command_before_restart_from.short_name: support.error( "new and old pipelines have difference before %s" % options_storage.args.restart_from, log) if command_before_restart_from is None: first_del = 0 else: first_del = stage_id_before_restart_from + 1 if restart_from_stage_id is not None: stage_filename = options_storage.get_stage_filename( restart_from_stage_id, old_pipeline[restart_from_stage_id]["short_name"]) if os.path.isfile(stage_filename): os.remove(stage_filename) for delete_id in range(first_del, len(old_pipeline)): stage_filename = options_storage.get_stage_filename( delete_id, old_pipeline[delete_id]["short_name"]) if os.path.isfile(stage_filename): os.remove(stage_filename) cfg_dir = old_pipeline[delete_id]["config_dir"] if cfg_dir != "" and os.path.isdir( os.path.join(cfg["common"].output_dir, cfg_dir)): shutil.rmtree(os.path.join(cfg["common"].output_dir, cfg_dir))
def init_parser(args): if options_parser.is_first_run(): options_storage.first_command_line = args check_dir_is_empty(options_parser.get_output_dir_from_args()) else: output_dir = options_parser.get_output_dir_from_args() if output_dir is None: support.error( "the output_dir is not set! It is a mandatory parameter (-o output_dir)." ) command_line, options, script, err_msg = get_options_from_params( os.path.join(output_dir, "params.txt"), args[0]) if err_msg != "": support.error(err_msg) options_storage.first_command_line = [script] + options
def write_single_read(in_file, out_file, read_name=None, is_fastq=False, is_python3=False): if read_name is None: read_name = support.process_readline(in_file.readline(), is_python3) if not read_name: return "" # no next read read_value = support.process_readline(in_file.readline(), is_python3) line = support.process_readline(in_file.readline(), is_python3) fpos = in_file.tell() while (is_fastq and not line.startswith('+')) or (not is_fastq and not line.startswith('>')): read_value += line line = support.process_readline(in_file.readline(), is_python3) if not line: if fpos == in_file.tell(): break fpos = in_file.tell() out_file.write(read_name + '\n') out_file.write(read_value + '\n') if is_fastq: read_quality = support.process_readline(in_file.readline(), is_python3) line = support.process_readline(in_file.readline(), is_python3) while not line.startswith('@'): read_quality += line line = support.process_readline(in_file.readline(), is_python3) if not line: if fpos == in_file.tell(): break fpos = in_file.tell() if len(read_value) != len(read_quality): support.error( "The length of sequence and quality lines should be the same! " "Check read %s (SEQ length is %d, QUAL length is %d)" % (read_name, len(read_value), len(read_quality))) out_file.write("+\n") out_file.write(read_quality + '\n') return line # next read name or empty string
def save_restart_options(log): if dataset_yaml_filename: support.error("you cannot specify --dataset with --restart-from option!", log) if single_cell: support.error("you cannot specify --sc with --restart-from option!", log) if iontorrent: support.error("you cannot specify --iontorrent with --restart-from option!", log) if only_assembler: support.error("you cannot specify --only-assembler with --restart-from option!", log) if only_error_correction: support.error("you cannot specify --only-error-correction with --restart-from option!", log) global restart_k_mers global restart_careful global restart_mismatch_corrector global restart_disable_gzip_output global restart_disable_rr global restart_threads global restart_memory global restart_tmp_dir global restart_qvoffset global restart_cov_cutoff global restart_developer_mode global restart_reference global restart_configs_dir global restart_read_buffer_size restart_k_mers = k_mers restart_careful = careful restart_mismatch_corrector = mismatch_corrector restart_disable_gzip_output = disable_gzip_output restart_disable_rr = disable_rr restart_threads = threads restart_memory = memory restart_tmp_dir = tmp_dir restart_qvoffset = qvoffset restart_cov_cutoff = cov_cutoff restart_developer_mode = developer_mode restart_reference = reference restart_configs_dir = configs_dir restart_read_buffer_size = read_buffer_size
def run_bh(result_filename, configs_dir, execution_home, cfg, ext_python_modules_home, log): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) shutil.copytree(os.path.join(configs_dir, "hammer"), dst_configs) cfg_file_name = os.path.join(dst_configs, "config.info") # removing template configs for root, dirs, files in os.walk(dst_configs): for cfg_file in files: cfg_file = os.path.join(root, cfg_file) if cfg_file.endswith('.info.template'): if os.path.isfile(cfg_file.split('.template')[0]): os.remove(cfg_file) else: os.rename(cfg_file, cfg_file.split('.template')[0]) prepare_config_bh(cfg_file_name, cfg, log) command = [os.path.join(execution_home, "hammer"), os.path.abspath(cfg_file_name)] log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n") support.sys_call(command, log) corrected_dataset_yaml_filename = os.path.join(cfg.tmp_dir, "corrected.yaml") if not os.path.isfile(corrected_dataset_yaml_filename): support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!") corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) if cfg.gzip_output: log.info("\n== Compressing corrected reads (with gzip)") move_dataset_files(corrected_dataset_data, cfg.output_dir, ext_python_modules_home, cfg.max_threads, log, cfg.gzip_output) corrected_dataset_yaml_filename = result_filename pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w')) log.info("\n== Dataset description file created: " + corrected_dataset_yaml_filename + "\n") shutil.rmtree(cfg.tmp_dir)
def main(): args = parse_args() # create logger log = logging.getLogger("Preprocess Lucigen NxMate reads") log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter("%(message)s")) console.setLevel(logging.DEBUG) log.addHandler(console) try: with open(args.args_filename) as f: lines = f.readlines() for infile1, infile2 in zip(lines[0::2], lines[1::2]): lucigen_nxmate.process_reads(infile1, infile2, args.dst, log, args.threads) except ImportError: support.error( "can't process Lucigen NxMate reads! lucigen_nxmate.py is missing!", log)
def check_cfg_for_partial_run(cfg, partial_run_type="restart-from" ): # restart-from ot stop-after if partial_run_type == "restart-from": check_point = options_storage.args.restart_from action = "restart from" verb = "was" elif partial_run_type == "stop-after": check_point = options_storage.args.stop_after action = "stop after" verb = "is" else: return if check_point == "ec" and ("error_correction" not in cfg): support.error( "failed to %s 'read error correction' ('%s') because this stage %s not specified!" % (action, check_point, verb)) if check_point == "mc" and ("mismatch_corrector" not in cfg): support.error( "failed to %s 'mismatch correction' ('%s') because this stage %s not specified!" % (action, check_point, verb)) if check_point == "as" or check_point.startswith('k'): if "assembly" not in cfg: support.error( "failed to %s 'assembling' ('%s') because this stage %s not specified!" % (action, check_point, verb))
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False): to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): moved_reads_files = [] for reads_file in value: dst_filename = os.path.join(dst, os.path.basename(reads_file)) # TODO: fix problem with files with the same basenames in Hammer binary! if not os.path.isfile(reads_file): if (not gzip and os.path.isfile(dst_filename)) or (gzip and os.path.isfile(dst_filename + '.gz')): support.warning('file with corrected reads (' + reads_file + ') is the same in several libraries', log) if gzip: dst_filename += '.gz' else: support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) else: shutil.move(reads_file, dst_filename) if gzip: to_compress.append(dst_filename) dst_filename += '.gz' moved_reads_files.append(dst_filename) reads_library[key] = moved_reads_files if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def check_cfg_for_restart_from(cfg): if options_storage.restart_from == 'ec' and ("error_correction" not in cfg): support.error( "failed to restart from read error correction because this stage was not specified!" ) if options_storage.restart_from == 'mc' and ("mismatch_corrector" not in cfg): support.error( "failed to restart from mismatch correction because this stage was not specified!" ) if options_storage.restart_from == 'as' or options_storage.restart_from.startswith( 'k'): if "assembly" not in cfg: support.error( "failed to restart from assembling because this stage was not specified!" ) if options_storage.restart_from.startswith('k'): correct_k = False k_to_check = options_storage.k_mers if not k_to_check: if options_storage.auto_K_allowed(): k_to_check = list( set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250)) else: k_to_check = options_storage.K_MERS_SHORT for k in k_to_check: if options_storage.restart_from == ( "k%d" % k) or options_storage.restart_from.startswith( "k%d:" % k): correct_k = True break if not correct_k: k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error( "failed to restart from K=%s because this K was not specified!" % k_str)
def check_cfg_for_partial_run(cfg, type='restart-from'): # restart-from ot stop-after if type == 'restart-from': check_point = options_storage.restart_from action = 'restart from' verb = 'was' elif type == 'stop-after': check_point = options_storage.stop_after action = 'stop after' verb = 'is' else: return if check_point == 'ec' and ("error_correction" not in cfg): support.error("failed to " + action + " 'read error correction' ('" + check_point + "') because this stage " + verb + " not specified!") if check_point == 'mc' and ("mismatch_corrector" not in cfg): support.error("failed to " + action + " 'mismatch correction' ('" + check_point + "') because this stage " + verb + " not specified!") if check_point == 'as' or check_point.startswith('k'): if "assembly" not in cfg: support.error("failed to " + action + " 'assembling' ('" + check_point + "') because this stage " + verb + " not specified!") if check_point.startswith('k'): correct_k = False k_to_check = options_storage.k_mers if not k_to_check: if options_storage.auto_K_allowed(): k_to_check = list(set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250)) else: k_to_check = options_storage.K_MERS_SHORT for k in k_to_check: if check_point == ("k%d" % k) or check_point.startswith("k%d:" % k): correct_k = True break if not correct_k: k_str = check_point[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
def main(ds_args_list, general_args_list, spades_home, bin_home): log = logging.getLogger('dipspades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) ds_args = parse_arguments(ds_args_list, log) if not os.path.exists(ds_args.output_dir): os.makedirs(ds_args.output_dir) log_filename = os.path.join(ds_args.output_dir, "dipspades.log") if os.path.exists(log_filename): os.remove(log_filename) log_handler = logging.FileHandler(log_filename, mode='a') log.addHandler(log_handler) params_filename = os.path.join(ds_args.output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='a') log.addHandler(params_handler) log.info("\n") log.info("General command line: " + " ".join(general_args_list) + "\n") log.info("dipSPAdes command line: " + " ".join(ds_args_list) + "\n") print_ds_args(ds_args, log) log.removeHandler(params_handler) log.info("\n======= dipSPAdes started. Log can be found here: " + log_filename + "\n") write_haplocontigs_in_file(ds_args.haplocontigs, ds_args.haplocontigs_fnames) config_fname = prepare_configs(os.path.join(spades_home, "configs", "dipspades"), ds_args, log) ds_args.tmp_dir = support.get_tmp_dir(prefix="dipspades_", base_dir=ds_args.tmp_dir) prepare_config(config_fname, ds_args, log) try: log.info("===== Assembling started.\n") binary_path = os.path.join(bin_home, "spades-dipspades-core") command = [binary_path, config_fname] support.sys_call(command, log) log.info("\n===== Assembling finished.\n") print_ds_output(ds_args.output_dir, log) if os.path.isdir(ds_args.tmp_dir): shutil.rmtree(ds_args.tmp_dir) log.info("\n======= dipSPAdes finished.\n") log.info("dipSPAdes log can be found here: " + log_filename + "\n") log.info("Thank you for using dipSPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message(), dipspades=True) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log, dipspades=True)
def compress_dataset_files(input_file, ext_python_modules_home, max_threads, log, not_used_yaml_file, output_dir, gzip_output): addsitedir(ext_python_modules_home) if sys.version.startswith("2."): import pyyaml2 as pyyaml from joblib2 import Parallel, delayed elif sys.version.startswith("3."): import pyyaml3 as pyyaml from joblib3 import Parallel, delayed dataset_data = pyyaml.load(open(input_file)) remove_not_corrected_reads(output_dir) is_changed = False if gzip_output: is_changed = True pigz_path = support.which("pigz") if pigz_path: compressor = "pigz" else: compressor = "gzip" log.info("\n== Compressing corrected reads (with %s)" % compressor) to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith("reads"): compressed_reads_filenames = [] for reads_file in value: compressed_reads_filenames.append(reads_file + ".gz") to_compress.append(reads_file) reads_library[key] = compressed_reads_filenames if len(to_compress): for reads_file in to_compress: if not isfile(reads_file): support.error( "something went wrong and file with corrected reads (%s) is missing!" % reads_file, log) if pigz_path: for reads_file in to_compress: support.sys_call([ pigz_path, "-f", "-7", "-p", str(max_threads), reads_file ], log) else: n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)( delayed(support.sys_call)(["gzip", "-f", "-7", reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output) if not_used_yaml_file != "": is_changed = True not_used_dataset_data = pyyaml.load(open(not_used_yaml_file)) dataset_data += not_used_dataset_data if is_changed: with open(input_file, 'w') as f: pyyaml.dump(dataset_data, f, default_flow_style=False, default_style='"', width=float("inf"))
def nx_seq_junction(infilename1, infilename2, dst, log, silent=True): starttime = time.time() basename1 = os.path.basename(infilename1) if os.path.splitext(basename1)[1] == '.gz': basename1 = os.path.splitext(basename1)[0] basename2 = os.path.basename(infilename2) if os.path.splitext(basename2)[1] == '.gz': basename2 = os.path.splitext(basename2)[0] #open three outfiles splitfilenameleft = os.path.join(dst, 'R1_IJS7_' + basename1) splitfile1 = open(splitfilenameleft, 'w') splitfilenameright = os.path.join(dst, 'R2_IJS7_' + basename2) splitfile2 = open(splitfilenameright, 'w') unsplitfilename = os.path.join( dst, 'unsplit_IJS7_' + basename1.replace('_R1_', '_R1R2_')) unsplitfile = open(unsplitfilename, 'w') #jctstr = '(GGTTCATCGTCAGGCCTGACGATGAACC){e<=4}' # JS7 24/28 required results in ~92% detected in ion torrent # from NextClip: --adaptor_sequence GTTCATCGTCAGG -e --strict_match 22,11 --relaxed_match 20,10 eg strict 22/26 = 4 errors, relaxed 20/26 = 6 errors jctstr = '(GTTCATCGTCAGGCCTGACGATGAAC){e<=4}' # try 22/26 to match NextClip strict (e<=6 for relaxed) #PARSE both files in tuples of 4 lines parserR1 = ParseFastQ(infilename1) parserR2 = ParseFastQ(infilename2) all_stats = JunctionStats() n_jobs = options_storage.threads while True: # prepare input reads1 = list(itertools.islice(parserR1, READS_PER_BATCH)) reads2 = list(itertools.islice(parserR2, READS_PER_BATCH)) if len(reads1) != len(reads2): support.error( "lucigen_nxmate.py, nx_seq_junction: " "number of left reads (%d) is not equal to number of right reads (%d)!" % (len(reads1), len(reads2)), log) if not reads1: break chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs) # processing outputs = Parallel(n_jobs=n_jobs)( delayed(nx_seq_junction_process_batch)(reads, jctstr) for reads in chunks) results, stats = [x[0] for x in outputs], [x[1] for x in outputs] # writing results for result, stat in zip(results, stats): write_to_files([splitfile1, splitfile2, unsplitfile], result) all_stats += stat if not silent: log.info( "==== nx_seq_junction progress: reads processed: %d, time elapsed: %s" % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)))) parserR1.close() parserR2.close() splitfile1.close() splitfile2.close() unsplitfile.close() if all_stats.readcounter == 0: support.error( "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!", log) if all_stats.splitcounter == 0: support.error( "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!", log) if not silent: #print some stats percentsplit = 100 * all_stats.splitcounter / all_stats.readcounter percentR1R2 = 100 * all_stats.R1R2jctcounter / all_stats.splitcounter percentR1 = 100 * all_stats.R1jctcounter / all_stats.splitcounter percentR2 = 100 * all_stats.R2jctcounter / all_stats.splitcounter log.info("==== nx_seq_junction info: processing finished!") log.info("==== nx_seq_junction info: %d reads processed" % (all_stats.readcounter)) log.info( "==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))" % (all_stats.splitcounter, percentsplit)) log.info( "==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))" % (all_stats.R1R2jctcounter, percentR1R2)) log.info( "==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))" % (all_stats.R1jctcounter, percentR1)) log.info( "==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))" % (all_stats.R2jctcounter, percentR2)) elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)) log.info("==== nx_seq_junction info: time elapsed: %s" % (elapsedtime)) parserR1.close() parserR2.close() return splitfilenameleft, splitfilenameright, unsplitfilename
def nx_seq_junction(infilename1, infilename2, dst, log, silent=True): starttime = time.time() basename1 = os.path.basename(infilename1) if os.path.splitext(basename1)[1] == '.gz': basename1 = os.path.splitext(basename1)[0] basename2 = os.path.basename(infilename2) if os.path.splitext(basename2)[1] == '.gz': basename2 = os.path.splitext(basename2)[0] #open three outfiles splitfilenameleft = os.path.join(dst, 'R1_IJS7_' + basename1) splitfile1 = open(splitfilenameleft, 'w') splitfilenameright = os.path.join(dst, 'R2_IJS7_' + basename2) splitfile2 = open(splitfilenameright, 'w') unsplitfilename = os.path.join(dst, 'unsplit_IJS7_' + basename1.replace('_R1_', '_R1R2_')) unsplitfile = open(unsplitfilename, 'w') #jctstr = '(GGTTCATCGTCAGGCCTGACGATGAACC){e<=4}' # JS7 24/28 required results in ~92% detected in ion torrent # from NextClip: --adaptor_sequence GTTCATCGTCAGG -e --strict_match 22,11 --relaxed_match 20,10 eg strict 22/26 = 4 errors, relaxed 20/26 = 6 errors jctstr = '(GTTCATCGTCAGGCCTGACGATGAAC){e<=4}' # try 22/26 to match NextClip strict (e<=6 for relaxed) #PARSE both files in tuples of 4 lines parserR1 = ParseFastQ(infilename1) parserR2 = ParseFastQ(infilename2) all_stats = JunctionStats() n_jobs = options_storage.threads while True: # prepare input reads1 = list(itertools.islice(parserR1, READS_PER_BATCH)) reads2 = list(itertools.islice(parserR2, READS_PER_BATCH)) if len(reads1) != len(reads2): support.error("lucigen_nxmate.py, nx_seq_junction: " "number of left reads (%d) is not equal to number of right reads (%d)!" % (len(reads1), len(reads2)), log) if not reads1: break chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs) # processing outputs = Parallel(n_jobs=n_jobs)(delayed(nx_seq_junction_process_batch)(reads, jctstr) for reads in chunks) results, stats = [x[0] for x in outputs], [x[1] for x in outputs] # writing results for result, stat in zip(results, stats): write_to_files([splitfile1, splitfile2, unsplitfile], result) all_stats += stat if not silent: log.info("==== nx_seq_junction progress: reads processed: %d, time elapsed: %s" % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)))) parserR1.close() parserR2.close() splitfile1.close() splitfile2.close() unsplitfile.close() if all_stats.readcounter == 0: support.error("lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!", log) if all_stats.splitcounter == 0: support.error("lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!", log) if not silent: #print some stats percentsplit = 100 * all_stats.splitcounter / all_stats.readcounter percentR1R2 = 100 * all_stats.R1R2jctcounter / all_stats.splitcounter percentR1 = 100 * all_stats.R1jctcounter / all_stats.splitcounter percentR2 = 100 * all_stats.R2jctcounter / all_stats.splitcounter log.info("==== nx_seq_junction info: processing finished!") log.info("==== nx_seq_junction info: %d reads processed" % (all_stats.readcounter)) log.info("==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))" % (all_stats.splitcounter, percentsplit)) log.info("==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))" % (all_stats.R1R2jctcounter, percentR1R2)) log.info("==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))" % (all_stats.R1jctcounter, percentR1)) log.info("==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))" % (all_stats.R2jctcounter, percentR2)) elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)) log.info("==== nx_seq_junction info: time elapsed: %s" % (elapsedtime)) parserR1.close() parserR2.close() return splitfilenameleft, splitfilenameright, unsplitfilename
def chimera_clean(infilename1, infilename2, dst, log, silent=True): starttime = time.time() basename1 = os.path.basename(infilename1) if os.path.splitext(basename1)[1] == '.gz': basename1 = os.path.splitext(basename1)[0] basename2 = os.path.basename(infilename2) if os.path.splitext(basename2)[1] == '.gz': basename2 = os.path.splitext(basename2)[0] #open four outfiles outfilename1 = os.path.join(dst, 'mates_ICC4_' + basename1) outfile1 = open(outfilename1, 'w') slagfilename1 = os.path.join(dst, 'non-mates_ICC4_' + basename1) slagfile1 = open(slagfilename1, 'w') outfilename2 = os.path.join(dst, 'mates_ICC4_' + basename2) outfile2 = open(outfilename2, 'w') slagfilename2 = os.path.join(dst, 'non-mates_ICC4_' + basename2) slagfile2 = open(slagfilename2, 'w') #set up regular expression patterns for chimera codes- for illumin use the reverse complements of right codes csslist1 = ['(TGGACTCCACTGTG){e<=1}', '(ACTTCGCCACTGTG){e<=1}', '(TGAGTCCCACTGTG){e<=1}', '(TGACTGCCACTGTG){e<=1}', '(TCAGGTCCACTGTG){e<=1}', '(ATGTCACCACTGTG){e<=1}', '(GTATGACCACTGTG){e<=1}', '(GTCTACCCACTGTG){e<=1}', '(GTTGGACCACTGTG){e<=1}', '(CGATTCCCACTGTG){e<=1}', '(GGTTACCCACTGTG){e<=1}', '(TCACCTCCACTGTG){e<=1}'] csslist2 = ['(TCCAGACCAATGTG){e<=1}', '(ACATCACCAATGTG){e<=1}', '(TCACGACCAATGTG){e<=1}', '(TAGCACCCAATGTG){e<=1}', '(AACCTCCCAATGTG){e<=1}', '(ACAACTCCAATGTG){e<=1}', '(GTCTAACCAATGTG){e<=1}', '(TACACGCCAATGTG){e<=1}', '(GAGAACCCAATGTG){e<=1}', '(GAGATTCCAATGTG){e<=1}', '(GACCTACCAATGTG){e<=1}', '(AGACTCCCAATGTG){e<=1}'] #PARSE both files in tuples of 4 lines parserR1 = ParseFastQ(infilename1) parserR2 = ParseFastQ(infilename2) all_stats = CleanStats() n_jobs = options_storage.threads while True: # prepare input reads1 = list(itertools.islice(parserR1, READS_PER_BATCH)) reads2 = list(itertools.islice(parserR2, READS_PER_BATCH)) if len(reads1) != len(reads2): support.error("lucigen_nxmate.py, chimera_clean: " "number of left reads (%d) is not equal to number of right reads (%d)!" % (len(reads1), len(reads2)), log) if not reads1: break chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs) # processing outputs = Parallel(n_jobs=n_jobs)(delayed(chimera_clean_process_batch)(reads, csslist1, csslist2) for reads in chunks) results, stats = [x[0] for x in outputs], [x[1] for x in outputs] # writing results for result, stat in zip(results, stats): write_to_files([outfile1, outfile2, slagfile1, slagfile2], result) all_stats += stat if not silent: log.info("==== chimera_clean progress: reads processed: %d, time elapsed: %s" % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)))) parserR1.close() parserR2.close() outfile1.close() slagfile1.close() outfile2.close() slagfile2.close() if all_stats.TOTALmatecounter + all_stats.slagcounter != all_stats.readcounter: support.error("lucigen_nxmate.py, chimera_clean: error in the script somewhere! Unequal read counts!", log) if all_stats.readcounter == 0: support.error("lucigen_nxmate.py, chimera_clean: error in input data! Number of processed reads is 0!", log) if not silent: #print some stats percentmates = 100. * all_stats.matecounter / all_stats.readcounter percentslag = 100. * all_stats.slagcounter / all_stats.readcounter log.info("==== chimera_clean info: processing finished!") log.info("==== chimera_clean info: %d reads processed, %d true mate reads (%.2f %%) " "and %d non-mates/chimeras (%.2f %%)." % (all_stats.readcounter, all_stats.matecounter, percentmates, all_stats.slagcounter, percentslag)) shortmates = all_stats.TOTALmatecounter - all_stats.matecounter log.info("==== chimera_clean info: %d mates too short to keep after trimming" % shortmates) elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)) log.info("==== chimera_clean info: time elapsed: %s" % (elapsedtime)) log.info("==== chimera_clean info: " + str(all_stats.csscounter)) return outfilename1, outfilename2
import os import time import support import gzip import itertools import sys from site import addsitedir import spades_init import options_storage try: import regex except ImportError: support.error("Can't process Lucigen NxMate reads! Python module regex is not installed!") addsitedir(spades_init.ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed # CONSTANTS READS_PER_THREAD = 25000 READS_PER_BATCH = READS_PER_THREAD * options_storage.threads # e.g. 100000 for 4 threads minseq = 25 # minimum length sequence to keep after trimming class ParseFastQ(object):
def fill_cfg(options_to_parse, log): try: options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options) except getopt.GetoptError: _, exc, _ = sys.exc_info() sys.stderr.write(str(exc) + "\n") sys.stderr.flush() options_storage.usage(spades_version) sys.exit(1) if not options: options_storage.usage(spades_version) sys.exit(1) # all parameters are stored here cfg = dict() # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER paired-end libs and MAX_LIBS_NUMBER mate-pair libs dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * 2)] options_storage.continue_mode = False for opt, arg in options: if opt == '-o': options_storage.output_dir = arg elif opt == "--tmp-dir": options_storage.tmp_dir = arg elif opt == "--reference": options_storage.reference = support.check_file_existence(arg, 'reference', log) elif opt == "--dataset": options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log) elif opt in options_storage.reads_options: support.add_to_dataset(opt, arg, dataset_data) elif opt == '-k': options_storage.k_mers = list(map(int, arg.split(","))) for k in options_storage.k_mers: if k > 127: support.error('wrong k value ' + str(k) + ': all k values should be less than 128', log) if k % 2 == 0: support.error('wrong k value ' + str(k) + ': all k values should be odd', log) elif opt == "--sc": options_storage.single_cell = True elif opt == "--disable-gzip-output": options_storage.disable_gzip_output = True elif opt == "--only-error-correction": if options_storage.only_assembler: support.error('you cannot specify --only-error-correction and --only-assembler simultaneously') options_storage.only_error_correction = True elif opt == "--only-assembler": if options_storage.only_error_correction: support.error('you cannot specify --only-error-correction and --only-assembler simultaneously') options_storage.only_assembler = True elif opt == "--bh-heap-check": options_storage.bh_heap_check = arg elif opt == "--spades-heap-check": options_storage.spades_heap_check = arg elif opt == "--continue": options_storage.continue_mode = True elif opt == '-t' or opt == "--threads": options_storage.threads = int(arg) elif opt == '-m' or opt == "--memory": options_storage.memory = int(arg) elif opt == "--phred-offset": if int(arg) in [33, 64]: options_storage.qvoffset = int(arg) else: support.error('wrong PHRED quality offset value ' + str(arg) + ': should be either 33 or 64', log) elif opt == '-i' or opt == "--iterations": options_storage.iterations = int(arg) elif opt == "--debug": options_storage.developer_mode = True elif opt == "--rectangles": options_storage.rectangles = True #corrector elif opt == "--mismatch-correction": options_storage.mismatch_corrector = True elif opt == "--careful": options_storage.mismatch_corrector = True options_storage.careful = True elif opt == '-h' or opt == "--help": options_storage.usage(spades_version) sys.exit(0) elif opt == "--help-hidden": options_storage.usage(spades_version, True) sys.exit(0) elif opt == "--test": options_storage.set_test_options() support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data) support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data) #break else: raise ValueError if not options_storage.output_dir: support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log) if not os.path.isdir(options_storage.output_dir): if options_storage.continue_mode: support.error("the output_dir should exist for --continue!", log) os.makedirs(options_storage.output_dir) if options_storage.continue_mode: return None, None if options_storage.dataset_yaml_filename: try: dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r')) except pyyaml.YAMLError: _, exc, _ = sys.exc_info() support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc)) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename)) else: dataset_data = support.correct_dataset(dataset_data) dataset_data = support.relative2abs_paths(dataset_data, os.getcwd()) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) support.check_dataset_reads(dataset_data, options_storage.only_assembler, log) if support.dataset_has_only_mate_pairs_libraries(dataset_data): support.error('you should specify at least one paired-end or unpaired library (only mate-pairs libraries were found)!') if options_storage.rectangles and (len(dataset_data) > 1): support.error('rectangle graph algorithm for repeat resolution cannot work with multiple libraries!') ### FILLING cfg cfg["common"] = empty_config() cfg["dataset"] = empty_config() if not options_storage.only_assembler: cfg["error_correction"] = empty_config() if not options_storage.only_error_correction: cfg["assembly"] = empty_config() # common cfg["common"].__dict__["output_dir"] = os.path.abspath(options_storage.output_dir) cfg["common"].__dict__["max_threads"] = options_storage.threads cfg["common"].__dict__["max_memory"] = options_storage.memory cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode # dataset section cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell cfg["dataset"].__dict__["yaml_filename"] = os.path.abspath(options_storage.dataset_yaml_filename) if options_storage.developer_mode and options_storage.reference: cfg["dataset"].__dict__["reference"] = options_storage.reference # error correction if (not options_storage.only_assembler) and (options_storage.iterations > 0): cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected") cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output if options_storage.qvoffset: cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset if options_storage.bh_heap_check: cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check if options_storage.tmp_dir: cfg["error_correction"].__dict__["tmp_dir"] = options_storage.tmp_dir else: cfg["error_correction"].__dict__["tmp_dir"] = cfg["error_correction"].output_dir cfg["error_correction"].tmp_dir = os.path.join(os.path.abspath(cfg["error_correction"].tmp_dir), 'tmp') # assembly if not options_storage.only_error_correction: if options_storage.k_mers: cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers else: cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers_short cfg["assembly"].__dict__["careful"] = options_storage.careful if options_storage.spades_heap_check: cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check #corrector can work only if contigs exist (not only error correction) if (not options_storage.only_error_correction) and options_storage.mismatch_corrector: cfg["mismatch_corrector"] = empty_config() cfg["mismatch_corrector"].__dict__["skip-masked"] = "" cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades") cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir return cfg, dataset_data
def main(): os.environ["LC_ALL"] = "C" if len(sys.argv) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) check_binaries(bin_home, log) # parse options and safe all parameters to cfg cfg, dataset_data = fill_cfg(sys.argv, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt")) if not options: support.error("failed to parse command line of the previous run! Please restart from the beginning.") cfg, dataset_data = fill_cfg(options, log) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) else: params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) command = "Command line:" for v in sys.argv: command += " " + v log.info(command) print_used_values(cfg, log) log.removeHandler(params_handler) log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads if needed if support.dataset_has_interlaced_reads(dataset_data): dir_for_split_reads = os.path.join(os.path.abspath(options_storage.output_dir), 'split_reads') if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = os.path.abspath(options_storage.dataset_yaml_filename) try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): shutil.copytree(os.path.join(spades_home, "configs"), tmp_configs_dir) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode: log.info("\n===== Skipping read error correction (already processed). \n") else: options_storage.continue_mode = False # continue from here if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if not os.path.exists(bh_cfg.tmp_dir): os.makedirs(bh_cfg.tmp_dir) log.info("\n===== Read error correction started. \n") bh_logic.run_bh(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, ext_python_modules_home, log) log.info("\n===== Read error correction finished. \n") result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename spades_cfg.__dict__["additional_contigs"] = os.path.join(spades_cfg.output_dir, "simplified_contigs.fasta") if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename))): log.info("\n===== Skipping assembling (already processed). \n") # calculating latest_dir for the next stages latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error("failed to continue the previous run! Please restart from the beginning.") else: if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if support.dataset_has_paired_reads(dataset_data): spades_cfg.__dict__["paired_mode"] = True else: spades_cfg.__dict__["paired_mode"] = False if options_storage.rectangles: spades_cfg.__dict__["resolving_mode"] = "rectangles" if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== Assembling started.\n") # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile(dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write(process_cfg.process_spaces(os.path.abspath(cfg["dataset"].reference)) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, log) #rectangles if spades_cfg.paired_mode and options_storage.rectangles: if options_storage.continue_mode: # TODO: continue mode support.warning("sorry, --continue doesn't work with --rectangles yet. Skipping repeat resolving.") else: sys.path.append(os.path.join(python_modules_home, "rectangles")) import rrr rrr_input_dir = os.path.join(latest_dir, "saves") rrr_outpath = os.path.join(spades_cfg.output_dir, "rectangles") if not os.path.exists(rrr_outpath): os.mkdir(rrr_outpath) rrr_reference_information_file = os.path.join(rrr_input_dir, "late_pair_info_counted_etalon_distance.txt") rrr_test_util = rrr.TestUtils(rrr_reference_information_file, os.path.join(rrr_outpath, "rectangles.log")) rrr.resolve(rrr_input_dir, rrr_outpath, rrr_test_util, "", cfg["dataset"].single_cell, spades_cfg.careful) shutil.copyfile(os.path.join(rrr_outpath, "rectangles_extend_before_scaffold.fasta"), spades_cfg.result_contigs) shutil.copyfile(os.path.join(rrr_outpath, "rectangles_extend.fasta"), spades_cfg.result_scaffolds) if not spades_cfg.developer_mode: if os.path.exists(rrr_input_dir): shutil.rmtree(rrr_input_dir) if os.path.exists(rrr_outpath): shutil.rmtree(rrr_outpath, True) if os.path.exists(rrr_outpath): os.system('rm -r ' + rrr_outpath) #EOR if os.path.isdir(misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if os.path.isfile(spades_cfg.additional_contigs): shutil.move(spades_cfg.additional_contigs, misc_dir) log.info("\n===== Assembling finished. \n") #corrector if "mismatch_corrector" in cfg and (os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for k, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue shutil.move(old, new) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)): log.info("\n===== Skipping mismatch correction (already processed). \n") else: log.info("\n===== Mismatch correction started.") # detecting paired-end library with the largest insert size dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r')) ### initial dataset, i.e. before error correction dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename)) paired_end_libraries_ids = [] for id, reads_library in enumerate(dataset_data): if reads_library['type'] == 'paired-end': paired_end_libraries_ids.append(id) if not len(paired_end_libraries_ids): support.error('Mismatch correction cannot be performed without at least one paired-end library!') estimated_params = load_config_from_file(os.path.join(latest_dir, "_est_params.info")) max_insert_size = -1 target_paired_end_library_id = -1 for id in paired_end_libraries_ids: if float(estimated_params.__dict__["insert_size_" + str(id)]) > max_insert_size: max_insert_size = float(estimated_params.__dict__["insert_size_" + str(id)]) target_paired_end_library_id = id yaml_dirname = os.path.dirname(options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list(map(lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id]['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list(map(lambda x: os.path.join(yaml_dirname, x), dataset_data[target_paired_end_library_id]['right reads'])) cfg["mismatch_corrector"].__dict__["insert-size"] = round(max_insert_size) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value: args.append(value) # processing contigs and scaffolds (or only contigs) for k, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(corrected): log.info("\n== Skipping processing of " + k + " (already processed)\n") continue options_storage.continue_mode = False log.info("\n== Processing of " + k + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = os.path.join(corrector_cfg.__dict__["output-dir"], "mismatch_corrector_" + k) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.abspath(os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")) # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) log.info("\n===== Mismatch correction finished.\n") if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + os.path.dirname(corrected_dataset_yaml_filename) + "/") if os.path.isfile(result_contigs_filename): log.info(" * Assembled contigs are in " + result_contigs_filename) if os.path.isfile(result_scaffolds_filename): log.info(" * Assembled scaffolds are in " + result_scaffolds_filename) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") threshold = 3 if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode: support.break_scaffolds(result_scaffolds_filename, threshold, result_broken_scaffolds) #log.info(" * Scaffolds broken by " + str(threshold) + " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: _, exc, _ = sys.exc_info() log.exception(exc) support.error("exception caught", log)
options_storage.output_dir = arg elif opt == "--tmp-dir": options_storage.tmp_dir = arg elif opt == "--reference": options_storage.reference = support.check_file_existence(arg, "reference", log) elif opt == "--dataset": options_storage.dataset_yaml_filename = support.check_file_existence(arg, "dataset", log) elif opt in options_storage.reads_options: support.add_to_dataset(opt, arg, dataset_data) elif opt == "-k": options_storage.k_mers = map(int, arg.split(",")) for k in options_storage.k_mers: if k > 127: support.error("wrong k value " + str(k) + ": all k values should be less than 128", log) if k % 2 == 0: support.error("wrong k value " + str(k) + ": all k values should be odd", log) elif opt == "--sc": options_storage.single_cell = True elif opt == "--disable-gzip-output": options_storage.disable_gzip_output = True elif opt == "--only-error-correction": if options_storage.only_assembler: support.error("you cannot specify --only-error-correction and --only-assembler simultaneously") options_storage.only_error_correction = True elif opt == "--only-assembler": if options_storage.only_error_correction: support.error("you cannot specify --only-error-correction and --only-assembler simultaneously")
def main(ds_args_list, general_args_list, spades_home, bin_home): log = logging.getLogger('dipspades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) ds_args = parse_arguments(ds_args_list, log) if not os.path.exists(ds_args.output_dir): os.makedirs(ds_args.output_dir) log_filename = os.path.join(ds_args.output_dir, "dipspades.log") if os.path.exists(log_filename): os.remove(log_filename) log_handler = logging.FileHandler(log_filename, mode='a') log.addHandler(log_handler) params_filename = os.path.join(ds_args.output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='a') log.addHandler(params_handler) log.info("\n") log.info("General command line: " + " ".join(general_args_list) + "\n") log.info("dipSPAdes command line: " + " ".join(ds_args_list) + "\n") print_ds_args(ds_args, log) log.removeHandler(params_handler) log.info("\n======= dipSPAdes started. Log can be found here: " + log_filename + "\n") write_haplocontigs_in_file(ds_args.haplocontigs, ds_args.haplocontigs_fnames) config_fname = prepare_configs(os.path.join(spades_home, "configs", "dipspades"), ds_args, log) ds_args.tmp_dir = support.get_tmp_dir(prefix="dipspades_", base_dir=ds_args.tmp_dir) prepare_config(config_fname, ds_args, log) try: log.info("===== Assembling started.\n") binary_path = os.path.join(bin_home, "dipspades") command = [binary_path, config_fname] support.sys_call(command, log) log.info("\n===== Assembling finished.\n") print_ds_output(ds_args.output_dir, log) if os.path.isdir(ds_args.tmp_dir): shutil.rmtree(ds_args.tmp_dir) log.info("\n======= dipSPAdes finished.\n") log.info("dipSPAdes log can be found here: " + log_filename + "\n") log.info("Thank you for using dipSPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message(), dipspades=True) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log, dipspades=True)
def main(): all_long_options = list(set(options_storage.long_options + dipspades_logic.DS_Args_List.long_options)) all_short_options = options_storage.short_options + dipspades_logic.DS_Args_List.short_options dipspades_logic_args = [] spades_py_args = ["--diploid"] try: options, not_options = getopt.gnu_getopt(sys.argv, all_short_options, all_long_options) except getopt.GetoptError: _, exc, _ = sys.exc_info() sys.stderr.write(str(exc) + "\n") options_storage.usage(spades_version, mode="dip") sys.stderr.flush() sys.exit(1) if not options: options_storage.usage(spades_version, mode="dip") sys.stderr.flush() sys.exit(1) output_dir = None spades_py_run_needed = False for opt, arg in options: # processing some special options if opt == '--test': output_dir = abspath("test_dipspades") spades_py_args = ["--diploid", "-1", os.path.join(spades_init.spades_home, "test_dataset/ecoli_1K_1.fq.gz"), "-2", os.path.join(spades_init.spades_home, "test_dataset/ecoli_1K_2.fq.gz"), "--only-assembler"] dipspades_logic_args = [] spades_py_run_needed = True break if opt == '-o': output_dir = abspath(expanduser(arg)) elif opt == '--careful' or opt == '--mismatch-correction': continue if opt == '-v' or opt == '--version': options_storage.version(spades_version, mode="dip") sys.exit(0) if opt == '-h' or opt == '--help': options_storage.usage(spades_version, mode="dip") sys.exit(0) elif opt == "--help-hidden": options_storage.usage(spades_version, show_hidden=True, mode="dip") sys.exit(0) # for all other options cur_opt_arg = [opt] if arg: cur_opt_arg.append(arg) if opt.startswith("--"): # long option if opt[2:] in options_storage.long_options or (opt[2:] + "=") in options_storage.long_options: spades_py_args += cur_opt_arg if opt[2:] in dipspades_logic.DS_Args_List.long_options or (opt[2:] + "=") in dipspades_logic.DS_Args_List.long_options: dipspades_logic_args += cur_opt_arg else: spades_py_run_needed = True else: dipspades_logic_args += cur_opt_arg else: # short option if opt != '-o': if opt[1:] in options_storage.short_options: spades_py_args += cur_opt_arg if opt[1:] in dipspades_logic.DS_Args_List.short_options: dipspades_logic_args += cur_opt_arg else: spades_py_run_needed = True else: dipspades_logic_args += cur_opt_arg if not output_dir: support.error("The output_dir is not set! It is a mandatory parameter (-o output_dir).", dipspades=True) spades_output_dir = os.path.join(output_dir, "spades") dipspades_output_dir = os.path.join(output_dir, "dipspades") if not os.path.isdir(output_dir): os.makedirs(output_dir) if not os.path.isdir(spades_output_dir): os.makedirs(spades_output_dir) if not os.path.isdir(dipspades_output_dir): os.makedirs(dipspades_output_dir) spades_result = "" if spades_py_run_needed: spades_py_args += ["-o", spades_output_dir] spades.main(spades_py_args) spades_result = os.path.join(spades_output_dir, "contigs.fasta") if not os.path.isfile(spades_result): support.error("Something went wrong and SPAdes did not generate haplocontigs. " "DipSPAdes cannot proceed without them, aborting.", dipspades=True) dipspades_logic_args += ["-o", dipspades_output_dir] if spades_result != "": dipspades_logic_args += ["--hap", spades_result] dipspades_logic.main(dipspades_logic_args, sys.argv, spades.spades_home, spades.bin_home)
def fill_cfg(options_to_parse, log): try: options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options) except getopt.GetoptError: _, exc, _ = sys.exc_info() sys.stderr.write(str(exc) + "\n") sys.stderr.flush() options_storage.usage(spades_version) sys.exit(1) if not options: options_storage.usage(spades_version) sys.exit(1) if len(not_options) > 1: for opt, arg in options: if opt == "-k" and arg.strip().endswith(','): support.error("Do not put spaces after commas in the list of k-mers sizes! Correct example: -k 21,33,55", log) support.error("Please specify option (e.g. -1, -2, -s, etc) for the following paths: " + ", ".join(not_options[1:]) + "\n", log) # all parameters are stored here cfg = dict() # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER for each type of short-reads libs dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * len(options_storage.SHORT_READS_TYPES.keys()))] # "[{}] * num" doesn't work here! # for parsing options from "previous run command" options_storage.continue_mode = False options_storage.k_mers = None for opt, arg in options: if opt == '-o': options_storage.output_dir = os.path.abspath(arg) elif opt == "--tmp-dir": options_storage.tmp_dir = os.path.abspath(arg) elif opt == "--reference": options_storage.reference = support.check_file_existence(arg, 'reference', log) elif opt == "--dataset": options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log) elif opt in options_storage.reads_options: support.add_to_dataset(opt, arg, dataset_data) elif opt == '-k': if arg == 'auto': options_storage.k_mers = arg else: options_storage.k_mers = list(map(int, arg.split(","))) for k in options_storage.k_mers: if k < options_storage.MIN_K or k > options_storage.MAX_K: support.error('wrong k value ' + str(k) + ': all k values should be between %d and %d' % (options_storage.MIN_K, options_storage.MAX_K), log) if k % 2 == 0: support.error('wrong k value ' + str(k) + ': all k values should be odd', log) elif opt == "--sc": options_storage.single_cell = True elif opt == "--iontorrent": options_storage.iontorrent = True elif opt == "--disable-gzip-output": options_storage.disable_gzip_output = True elif opt == "--disable-gzip-output:false": options_storage.disable_gzip_output = False elif opt == "--disable-rr": options_storage.disable_rr = True elif opt == "--disable-rr:false": options_storage.disable_rr = False elif opt == "--only-error-correction": if options_storage.only_assembler: support.error('you cannot specify --only-error-correction and --only-assembler simultaneously') options_storage.only_error_correction = True elif opt == "--only-assembler": if options_storage.only_error_correction: support.error('you cannot specify --only-error-correction and --only-assembler simultaneously') options_storage.only_assembler = True elif opt == "--read-buffer-size": options_storage.read_buffer_size = int(arg) elif opt == "--bh-heap-check": options_storage.bh_heap_check = arg elif opt == "--spades-heap-check": options_storage.spades_heap_check = arg elif opt == "--continue": options_storage.continue_mode = True elif opt == "--restart-from": if arg not in ['ec', 'as', 'mc'] and not arg.startswith('k'): support.error("wrong value for --restart-from option: " + arg + " (only 'ec', 'as', 'k<int>', 'mc' are available)", log) options_storage.continue_mode = True options_storage.restart_from = arg elif opt == '-t' or opt == "--threads": options_storage.threads = int(arg) elif opt == '-m' or opt == "--memory": options_storage.memory = int(arg) elif opt == "--phred-offset": if arg == 'auto': options_storage.qvoffset = arg elif arg in ['33', '64']: options_storage.qvoffset = int(arg) else: support.error('wrong PHRED quality offset value ' + str(arg) + ': should be either 33 or 64', log) elif opt == '-i' or opt == "--iterations": options_storage.iterations = int(arg) elif opt == "--debug": options_storage.developer_mode = True elif opt == "--debug:false": options_storage.developer_mode = False #corrector elif opt == "--mismatch-correction": options_storage.mismatch_corrector = True elif opt == "--mismatch-correction:false": options_storage.mismatch_corrector = False elif opt == "--careful": options_storage.mismatch_corrector = True options_storage.careful = True elif opt == "--careful:false": options_storage.mismatch_corrector = False options_storage.careful = False elif opt == '-h' or opt == "--help": options_storage.usage(spades_version) sys.exit(0) elif opt == "--help-hidden": options_storage.usage(spades_version, True) sys.exit(0) elif opt == "--test": options_storage.set_test_options() support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data) support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data) #break elif opt == "--diploid": options_storage.diploid_mode = True else: raise ValueError if not options_storage.output_dir: support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log) if not os.path.isdir(options_storage.output_dir): if options_storage.continue_mode: support.error("the output_dir should exist for --continue and for --restart-from!", log) os.makedirs(options_storage.output_dir) if options_storage.restart_from: if options_storage.continue_mode: # saving parameters specified with --restart-from if not support.dataset_is_empty(dataset_data): support.error("you cannot specify reads with --restart-from option!", log) options_storage.save_restart_options(log) else: # overriding previous run parameters options_storage.load_restart_options() if options_storage.continue_mode: return None, None if options_storage.dataset_yaml_filename: try: dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r')) except pyyaml.YAMLError: _, exc, _ = sys.exc_info() support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc)) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename)) else: dataset_data = support.correct_dataset(dataset_data) dataset_data = support.relative2abs_paths(dataset_data, os.getcwd()) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) support.check_dataset_reads(dataset_data, options_storage.only_assembler, log) if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION): support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!') options_storage.set_default_values() ### FILLING cfg cfg["common"] = empty_config() cfg["dataset"] = empty_config() if not options_storage.only_assembler: cfg["error_correction"] = empty_config() if not options_storage.only_error_correction: cfg["assembly"] = empty_config() # common cfg["common"].__dict__["output_dir"] = options_storage.output_dir cfg["common"].__dict__["tmp_dir"] = options_storage.tmp_dir cfg["common"].__dict__["max_threads"] = options_storage.threads cfg["common"].__dict__["max_memory"] = options_storage.memory cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode # dataset section cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell cfg["dataset"].__dict__["iontorrent"] = options_storage.iontorrent cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename if options_storage.developer_mode and options_storage.reference: cfg["dataset"].__dict__["reference"] = options_storage.reference # error correction if (not options_storage.only_assembler) and (options_storage.iterations > 0): cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected") cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output if options_storage.qvoffset: cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset if options_storage.bh_heap_check: cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent # assembly if not options_storage.only_error_correction: if options_storage.k_mers: cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers else: cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT cfg["assembly"].__dict__["careful"] = options_storage.careful cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode if options_storage.spades_heap_check: cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check if options_storage.read_buffer_size: cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size #corrector can work only if contigs exist (not only error correction) if (not options_storage.only_error_correction) and options_storage.mismatch_corrector: cfg["mismatch_corrector"] = empty_config() cfg["mismatch_corrector"].__dict__["skip-masked"] = None cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades") cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir return cfg, dataset_data
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: options_storage.usage(spades_version) sys.exit(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg cfg, dataset_data = fill_cfg(args, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt")) if not options: support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.") cfg, dataset_data = fill_cfg(options, log) if options_storage.restart_from: check_cfg_for_restart_from(cfg) options_storage.continue_mode = True log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" flag = False for v in args[1:]: if v == '-o' or v == '--restart-from': flag = True continue if flag: flag = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line:" for v in args: command += " " + v log.info(command) # special case if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'): support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads(dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename] for format in [".fasta", ".fastg"]: for old_result_file in old_result_files: if os.path.isfile(old_result_file[:-6] + format): os.remove(old_result_file[:-6] + format) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile(dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir(misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) #corrector if "mismatch_corrector" in cfg and (os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue for format in [".fasta", ".fastg"]: if os.path.isfile(old[:-6] + format): shutil.move(old[:-6] + format, new[:-6] + format) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size est_params_data = pyyaml.load(open(os.path.join(latest_dir, "final.lib_data"), 'r')) max_IS_library = None for reads_library in est_params_data: if reads_library['type'] == 'paired-end': if not max_IS_library or float(reads_library["insert size mean"]) > float(max_IS_library["insert size mean"]): max_IS_library = reads_library if not max_IS_library: support.error('Mismatch correction cannot be performed without at least one paired-end library!', log) if not max_IS_library["insert size mean"]: support.warning('Failed to estimate insert size for all paired-end libraries. Starting Mismatch correction' ' based on the first paired-end library and with default insert size.', log) else: cfg["mismatch_corrector"].__dict__["insert-size"] = round(max_IS_library["insert size mean"]) yaml_dirname = os.path.dirname(options_storage.dataset_yaml_filename) cfg["mismatch_corrector"].__dict__["1"] = list(map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['left reads'])) cfg["mismatch_corrector"].__dict__["2"] = list(map(lambda x: os.path.join(yaml_dirname, x), max_IS_library['right reads'])) #TODO: add reads orientation import corrector corrector_cfg = cfg["mismatch_corrector"] args = [] for key, values in corrector_cfg.__dict__.items(): if key == "output-dir": continue # for processing list of reads if not isinstance(values, list): values = [values] for value in values: if len(key) == 1: args.append('-' + key) else: args.append('--' + key) if value is not None: args.append(value) # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") cur_args = args[:] cur_args += ['-c', assembled] tmp_dir_for_corrector = support.get_tmp_dir(prefix="mis_cor_%s_" % assembly_type) cur_args += ['--output-dir', tmp_dir_for_corrector] # correcting corrector.main(cur_args, ext_python_modules_home, log) result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta") # moving corrected contigs (scaffolds) to SPAdes output dir if os.path.isfile(result_corrected_filename): shutil.move(result_corrected_filename, corrected) if os.path.isdir(tmp_dir_for_corrector): shutil.rmtree(tmp_dir_for_corrector) assembled_fastg = assembled[:-6] + ".fastg" if os.path.isfile(assembled_fastg): support.create_fastg_from_fasta(corrected, assembled_fastg, log) log.info("\n===== %s finished.\n" % STAGE_NAME) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) #log.info("") if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) if os.path.isfile(result_contigs_filename[:-6] + ".fastg"): message += " (" + os.path.basename(result_contigs_filename[:-6] + ".fastg") + ")" log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"): message += " (" + os.path.basename(result_scaffolds_filename[:-6] + ".fastg") + ")" log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS if options_storage.test_mode: for result_filename in [result_contigs_filename, result_scaffolds_filename]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error("TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error("TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def main(args): os.environ["LC_ALL"] = "C" if len(args) == 1: show_usage(0) log = logging.getLogger('spades') log.setLevel(logging.DEBUG) console = logging.StreamHandler(sys.stdout) console.setFormatter(logging.Formatter('%(message)s')) console.setLevel(logging.DEBUG) log.addHandler(console) support.check_binaries(bin_home, log) # parse options and safe all parameters to cfg options = args cfg, dataset_data = fill_cfg(options, log) if options_storage.continue_mode: cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0]) if not options: support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.") cfg, dataset_data = fill_cfg(options, log, secondary_filling=True) if options_storage.restart_from: check_cfg_for_partial_run(cfg, type='restart-from') options_storage.continue_mode = True if options_storage.stop_after: check_cfg_for_partial_run(cfg, type='stop-after') log_filename = os.path.join(cfg["common"].output_dir, "spades.log") if options_storage.continue_mode: log_handler = logging.FileHandler(log_filename, mode='a') else: log_handler = logging.FileHandler(log_filename, mode='w') log.addHandler(log_handler) if options_storage.continue_mode: log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n") log.info("Restored from " + cmd_line) if options_storage.restart_from: updated_params = "" skip_next = False for v in args[1:]: if v == '-o' or v == '--restart-from': skip_next = True continue if skip_next or v.startswith('--restart-from='): # you can specify '--restart-from=k33' but not '-o=out_dir' skip_next = False continue updated_params += " " + v updated_params = updated_params.strip() log.info("with updated parameters: " + updated_params) cmd_line += " " + updated_params log.info("") params_filename = os.path.join(cfg["common"].output_dir, "params.txt") params_handler = logging.FileHandler(params_filename, mode='w') log.addHandler(params_handler) if options_storage.continue_mode: log.info(cmd_line) else: command = "Command line: " for v in args: # substituting relative paths with absolute ones (read paths, output dir path, etc) v, prefix = support.get_option_prefix(v) if v in options_storage.dict_of_rel2abs.keys(): v = options_storage.dict_of_rel2abs[v] if prefix: command += prefix + ":" command += v + " " log.info(command) # special case # if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'): # support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log) # del cfg["mismatch_corrector"] print_used_values(cfg, log) log.removeHandler(params_handler) support.check_single_reads_in_options(options, log) if not options_storage.continue_mode: log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n") # splitting interlaced reads and processing Ns in additional contigs if needed if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data)\ or support.dataset_has_nxmate_reads(dataset_data): dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input') if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_nxmate_reads(dataset_data): if not os.path.isdir(dir_for_split_reads): os.makedirs(dir_for_split_reads) if support.dataset_has_interlaced_reads(dataset_data): dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log) if support.dataset_has_nxmate_reads(dataset_data): dataset_data = support.process_nxmate_reads(dataset_data, dir_for_split_reads, log) if support.dataset_has_additional_contigs(dataset_data): dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log) options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml") pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w')) cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename try: # copying configs before all computations (to prevent its changing at run time) tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs") if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode: shutil.rmtree(tmp_configs_dir) if not os.path.isdir(tmp_configs_dir): if options_storage.configs_dir: dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False) else: dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False) corrected_dataset_yaml_filename = '' if "error_correction" in cfg: STAGE_NAME = "Read error correction" bh_cfg = merge_configs(cfg["error_correction"], cfg["common"]) corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml") if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \ and not options_storage.restart_from == "ec": log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: support.continue_from_here(log) if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in bh_cfg.__dict__: os.environ["HEAPCHECK"] = bh_cfg.heap_check if os.path.exists(bh_cfg.output_dir): shutil.rmtree(bh_cfg.output_dir) os.makedirs(bh_cfg.output_dir) if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w')) bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename else: not_used_dataset_data = None bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename log.info("\n===== %s started. \n" % STAGE_NAME) hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data, ext_python_modules_home, log) log.info("\n===== %s finished. \n" % STAGE_NAME) if options_storage.stop_after == 'ec': support.finish_here(log) result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta") result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta") result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, "assembly_graph.fastg") truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads") truseq_long_reads_file = truseq_long_reads_file_base + ".fasta" misc_dir = os.path.join(cfg["common"].output_dir, "misc") ### if mismatch correction is enabled then result contigs are copied to misc directory assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta") assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta") if "assembly" in cfg and not options_storage.run_completed: STAGE_NAME = "Assembling" spades_cfg = merge_configs(cfg["assembly"], cfg["common"]) spades_cfg.__dict__["result_contigs"] = result_contigs_filename spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs) or ("mismatch_corrector" in cfg and os.path.isfile(assembled_contigs_filename)))\ and not options_storage.restart_from == 'as' \ and not options_storage.restart_from == 'scc' \ and not (options_storage.restart_from and options_storage.restart_from.startswith('k')): log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) # calculating latest_dir for the next stages latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*")) if not latest_dir: support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log) else: old_result_files = [result_contigs_filename, result_scaffolds_filename, assembled_contigs_filename, assembled_scaffolds_filename] for old_result_file in old_result_files: if os.path.isfile(old_result_file): os.remove(old_result_file) if options_storage.restart_from == 'as': support.continue_from_here(log) if os.path.isfile(corrected_dataset_yaml_filename): dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename)) if spades_cfg.disable_rr: spades_cfg.__dict__["rr_enable"] = False else: spades_cfg.__dict__["rr_enable"] = True if "HEAPCHECK" in os.environ: del os.environ["HEAPCHECK"] if "heap_check" in spades_cfg.__dict__: os.environ["HEAPCHECK"] = spades_cfg.heap_check log.info("\n===== %s started.\n" % STAGE_NAME) # creating dataset dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info") if not os.path.isfile(dataset_filename) or not options_storage.continue_mode: dataset_file = open(dataset_filename, 'w') import process_cfg dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n') dataset_file.write("meta" + '\t' + process_cfg.bool_to_str(cfg["dataset"].meta) + '\n') dataset_file.write("moleculo" + '\t' + process_cfg.bool_to_str(cfg["dataset"].truseq) + '\n') if os.path.isfile(corrected_dataset_yaml_filename): dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n') else: dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n') if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__: dataset_file.write("reference_genome" + '\t') dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n') dataset_file.close() spades_cfg.__dict__["dataset"] = dataset_filename latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log) if os.path.isdir(misc_dir) and not options_storage.continue_mode: shutil.rmtree(misc_dir) if not os.path.isdir(misc_dir): os.makedirs(misc_dir) if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'): k_str = options_storage.restart_from[1:] if k_str.find(":") != -1: k_str = k_str[:k_str.find(":")] support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log) log.info("\n===== %s finished. \n" % STAGE_NAME) if not options_storage.run_completed: if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')): support.finish_here(log) #postprocessing if cfg["run_truseq_postprocessing"] and not options_storage.run_completed: if options_storage.continue_mode and os.path.isfile(truseq_long_reads_file_base + ".fastq") and not options_storage.restart_from == 'tpp': log.info("\n===== Skipping %s (already processed). \n" % "TruSeq postprocessing") else: support.continue_from_here(log) if os.path.isfile(result_scaffolds_filename): shutil.move(result_scaffolds_filename, assembled_scaffolds_filename) reads_library = dataset_data[0] alignment_bin = os.path.join(bin_home, "bwa-spades") alignment_dir = os.path.join(cfg["common"].output_dir, "alignment") sam_files = alignment.align_bwa(alignment_bin, assembled_scaffolds_filename, dataset_data, alignment_dir, log, options_storage.threads) moleculo_postprocessing.moleculo_postprocessing(assembled_scaffolds_filename, truseq_long_reads_file_base, sam_files, log) if options_storage.stop_after == 'tpp': support.finish_here(log) #corrector if "mismatch_corrector" in cfg and not options_storage.run_completed and \ (os.path.isfile(result_contigs_filename) or (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))): STAGE_NAME = "Mismatch correction" to_correct = dict() to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename) if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and os.path.isfile(assembled_scaffolds_filename)): to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename) # moving assembled contigs (scaffolds) to misc dir for assembly_type, (old, new) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(new): continue if os.path.isfile(old): shutil.move(old, new) if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \ (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \ and not options_storage.restart_from == 'mc': log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME) else: if options_storage.restart_from == 'mc': support.continue_from_here(log) log.info("\n===== %s started." % STAGE_NAME) # detecting paired-end library with the largest insert size cfg["mismatch_corrector"].__dict__["dataset"] = cfg["dataset"].yaml_filename #TODO: add reads orientation import corrector_logic corrector_cfg = cfg["mismatch_corrector"] # processing contigs and scaffolds (or only contigs) for assembly_type, (corrected, assembled) in to_correct.items(): if options_storage.continue_mode and os.path.isfile(corrected): log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n") continue support.continue_from_here(log) log.info("\n== Processing of " + assembly_type + "\n") tmp_dir_for_corrector = os.path.join(cfg["common"].output_dir, "mismatch_corrector", assembly_type) cfg["mismatch_corrector"].__dict__["output_dir"] = tmp_dir_for_corrector # correcting corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"]) result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta") corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg, ext_python_modules_home, log, assembled, result_corrected_filename) if os.path.isfile(result_corrected_filename): shutil.copyfile(result_corrected_filename, corrected) tmp_d = os.path.join(tmp_dir_for_corrector, "tmp") if os.path.isdir(tmp_d) and not cfg["common"].developer_mode: shutil.rmtree(tmp_d) log.info("\n===== %s finished.\n" % STAGE_NAME) if options_storage.stop_after == 'mc': support.finish_here(log) if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir): shutil.rmtree(tmp_configs_dir) if not options_storage.run_completed: #log.info("") if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)): log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/")) if "assembly" in cfg and os.path.isfile(result_contigs_filename): message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename) log.info(message) if "assembly" in cfg and os.path.isfile(result_scaffolds_filename): message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename) log.info(message) if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename): message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename) log.info(message) #log.info("") #breaking scaffolds if os.path.isfile(result_scaffolds_filename): if not os.path.isdir(misc_dir): os.makedirs(misc_dir) result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta") if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode: modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename, options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) if modified: support.write_fasta(result_broken_scaffolds, broken_scaffolds) #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) + # " Ns are in " + result_broken_scaffolds) ### printing WARNINGS SUMMARY if not support.log_warnings(log): log.info("\n======= SPAdes pipeline finished.") # otherwise it finished WITH WARNINGS if options_storage.test_mode: if options_storage.truseq_mode: if not os.path.isfile(truseq_long_reads_file): support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file) else: for result_filename in [result_contigs_filename, result_scaffolds_filename]: if os.path.isfile(result_filename): result_fasta = list(support.read_fasta(result_filename)) # correctness check: should be one contig of length 1000 bp correct_number = 1 correct_length = 1000 if not len(result_fasta): support.error("TEST FAILED: %s does not contain contigs!" % result_filename) elif len(result_fasta) > correct_number: support.error("TEST FAILED: %s contains more than %d contig (%d)!" % (result_filename, correct_number, len(result_fasta))) elif len(result_fasta[0][1]) != correct_length: if len(result_fasta[0][1]) > correct_length: relation = "more" else: relation = "less" support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" % (result_filename, relation, correct_length, len(result_fasta[0][1]))) else: support.error("TEST FAILED: " + result_filename + " does not exist!") log.info("\n========= TEST PASSED CORRECTLY.") log.info("\nSPAdes log can be found here: " + log_filename) log.info("") log.info("Thank you for using SPAdes!") log.removeHandler(log_handler) except Exception: exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error support.error("It looks like you are using SPAdes binaries for another platform.\n" + support.get_spades_binaries_info_message()) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log) except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException exc_type, exc_value, _ = sys.exc_info() if exc_type == SystemExit: sys.exit(exc_value) else: log.exception(exc_value) support.error("exception caught: %s" % exc_type, log)
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg, dataset_data, ext_python_modules_home, only_compressing_is_needed, log): addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): import pyyaml2 as pyyaml elif sys.version.startswith('3.'): import pyyaml3 as pyyaml # not all reads need processing if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES): not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES) to_correct_dataset_yaml_filename = os.path.join(cfg.output_dir, "to_correct.yaml") pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500) cfg.dataset_yaml_filename = to_correct_dataset_yaml_filename else: not_used_dataset_data = None if not only_compressing_is_needed: dst_configs = os.path.join(cfg.output_dir, "configs") if os.path.exists(dst_configs): shutil.rmtree(dst_configs) if cfg.iontorrent: dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg") else: dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False) cfg_file_name = os.path.join(dst_configs, "config.info") cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_") if cfg.iontorrent: prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home) binary_name = "ionhammer" else: prepare_config_bh(cfg_file_name, cfg, log) binary_name = "hammer" command = [os.path.join(execution_home, binary_name), os.path.abspath(cfg_file_name)] log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n") support.sys_call(command, log) if not os.path.isfile(corrected_dataset_yaml_filename): support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!") else: log.info("\n===== Skipping %s (already processed). \n" % "read error correction tool") support.continue_from_here(log) corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r')) remove_not_corrected_reads(cfg.output_dir) is_changed = False if cfg.gzip_output: is_changed = True compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log) if not_used_dataset_data: is_changed = True corrected_dataset_data += not_used_dataset_data if is_changed: pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500) log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n") if os.path.isdir(cfg.tmp_dir): shutil.rmtree(cfg.tmp_dir)