Ejemplo n.º 1
0
def run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, K):
    data_dir = os.path.join(cfg.output_dir, "SCC", "K%d" % K)
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)

    dir_util.copy_tree(os.path.join(configs_dir, "debruijn"),
                       dst_configs,
                       preserve_times=False)

    log.info("\n== Running scaffold correction \n")
    scaffolds_file = os.path.join(latest, "scaffolds.fasta")
    if not os.path.isfile(scaffolds_file):
        support.error("Scaffodls were not found in " + scaffolds_file, log)
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs,
                                                  "construction.info")
        process_cfg.substitute_params(
            construction_cfg_file_name,
            {"read_buffer_size": cfg.read_buffer_size}, log)
    process_cfg.substitute_params(
        os.path.join(dst_configs, "moleculo_mode.info"),
        {"scaffolds_file": scaffolds_file}, log)
    prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, K)
    command = [
        os.path.join(execution_home, "spades-truseq-scfcorrection"),
        cfg_file_name
    ]
    add_configs(command, dst_configs)
    log.info(str(command))
    support.sys_call(command, log)
Ejemplo n.º 2
0
def run_scaffold_correction(configs_dir, execution_home, cfg, log, K):
    data_dir = os.path.join(cfg.output_dir, "SCC")
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)

    dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)
    # removing template configs
    for root, dirs, files in os.walk(dst_configs):
        for cfg_file in files:
            cfg_file = os.path.join(root, cfg_file)
            if cfg_file.endswith('.info.template'):
                if os.path.isfile(cfg_file.split('.template')[0]):
                    os.remove(cfg_file)
                else:
                    os.rename(cfg_file, cfg_file.split('.template')[0])

    log.info("\n== Running scaffold correction \n")
    latest = os.path.join(cfg.output_dir, "K%d" % K)
    scaffolds_file = os.path.join(latest, "scaffolds.fasta")
    if not os.path.isfile(scaffolds_file):
        support.error("Scaffodls were not found in " + scaffolds_file, log)
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
        process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
    prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, scaffolds_file)
    command = [os.path.join(execution_home, "scaffold_correction"), cfg_file_name]
    log.info(str(command))
    support.sys_call(command, log)
Ejemplo n.º 3
0
def get_options_from_params(params_filename, spades_py_name=None):
    if not os.path.isfile(params_filename):
        return None, None
    params = open(params_filename, 'r')
    cmd_line = params.readline().strip()
    spades_prev_version = None
    for line in params:
        if line.find('rnaSPAdes version:') != -1:
            spades_prev_version = line.split('rnaSPAdes version:')[1]
            break
    params.close()
    if spades_prev_version is None:
        support.error("failed to parse rnaSPAdes version of the previous run! "
                      "Please restart from the beginning or specify another output directory.")
    if spades_prev_version.strip() != spades_version.strip():
        support.error("rnaSPAdes version of the previous run (%s) is not equal to the current version of rnaSPAdes (%s)! "
                      "Please restart from the beginning or specify another output directory."
                      % (spades_prev_version.strip(), spades_version.strip()))
    if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1:
        spades_py_name = 'spades.py'  # try default name
    else:
        spades_py_name = os.path.basename(spades_py_name)
    spades_py_pos = cmd_line.find(spades_py_name)
    if spades_py_pos == -1:
        return None, None
    return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split()
Ejemplo n.º 4
0
def run_corrector(configs_dir, execution_home, cfg,
                ext_python_modules_home, log, to_correct, result):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith('2.'):
        import pyyaml2 as pyyaml
    elif sys.version.startswith('3.'):
        import pyyaml3 as pyyaml

    dst_configs = os.path.join(cfg.output_dir, "configs")
    if os.path.exists(dst_configs):
        shutil.rmtree(dst_configs)
    dir_util.copy_tree(os.path.join(configs_dir, "corrector"), dst_configs, preserve_times=False)
    cfg_file_name = os.path.join(dst_configs, "corrector.info")

    cfg.tmp_dir = support.get_tmp_dir(prefix="corrector_")

    prepare_config_corr(cfg_file_name, cfg, ext_python_modules_home)
    binary_name = "corrector"

    command = [os.path.join(execution_home, binary_name),
               os.path.abspath(cfg_file_name), os.path.abspath(to_correct)]

    log.info("\n== Running contig polishing tool: " + ' '.join(command) + "\n")


    log.info("\n== Dataset description file was created: " + cfg_file_name + "\n")

    support.sys_call(command, log)
    if not os.path.isfile(result):
        support.error("Mismatch correction finished abnormally: " + result + " not found!")
    if os.path.isdir(cfg.tmp_dir):
        shutil.rmtree(cfg.tmp_dir)
Ejemplo n.º 5
0
def parse_args(args, log):
    options, cfg, dataset_data = options_parser.parse_args(log, bin_home, spades_home,
                                                           secondary_filling=False, restart_from=False)

    command_line = ""

    if options_storage.args.continue_mode:
        restart_from = options_storage.args.restart_from
        command_line, options, script_name, err_msg = get_options_from_params(
            os.path.join(options_storage.args.output_dir, "params.txt"),
            args[0])
        if err_msg:
            support.error(err_msg + " Please restart from the beginning or specify another output directory.")
        options, cfg, dataset_data = options_parser.parse_args(log, bin_home, spades_home, secondary_filling=True,
                                                               restart_from=(options_storage.args.restart_from is not None),
                                                               options=options)

        options_storage.args.continue_mode = True
        options_storage.args.restart_from = restart_from

        if options_storage.args.restart_from:
            check_cfg_for_partial_run(cfg, partial_run_type="restart-from")

    if options_storage.args.stop_after:
        check_cfg_for_partial_run(cfg, partial_run_type="stop-after")

    support.check_single_reads_in_options(log)
    return cfg, dataset_data, command_line
Ejemplo n.º 6
0
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log):
    log.info("\n== Compressing corrected reads (with gzip)")
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                compressed_reads_filenames = []
                for reads_file in value:
                    if not os.path.isfile(reads_file):
                        support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
                    to_compress.append(reads_file)
                    compressed_reads_filenames.append(reads_file + ".gz")
                reads_library[key] = compressed_reads_filenames
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Ejemplo n.º 7
0
def get_options_from_params(params_filename, spades_py_name=None):
    if not os.path.isfile(params_filename):
        return None, None
    params = open(params_filename, 'r')
    cmd_line = params.readline().strip()
    spades_prev_version = None
    for line in params:
        if line.find('SPAdes version:') != -1:
            spades_prev_version = line.split('SPAdes version:')[1]
            break
    params.close()
    if spades_prev_version is None:
        support.error(
            "failed to parse SPAdes version of the previous run! "
            "Please restart from the beginning or specify another output directory."
        )
    if spades_prev_version.strip() != spades_version.strip():
        support.error(
            "SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! "
            "Please restart from the beginning or specify another output directory."
            % (spades_prev_version.strip(), spades_version.strip()))
    if spades_py_name is None or cmd_line.find(
            os.path.basename(spades_py_name)) == -1:
        spades_py_name = 'spades.py'  # try default name
    else:
        spades_py_name = os.path.basename(spades_py_name)
    spades_py_pos = cmd_line.find(spades_py_name)
    if spades_py_pos == -1:
        return None, None
    return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split()
Ejemplo n.º 8
0
def parse_arguments(argv, log):
    try:
        options, not_options = getopt.gnu_getopt(argv, DS_Args_List.short_options, DS_Args_List.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage("", dipspades=True)
        sys.exit(1)

    ds_args = DS_Args()
    for opt, arg in options:
        if opt == '-o':
            ds_args.output_dir = os.path.abspath(arg)
        elif opt == '--expect-gaps':
            ds_args.allow_gaps = True
        elif opt == '--expect-rearrangements':
            ds_args.weak_align = True
        elif opt == '--hap':
            ds_args.haplocontigs_fnames.append(support.check_file_existence(arg, 'haplocontigs', log, dipspades=True))
        elif opt == '-t' or opt == "--threads":
            ds_args.max_threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            ds_args.max_memory = int(arg)
        elif opt == '--tmp-dir':
            ds_args.tmp_dir = os.path.abspath(arg)
    ds_args.haplocontigs = os.path.join(ds_args.output_dir, "haplocontigs")

    if not ds_args.output_dir:
        support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log, dipspades=True)
    if not ds_args.haplocontigs_fnames:
        support.error("cannot start dipSPAdes without at least one haplocontigs file!", log, dipspades=True)
    if not ds_args.tmp_dir:
        ds_args.tmp_dir = os.path.join(ds_args.output_dir, options_storage.TMP_DIR)
    return ds_args
Ejemplo n.º 9
0
def get_command_and_stage_id_before_restart_from(draft_commands, cfg, log):
    restart_from_stage_name = options_storage.args.restart_from.split(":")[0]

    if options_storage.args.restart_from == options_storage.LAST_STAGE:
        last_command = get_first_incomplete_command(os.path.join(get_stage.cfg["common"].output_dir, "run_spades.yaml"))
        if last_command is None:
            restart_from_stage_name = draft_commands[-1].short_name
        else:
            restart_from_stage_name = last_command["short_name"]

    restart_from_stage_id = None
    for num in range(len(draft_commands)):
        stage = draft_commands[num]
        if stage.short_name.startswith(restart_from_stage_name):
            restart_from_stage_id = num
            break

    if restart_from_stage_id is None:
        support.error(
            "failed to restart from %s because this stage was not specified!" % options_storage.args.restart_from,
            log)

    if ":" in options_storage.args.restart_from or options_storage.args.restart_from == options_storage.LAST_STAGE:
        return draft_commands[restart_from_stage_id], restart_from_stage_id

    if restart_from_stage_id > 0:
        stage_filename = options_storage.get_stage_filename(restart_from_stage_id - 1, draft_commands[restart_from_stage_id - 1].short_name)
        if not os.path.isfile(stage_filename):
            support.error(
                "cannot restart from stage %s: previous stage was not complete." % options_storage.args.restart_from,
                log)
        return draft_commands[restart_from_stage_id - 1], restart_from_stage_id - 1
    return None, None
Ejemplo n.º 10
0
def run_corrector(configs_dir, execution_home, cfg,
                ext_python_modules_home, log, to_correct, result):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith('2.'):
        import pyyaml2 as pyyaml
    elif sys.version.startswith('3.'):
        import pyyaml3 as pyyaml

    dst_configs = os.path.join(cfg.output_dir, "configs")
    if os.path.exists(dst_configs):
        shutil.rmtree(dst_configs)
    dir_util.copy_tree(os.path.join(configs_dir, "corrector"), dst_configs, preserve_times=False)
    cfg_file_name = os.path.join(dst_configs, "corrector.info")

    cfg.tmp_dir = support.get_tmp_dir(prefix="corrector_")

    prepare_config_corr(cfg_file_name, cfg, ext_python_modules_home)
    binary_name = "corrector"

    command = [os.path.join(execution_home, binary_name),
               os.path.abspath(cfg_file_name), os.path.abspath(to_correct)]

    log.info("\n== Running contig polishing tool: " + ' '.join(command) + "\n")


    log.info("\n== Dataset description file was created: " + cfg_file_name + "\n")

    support.sys_call(command, log)
    if not os.path.isfile(result):
        support.error("Mismatch correction finished abnormally: " + result + " not found!")
    if os.path.isdir(cfg.tmp_dir):
        shutil.rmtree(cfg.tmp_dir)
Ejemplo n.º 11
0
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log):
    log.info("\n== Compressing corrected reads (with gzip)")
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                compressed_reads_filenames = []
                for reads_file in value:
                    if not os.path.isfile(reads_file):
                        support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
                    to_compress.append(reads_file)
                    compressed_reads_filenames.append(reads_file + ".gz")
                reads_library[key] = compressed_reads_filenames
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Ejemplo n.º 12
0
def run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, K):
    data_dir = os.path.join(cfg.output_dir, "SCC", "K%d" % K)
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)

    dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)

    log.info("\n== Running scaffold correction \n")
    scaffolds_file = os.path.join(latest, "scaffolds.fasta")
    if not os.path.isfile(scaffolds_file):
        support.error("Scaffodls were not found in " + scaffolds_file, log)
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
        process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
    process_cfg.substitute_params(os.path.join(dst_configs, "moleculo_mode.info"), {"scaffolds_file": scaffolds_file}, log)
    prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, K)
    command = [os.path.join(execution_home, "scaffold_correction"), cfg_file_name]
    add_configs(command, dst_configs)
    log.info(str(command))
    support.sys_call(command, log)
def run_scaffold_correction(configs_dir, execution_home, cfg, log, K):
    data_dir = os.path.join(cfg.output_dir, "SCC")
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)

    dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)
    # removing template configs
    for root, dirs, files in os.walk(dst_configs):
        for cfg_file in files:
            cfg_file = os.path.join(root, cfg_file)
            if cfg_file.endswith('.info.template'):
                if os.path.isfile(cfg_file.split('.template')[0]):
                    os.remove(cfg_file)
                else:
                    os.rename(cfg_file, cfg_file.split('.template')[0])

    log.info("\n== Running scaffold correction \n")
    latest = os.path.join(cfg.output_dir, "K%d" % K)
    scaffolds_file = os.path.join(latest, "scaffolds.fasta")
    if not os.path.isfile(scaffolds_file):
        support.error("Scaffodls were not found in " + scaffolds_file, log)
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
        process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
    prepare_config_scaffold_correction(cfg_file_name, cfg, log, saves_dir, scaffolds_file)
    command = [os.path.join(execution_home, "scaffold_correction"), cfg_file_name]
    log.info(str(command))
    support.sys_call(command, log)
Ejemplo n.º 14
0
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
    data_dir = os.path.join(cfg.output_dir, "K%d" % K)
    stage = BASE_STAGE
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if options_storage.continue_mode:
        if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
            (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))):
            log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)")
            return
        if options_storage.restart_from and options_storage.restart_from.find(":") != -1:
            stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:]
        support.continue_from_here(log)

    if stage != BASE_STAGE:
        if not os.path.isdir(saves_dir):
            support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir))
    else:
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir)

        shutil.copytree(os.path.join(configs_dir, "debruijn"), dst_configs)
        # removing template configs
        for root, dirs, files in os.walk(dst_configs):
            for cfg_file in files:
                cfg_file = os.path.join(root, cfg_file)
                if cfg_file.endswith('.info.template'):
                    if os.path.isfile(cfg_file.split('.template')[0]):
                        os.remove(cfg_file)
                    else:
                        os.rename(cfg_file, cfg_file.split('.template')[0])

    log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
    if prev_K:
        additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta")
        if not os.path.isfile(additional_contigs_fname):
            support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log)
            additional_contigs_fname = None
    else:
        additional_contigs_fname = None
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
        process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
    prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one)

    command = [os.path.join(execution_home, "spades"), cfg_file_name]

## this code makes sense for src/debruijn/simplification.cpp: corrected_and_save_reads() function which is not used now
#    bin_reads_dir = os.path.join(cfg.output_dir, ".bin_reads")
#    if os.path.isdir(bin_reads_dir):
#        if glob.glob(os.path.join(bin_reads_dir, "*_cor*")):
#            for cor_filename in glob.glob(os.path.join(bin_reads_dir, "*_cor*")):
#                cor_index = cor_filename.rfind("_cor")
#                new_bin_filename = cor_filename[:cor_index] + cor_filename[cor_index + 4:]
#                shutil.move(cor_filename, new_bin_filename)
    support.sys_call(command, log)
Ejemplo n.º 15
0
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg,
               not_used_dataset_data, ext_python_modules_home, log):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith('2.'):
        import pyyaml2 as pyyaml
    elif sys.version.startswith('3.'):
        import pyyaml3 as pyyaml

    dst_configs = os.path.join(cfg.output_dir, "configs")
    if os.path.exists(dst_configs):
        shutil.rmtree(dst_configs)
    if cfg.iontorrent:
        dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False)
        cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg")
    else:
        dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False)
        cfg_file_name = os.path.join(dst_configs, "config.info")
    # removing template configs
    for root, dirs, files in os.walk(dst_configs):
        for cfg_file in files:
            cfg_file = os.path.join(root, cfg_file)
            if cfg_file.endswith('.template'):
                if os.path.isfile(cfg_file.split('.template')[0]):
                    os.remove(cfg_file)
                else:
                    os.rename(cfg_file, cfg_file.split('.template')[0])

    cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_")
    if cfg.iontorrent:
        prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home)
        binary_name = "ionhammer"
    else:
        prepare_config_bh(cfg_file_name, cfg, log)
        binary_name = "hammer"

    command = [os.path.join(execution_home, binary_name),
               os.path.abspath(cfg_file_name)]

    log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n")
    support.sys_call(command, log)
    if not os.path.isfile(corrected_dataset_yaml_filename):
        support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!")
    corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
    remove_not_corrected_reads(cfg.output_dir)
    is_changed = False
    if cfg.gzip_output:
        is_changed = True
        compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log)
    if not_used_dataset_data:
        is_changed = True
        corrected_dataset_data += not_used_dataset_data
    if is_changed:
        pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'))
    log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n")

    if os.path.isdir(cfg.tmp_dir):
        shutil.rmtree(cfg.tmp_dir)
Ejemplo n.º 16
0
def check_binaries(binary_dir, log):
    for binary in ["hammer", "spades", "bwa-spades"]:
        binary_path = os.path.join(binary_dir, binary)
        if not os.path.isfile(binary_path):
            support.error("SPAdes binaries not found: " + binary_path +
                          "\nYou can obtain SPAdes binaries in one of two ways:" +
                          "\n1. Download them from http://spades.bioinf.spbau.ru/release" +
                          str(spades_version).strip() + "/SPAdes-" + str(spades_version).strip() + "-Linux.tar.gz" +
                          "\n2. Build source code with ./spades_compile.sh script", log)
Ejemplo n.º 17
0
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg,
               not_used_dataset_data, ext_python_modules_home, log):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith('2.'):
        import pyyaml2 as pyyaml
    elif sys.version.startswith('3.'):
        import pyyaml3 as pyyaml
    dst_configs = os.path.join(cfg.output_dir, "configs")
    if os.path.exists(dst_configs):
        shutil.rmtree(dst_configs)
    if cfg.iontorrent:
        dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False)
        cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg")
    else:
        dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False)
        cfg_file_name = os.path.join(dst_configs, "config.info")
    # removing template configs
    for root, dirs, files in os.walk(dst_configs):
        for cfg_file in files:
            cfg_file = os.path.join(root, cfg_file)
            if cfg_file.endswith('.template'):
                if os.path.isfile(cfg_file.split('.template')[0]):
                    os.remove(cfg_file)
                else:
                    os.rename(cfg_file, cfg_file.split('.template')[0])

    cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_")
    if cfg.iontorrent:
        prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home)
        binary_name = "ionhammer"
    else:
        prepare_config_bh(cfg_file_name, cfg, log)
        binary_name = "hammer"

    command = [os.path.join(execution_home, binary_name),
               os.path.abspath(cfg_file_name)]

    log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n")
    support.sys_call(command, log)
    if not os.path.isfile(corrected_dataset_yaml_filename):
        support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!")
    corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
    remove_not_corrected_reads(cfg.output_dir)
    is_changed = False
    if cfg.gzip_output:
        is_changed = True
        compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log)
    if not_used_dataset_data:
        is_changed = True
        corrected_dataset_data += not_used_dataset_data
    if is_changed:
        pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'))
    log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n")

    if os.path.isdir(cfg.tmp_dir):
        shutil.rmtree(cfg.tmp_dir)
Ejemplo n.º 18
0
def move_dataset_files(dataset_data,
                       dst,
                       ext_python_modules_home,
                       max_threads,
                       log,
                       gzip=False):
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                moved_reads_files = []
                for reads_file in value:
                    dst_filename = os.path.join(dst,
                                                os.path.basename(reads_file))
                    # TODO: fix problem with files with the same basenames in Hammer binary!
                    if not os.path.isfile(reads_file):
                        if (not gzip and os.path.isfile(dst_filename)) or (
                                gzip and os.path.isfile(dst_filename + '.gz')):
                            support.warning(
                                'file with corrected reads (' + reads_file +
                                ') is the same in several libraries', log)
                            if gzip:
                                dst_filename += '.gz'
                        else:
                            support.error(
                                'something went wrong and file with corrected reads ('
                                + reads_file + ') is missing!', log)
                    else:
                        shutil.move(reads_file, dst_filename)
                        if gzip:
                            to_compress.append(dst_filename)
                            dst_filename += '.gz'
                    moved_reads_files.append(dst_filename)
                reads_library[key] = moved_reads_files
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([
                    pigz_path, '-f', '-7', '-p',
                    str(max_threads), reads_file
                ], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(
                delayed(support.sys_call)(['gzip', '-f', '-7', reads_file])
                for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Ejemplo n.º 19
0
def check_binaries(binary_dir, log):
    for binary in ["hammer", "spades", "bwa-spades"]:
        binary_path = os.path.join(binary_dir, binary)
        if not os.path.isfile(binary_path):
            support.error(
                "SPAdes binaries not found: " + binary_path +
                "\nYou can obtain SPAdes binaries in one of two ways:" +
                "\n1. Download them from http://spades.bioinf.spbau.ru/release"
                + str(spades_version).strip() + "/SPAdes-" +
                str(spades_version).strip() + "-Linux.tar.gz" +
                "\n2. Build source code with ./spades_compile.sh script", log)
def parse_arguments(argv, log):
    try:
        options, not_options = getopt.gnu_getopt(argv,
                                                 DS_Args_List.short_options,
                                                 DS_Args_List.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage("", dipspades=True)
        sys.exit(1)

    ds_args = DS_Args()
    for opt, arg in options:
        if opt == '-o':
            ds_args.output_dir = os.path.abspath(arg)
        elif opt == '--expect-gaps':
            ds_args.allow_gaps = True
        elif opt == '--expect-rearrangements':
            ds_args.weak_align = True
        elif opt == '--hap':
            ds_args.haplocontigs_fnames.append(
                support.check_file_existence(arg,
                                             'haplocontigs',
                                             log,
                                             dipspades=True))
        elif opt == '-t' or opt == "--threads":
            ds_args.max_threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            ds_args.max_memory = int(arg)
        elif opt == '--tmp-dir':
            ds_args.tmp_dir = os.path.abspath(arg)
        elif opt == '--dsdebug':
            ds_args.dev_mode = True
        elif opt == '--hap-assembly':
            ds_args.haplotype_assembly = True
        elif opt == '--dsK':
            ds_args.k = int(arg)
    ds_args.haplocontigs = os.path.join(ds_args.output_dir, "haplocontigs")

    if not ds_args.output_dir:
        support.error(
            "the output_dir is not set! It is a mandatory parameter (-o output_dir).",
            log,
            dipspades=True)
    if not ds_args.haplocontigs_fnames:
        support.error(
            "cannot start dipSPAdes without at least one haplocontigs file!",
            log,
            dipspades=True)
    if not ds_args.tmp_dir:
        ds_args.tmp_dir = os.path.join(ds_args.output_dir,
                                       options_storage.TMP_DIR)
    return ds_args
Ejemplo n.º 21
0
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
    data_dir = os.path.join(cfg.output_dir, "K%d" % K)
    stage = BASE_STAGE
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")

    if options_storage.continue_mode:
        if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
            (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))):
            log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)")
            return
        if options_storage.restart_from and options_storage.restart_from.find(":") != -1 \
                and options_storage.restart_from.startswith("k%d:" % K):
            stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:]
        support.continue_from_here(log)

    if stage != BASE_STAGE:
        if not os.path.isdir(saves_dir):
            support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir))
    else:
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir)

        dir_util._path_created = {}  # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree
        dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)

    log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
    if prev_K:
        additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta")
        if not os.path.isfile(additional_contigs_fname):
            support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log)
            additional_contigs_fname = None
    else:
        additional_contigs_fname = None
    if "read_buffer_size" in cfg.__dict__:
        #FIXME why here???
        process_cfg.substitute_params(os.path.join(dst_configs, "construction.info"), {"read_buffer_size": cfg.read_buffer_size}, log)
    if "scaffolding_mode" in cfg.__dict__:
        #FIXME why here???
        process_cfg.substitute_params(os.path.join(dst_configs, "pe_params.info"), {"scaffolding_mode": cfg.scaffolding_mode}, log)

    prepare_config_rnaspades(os.path.join(dst_configs, "rna_mode.info"), log)
    prepare_config_construction(os.path.join(dst_configs, "construction.info"), log)
    cfg_fn = os.path.join(dst_configs, "config.info")
    prepare_config_spades(cfg_fn, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home)

    command = [os.path.join(execution_home, "spades-core"), cfg_fn]

    add_configs(command, dst_configs)

    #print("Calling: " + " ".join(command))
    support.sys_call(command, log)
Ejemplo n.º 22
0
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
    data_dir = os.path.join(cfg.output_dir, "K%d" % K)
    stage = BASE_STAGE
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")

    if options_storage.continue_mode:
        if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
            (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))):
            log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)")
            return
        if options_storage.restart_from and options_storage.restart_from.find(":") != -1 \
                and options_storage.restart_from.startswith("k%d:" % K):
            stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:]
        support.continue_from_here(log)

    if stage != BASE_STAGE:
        if not os.path.isdir(saves_dir):
            support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir))
    else:
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir)

        dir_util._path_created = {}  # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree
        dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)

    log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
    if prev_K:
        additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta")
        if not os.path.isfile(additional_contigs_fname):
            support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log)
            additional_contigs_fname = None
    else:
        additional_contigs_fname = None
    if "read_buffer_size" in cfg.__dict__:
        #FIXME why here???
        process_cfg.substitute_params(os.path.join(dst_configs, "construction.info"), {"read_buffer_size": cfg.read_buffer_size}, log)
    if "scaffolding_mode" in cfg.__dict__:
        #FIXME why here???
        process_cfg.substitute_params(os.path.join(dst_configs, "pe_params.info"), {"scaffolding_mode": cfg.scaffolding_mode}, log)

    prepare_config_rnaspades(os.path.join(dst_configs, "rna_mode.info"), log)
    prepare_config_construction(os.path.join(dst_configs, "construction.info"), log)
    cfg_fn = os.path.join(dst_configs, "config.info")
    prepare_config_spades(cfg_fn, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home)

    command = [os.path.join(execution_home, "spades-core"), cfg_fn]

    add_configs(command, dst_configs)

    #print("Calling: " + " ".join(command))
    support.sys_call(command, log)
Ejemplo n.º 23
0
def run_bh(result_filename, configs_dir, execution_home, cfg,
           ext_python_modules_home, log):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith('2.'):
        import pyyaml2 as pyyaml
    elif sys.version.startswith('3.'):
        import pyyaml3 as pyyaml

    dst_configs = os.path.join(cfg.output_dir, "configs")
    if os.path.exists(dst_configs):
        shutil.rmtree(dst_configs)
    shutil.copytree(os.path.join(configs_dir, "hammer"), dst_configs)
    cfg_file_name = os.path.join(dst_configs, "config.info")
    # removing template configs
    for root, dirs, files in os.walk(dst_configs):
        for cfg_file in files:
            cfg_file = os.path.join(root, cfg_file)
            if cfg_file.endswith('.info.template'):
                if os.path.isfile(cfg_file.split('.template')[0]):
                    os.remove(cfg_file)
                else:
                    os.rename(cfg_file, cfg_file.split('.template')[0])

    prepare_config_bh(cfg_file_name, cfg, log)

    command = [
        os.path.join(execution_home, "hammer"),
        os.path.abspath(cfg_file_name)
    ]

    log.info("\n== Running read error correction tool: " + ' '.join(command) +
             "\n")
    support.sys_call(command, log)
    corrected_dataset_yaml_filename = os.path.join(cfg.tmp_dir,
                                                   "corrected.yaml")
    if not os.path.isfile(corrected_dataset_yaml_filename):
        support.error("read error correction finished abnormally: " +
                      corrected_dataset_yaml_filename + " not found!")
    corrected_dataset_data = pyyaml.load(
        open(corrected_dataset_yaml_filename, 'r'))
    if cfg.gzip_output:
        log.info("\n== Compressing corrected reads (with gzip)")
    move_dataset_files(corrected_dataset_data, cfg.output_dir,
                       ext_python_modules_home, cfg.max_threads, log,
                       cfg.gzip_output)
    corrected_dataset_yaml_filename = result_filename
    pyyaml.dump(corrected_dataset_data,
                open(corrected_dataset_yaml_filename, 'w'))
    log.info("\n== Dataset description file created: " +
             corrected_dataset_yaml_filename + "\n")

    shutil.rmtree(cfg.tmp_dir)
Ejemplo n.º 24
0
def substitute_params(filename, var_dict, log):
    lines = file_lines(filename)
    vars_in_file = vars_from_lines(lines)

    for var, value in var_dict.items():
        if var not in vars_in_file:
            support.error("Couldn't find " + var + " in " + filename, log)

        meta = vars_in_file[var]
        lines[meta.line_num] = meta.indent + str(var) + " " + str(value) + "\n"

    file = open(filename, "w")
    file.writelines(lines)
Ejemplo n.º 25
0
def init_parser(args):
    if options_parser.is_first_run():
        options_storage.first_command_line = args
        check_dir_is_empty(options_parser.get_output_dir_from_args())
    else:
        command_line, options, script, err_msg = get_options_from_params(
            os.path.join(options_parser.get_output_dir_from_args(), "params.txt"),
            args[0])

        if err_msg != "":
            support.error(err_msg)

        options_storage.first_command_line = [script] + options
Ejemplo n.º 26
0
def substitute_params(filename, var_dict, log):
    lines = file_lines(filename)
    vars_in_file = vars_from_lines(lines)

    for var, value in var_dict.items():
        if var not in vars_in_file:
            support.error("Couldn't find " + var + " in " + filename, log)

        meta = vars_in_file[var]
        lines[meta.line_num] = meta.indent + str(var) + " " + str(value) + "\n"

    file = open(filename, "w")
    file.writelines(lines)
Ejemplo n.º 27
0
def run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one):
    data_dir = os.path.join(cfg.output_dir, "K%d" % K)
    stage = BASE_STAGE
    saves_dir = os.path.join(data_dir, 'saves')
    dst_configs = os.path.join(data_dir, "configs")
    cfg_file_name = os.path.join(dst_configs, "config.info")

    if options_storage.continue_mode:
        if os.path.isfile(os.path.join(data_dir, "final_contigs.fasta")) and not (options_storage.restart_from and
            (options_storage.restart_from == ("k%d" % K) or options_storage.restart_from.startswith("k%d:" % K))):
            log.info("\n== Skipping assembler: " + ("K%d" % K) + " (already processed)")
            return
        if options_storage.restart_from and options_storage.restart_from.find(":") != -1:
            stage = options_storage.restart_from[options_storage.restart_from.find(":") + 1:]
        support.continue_from_here(log)

    if stage != BASE_STAGE:
        if not os.path.isdir(saves_dir):
            support.error("Cannot restart from stage %s: saves were not found (%s)!" % (stage, saves_dir))
    else:
        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)
        os.makedirs(data_dir)

        dir_util.copy_tree(os.path.join(configs_dir, "debruijn"), dst_configs, preserve_times=False)
        # removing template configs
        for root, dirs, files in os.walk(dst_configs):
            for cfg_file in files:
                cfg_file = os.path.join(root, cfg_file)
                if cfg_file.endswith('.info.template'):
                    if os.path.isfile(cfg_file.split('.template')[0]):
                        os.remove(cfg_file)
                    else:
                        os.rename(cfg_file, cfg_file.split('.template')[0])

    log.info("\n== Running assembler: " + ("K%d" % K) + "\n")
    if prev_K:
        additional_contigs_fname = os.path.join(cfg.output_dir, "K%d" % prev_K, "simplified_contigs.fasta")
        if not os.path.isfile(additional_contigs_fname):
            support.warning("additional contigs for K=%d were not found (%s)!" % (K, additional_contigs_fname), log)
            additional_contigs_fname = None
    else:
        additional_contigs_fname = None
    if "read_buffer_size" in cfg.__dict__:
        construction_cfg_file_name = os.path.join(dst_configs, "construction.info")
        process_cfg.substitute_params(construction_cfg_file_name, {"read_buffer_size": cfg.read_buffer_size}, log)
    prepare_config_spades(cfg_file_name, cfg, log, additional_contigs_fname, K, stage, saves_dir, last_one, execution_home)

    command = [os.path.join(execution_home, "spades"), cfg_file_name]
    support.sys_call(command, log)
Ejemplo n.º 28
0
def check_cfg_for_restart_from(cfg):
    if options_storage.restart_from == 'ec' and ("error_correction" not in cfg):
        support.error("failed to restart from read error correction because this stage was not specified!")
    if options_storage.restart_from == 'mc' and ("mismatch_corrector" not in cfg):
        support.error("failed to restart from mismatch correction because this stage was not specified!")
    if options_storage.restart_from == 'as' or options_storage.restart_from.startswith('k'):
        if "assembly" not in cfg:
            support.error("failed to restart from assembling because this stage was not specified!")
        if options_storage.restart_from.startswith('k'):
            correct_k = False
            k_to_check = options_storage.k_mers
            if not k_to_check:
                if options_storage.auto_K_allowed():
                    k_to_check = list(set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250))
                else:
                    k_to_check = options_storage.K_MERS_SHORT
            for k in k_to_check:
                if options_storage.restart_from == ("k%d" % k) or options_storage.restart_from.startswith("k%d:" % k):
                    correct_k = True
                    break
            if not correct_k:
                k_str = options_storage.restart_from[1:]
                if k_str.find(":") != -1:
                    k_str = k_str[:k_str.find(":")]
                support.error("failed to restart from K=%s because this K was not specified!" % k_str)
Ejemplo n.º 29
0
def get_read_length(output_dir, K, ext_python_modules_home, log):
    est_params_filename = os.path.join(output_dir, "K%d" % K, "final.lib_data")
    max_read_length = 0
    if os.path.isfile(est_params_filename):
        addsitedir(ext_python_modules_home)
        if sys.version.startswith('2.'):
            import pyyaml2 as pyyaml
        elif sys.version.startswith('3.'):
            import pyyaml3 as pyyaml
        est_params_data = pyyaml.load(open(est_params_filename, 'r'))
        max_read_length = int(est_params_data['nomerge max read length'])
        log.info("Max read length detected as %d" % max_read_length)
    if max_read_length == 0:
        support.error("Failed to estimate maximum read length! File with estimated params: " + est_params_filename, log)
    return max_read_length
Ejemplo n.º 30
0
def get_read_length(output_dir, K, ext_python_modules_home, log):
    est_params_filename = os.path.join(output_dir, "K%d" % K, "final.lib_data")
    max_read_length = 0
    if os.path.isfile(est_params_filename):
        addsitedir(ext_python_modules_home)
        if sys.version.startswith('2.'):
            import pyyaml2 as pyyaml
        elif sys.version.startswith('3.'):
            import pyyaml3 as pyyaml
        est_params_data = pyyaml.load(open(est_params_filename, 'r'))
        max_read_length = int(est_params_data['nomerge max read length'])
        log.info("Max read length detected as %d" % max_read_length)
    if max_read_length == 0:
        support.error("Failed to estimate maximum read length! File with estimated params: " + est_params_filename, log)
    return max_read_length
def main():
    args = parse_args()

    # create logger
    log = logging.getLogger("Preprocess interlaced reads")
    log.setLevel(logging.DEBUG)
    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter("%(message)s"))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    with open(args.args_filename) as f:
        lines = f.readlines()
        lines = [x.rstrip() for x in lines]
        for input_filename, out_left_filename, out_right_filename, was_compressed, is_fastq in \
                zip(lines[0::5], lines[1::5], lines[2::5], lines[3::5], lines[4::5]):
            was_compressed = (was_compressed == "True")
            is_fastq = (is_fastq == "True")

            if was_compressed:
                input_file = gzip.open(input_filename, 'r')
            else:
                input_file = open(input_filename)

            log.info(
                "== Splitting %s into left and right reads (in %s directory)" %
                (input_filename, args.dst))
            out_files = [
                open(out_left_filename, 'w'),
                open(out_right_filename, 'w')
            ]
            i = 0
            next_read_name = write_single_read(
                input_file, out_files[i], None, is_fastq,
                sys.version.startswith("3.") and was_compressed)
            while next_read_name:
                i = (i + 1) % 2
                next_read_name = write_single_read(
                    input_file, out_files[i], next_read_name, is_fastq,
                    sys.version.startswith("3.") and was_compressed)
            if i == 0:
                support.error(
                    "the number of reads in file with interlaced reads (%s) should be EVEN!"
                    % (input_filename), log)
            out_files[0].close()
            out_files[1].close()

            input_file.close()
def save_restart_options(log):
    if dataset_yaml_filename:
        support.error(
            "you cannot specify --dataset with --restart-from option!", log)
    if single_cell:
        support.error("you cannot specify --sc with --restart-from option!",
                      log)
    if iontorrent:
        support.error(
            "you cannot specify --iontorrent with --restart-from option!", log)
    if only_assembler:
        support.error(
            "you cannot specify --only-assembler with --restart-from option!",
            log)
    if only_error_correction:
        support.error(
            "you cannot specify --only-error-correction with --restart-from option!",
            log)

    global restart_k_mers
    global restart_careful
    global restart_mismatch_corrector
    global restart_disable_gzip_output
    global restart_disable_rr
    global restart_threads
    global restart_memory
    global restart_tmp_dir
    global restart_qvoffset
    global restart_cov_cutoff
    global restart_developer_mode
    global restart_reference
    global restart_configs_dir
    global restart_read_buffer_size

    restart_k_mers = k_mers
    restart_careful = careful
    restart_mismatch_corrector = mismatch_corrector
    restart_disable_gzip_output = disable_gzip_output
    restart_disable_rr = disable_rr
    restart_threads = threads
    restart_memory = memory
    restart_tmp_dir = tmp_dir
    restart_qvoffset = qvoffset
    restart_cov_cutoff = cov_cutoff
    restart_developer_mode = developer_mode
    restart_reference = reference
    restart_configs_dir = configs_dir
    restart_read_buffer_size = read_buffer_size
Ejemplo n.º 33
0
def get_read_length(output_dir, K, ext_python_modules_home, log):
    est_params_filename = os.path.join(output_dir, "K%d" % K, "final.lib_data")
    max_read_length = 0
    if os.path.isfile(est_params_filename):
        addsitedir(ext_python_modules_home)
        if sys.version.startswith('2.'):
            import pyyaml2 as pyyaml
        elif sys.version.startswith('3.'):
            import pyyaml3 as pyyaml
        est_params_data = pyyaml.load(open(est_params_filename, 'r'))
        for reads_library in est_params_data:
            if reads_library['type'] in READS_TYPES_USED_IN_CONSTRUCTION:
                if int(reads_library["read length"]) > max_read_length:
                    max_read_length = int(reads_library["read length"])
    if max_read_length == 0:
        support.error("Failed to estimate maximum read length! File with estimated params: " + est_params_filename, log)
    return max_read_length
Ejemplo n.º 34
0
def clear_configs(cfg, log, command_before_restart_from,
                  stage_id_before_restart_from):
    def matches_with_restart_from_arg(stage, restart_from_arg):
        return stage["short_name"].startswith(restart_from_arg.split(":")[0])

    spades_commands_fpath = os.path.join(cfg["common"].output_dir,
                                         "run_spades.yaml")
    with open(spades_commands_fpath) as stream:
        old_pipeline = pyyaml.load(stream)

    restart_from_stage_id = None
    for num in range(len(old_pipeline)):
        stage = old_pipeline[num]
        if matches_with_restart_from_arg(stage,
                                         options_storage.args.restart_from):
            restart_from_stage_id = num
            break

    if command_before_restart_from is not None and \
                    old_pipeline[stage_id_before_restart_from]["short_name"] != command_before_restart_from.short_name:
        support.error(
            "new and old pipelines have difference before %s" %
            options_storage.args.restart_from, log)

    if command_before_restart_from is None:
        first_del = 0
    else:
        first_del = stage_id_before_restart_from + 1

    if restart_from_stage_id is not None:
        stage_filename = options_storage.get_stage_filename(
            restart_from_stage_id,
            old_pipeline[restart_from_stage_id]["short_name"])
        if os.path.isfile(stage_filename):
            os.remove(stage_filename)

    for delete_id in range(first_del, len(old_pipeline)):
        stage_filename = options_storage.get_stage_filename(
            delete_id, old_pipeline[delete_id]["short_name"])
        if os.path.isfile(stage_filename):
            os.remove(stage_filename)

        cfg_dir = old_pipeline[delete_id]["config_dir"]
        if cfg_dir != "" and os.path.isdir(
                os.path.join(cfg["common"].output_dir, cfg_dir)):
            shutil.rmtree(os.path.join(cfg["common"].output_dir, cfg_dir))
Ejemplo n.º 35
0
def init_parser(args):
    if options_parser.is_first_run():
        options_storage.first_command_line = args
        check_dir_is_empty(options_parser.get_output_dir_from_args())
    else:
        output_dir = options_parser.get_output_dir_from_args()
        if output_dir is None:
            support.error(
                "the output_dir is not set! It is a mandatory parameter (-o output_dir)."
            )

        command_line, options, script, err_msg = get_options_from_params(
            os.path.join(output_dir, "params.txt"), args[0])

        if err_msg != "":
            support.error(err_msg)

        options_storage.first_command_line = [script] + options
def write_single_read(in_file,
                      out_file,
                      read_name=None,
                      is_fastq=False,
                      is_python3=False):
    if read_name is None:
        read_name = support.process_readline(in_file.readline(), is_python3)
    if not read_name:
        return ""  # no next read
    read_value = support.process_readline(in_file.readline(), is_python3)
    line = support.process_readline(in_file.readline(), is_python3)
    fpos = in_file.tell()
    while (is_fastq
           and not line.startswith('+')) or (not is_fastq
                                             and not line.startswith('>')):
        read_value += line
        line = support.process_readline(in_file.readline(), is_python3)
        if not line:
            if fpos == in_file.tell():
                break
            fpos = in_file.tell()
    out_file.write(read_name + '\n')
    out_file.write(read_value + '\n')

    if is_fastq:
        read_quality = support.process_readline(in_file.readline(), is_python3)
        line = support.process_readline(in_file.readline(), is_python3)
        while not line.startswith('@'):
            read_quality += line
            line = support.process_readline(in_file.readline(), is_python3)
            if not line:
                if fpos == in_file.tell():
                    break
                fpos = in_file.tell()
        if len(read_value) != len(read_quality):
            support.error(
                "The length of sequence and quality lines should be the same! "
                "Check read %s (SEQ length is %d, QUAL length is %d)" %
                (read_name, len(read_value), len(read_quality)))
        out_file.write("+\n")
        out_file.write(read_quality + '\n')
    return line  # next read name or empty string
Ejemplo n.º 37
0
def save_restart_options(log):
    if dataset_yaml_filename:
        support.error("you cannot specify --dataset with --restart-from option!", log)
    if single_cell:
        support.error("you cannot specify --sc with --restart-from option!", log)
    if iontorrent:
        support.error("you cannot specify --iontorrent with --restart-from option!", log)
    if only_assembler:
        support.error("you cannot specify --only-assembler with --restart-from option!", log)
    if only_error_correction:
        support.error("you cannot specify --only-error-correction with --restart-from option!", log)

    global restart_k_mers
    global restart_careful
    global restart_mismatch_corrector
    global restart_disable_gzip_output
    global restart_disable_rr
    global restart_threads
    global restart_memory
    global restart_tmp_dir
    global restart_qvoffset
    global restart_cov_cutoff
    global restart_developer_mode
    global restart_reference
    global restart_configs_dir
    global restart_read_buffer_size

    restart_k_mers = k_mers
    restart_careful = careful
    restart_mismatch_corrector = mismatch_corrector
    restart_disable_gzip_output = disable_gzip_output
    restart_disable_rr = disable_rr
    restart_threads = threads
    restart_memory = memory
    restart_tmp_dir = tmp_dir
    restart_qvoffset = qvoffset
    restart_cov_cutoff = cov_cutoff
    restart_developer_mode = developer_mode
    restart_reference = reference
    restart_configs_dir = configs_dir
    restart_read_buffer_size = read_buffer_size
Ejemplo n.º 38
0
def run_bh(result_filename, configs_dir, execution_home, cfg, ext_python_modules_home, log):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith('2.'):
        import pyyaml2 as pyyaml
    elif sys.version.startswith('3.'):
        import pyyaml3 as pyyaml

    dst_configs = os.path.join(cfg.output_dir, "configs")
    if os.path.exists(dst_configs):
        shutil.rmtree(dst_configs)
    shutil.copytree(os.path.join(configs_dir, "hammer"), dst_configs)
    cfg_file_name = os.path.join(dst_configs, "config.info")
    # removing template configs
    for root, dirs, files in os.walk(dst_configs):
        for cfg_file in files:
            cfg_file = os.path.join(root, cfg_file)
            if cfg_file.endswith('.info.template'):
                if os.path.isfile(cfg_file.split('.template')[0]):
                    os.remove(cfg_file)
                else:
                    os.rename(cfg_file, cfg_file.split('.template')[0])

    prepare_config_bh(cfg_file_name, cfg, log)

    command = [os.path.join(execution_home, "hammer"),
               os.path.abspath(cfg_file_name)]

    log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n")
    support.sys_call(command, log)
    corrected_dataset_yaml_filename = os.path.join(cfg.tmp_dir, "corrected.yaml")
    if not os.path.isfile(corrected_dataset_yaml_filename):
        support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!")
    corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
    if cfg.gzip_output:
        log.info("\n== Compressing corrected reads (with gzip)")
    move_dataset_files(corrected_dataset_data, cfg.output_dir, ext_python_modules_home, cfg.max_threads, log, cfg.gzip_output)
    corrected_dataset_yaml_filename = result_filename
    pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'))
    log.info("\n== Dataset description file created: " + corrected_dataset_yaml_filename + "\n")

    shutil.rmtree(cfg.tmp_dir)
Ejemplo n.º 39
0
def main():
    args = parse_args()

    # create logger
    log = logging.getLogger("Preprocess Lucigen NxMate reads")
    log.setLevel(logging.DEBUG)
    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter("%(message)s"))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    try:
        with open(args.args_filename) as f:
            lines = f.readlines()
            for infile1, infile2 in zip(lines[0::2], lines[1::2]):
                lucigen_nxmate.process_reads(infile1, infile2, args.dst, log,
                                             args.threads)
    except ImportError:
        support.error(
            "can't process Lucigen NxMate reads! lucigen_nxmate.py is missing!",
            log)
Ejemplo n.º 40
0
def check_cfg_for_partial_run(cfg,
                              partial_run_type="restart-from"
                              ):  # restart-from ot stop-after
    if partial_run_type == "restart-from":
        check_point = options_storage.args.restart_from
        action = "restart from"
        verb = "was"
    elif partial_run_type == "stop-after":
        check_point = options_storage.args.stop_after
        action = "stop after"
        verb = "is"
    else:
        return

    if check_point == "ec" and ("error_correction" not in cfg):
        support.error(
            "failed to %s 'read error correction' ('%s') because this stage %s not specified!"
            % (action, check_point, verb))
    if check_point == "mc" and ("mismatch_corrector" not in cfg):
        support.error(
            "failed to %s 'mismatch correction' ('%s') because this stage %s not specified!"
            % (action, check_point, verb))
    if check_point == "as" or check_point.startswith('k'):
        if "assembly" not in cfg:
            support.error(
                "failed to %s 'assembling' ('%s') because this stage %s not specified!"
                % (action, check_point, verb))
Ejemplo n.º 41
0
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False):
    to_compress = []
    for reads_library in dataset_data:
        for key, value in reads_library.items():
            if key.endswith('reads'):
                moved_reads_files = []
                for reads_file in value:
                    dst_filename = os.path.join(dst, os.path.basename(reads_file))
                    # TODO: fix problem with files with the same basenames in Hammer binary!
                    if not os.path.isfile(reads_file):
                        if (not gzip and os.path.isfile(dst_filename)) or (gzip and os.path.isfile(dst_filename + '.gz')):
                            support.warning('file with corrected reads (' + reads_file + ') is the same in several libraries', log)
                            if gzip:
                                dst_filename += '.gz'
                        else:
                            support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log)
                    else:
                        shutil.move(reads_file, dst_filename)
                        if gzip:
                            to_compress.append(dst_filename)
                            dst_filename += '.gz'
                    moved_reads_files.append(dst_filename)
                reads_library[key] = moved_reads_files
    if len(to_compress):
        pigz_path = support.which('pigz')
        if pigz_path:
            for reads_file in to_compress:
                support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log)
        else:
            addsitedir(ext_python_modules_home)
            if sys.version.startswith('2.'):
                from joblib2 import Parallel, delayed
            elif sys.version.startswith('3.'):
                from joblib3 import Parallel, delayed
            n_jobs = min(len(to_compress), max_threads)
            outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress)
            for output in outputs:
                if output:
                    log.info(output)
Ejemplo n.º 42
0
def check_cfg_for_restart_from(cfg):
    if options_storage.restart_from == 'ec' and ("error_correction"
                                                 not in cfg):
        support.error(
            "failed to restart from read error correction because this stage was not specified!"
        )
    if options_storage.restart_from == 'mc' and ("mismatch_corrector"
                                                 not in cfg):
        support.error(
            "failed to restart from mismatch correction because this stage was not specified!"
        )
    if options_storage.restart_from == 'as' or options_storage.restart_from.startswith(
            'k'):
        if "assembly" not in cfg:
            support.error(
                "failed to restart from assembling because this stage was not specified!"
            )
        if options_storage.restart_from.startswith('k'):
            correct_k = False
            k_to_check = options_storage.k_mers
            if not k_to_check:
                if options_storage.auto_K_allowed():
                    k_to_check = list(
                        set(options_storage.K_MERS_SHORT +
                            options_storage.K_MERS_150 +
                            options_storage.K_MERS_250))
                else:
                    k_to_check = options_storage.K_MERS_SHORT
            for k in k_to_check:
                if options_storage.restart_from == (
                        "k%d" % k) or options_storage.restart_from.startswith(
                            "k%d:" % k):
                    correct_k = True
                    break
            if not correct_k:
                k_str = options_storage.restart_from[1:]
                if k_str.find(":") != -1:
                    k_str = k_str[:k_str.find(":")]
                support.error(
                    "failed to restart from K=%s because this K was not specified!"
                    % k_str)
def check_cfg_for_partial_run(cfg, type='restart-from'):  # restart-from ot stop-after
    if type == 'restart-from':
        check_point = options_storage.restart_from
        action = 'restart from'
        verb = 'was'
    elif type == 'stop-after':
        check_point = options_storage.stop_after
        action = 'stop after'
        verb = 'is'
    else:
        return

    if check_point == 'ec' and ("error_correction" not in cfg):
        support.error("failed to " + action + " 'read error correction' ('" + check_point + "') because this stage " + verb + " not specified!")
    if check_point == 'mc' and ("mismatch_corrector" not in cfg):
        support.error("failed to " + action + " 'mismatch correction' ('" + check_point + "') because this stage " + verb + " not specified!")
    if check_point == 'as' or check_point.startswith('k'):
        if "assembly" not in cfg:
            support.error("failed to " + action + " 'assembling' ('" + check_point + "') because this stage " + verb + " not specified!")
        if check_point.startswith('k'):
            correct_k = False
            k_to_check = options_storage.k_mers
            if not k_to_check:
                if options_storage.auto_K_allowed():
                    k_to_check = list(set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250))
                else:
                    k_to_check = options_storage.K_MERS_SHORT
            for k in k_to_check:
                if check_point == ("k%d" % k) or check_point.startswith("k%d:" % k):
                    correct_k = True
                    break
            if not correct_k:
                k_str = check_point[1:]
                if k_str.find(":") != -1:
                    k_str = k_str[:k_str.find(":")]
                support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
Ejemplo n.º 44
0
def check_cfg_for_partial_run(cfg, type='restart-from'):  # restart-from ot stop-after
    if type == 'restart-from':
        check_point = options_storage.restart_from
        action = 'restart from'
        verb = 'was'
    elif type == 'stop-after':
        check_point = options_storage.stop_after
        action = 'stop after'
        verb = 'is'
    else:
        return

    if check_point == 'ec' and ("error_correction" not in cfg):
        support.error("failed to " + action + " 'read error correction' ('" + check_point + "') because this stage " + verb + " not specified!")
    if check_point == 'mc' and ("mismatch_corrector" not in cfg):
        support.error("failed to " + action + " 'mismatch correction' ('" + check_point + "') because this stage " + verb + " not specified!")
    if check_point == 'as' or check_point.startswith('k'):
        if "assembly" not in cfg:
            support.error("failed to " + action + " 'assembling' ('" + check_point + "') because this stage " + verb + " not specified!")
        if check_point.startswith('k'):
            correct_k = False
            k_to_check = options_storage.k_mers
            if not k_to_check:
                if options_storage.auto_K_allowed():
                    k_to_check = list(set(options_storage.K_MERS_SHORT + options_storage.K_MERS_150 + options_storage.K_MERS_250))
                else:
                    k_to_check = options_storage.K_MERS_SHORT
            for k in k_to_check:
                if check_point == ("k%d" % k) or check_point.startswith("k%d:" % k):
                    correct_k = True
                    break
            if not correct_k:
                k_str = check_point[1:]
                if k_str.find(":") != -1:
                    k_str = k_str[:k_str.find(":")]
                support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
Ejemplo n.º 45
0
def main(ds_args_list, general_args_list, spades_home, bin_home):
    log = logging.getLogger('dipspades')
    log.setLevel(logging.DEBUG)
    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter('%(message)s'))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    support.check_binaries(bin_home, log)
    ds_args = parse_arguments(ds_args_list, log)

    if not os.path.exists(ds_args.output_dir):
        os.makedirs(ds_args.output_dir)
    log_filename = os.path.join(ds_args.output_dir, "dipspades.log")
    if os.path.exists(log_filename):
        os.remove(log_filename)
    log_handler = logging.FileHandler(log_filename, mode='a')
    log.addHandler(log_handler)

    params_filename = os.path.join(ds_args.output_dir, "params.txt")
    params_handler = logging.FileHandler(params_filename, mode='a')
    log.addHandler(params_handler)

    log.info("\n")
    log.info("General command line: " + " ".join(general_args_list) + "\n")
    log.info("dipSPAdes command line: " + " ".join(ds_args_list) + "\n")
    print_ds_args(ds_args, log)
    log.removeHandler(params_handler)

    log.info("\n======= dipSPAdes started. Log can be found here: " + log_filename + "\n")
    write_haplocontigs_in_file(ds_args.haplocontigs, ds_args.haplocontigs_fnames)

    config_fname = prepare_configs(os.path.join(spades_home, "configs", "dipspades"), ds_args, log)
    ds_args.tmp_dir = support.get_tmp_dir(prefix="dipspades_", base_dir=ds_args.tmp_dir)
    prepare_config(config_fname, ds_args, log)

    try:
        log.info("===== Assembling started.\n")
        binary_path = os.path.join(bin_home, "spades-dipspades-core")
        command = [binary_path, config_fname]
        support.sys_call(command, log)
        log.info("\n===== Assembling finished.\n")
        print_ds_output(ds_args.output_dir, log)
        if os.path.isdir(ds_args.tmp_dir):
            shutil.rmtree(ds_args.tmp_dir)
        log.info("\n======= dipSPAdes finished.\n")
        log.info("dipSPAdes log can be found here: " + log_filename + "\n")
        log.info("Thank you for using dipSPAdes!")
        log.removeHandler(log_handler)
    except Exception:
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error
                support.error("It looks like you are using SPAdes binaries for another platform.\n" +
                              support.get_spades_binaries_info_message(), dipspades=True)
            else:
                log.exception(exc_value)
                support.error("exception caught: %s" % exc_type, log)
    except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            log.exception(exc_value)
            support.error("exception caught: %s" % exc_type, log, dipspades=True)
Ejemplo n.º 46
0
def compress_dataset_files(input_file, ext_python_modules_home, max_threads,
                           log, not_used_yaml_file, output_dir, gzip_output):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith("2."):
        import pyyaml2 as pyyaml
        from joblib2 import Parallel, delayed
    elif sys.version.startswith("3."):
        import pyyaml3 as pyyaml
        from joblib3 import Parallel, delayed

    dataset_data = pyyaml.load(open(input_file))
    remove_not_corrected_reads(output_dir)
    is_changed = False
    if gzip_output:
        is_changed = True
        pigz_path = support.which("pigz")
        if pigz_path:
            compressor = "pigz"
        else:
            compressor = "gzip"
        log.info("\n== Compressing corrected reads (with %s)" % compressor)
        to_compress = []
        for reads_library in dataset_data:
            for key, value in reads_library.items():
                if key.endswith("reads"):
                    compressed_reads_filenames = []
                    for reads_file in value:
                        compressed_reads_filenames.append(reads_file + ".gz")
                        to_compress.append(reads_file)
                    reads_library[key] = compressed_reads_filenames

        if len(to_compress):
            for reads_file in to_compress:
                if not isfile(reads_file):
                    support.error(
                        "something went wrong and file with corrected reads (%s) is missing!"
                        % reads_file, log)

            if pigz_path:
                for reads_file in to_compress:
                    support.sys_call([
                        pigz_path, "-f", "-7", "-p",
                        str(max_threads), reads_file
                    ], log)
            else:
                n_jobs = min(len(to_compress), max_threads)
                outputs = Parallel(n_jobs=n_jobs)(
                    delayed(support.sys_call)(["gzip", "-f", "-7", reads_file])
                    for reads_file in to_compress)
                for output in outputs:
                    if output:
                        log.info(output)

    if not_used_yaml_file != "":
        is_changed = True
        not_used_dataset_data = pyyaml.load(open(not_used_yaml_file))
        dataset_data += not_used_dataset_data
    if is_changed:
        with open(input_file, 'w') as f:
            pyyaml.dump(dataset_data,
                        f,
                        default_flow_style=False,
                        default_style='"',
                        width=float("inf"))
def nx_seq_junction(infilename1, infilename2, dst, log, silent=True):
    starttime = time.time()

    basename1 = os.path.basename(infilename1)
    if os.path.splitext(basename1)[1] == '.gz':
        basename1 = os.path.splitext(basename1)[0]
    basename2 = os.path.basename(infilename2)
    if os.path.splitext(basename2)[1] == '.gz':
        basename2 = os.path.splitext(basename2)[0]
    #open three outfiles
    splitfilenameleft = os.path.join(dst, 'R1_IJS7_' + basename1)
    splitfile1 = open(splitfilenameleft, 'w')

    splitfilenameright = os.path.join(dst, 'R2_IJS7_' + basename2)
    splitfile2 = open(splitfilenameright, 'w')

    unsplitfilename = os.path.join(
        dst, 'unsplit_IJS7_' + basename1.replace('_R1_', '_R1R2_'))
    unsplitfile = open(unsplitfilename, 'w')

    #jctstr = '(GGTTCATCGTCAGGCCTGACGATGAACC){e<=4}' # JS7 24/28 required results in ~92% detected in ion torrent
    # from NextClip: --adaptor_sequence GTTCATCGTCAGG -e --strict_match 22,11 --relaxed_match 20,10 eg strict 22/26 = 4 errors, relaxed 20/26 = 6 errors
    jctstr = '(GTTCATCGTCAGGCCTGACGATGAAC){e<=4}'  # try 22/26 to match NextClip strict (e<=6 for relaxed)

    #PARSE both files in tuples of 4 lines
    parserR1 = ParseFastQ(infilename1)
    parserR2 = ParseFastQ(infilename2)

    all_stats = JunctionStats()
    n_jobs = options_storage.threads
    while True:
        # prepare input
        reads1 = list(itertools.islice(parserR1, READS_PER_BATCH))
        reads2 = list(itertools.islice(parserR2, READS_PER_BATCH))
        if len(reads1) != len(reads2):
            support.error(
                "lucigen_nxmate.py, nx_seq_junction: "
                "number of left reads (%d) is not equal to number of right reads (%d)!"
                % (len(reads1), len(reads2)), log)
        if not reads1:
            break
        chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs)
        # processing
        outputs = Parallel(n_jobs=n_jobs)(
            delayed(nx_seq_junction_process_batch)(reads, jctstr)
            for reads in chunks)
        results, stats = [x[0] for x in outputs], [x[1] for x in outputs]
        # writing results
        for result, stat in zip(results, stats):
            write_to_files([splitfile1, splitfile2, unsplitfile], result)
            all_stats += stat
        if not silent:
            log.info(
                "==== nx_seq_junction progress: reads processed: %d, time elapsed: %s"
                % (all_stats.readcounter,
                   time.strftime('%H:%M:%S',
                                 time.gmtime(time.time() - starttime))))
    parserR1.close()
    parserR2.close()

    splitfile1.close()
    splitfile2.close()
    unsplitfile.close()

    if all_stats.readcounter == 0:
        support.error(
            "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!",
            log)
    if all_stats.splitcounter == 0:
        support.error(
            "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!",
            log)
    if not silent:
        #print some stats
        percentsplit = 100 * all_stats.splitcounter / all_stats.readcounter
        percentR1R2 = 100 * all_stats.R1R2jctcounter / all_stats.splitcounter
        percentR1 = 100 * all_stats.R1jctcounter / all_stats.splitcounter
        percentR2 = 100 * all_stats.R2jctcounter / all_stats.splitcounter
        log.info("==== nx_seq_junction info: processing finished!")
        log.info("==== nx_seq_junction info: %d reads processed" %
                 (all_stats.readcounter))
        log.info(
            "==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))"
            % (all_stats.splitcounter, percentsplit))
        log.info(
            "==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))"
            % (all_stats.R1R2jctcounter, percentR1R2))
        log.info(
            "==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))"
            % (all_stats.R1jctcounter, percentR1))
        log.info(
            "==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))"
            % (all_stats.R2jctcounter, percentR2))
        elapsedtime = time.strftime('%H:%M:%S',
                                    time.gmtime(time.time() - starttime))
        log.info("==== nx_seq_junction info: time elapsed: %s" % (elapsedtime))
    parserR1.close()
    parserR2.close()
    return splitfilenameleft, splitfilenameright, unsplitfilename
def nx_seq_junction(infilename1, infilename2, dst, log, silent=True):
    starttime = time.time()

    basename1 = os.path.basename(infilename1)
    if os.path.splitext(basename1)[1] == '.gz':
        basename1 = os.path.splitext(basename1)[0]
    basename2 = os.path.basename(infilename2)
    if os.path.splitext(basename2)[1] == '.gz':
        basename2 = os.path.splitext(basename2)[0]
    #open three outfiles
    splitfilenameleft = os.path.join(dst, 'R1_IJS7_' + basename1)
    splitfile1 = open(splitfilenameleft, 'w')

    splitfilenameright = os.path.join(dst, 'R2_IJS7_' + basename2)
    splitfile2 = open(splitfilenameright, 'w')

    unsplitfilename = os.path.join(dst, 'unsplit_IJS7_' + basename1.replace('_R1_', '_R1R2_'))
    unsplitfile = open(unsplitfilename, 'w')

    #jctstr = '(GGTTCATCGTCAGGCCTGACGATGAACC){e<=4}' # JS7 24/28 required results in ~92% detected in ion torrent
    # from NextClip: --adaptor_sequence GTTCATCGTCAGG -e --strict_match 22,11 --relaxed_match 20,10 eg strict 22/26 = 4 errors, relaxed 20/26 = 6 errors
    jctstr = '(GTTCATCGTCAGGCCTGACGATGAAC){e<=4}'  # try 22/26 to match NextClip strict (e<=6 for relaxed)

    #PARSE both files in tuples of 4 lines
    parserR1 = ParseFastQ(infilename1)
    parserR2 = ParseFastQ(infilename2)

    all_stats = JunctionStats()
    n_jobs = options_storage.threads
    while True:
        # prepare input
        reads1 = list(itertools.islice(parserR1, READS_PER_BATCH))
        reads2 = list(itertools.islice(parserR2, READS_PER_BATCH))
        if len(reads1) != len(reads2):
            support.error("lucigen_nxmate.py, nx_seq_junction: "
                          "number of left reads (%d) is not equal to number of right reads (%d)!"
                          % (len(reads1), len(reads2)), log)
        if not reads1:
            break
        chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs)
        # processing
        outputs = Parallel(n_jobs=n_jobs)(delayed(nx_seq_junction_process_batch)(reads, jctstr)
                                          for reads in chunks)
        results, stats = [x[0] for x in outputs], [x[1] for x in outputs]
        # writing results
        for result, stat in zip(results, stats):
            write_to_files([splitfile1, splitfile2, unsplitfile], result)
            all_stats += stat
        if not silent:
            log.info("==== nx_seq_junction progress: reads processed: %d, time elapsed: %s"
                     % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))))
    parserR1.close()
    parserR2.close()

    splitfile1.close()
    splitfile2.close()
    unsplitfile.close()

    if all_stats.readcounter == 0:
        support.error("lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!", log)
    if all_stats.splitcounter == 0:
        support.error("lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!", log)
    if not silent:
        #print some stats
        percentsplit = 100 * all_stats.splitcounter / all_stats.readcounter
        percentR1R2 = 100 * all_stats.R1R2jctcounter / all_stats.splitcounter
        percentR1 = 100 * all_stats.R1jctcounter / all_stats.splitcounter
        percentR2 = 100 * all_stats.R2jctcounter / all_stats.splitcounter
        log.info("==== nx_seq_junction info: processing finished!")
        log.info("==== nx_seq_junction info: %d reads processed" % (all_stats.readcounter))
        log.info("==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))"
                 % (all_stats.splitcounter, percentsplit))
        log.info("==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))"
                 % (all_stats.R1R2jctcounter, percentR1R2))
        log.info("==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))"
                 % (all_stats.R1jctcounter, percentR1))
        log.info("==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))"
                 % (all_stats.R2jctcounter, percentR2))
        elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))
        log.info("==== nx_seq_junction info: time elapsed: %s" % (elapsedtime))
    parserR1.close()
    parserR2.close()
    return splitfilenameleft, splitfilenameright, unsplitfilename
def chimera_clean(infilename1, infilename2, dst, log, silent=True):
    starttime = time.time()

    basename1 = os.path.basename(infilename1)
    if os.path.splitext(basename1)[1] == '.gz':
        basename1 = os.path.splitext(basename1)[0]
    basename2 = os.path.basename(infilename2)
    if os.path.splitext(basename2)[1] == '.gz':
        basename2 = os.path.splitext(basename2)[0]
    #open four outfiles
    outfilename1 = os.path.join(dst, 'mates_ICC4_' + basename1)
    outfile1 = open(outfilename1, 'w')

    slagfilename1 = os.path.join(dst, 'non-mates_ICC4_' + basename1)
    slagfile1 = open(slagfilename1, 'w')

    outfilename2 = os.path.join(dst, 'mates_ICC4_' + basename2)
    outfile2 = open(outfilename2, 'w')

    slagfilename2 = os.path.join(dst, 'non-mates_ICC4_' + basename2)
    slagfile2 = open(slagfilename2, 'w')

    #set up regular expression patterns for chimera codes- for illumin use the  reverse complements of right codes
    csslist1 = ['(TGGACTCCACTGTG){e<=1}', '(ACTTCGCCACTGTG){e<=1}', '(TGAGTCCCACTGTG){e<=1}', '(TGACTGCCACTGTG){e<=1}',
                '(TCAGGTCCACTGTG){e<=1}', '(ATGTCACCACTGTG){e<=1}', '(GTATGACCACTGTG){e<=1}', '(GTCTACCCACTGTG){e<=1}',
                '(GTTGGACCACTGTG){e<=1}', '(CGATTCCCACTGTG){e<=1}', '(GGTTACCCACTGTG){e<=1}', '(TCACCTCCACTGTG){e<=1}']

    csslist2 = ['(TCCAGACCAATGTG){e<=1}', '(ACATCACCAATGTG){e<=1}', '(TCACGACCAATGTG){e<=1}', '(TAGCACCCAATGTG){e<=1}',
                '(AACCTCCCAATGTG){e<=1}', '(ACAACTCCAATGTG){e<=1}', '(GTCTAACCAATGTG){e<=1}', '(TACACGCCAATGTG){e<=1}',
                '(GAGAACCCAATGTG){e<=1}', '(GAGATTCCAATGTG){e<=1}', '(GACCTACCAATGTG){e<=1}', '(AGACTCCCAATGTG){e<=1}']

    #PARSE both files in tuples of 4 lines
    parserR1 = ParseFastQ(infilename1)
    parserR2 = ParseFastQ(infilename2)

    all_stats = CleanStats()
    n_jobs = options_storage.threads
    while True:
        # prepare input
        reads1 = list(itertools.islice(parserR1, READS_PER_BATCH))
        reads2 = list(itertools.islice(parserR2, READS_PER_BATCH))
        if len(reads1) != len(reads2):
            support.error("lucigen_nxmate.py, chimera_clean: "
                          "number of left reads (%d) is not equal to number of right reads (%d)!"
                          % (len(reads1), len(reads2)), log)
        if not reads1:
            break
        chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs)
        # processing
        outputs = Parallel(n_jobs=n_jobs)(delayed(chimera_clean_process_batch)(reads, csslist1, csslist2)
                                          for reads in chunks)
        results, stats = [x[0] for x in outputs], [x[1] for x in outputs]
        # writing results
        for result, stat in zip(results, stats):
            write_to_files([outfile1, outfile2, slagfile1, slagfile2], result)
            all_stats += stat
        if not silent:
            log.info("==== chimera_clean progress: reads processed: %d, time elapsed: %s"
                     % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))))
    parserR1.close()
    parserR2.close()

    outfile1.close()
    slagfile1.close()
    outfile2.close()
    slagfile2.close()

    if all_stats.TOTALmatecounter + all_stats.slagcounter != all_stats.readcounter:
        support.error("lucigen_nxmate.py, chimera_clean: error in the script somewhere! Unequal read counts!", log)
    if all_stats.readcounter == 0:
        support.error("lucigen_nxmate.py, chimera_clean: error in input data! Number of processed reads is 0!", log)
    if not silent:
        #print some stats
        percentmates = 100. * all_stats.matecounter / all_stats.readcounter
        percentslag = 100. * all_stats.slagcounter / all_stats.readcounter
        log.info("==== chimera_clean info: processing finished!")
        log.info("==== chimera_clean info: %d reads processed, %d true mate reads (%.2f %%) "
                 "and %d non-mates/chimeras (%.2f %%)."
                 % (all_stats.readcounter, all_stats.matecounter, percentmates, all_stats.slagcounter, percentslag))
        shortmates = all_stats.TOTALmatecounter - all_stats.matecounter
        log.info("==== chimera_clean info: %d mates too short to keep after trimming" % shortmates)
        elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))
        log.info("==== chimera_clean info: time elapsed: %s" % (elapsedtime))
        log.info("==== chimera_clean info: " + str(all_stats.csscounter))
    return outfilename1, outfilename2

import os
import time
import support
import gzip
import itertools
import sys
from site import addsitedir
import spades_init
import options_storage

try:
    import regex
except ImportError:
    support.error("Can't process Lucigen NxMate reads! Python module regex is not installed!")

addsitedir(spades_init.ext_python_modules_home)
if sys.version.startswith('2.'):
    from joblib2 import Parallel, delayed
elif sys.version.startswith('3.'):
    from joblib3 import Parallel, delayed

    
# CONSTANTS
READS_PER_THREAD = 25000
READS_PER_BATCH = READS_PER_THREAD * options_storage.threads  # e.g. 100000 for 4 threads
minseq = 25  # minimum length sequence to keep after trimming


class ParseFastQ(object):
Ejemplo n.º 51
0
def fill_cfg(options_to_parse, log):
    try:
        options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage(spades_version)
        sys.exit(1)

    if not options:
        options_storage.usage(spades_version)
        sys.exit(1)

    # all parameters are stored here
    cfg = dict()
    # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER paired-end libs and MAX_LIBS_NUMBER mate-pair libs
    dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * 2)]

    options_storage.continue_mode = False
    for opt, arg in options:
        if opt == '-o':
            options_storage.output_dir = arg
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = arg
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(arg, 'reference', log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == '-k':
            options_storage.k_mers = list(map(int, arg.split(",")))
            for k in options_storage.k_mers:
                if k > 127:
                    support.error('wrong k value ' + str(k) + ': all k values should be less than 128', log)
                if k % 2 == 0:
                    support.error('wrong k value ' + str(k) + ': all k values should be odd', log)

        elif opt == "--sc":
            options_storage.single_cell = True
        elif opt == "--disable-gzip-output":
            options_storage.disable_gzip_output = True

        elif opt == "--only-error-correction":
            if options_storage.only_assembler:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_error_correction = True
        elif opt == "--only-assembler":
            if options_storage.only_error_correction:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_assembler = True

        elif opt == "--bh-heap-check":
            options_storage.bh_heap_check = arg
        elif opt == "--spades-heap-check":
            options_storage.spades_heap_check = arg

        elif opt == "--continue":
            options_storage.continue_mode = True

        elif opt == '-t' or opt == "--threads":
            options_storage.threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            options_storage.memory = int(arg)
        elif opt == "--phred-offset":
            if int(arg) in [33, 64]:
                options_storage.qvoffset = int(arg)
            else:
                support.error('wrong PHRED quality offset value ' + str(arg) + ': should be either 33 or 64', log)
        elif opt == '-i' or opt == "--iterations":
            options_storage.iterations = int(arg)

        elif opt == "--debug":
            options_storage.developer_mode = True

        elif opt == "--rectangles":
            options_storage.rectangles = True

        #corrector
        elif opt == "--mismatch-correction":
            options_storage.mismatch_corrector = True

        elif opt == "--careful":
            options_storage.mismatch_corrector = True
            options_storage.careful = True

        elif opt == '-h' or opt == "--help":
            options_storage.usage(spades_version)
            sys.exit(0)
        elif opt == "--help-hidden":
            options_storage.usage(spades_version, True)
            sys.exit(0)

        elif opt == "--test":
            options_storage.set_test_options()
            support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
            support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
            #break
        else:
            raise ValueError


    if not options_storage.output_dir:
        support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
    if not os.path.isdir(options_storage.output_dir):
        if options_storage.continue_mode:
            support.error("the output_dir should exist for --continue!", log)
        os.makedirs(options_storage.output_dir)
    if options_storage.continue_mode:
        return None, None

    if options_storage.dataset_yaml_filename:
        try:
            dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r'))
        except pyyaml.YAMLError:
            _, exc, _ = sys.exc_info()
            support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc))
        dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename))
    else:
        dataset_data = support.correct_dataset(dataset_data)
        dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
        options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))

    support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
    if support.dataset_has_only_mate_pairs_libraries(dataset_data):
        support.error('you should specify at least one paired-end or unpaired library (only mate-pairs libraries were found)!')
    if options_storage.rectangles and (len(dataset_data) > 1):
        support.error('rectangle graph algorithm for repeat resolution cannot work with multiple libraries!')

    ### FILLING cfg
    cfg["common"] = empty_config()
    cfg["dataset"] = empty_config()
    if not options_storage.only_assembler:
        cfg["error_correction"] = empty_config()
    if not options_storage.only_error_correction:
        cfg["assembly"] = empty_config()

    # common
    cfg["common"].__dict__["output_dir"] = os.path.abspath(options_storage.output_dir)
    cfg["common"].__dict__["max_threads"] = options_storage.threads
    cfg["common"].__dict__["max_memory"] = options_storage.memory
    cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode

    # dataset section
    cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell
    cfg["dataset"].__dict__["yaml_filename"] = os.path.abspath(options_storage.dataset_yaml_filename)
    if options_storage.developer_mode and options_storage.reference:
        cfg["dataset"].__dict__["reference"] = options_storage.reference

    # error correction
    if (not options_storage.only_assembler) and (options_storage.iterations > 0):
        cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected")
        cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations
        cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output
        if options_storage.qvoffset:
            cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset
        if options_storage.bh_heap_check:
            cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check
        if options_storage.tmp_dir:
            cfg["error_correction"].__dict__["tmp_dir"] = options_storage.tmp_dir
        else:
            cfg["error_correction"].__dict__["tmp_dir"] = cfg["error_correction"].output_dir
        cfg["error_correction"].tmp_dir = os.path.join(os.path.abspath(cfg["error_correction"].tmp_dir), 'tmp')

    # assembly
    if not options_storage.only_error_correction:
        if options_storage.k_mers:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
        else:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers_short
        cfg["assembly"].__dict__["careful"] = options_storage.careful
        if options_storage.spades_heap_check:
            cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check

    #corrector can work only if contigs exist (not only error correction)
    if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
        cfg["mismatch_corrector"] = empty_config()
        cfg["mismatch_corrector"].__dict__["skip-masked"] = ""
        cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades")
        cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
        cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir

    return cfg, dataset_data
Ejemplo n.º 52
0
def main():
    os.environ["LC_ALL"] = "C"

    if len(sys.argv) == 1:
        options_storage.usage(spades_version)
        sys.exit(0)

    log = logging.getLogger('spades')
    log.setLevel(logging.DEBUG)

    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter('%(message)s'))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    check_binaries(bin_home, log)

    # parse options and safe all parameters to cfg
    cfg, dataset_data = fill_cfg(sys.argv, log)

    if options_storage.continue_mode:
        cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"))
        if not options:
            support.error("failed to parse command line of the previous run! Please restart from the beginning.")
        cfg, dataset_data = fill_cfg(options, log)
        options_storage.continue_mode = True

    log_filename = os.path.join(cfg["common"].output_dir, "spades.log")
    if options_storage.continue_mode:
        log_handler = logging.FileHandler(log_filename, mode='a')
    else:
        log_handler = logging.FileHandler(log_filename, mode='w')
    log.addHandler(log_handler)

    if options_storage.continue_mode:
        log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n")
        log.info("Restored from " + cmd_line)
    else:
        params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
        params_handler = logging.FileHandler(params_filename, mode='w')
        log.addHandler(params_handler)

        command = "Command line:"
        for v in sys.argv:
            command += " " + v
        log.info(command)

        print_used_values(cfg, log)
        log.removeHandler(params_handler)

        log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n")

    # splitting interlaced reads if needed
    if support.dataset_has_interlaced_reads(dataset_data):
        dir_for_split_reads = os.path.join(os.path.abspath(options_storage.output_dir), 'split_reads')
        if not os.path.isdir(dir_for_split_reads):
            os.makedirs(dir_for_split_reads)
        dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log)
        options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
        cfg["dataset"].yaml_filename = os.path.abspath(options_storage.dataset_yaml_filename)

    try:
        # copying configs before all computations (to prevent its changing at run time)
        tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs")
        if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode:
            shutil.rmtree(tmp_configs_dir)
        if not os.path.isdir(tmp_configs_dir):
            shutil.copytree(os.path.join(spades_home, "configs"), tmp_configs_dir)

        corrected_dataset_yaml_filename = ''
        if "error_correction" in cfg:
            bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
            bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename
            corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml")
            if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode:
                log.info("\n===== Skipping read error correction (already processed). \n")
            else:
                options_storage.continue_mode = False # continue from here

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in bh_cfg.__dict__:
                    os.environ["HEAPCHECK"] = bh_cfg.heap_check

                if os.path.exists(bh_cfg.output_dir):
                    shutil.rmtree(bh_cfg.output_dir)

                os.makedirs(bh_cfg.output_dir)
                if not os.path.exists(bh_cfg.tmp_dir):
                    os.makedirs(bh_cfg.tmp_dir)

                log.info("\n===== Read error correction started. \n")
                bh_logic.run_bh(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg,
                    ext_python_modules_home, log)
                log.info("\n===== Read error correction finished. \n")

        result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta")
        result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta")
        misc_dir = os.path.join(cfg["common"].output_dir, "misc")
        ### if mismatch correction is enabled then result contigs are copied to misc directory
        assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta")
        assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta")
        if "assembly" in cfg:
            spades_cfg = merge_configs(cfg["assembly"], cfg["common"])
            spades_cfg.__dict__["result_contigs"] = result_contigs_filename
            spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
            spades_cfg.__dict__["additional_contigs"] = os.path.join(spades_cfg.output_dir, "simplified_contigs.fasta")

            if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
                                                  or ("mismatch_corrector" in cfg and
                                                      os.path.isfile(assembled_contigs_filename))):
                log.info("\n===== Skipping assembling (already processed). \n")
                # calculating latest_dir for the next stages
                latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*"))
                if not latest_dir:
                    support.error("failed to continue the previous run! Please restart from the beginning.")
            else:
                if os.path.isfile(corrected_dataset_yaml_filename):
                    dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
                    dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename))
                if support.dataset_has_paired_reads(dataset_data):
                    spades_cfg.__dict__["paired_mode"] = True
                else:
                    spades_cfg.__dict__["paired_mode"] = False

                if options_storage.rectangles:
                    spades_cfg.__dict__["resolving_mode"] = "rectangles"

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in spades_cfg.__dict__:
                    os.environ["HEAPCHECK"] = spades_cfg.heap_check

                log.info("\n===== Assembling started.\n")

                # creating dataset
                dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info")
                if not os.path.isfile(dataset_filename) or not options_storage.continue_mode:
                    dataset_file = open(dataset_filename, 'w')
                    import process_cfg
                    dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n')
                    if os.path.isfile(corrected_dataset_yaml_filename):
                        dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n')
                    else:
                        dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n')
                    if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__:
                        dataset_file.write("reference_genome" + '\t')
                        dataset_file.write(process_cfg.process_spaces(os.path.abspath(cfg["dataset"].reference)) + '\n')
                    dataset_file.close()
                spades_cfg.__dict__["dataset"] = dataset_filename

                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, log)

                #rectangles
                if spades_cfg.paired_mode and options_storage.rectangles:
                    if options_storage.continue_mode: # TODO: continue mode
                        support.warning("sorry, --continue doesn't work with --rectangles yet. Skipping repeat resolving.")
                    else:
                        sys.path.append(os.path.join(python_modules_home, "rectangles"))
                        import rrr

                        rrr_input_dir = os.path.join(latest_dir, "saves")
                        rrr_outpath = os.path.join(spades_cfg.output_dir, "rectangles")
                        if not os.path.exists(rrr_outpath):
                            os.mkdir(rrr_outpath)

                        rrr_reference_information_file = os.path.join(rrr_input_dir,
                            "late_pair_info_counted_etalon_distance.txt")
                        rrr_test_util = rrr.TestUtils(rrr_reference_information_file,
                            os.path.join(rrr_outpath, "rectangles.log"))
                        rrr.resolve(rrr_input_dir, rrr_outpath, rrr_test_util, "", cfg["dataset"].single_cell, spades_cfg.careful)

                        shutil.copyfile(os.path.join(rrr_outpath, "rectangles_extend_before_scaffold.fasta"), spades_cfg.result_contigs)
                        shutil.copyfile(os.path.join(rrr_outpath, "rectangles_extend.fasta"), spades_cfg.result_scaffolds)

                        if not spades_cfg.developer_mode:
                            if os.path.exists(rrr_input_dir):
                                shutil.rmtree(rrr_input_dir)
                            if os.path.exists(rrr_outpath):
                                shutil.rmtree(rrr_outpath, True)
                            if os.path.exists(rrr_outpath):
                                os.system('rm -r ' + rrr_outpath)
                                #EOR

                if os.path.isdir(misc_dir) and not options_storage.continue_mode:
                    shutil.rmtree(misc_dir)
                if not os.path.isdir(misc_dir):
                    os.makedirs(misc_dir)
                    if os.path.isfile(spades_cfg.additional_contigs):
                        shutil.move(spades_cfg.additional_contigs, misc_dir)

                log.info("\n===== Assembling finished. \n")

            #corrector
            if "mismatch_corrector" in cfg and (os.path.isfile(result_contigs_filename) or
                                                (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))):
                to_correct = dict()
                to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename)
                if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and
                                                                 os.path.isfile(assembled_scaffolds_filename)):
                    to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename)

                # moving assembled contigs (scaffolds) to misc dir
                for k, (old, new) in to_correct.items():
                    if options_storage.continue_mode and os.path.isfile(new):
                        continue
                    shutil.move(old, new)

                if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \
                    (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)):
                    log.info("\n===== Skipping mismatch correction (already processed). \n")
                else:
                    log.info("\n===== Mismatch correction started.")

                    # detecting paired-end library with the largest insert size
                    dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r')) ### initial dataset, i.e. before error correction
                    dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename))
                    paired_end_libraries_ids = []
                    for id, reads_library in enumerate(dataset_data):
                        if reads_library['type'] == 'paired-end':
                            paired_end_libraries_ids.append(id)
                    if not len(paired_end_libraries_ids):
                        support.error('Mismatch correction cannot be performed without at least one paired-end library!')
                    estimated_params = load_config_from_file(os.path.join(latest_dir, "_est_params.info"))
                    max_insert_size = -1
                    target_paired_end_library_id = -1
                    for id in paired_end_libraries_ids:
                        if float(estimated_params.__dict__["insert_size_" + str(id)]) > max_insert_size:
                            max_insert_size = float(estimated_params.__dict__["insert_size_" + str(id)])
                            target_paired_end_library_id = id
                    yaml_dirname = os.path.dirname(options_storage.dataset_yaml_filename)
                    cfg["mismatch_corrector"].__dict__["1"] = list(map(lambda x: os.path.join(yaml_dirname, x),
                        dataset_data[target_paired_end_library_id]['left reads']))
                    cfg["mismatch_corrector"].__dict__["2"] = list(map(lambda x: os.path.join(yaml_dirname, x),
                        dataset_data[target_paired_end_library_id]['right reads']))
                    cfg["mismatch_corrector"].__dict__["insert-size"] = round(max_insert_size)
                    #TODO: add reads orientation

                    import corrector
                    corrector_cfg = cfg["mismatch_corrector"]
                    args = []
                    for key, values in corrector_cfg.__dict__.items():
                        if key == "output-dir":
                            continue

                        # for processing list of reads
                        if not isinstance(values, list):
                            values = [values]
                        for value in values:
                            if len(key) == 1:
                                args.append('-' + key)
                            else:
                                args.append('--' + key)
                            if value:
                                args.append(value)

                    # processing contigs and scaffolds (or only contigs)
                    for k, (corrected, assembled) in to_correct.items():
                        if options_storage.continue_mode and os.path.isfile(corrected):
                            log.info("\n== Skipping processing of " + k + " (already processed)\n")
                            continue

                        options_storage.continue_mode = False
                        log.info("\n== Processing of " + k + "\n")

                        cur_args = args[:]
                        cur_args += ['-c', assembled]
                        tmp_dir_for_corrector = os.path.join(corrector_cfg.__dict__["output-dir"], "mismatch_corrector_" + k)
                        cur_args += ['--output-dir', tmp_dir_for_corrector]

                        # correcting
                        corrector.main(cur_args, ext_python_modules_home, log)

                        result_corrected_filename = os.path.abspath(os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta"))
                        # moving corrected contigs (scaffolds) to SPAdes output dir
                        if os.path.isfile(result_corrected_filename):
                            shutil.move(result_corrected_filename, corrected)

                        if os.path.isdir(tmp_dir_for_corrector):
                            shutil.rmtree(tmp_dir_for_corrector)

                    log.info("\n===== Mismatch correction finished.\n")

        if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir):
            shutil.rmtree(tmp_configs_dir)

        #log.info("")
        if os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)):
            log.info(" * Corrected reads are in " + os.path.dirname(corrected_dataset_yaml_filename) + "/")
        if os.path.isfile(result_contigs_filename):
            log.info(" * Assembled contigs are in " + result_contigs_filename)
        if os.path.isfile(result_scaffolds_filename):
            log.info(" * Assembled scaffolds are in " + result_scaffolds_filename)
        #log.info("")

        #breaking scaffolds
        if os.path.isfile(result_scaffolds_filename):
            if not os.path.isdir(misc_dir):
                os.makedirs(misc_dir)
            result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta")
            threshold = 3
            if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode:
                support.break_scaffolds(result_scaffolds_filename, threshold, result_broken_scaffolds)
                #log.info(" * Scaffolds broken by " + str(threshold) + " Ns are in " + result_broken_scaffolds)

        ### printing WARNINGS SUMMARY
        if not support.log_warnings(log):
            log.info("\n======= SPAdes pipeline finished.")  # otherwise it finished WITH WARNINGS

        log.info("\nSPAdes log can be found here: " + log_filename)
        log.info("")
        log.info("Thank you for using SPAdes!")
        log.removeHandler(log_handler)

    except Exception:
        _, exc, _ = sys.exc_info()
        log.exception(exc)
        support.error("exception caught", log)
Ejemplo n.º 53
0
            options_storage.output_dir = arg
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = arg
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(arg, "reference", log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(arg, "dataset", log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == "-k":
            options_storage.k_mers = map(int, arg.split(","))
            for k in options_storage.k_mers:
                if k > 127:
                    support.error("wrong k value " + str(k) + ": all k values should be less than 128", log)
                if k % 2 == 0:
                    support.error("wrong k value " + str(k) + ": all k values should be odd", log)

        elif opt == "--sc":
            options_storage.single_cell = True
        elif opt == "--disable-gzip-output":
            options_storage.disable_gzip_output = True

        elif opt == "--only-error-correction":
            if options_storage.only_assembler:
                support.error("you cannot specify --only-error-correction and --only-assembler simultaneously")
            options_storage.only_error_correction = True
        elif opt == "--only-assembler":
            if options_storage.only_error_correction:
                support.error("you cannot specify --only-error-correction and --only-assembler simultaneously")
Ejemplo n.º 54
0
def main(ds_args_list, general_args_list, spades_home, bin_home):
    log = logging.getLogger('dipspades')
    log.setLevel(logging.DEBUG)
    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter('%(message)s'))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    support.check_binaries(bin_home, log)
    ds_args = parse_arguments(ds_args_list, log)

    if not os.path.exists(ds_args.output_dir):
        os.makedirs(ds_args.output_dir)
    log_filename = os.path.join(ds_args.output_dir, "dipspades.log")
    if os.path.exists(log_filename):
        os.remove(log_filename)
    log_handler = logging.FileHandler(log_filename, mode='a')
    log.addHandler(log_handler)

    params_filename = os.path.join(ds_args.output_dir, "params.txt")
    params_handler = logging.FileHandler(params_filename, mode='a')
    log.addHandler(params_handler)

    log.info("\n")
    log.info("General command line: " + " ".join(general_args_list) + "\n")
    log.info("dipSPAdes command line: " + " ".join(ds_args_list) + "\n")
    print_ds_args(ds_args, log)
    log.removeHandler(params_handler)

    log.info("\n======= dipSPAdes started. Log can be found here: " + log_filename + "\n")
    write_haplocontigs_in_file(ds_args.haplocontigs, ds_args.haplocontigs_fnames)

    config_fname = prepare_configs(os.path.join(spades_home, "configs", "dipspades"), ds_args, log)
    ds_args.tmp_dir = support.get_tmp_dir(prefix="dipspades_", base_dir=ds_args.tmp_dir)
    prepare_config(config_fname, ds_args, log)

    try:
        log.info("===== Assembling started.\n")
        binary_path = os.path.join(bin_home, "dipspades")
        command = [binary_path, config_fname]
        support.sys_call(command, log)
        log.info("\n===== Assembling finished.\n")
        print_ds_output(ds_args.output_dir, log)
        if os.path.isdir(ds_args.tmp_dir):
            shutil.rmtree(ds_args.tmp_dir)
        log.info("\n======= dipSPAdes finished.\n")
        log.info("dipSPAdes log can be found here: " + log_filename + "\n")
        log.info("Thank you for using dipSPAdes!")
        log.removeHandler(log_handler)
    except Exception:
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error
                support.error("It looks like you are using SPAdes binaries for another platform.\n" +
                              support.get_spades_binaries_info_message(), dipspades=True)
            else:
                log.exception(exc_value)
                support.error("exception caught: %s" % exc_type, log)
    except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            log.exception(exc_value)
            support.error("exception caught: %s" % exc_type, log, dipspades=True)
Ejemplo n.º 55
0
def main():
    all_long_options = list(set(options_storage.long_options + dipspades_logic.DS_Args_List.long_options))
    all_short_options = options_storage.short_options + dipspades_logic.DS_Args_List.short_options

    dipspades_logic_args = []
    spades_py_args = ["--diploid"]

    try:
        options, not_options = getopt.gnu_getopt(sys.argv, all_short_options, all_long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        options_storage.usage(spades_version, mode="dip")
        sys.stderr.flush()
        sys.exit(1)
    if not options:
        options_storage.usage(spades_version, mode="dip")
        sys.stderr.flush()
        sys.exit(1)

    output_dir = None
    spades_py_run_needed = False
    for opt, arg in options:
        # processing some special options
        if opt == '--test':
            output_dir = abspath("test_dipspades")
            spades_py_args = ["--diploid", "-1", os.path.join(spades_init.spades_home, "test_dataset/ecoli_1K_1.fq.gz"),
                              "-2", os.path.join(spades_init.spades_home, "test_dataset/ecoli_1K_2.fq.gz"), "--only-assembler"]
            dipspades_logic_args = []
            spades_py_run_needed = True
            break
        if opt == '-o':
            output_dir = abspath(expanduser(arg))
        elif opt == '--careful' or opt == '--mismatch-correction':
            continue
        if opt == '-v' or opt == '--version':
            options_storage.version(spades_version, mode="dip")
            sys.exit(0)
        if opt == '-h' or opt == '--help':
            options_storage.usage(spades_version, mode="dip")
            sys.exit(0)
        elif opt == "--help-hidden":
            options_storage.usage(spades_version, show_hidden=True, mode="dip")
            sys.exit(0)
        # for all other options
        cur_opt_arg = [opt]
        if arg:
            cur_opt_arg.append(arg)
        if opt.startswith("--"):  # long option
            if opt[2:] in options_storage.long_options or (opt[2:] + "=") in options_storage.long_options:
                spades_py_args += cur_opt_arg
                if opt[2:] in dipspades_logic.DS_Args_List.long_options or (opt[2:] + "=") in dipspades_logic.DS_Args_List.long_options:
                    dipspades_logic_args += cur_opt_arg
                else:
                    spades_py_run_needed = True
            else:
                dipspades_logic_args += cur_opt_arg
        else: # short option
            if opt != '-o':
                if opt[1:] in options_storage.short_options:
                    spades_py_args += cur_opt_arg
                    if opt[1:] in dipspades_logic.DS_Args_List.short_options:
                        dipspades_logic_args += cur_opt_arg
                    else:
                        spades_py_run_needed = True
                else:
                    dipspades_logic_args += cur_opt_arg

    if not output_dir:
        support.error("The output_dir is not set! It is a mandatory parameter (-o output_dir).", dipspades=True)

    spades_output_dir = os.path.join(output_dir, "spades")
    dipspades_output_dir = os.path.join(output_dir, "dipspades")

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    if not os.path.isdir(spades_output_dir):
        os.makedirs(spades_output_dir)
    if not os.path.isdir(dipspades_output_dir):
        os.makedirs(dipspades_output_dir)

    spades_result = ""
    if spades_py_run_needed:
        spades_py_args += ["-o", spades_output_dir]
        spades.main(spades_py_args)
        spades_result = os.path.join(spades_output_dir, "contigs.fasta")
        if not os.path.isfile(spades_result):
            support.error("Something went wrong and SPAdes did not generate haplocontigs. "
                      "DipSPAdes cannot proceed without them, aborting.", dipspades=True)

    dipspades_logic_args += ["-o", dipspades_output_dir]
    if spades_result != "":
        dipspades_logic_args += ["--hap", spades_result]
    dipspades_logic.main(dipspades_logic_args, sys.argv, spades.spades_home, spades.bin_home)
Ejemplo n.º 56
0
def fill_cfg(options_to_parse, log):
    try:
        options, not_options = getopt.gnu_getopt(options_to_parse, options_storage.short_options, options_storage.long_options)
    except getopt.GetoptError:
        _, exc, _ = sys.exc_info()
        sys.stderr.write(str(exc) + "\n")
        sys.stderr.flush()
        options_storage.usage(spades_version)
        sys.exit(1)

    if not options:
        options_storage.usage(spades_version)
        sys.exit(1)

    if len(not_options) > 1:
        for opt, arg in options:
            if opt == "-k" and arg.strip().endswith(','):
                support.error("Do not put spaces after commas in the list of k-mers sizes! Correct example: -k 21,33,55", log)
        support.error("Please specify option (e.g. -1, -2, -s, etc) for the following paths: " + ", ".join(not_options[1:]) + "\n", log)

    # all parameters are stored here
    cfg = dict()
    # dataset is stored here. We are prepared for up to MAX_LIBS_NUMBER for each type of short-reads libs
    dataset_data = [{} for i in range(options_storage.MAX_LIBS_NUMBER * len(options_storage.SHORT_READS_TYPES.keys()))]  # "[{}] * num" doesn't work here!

    # for parsing options from "previous run command"
    options_storage.continue_mode = False
    options_storage.k_mers = None

    for opt, arg in options:
        if opt == '-o':
            options_storage.output_dir = os.path.abspath(arg)
        elif opt == "--tmp-dir":
            options_storage.tmp_dir = os.path.abspath(arg)
        elif opt == "--reference":
            options_storage.reference = support.check_file_existence(arg, 'reference', log)
        elif opt == "--dataset":
            options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)

        elif opt in options_storage.reads_options:
            support.add_to_dataset(opt, arg, dataset_data)

        elif opt == '-k':
            if arg == 'auto':
                options_storage.k_mers = arg
            else:
                options_storage.k_mers = list(map(int, arg.split(",")))
                for k in options_storage.k_mers:
                    if k < options_storage.MIN_K or k > options_storage.MAX_K:
                        support.error('wrong k value ' + str(k) + ': all k values should be between %d and %d' %
                                                                  (options_storage.MIN_K, options_storage.MAX_K), log)
                    if k % 2 == 0:
                        support.error('wrong k value ' + str(k) + ': all k values should be odd', log)

        elif opt == "--sc":
            options_storage.single_cell = True
        elif opt == "--iontorrent":
            options_storage.iontorrent = True
        elif opt == "--disable-gzip-output":
            options_storage.disable_gzip_output = True
        elif opt == "--disable-gzip-output:false":
            options_storage.disable_gzip_output = False
        elif opt == "--disable-rr":
            options_storage.disable_rr = True
        elif opt == "--disable-rr:false":
            options_storage.disable_rr = False

        elif opt == "--only-error-correction":
            if options_storage.only_assembler:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_error_correction = True
        elif opt == "--only-assembler":
            if options_storage.only_error_correction:
                support.error('you cannot specify --only-error-correction and --only-assembler simultaneously')
            options_storage.only_assembler = True

        elif opt == "--read-buffer-size":
            options_storage.read_buffer_size = int(arg)
        elif opt == "--bh-heap-check":
            options_storage.bh_heap_check = arg
        elif opt == "--spades-heap-check":
            options_storage.spades_heap_check = arg

        elif opt == "--continue":
            options_storage.continue_mode = True
        elif opt == "--restart-from":
            if arg not in ['ec', 'as', 'mc'] and not arg.startswith('k'):
                support.error("wrong value for --restart-from option: " + arg + " (only 'ec', 'as', 'k<int>', 'mc' are available)", log)
            options_storage.continue_mode = True
            options_storage.restart_from = arg

        elif opt == '-t' or opt == "--threads":
            options_storage.threads = int(arg)
        elif opt == '-m' or opt == "--memory":
            options_storage.memory = int(arg)
        elif opt == "--phred-offset":
            if arg == 'auto':
                options_storage.qvoffset = arg
            elif arg in ['33', '64']:
                options_storage.qvoffset = int(arg)
            else:
                support.error('wrong PHRED quality offset value ' + str(arg) + ': should be either 33 or 64', log)
        elif opt == '-i' or opt == "--iterations":
            options_storage.iterations = int(arg)

        elif opt == "--debug":
            options_storage.developer_mode = True
        elif opt == "--debug:false":
            options_storage.developer_mode = False

        #corrector
        elif opt == "--mismatch-correction":
            options_storage.mismatch_corrector = True
        elif opt == "--mismatch-correction:false":
            options_storage.mismatch_corrector = False

        elif opt == "--careful":
            options_storage.mismatch_corrector = True
            options_storage.careful = True
        elif opt == "--careful:false":
            options_storage.mismatch_corrector = False
            options_storage.careful = False

        elif opt == '-h' or opt == "--help":
            options_storage.usage(spades_version)
            sys.exit(0)
        elif opt == "--help-hidden":
            options_storage.usage(spades_version, True)
            sys.exit(0)

        elif opt == "--test":
            options_storage.set_test_options()
            support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
            support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
            #break
        elif opt == "--diploid":
            options_storage.diploid_mode = True
        else:
            raise ValueError


    if not options_storage.output_dir:
        support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
    if not os.path.isdir(options_storage.output_dir):
        if options_storage.continue_mode:
            support.error("the output_dir should exist for --continue and for --restart-from!", log)
        os.makedirs(options_storage.output_dir)
    if options_storage.restart_from:
        if options_storage.continue_mode: # saving parameters specified with --restart-from
            if not support.dataset_is_empty(dataset_data):
                support.error("you cannot specify reads with --restart-from option!", log)
            options_storage.save_restart_options(log)
        else:  # overriding previous run parameters
            options_storage.load_restart_options()
    if options_storage.continue_mode:
        return None, None

    if options_storage.dataset_yaml_filename:
        try:
            dataset_data = pyyaml.load(open(options_storage.dataset_yaml_filename, 'r'))
        except pyyaml.YAMLError:
            _, exc, _ = sys.exc_info()
            support.error('exception caught while parsing YAML file (' + options_storage.dataset_yaml_filename + '):\n' + str(exc))
        dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(options_storage.dataset_yaml_filename))
    else:
        dataset_data = support.correct_dataset(dataset_data)
        dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
        options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))

    support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
    if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
        support.error('you should specify at least one unpaired, paired-end, or high-quality mate-pairs library!')

    options_storage.set_default_values()
    ### FILLING cfg
    cfg["common"] = empty_config()
    cfg["dataset"] = empty_config()
    if not options_storage.only_assembler:
        cfg["error_correction"] = empty_config()
    if not options_storage.only_error_correction:
        cfg["assembly"] = empty_config()

    # common
    cfg["common"].__dict__["output_dir"] = options_storage.output_dir
    cfg["common"].__dict__["tmp_dir"] = options_storage.tmp_dir
    cfg["common"].__dict__["max_threads"] = options_storage.threads
    cfg["common"].__dict__["max_memory"] = options_storage.memory
    cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode

    # dataset section
    cfg["dataset"].__dict__["single_cell"] = options_storage.single_cell
    cfg["dataset"].__dict__["iontorrent"] = options_storage.iontorrent
    cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
    if options_storage.developer_mode and options_storage.reference:
        cfg["dataset"].__dict__["reference"] = options_storage.reference

    # error correction
    if (not options_storage.only_assembler) and (options_storage.iterations > 0):
        cfg["error_correction"].__dict__["output_dir"] = os.path.join(cfg["common"].output_dir, "corrected")
        cfg["error_correction"].__dict__["max_iterations"] = options_storage.iterations
        cfg["error_correction"].__dict__["gzip_output"] = not options_storage.disable_gzip_output
        if options_storage.qvoffset:
            cfg["error_correction"].__dict__["qvoffset"] = options_storage.qvoffset
        if options_storage.bh_heap_check:
            cfg["error_correction"].__dict__["heap_check"] = options_storage.bh_heap_check
        cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent

    # assembly
    if not options_storage.only_error_correction:
        if options_storage.k_mers:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.k_mers
        else:
            cfg["assembly"].__dict__["iterative_K"] = options_storage.K_MERS_SHORT
        cfg["assembly"].__dict__["careful"] = options_storage.careful
        cfg["assembly"].__dict__["disable_rr"] = options_storage.disable_rr
        cfg["assembly"].__dict__["diploid_mode"] = options_storage.diploid_mode
        if options_storage.spades_heap_check:
            cfg["assembly"].__dict__["heap_check"] = options_storage.spades_heap_check
        if options_storage.read_buffer_size:
            cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size

    #corrector can work only if contigs exist (not only error correction)
    if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
        cfg["mismatch_corrector"] = empty_config()
        cfg["mismatch_corrector"].__dict__["skip-masked"] = None
        cfg["mismatch_corrector"].__dict__["bwa"] = os.path.join(bin_home, "bwa-spades")
        cfg["mismatch_corrector"].__dict__["threads"] = options_storage.threads
        cfg["mismatch_corrector"].__dict__["output-dir"] = options_storage.output_dir

    return cfg, dataset_data
Ejemplo n.º 57
0
def main(args):
    os.environ["LC_ALL"] = "C"

    if len(args) == 1:
        options_storage.usage(spades_version)
        sys.exit(0)

    log = logging.getLogger('spades')
    log.setLevel(logging.DEBUG)

    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter('%(message)s'))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    support.check_binaries(bin_home, log)

    # parse options and safe all parameters to cfg
    cfg, dataset_data = fill_cfg(args, log)

    if options_storage.continue_mode:
        cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"))
        if not options:
            support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
        cfg, dataset_data = fill_cfg(options, log)
        if options_storage.restart_from:
            check_cfg_for_restart_from(cfg)
        options_storage.continue_mode = True

    log_filename = os.path.join(cfg["common"].output_dir, "spades.log")
    if options_storage.continue_mode:
        log_handler = logging.FileHandler(log_filename, mode='a')
    else:
        log_handler = logging.FileHandler(log_filename, mode='w')
    log.addHandler(log_handler)

    if options_storage.continue_mode:
        log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n")
        log.info("Restored from " + cmd_line)
        if options_storage.restart_from:
            updated_params = ""
            flag = False
            for v in args[1:]:
                if v == '-o' or v == '--restart-from':
                    flag = True
                    continue
                if flag:
                    flag = False
                    continue
                updated_params += " " + v
            updated_params = updated_params.strip()
            log.info("with updated parameters: " + updated_params)
            cmd_line += " " + updated_params
        log.info("")

    params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
    params_handler = logging.FileHandler(params_filename, mode='w')
    log.addHandler(params_handler)

    if options_storage.continue_mode:
        log.info(cmd_line)
    else:
        command = "Command line:"
        for v in args:
            command += " " + v
        log.info(command)

    # special case
    if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'):
        support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log)
        del cfg["mismatch_corrector"]

    print_used_values(cfg, log)
    log.removeHandler(params_handler)

    if not options_storage.continue_mode:
        log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n")

    # splitting interlaced reads and processing Ns in additional contigs if needed
    if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data):
        dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input')
        if support.dataset_has_interlaced_reads(dataset_data):
            if not os.path.isdir(dir_for_split_reads):
                os.makedirs(dir_for_split_reads)
            dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log)
        if support.dataset_has_additional_contigs(dataset_data):
            dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log)
        options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
        cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename

    try:
        # copying configs before all computations (to prevent its changing at run time)
        tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs")
        if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode:
            shutil.rmtree(tmp_configs_dir)
        if not os.path.isdir(tmp_configs_dir):
            dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False)

        corrected_dataset_yaml_filename = ''
        if "error_correction" in cfg:
            STAGE_NAME = "Read error correction"
            bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
            corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml")
            if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \
                and not options_storage.restart_from == "ec":
                log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
            else:
                support.continue_from_here(log)

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in bh_cfg.__dict__:
                    os.environ["HEAPCHECK"] = bh_cfg.heap_check

                if os.path.exists(bh_cfg.output_dir):
                    shutil.rmtree(bh_cfg.output_dir)
                os.makedirs(bh_cfg.output_dir)

                if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES):
                    not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
                    to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
                    to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml")
                    pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'))
                    bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename
                else:
                    not_used_dataset_data = None
                    bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename

                log.info("\n===== %s started. \n" % STAGE_NAME)
                hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data,
                    ext_python_modules_home, log)
                log.info("\n===== %s finished. \n" % STAGE_NAME)

        result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta")
        result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta")
        misc_dir = os.path.join(cfg["common"].output_dir, "misc")
        ### if mismatch correction is enabled then result contigs are copied to misc directory
        assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta")
        assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta")
        if "assembly" in cfg:
            STAGE_NAME = "Assembling"
            spades_cfg = merge_configs(cfg["assembly"], cfg["common"])
            spades_cfg.__dict__["result_contigs"] = result_contigs_filename
            spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename

            if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
                                                  or ("mismatch_corrector" in cfg and
                                                      os.path.isfile(assembled_contigs_filename)))\
                and not options_storage.restart_from == 'as' \
                and not (options_storage.restart_from and options_storage.restart_from.startswith('k')):

                log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
                # calculating latest_dir for the next stages
                latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*"))
                if not latest_dir:
                    support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log)
            else:
                old_result_files = [result_contigs_filename, result_scaffolds_filename,
                                    assembled_contigs_filename, assembled_scaffolds_filename]
                for format in [".fasta", ".fastg"]:
                    for old_result_file in old_result_files:
                        if os.path.isfile(old_result_file[:-6] + format):
                            os.remove(old_result_file[:-6] + format)

                if options_storage.restart_from == 'as':
                    support.continue_from_here(log)

                if os.path.isfile(corrected_dataset_yaml_filename):
                    dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
                    dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename))
                if spades_cfg.disable_rr:
                    spades_cfg.__dict__["rr_enable"] = False
                else:
                    spades_cfg.__dict__["rr_enable"] = True

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in spades_cfg.__dict__:
                    os.environ["HEAPCHECK"] = spades_cfg.heap_check

                log.info("\n===== %s started.\n" % STAGE_NAME)

                # creating dataset
                dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info")
                if not os.path.isfile(dataset_filename) or not options_storage.continue_mode:
                    dataset_file = open(dataset_filename, 'w')
                    import process_cfg
                    dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n')
                    if os.path.isfile(corrected_dataset_yaml_filename):
                        dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n')
                    else:
                        dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n')
                    if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__:
                        dataset_file.write("reference_genome" + '\t')
                        dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n')
                    dataset_file.close()
                spades_cfg.__dict__["dataset"] = dataset_filename

                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)

                if os.path.isdir(misc_dir) and not options_storage.continue_mode:
                    shutil.rmtree(misc_dir)
                if not os.path.isdir(misc_dir):
                    os.makedirs(misc_dir)

                if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'):
                    k_str = options_storage.restart_from[1:]
                    if k_str.find(":") != -1:
                        k_str = k_str[:k_str.find(":")]
                    support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
                log.info("\n===== %s finished. \n" % STAGE_NAME)

            #corrector
            if "mismatch_corrector" in cfg and (os.path.isfile(result_contigs_filename) or
                                                (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))):
                STAGE_NAME = "Mismatch correction"
                to_correct = dict()
                to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename)
                if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and
                                                                 os.path.isfile(assembled_scaffolds_filename)):
                    to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename)

                # moving assembled contigs (scaffolds) to misc dir
                for assembly_type, (old, new) in to_correct.items():
                    if options_storage.continue_mode and os.path.isfile(new):
                        continue
                    for format in [".fasta", ".fastg"]:
                        if os.path.isfile(old[:-6] + format):
                            shutil.move(old[:-6] + format, new[:-6] + format)

                if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \
                    (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \
                    and not options_storage.restart_from == 'mc':
                    log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
                else:
                    if options_storage.restart_from == 'mc':
                        support.continue_from_here(log)

                    log.info("\n===== %s started." % STAGE_NAME)
                    # detecting paired-end library with the largest insert size
                    est_params_data = pyyaml.load(open(os.path.join(latest_dir, "final.lib_data"), 'r'))
                    max_IS_library = None
                    for reads_library in est_params_data:
                        if reads_library['type'] == 'paired-end':
                            if not max_IS_library or float(reads_library["insert size mean"]) > float(max_IS_library["insert size mean"]):
                                max_IS_library = reads_library
                    if not max_IS_library:
                        support.error('Mismatch correction cannot be performed without at least one paired-end library!', log)
                    if not max_IS_library["insert size mean"]:
                        support.warning('Failed to estimate insert size for all paired-end libraries. Starting Mismatch correction'
                                        ' based on the first paired-end library and with default insert size.', log)
                    else:
                        cfg["mismatch_corrector"].__dict__["insert-size"] = round(max_IS_library["insert size mean"])
                    yaml_dirname = os.path.dirname(options_storage.dataset_yaml_filename)
                    cfg["mismatch_corrector"].__dict__["1"] = list(map(lambda x: os.path.join(yaml_dirname, x),
                        max_IS_library['left reads']))
                    cfg["mismatch_corrector"].__dict__["2"] = list(map(lambda x: os.path.join(yaml_dirname, x),
                        max_IS_library['right reads']))
                    #TODO: add reads orientation

                    import corrector
                    corrector_cfg = cfg["mismatch_corrector"]
                    args = []
                    for key, values in corrector_cfg.__dict__.items():
                        if key == "output-dir":
                            continue

                        # for processing list of reads
                        if not isinstance(values, list):
                            values = [values]
                        for value in values:
                            if len(key) == 1:
                                args.append('-' + key)
                            else:
                                args.append('--' + key)
                            if value is not None:
                                args.append(value)

                    # processing contigs and scaffolds (or only contigs)
                    for assembly_type, (corrected, assembled) in to_correct.items():
                        if options_storage.continue_mode and os.path.isfile(corrected):
                            log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
                            continue

                        support.continue_from_here(log)
                        log.info("\n== Processing of " + assembly_type + "\n")

                        cur_args = args[:]
                        cur_args += ['-c', assembled]
                        tmp_dir_for_corrector = support.get_tmp_dir(prefix="mis_cor_%s_" % assembly_type)
                        cur_args += ['--output-dir', tmp_dir_for_corrector]

                        # correcting
                        corrector.main(cur_args, ext_python_modules_home, log)

                        result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")
                        # moving corrected contigs (scaffolds) to SPAdes output dir
                        if os.path.isfile(result_corrected_filename):
                            shutil.move(result_corrected_filename, corrected)

                        if os.path.isdir(tmp_dir_for_corrector):
                            shutil.rmtree(tmp_dir_for_corrector)

                        assembled_fastg = assembled[:-6] + ".fastg"
                        if os.path.isfile(assembled_fastg):
                            support.create_fastg_from_fasta(corrected, assembled_fastg, log)
                    log.info("\n===== %s finished.\n" % STAGE_NAME)

        if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir):
            shutil.rmtree(tmp_configs_dir)

        #log.info("")
        if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)):
            log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/"))
        if "assembly" in cfg and os.path.isfile(result_contigs_filename):
            message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
            if os.path.isfile(result_contigs_filename[:-6] + ".fastg"):
                message += " (" + os.path.basename(result_contigs_filename[:-6] + ".fastg") + ")"
            log.info(message)
        if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
            message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
            if os.path.isfile(result_scaffolds_filename[:-6] + ".fastg"):
                message += " (" + os.path.basename(result_scaffolds_filename[:-6] + ".fastg") + ")"
            log.info(message)
        #log.info("")

        #breaking scaffolds
        if os.path.isfile(result_scaffolds_filename):
            if not os.path.isdir(misc_dir):
                os.makedirs(misc_dir)
            result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta")
            if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode:
                modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename,
                    options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS)
                if modified:
                    support.write_fasta(result_broken_scaffolds, broken_scaffolds)
                    #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) +
                    # " Ns are in " + result_broken_scaffolds)

        ### printing WARNINGS SUMMARY
        if not support.log_warnings(log):
            log.info("\n======= SPAdes pipeline finished.")  # otherwise it finished WITH WARNINGS

        if options_storage.test_mode:
            for result_filename in [result_contigs_filename, result_scaffolds_filename]:
                if os.path.isfile(result_filename):
                    result_fasta = list(support.read_fasta(result_filename))
                    # correctness check: should be one contig of length 1000 bp
                    correct_number = 1
                    correct_length = 1000
                    if not len(result_fasta):
                        support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
                    elif len(result_fasta) > correct_number:
                        support.error("TEST FAILED: %s contains more than %d contig (%d)!" %
                                      (result_filename, correct_number, len(result_fasta)))
                    elif len(result_fasta[0][1]) != correct_length:
                        if len(result_fasta[0][1]) > correct_length:
                            relation = "more"
                        else:
                            relation = "less"
                        support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" %
                                      (result_filename, relation, correct_length, len(result_fasta[0][1])))
                else:
                    support.error("TEST FAILED: " + result_filename + " does not exist!")
            log.info("\n========= TEST PASSED CORRECTLY.")


        log.info("\nSPAdes log can be found here: " + log_filename)
        log.info("")
        log.info("Thank you for using SPAdes!")
        log.removeHandler(log_handler)

    except Exception:
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error
                support.error("It looks like you are using SPAdes binaries for another platform.\n" +
                              support.get_spades_binaries_info_message())
            else:
                log.exception(exc_value)
                support.error("exception caught: %s" % exc_type, log)
    except BaseException: # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            log.exception(exc_value)
            support.error("exception caught: %s" % exc_type, log)
def main(args):
    os.environ["LC_ALL"] = "C"

    if len(args) == 1:
        show_usage(0)

    log = logging.getLogger('spades')
    log.setLevel(logging.DEBUG)

    console = logging.StreamHandler(sys.stdout)
    console.setFormatter(logging.Formatter('%(message)s'))
    console.setLevel(logging.DEBUG)
    log.addHandler(console)

    support.check_binaries(bin_home, log)

    # parse options and safe all parameters to cfg
    options = args
    cfg, dataset_data = fill_cfg(options, log)

    if options_storage.continue_mode:
        cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
        if not options:
            support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
        cfg, dataset_data = fill_cfg(options, log, secondary_filling=True)
        if options_storage.restart_from:
            check_cfg_for_partial_run(cfg, type='restart-from')
        options_storage.continue_mode = True
    if options_storage.stop_after:
        check_cfg_for_partial_run(cfg, type='stop-after')

    log_filename = os.path.join(cfg["common"].output_dir, "spades.log")
    if options_storage.continue_mode:
        log_handler = logging.FileHandler(log_filename, mode='a')
    else:
        log_handler = logging.FileHandler(log_filename, mode='w')
    log.addHandler(log_handler)

    if options_storage.continue_mode:
        log.info("\n======= SPAdes pipeline continued. Log can be found here: " + log_filename + "\n")
        log.info("Restored from " + cmd_line)
        if options_storage.restart_from:
            updated_params = ""
            skip_next = False
            for v in args[1:]:
                if v == '-o' or v == '--restart-from':
                    skip_next = True
                    continue
                if skip_next or v.startswith('--restart-from='):  # you can specify '--restart-from=k33' but not '-o=out_dir'
                    skip_next = False
                    continue
                updated_params += " " + v
            updated_params = updated_params.strip()
            log.info("with updated parameters: " + updated_params)
            cmd_line += " " + updated_params
        log.info("")

    params_filename = os.path.join(cfg["common"].output_dir, "params.txt")
    params_handler = logging.FileHandler(params_filename, mode='w')
    log.addHandler(params_handler)

    if options_storage.continue_mode:
        log.info(cmd_line)
    else:
        command = "Command line: "
        for v in args:
            # substituting relative paths with absolute ones (read paths, output dir path, etc)
            v, prefix = support.get_option_prefix(v)
            if v in options_storage.dict_of_rel2abs.keys():
                v = options_storage.dict_of_rel2abs[v]
            if prefix:
                command += prefix + ":"
            command += v + " "
        log.info(command)

    # special case
#    if "mismatch_corrector" in cfg and not support.get_lib_ids_by_type(dataset_data, 'paired-end'):
#        support.warning('cannot perform mismatch correction without at least one paired-end library! Skipping this step.', log)
#        del cfg["mismatch_corrector"]

    print_used_values(cfg, log)
    log.removeHandler(params_handler)

    support.check_single_reads_in_options(options, log)

    if not options_storage.continue_mode:
        log.info("\n======= SPAdes pipeline started. Log can be found here: " + log_filename + "\n")

    # splitting interlaced reads and processing Ns in additional contigs if needed
    if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_additional_contigs(dataset_data)\
            or support.dataset_has_nxmate_reads(dataset_data):
        dir_for_split_reads = os.path.join(options_storage.output_dir, 'split_input')
        if support.dataset_has_interlaced_reads(dataset_data) or support.dataset_has_nxmate_reads(dataset_data):
            if not os.path.isdir(dir_for_split_reads):
                os.makedirs(dir_for_split_reads)
            if support.dataset_has_interlaced_reads(dataset_data):
                dataset_data = support.split_interlaced_reads(dataset_data, dir_for_split_reads, log)
            if support.dataset_has_nxmate_reads(dataset_data):
                dataset_data = support.process_nxmate_reads(dataset_data, dir_for_split_reads, log)
        if support.dataset_has_additional_contigs(dataset_data):
            dataset_data = support.process_Ns_in_additional_contigs(dataset_data, dir_for_split_reads, log)
        options_storage.dataset_yaml_filename = os.path.join(options_storage.output_dir, "input_dataset.yaml")
        pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
        cfg["dataset"].yaml_filename = options_storage.dataset_yaml_filename

    try:
        # copying configs before all computations (to prevent its changing at run time)
        tmp_configs_dir = os.path.join(cfg["common"].output_dir, "configs")
        if os.path.isdir(tmp_configs_dir) and not options_storage.continue_mode:
            shutil.rmtree(tmp_configs_dir)
        if not os.path.isdir(tmp_configs_dir):
            if options_storage.configs_dir:
                dir_util.copy_tree(options_storage.configs_dir, tmp_configs_dir, preserve_times=False)
            else:
                dir_util.copy_tree(os.path.join(spades_home, "configs"), tmp_configs_dir, preserve_times=False)

        corrected_dataset_yaml_filename = ''
        if "error_correction" in cfg:
            STAGE_NAME = "Read error correction"
            bh_cfg = merge_configs(cfg["error_correction"], cfg["common"])
            corrected_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "corrected.yaml")
            if os.path.isfile(corrected_dataset_yaml_filename) and options_storage.continue_mode \
                and not options_storage.restart_from == "ec":
                log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
            else:
                support.continue_from_here(log)

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in bh_cfg.__dict__:
                    os.environ["HEAPCHECK"] = bh_cfg.heap_check

                if os.path.exists(bh_cfg.output_dir):
                    shutil.rmtree(bh_cfg.output_dir)
                os.makedirs(bh_cfg.output_dir)

                if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES):
                    not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
                    to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
                    to_correct_dataset_yaml_filename = os.path.join(bh_cfg.output_dir, "to_correct.yaml")
                    pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'))
                    bh_cfg.__dict__["dataset_yaml_filename"] = to_correct_dataset_yaml_filename
                else:
                    not_used_dataset_data = None
                    bh_cfg.__dict__["dataset_yaml_filename"] = cfg["dataset"].yaml_filename

                log.info("\n===== %s started. \n" % STAGE_NAME)

                hammer_logic.run_hammer(corrected_dataset_yaml_filename, tmp_configs_dir, bin_home, bh_cfg, not_used_dataset_data,
                    ext_python_modules_home, log)
                log.info("\n===== %s finished. \n" % STAGE_NAME)
            if options_storage.stop_after == 'ec':
                support.finish_here(log)

        result_contigs_filename = os.path.join(cfg["common"].output_dir, "contigs.fasta")
        result_scaffolds_filename = os.path.join(cfg["common"].output_dir, "scaffolds.fasta")
        result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, "assembly_graph.fastg")
        truseq_long_reads_file_base = os.path.join(cfg["common"].output_dir, "truseq_long_reads")
        truseq_long_reads_file = truseq_long_reads_file_base + ".fasta"
        misc_dir = os.path.join(cfg["common"].output_dir, "misc")
        ### if mismatch correction is enabled then result contigs are copied to misc directory
        assembled_contigs_filename = os.path.join(misc_dir, "assembled_contigs.fasta")
        assembled_scaffolds_filename = os.path.join(misc_dir, "assembled_scaffolds.fasta")
        if "assembly" in cfg and not options_storage.run_completed:
            STAGE_NAME = "Assembling"
            spades_cfg = merge_configs(cfg["assembly"], cfg["common"])
            spades_cfg.__dict__["result_contigs"] = result_contigs_filename
            spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
            spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename

            if options_storage.continue_mode and (os.path.isfile(spades_cfg.result_contigs)
                                                  or ("mismatch_corrector" in cfg and
                                                      os.path.isfile(assembled_contigs_filename)))\
                and not options_storage.restart_from == 'as' \
                and not options_storage.restart_from == 'scc' \
                and not (options_storage.restart_from and options_storage.restart_from.startswith('k')):

                log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
                # calculating latest_dir for the next stages
                latest_dir = support.get_latest_dir(os.path.join(spades_cfg.output_dir, "K*"))
                if not latest_dir:
                    support.error("failed to continue the previous run! Please restart from previous stages or from the beginning.", log)
            else:
                old_result_files = [result_contigs_filename, result_scaffolds_filename,
                                    assembled_contigs_filename, assembled_scaffolds_filename]
                for old_result_file in old_result_files:
                    if os.path.isfile(old_result_file):
                        os.remove(old_result_file)

                if options_storage.restart_from == 'as':
                    support.continue_from_here(log)

                if os.path.isfile(corrected_dataset_yaml_filename):
                    dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
                    dataset_data = support.relative2abs_paths(dataset_data, os.path.dirname(corrected_dataset_yaml_filename))
                if spades_cfg.disable_rr:
                    spades_cfg.__dict__["rr_enable"] = False
                else:
                    spades_cfg.__dict__["rr_enable"] = True

                if "HEAPCHECK" in os.environ:
                    del os.environ["HEAPCHECK"]
                if "heap_check" in spades_cfg.__dict__:
                    os.environ["HEAPCHECK"] = spades_cfg.heap_check

                log.info("\n===== %s started.\n" % STAGE_NAME)

                # creating dataset
                dataset_filename = os.path.join(spades_cfg.output_dir, "dataset.info")
                if not os.path.isfile(dataset_filename) or not options_storage.continue_mode:
                    dataset_file = open(dataset_filename, 'w')
                    import process_cfg
                    dataset_file.write("single_cell" + '\t' + process_cfg.bool_to_str(cfg["dataset"].single_cell) + '\n')
                    dataset_file.write("meta" + '\t' + process_cfg.bool_to_str(cfg["dataset"].meta) + '\n')
                    dataset_file.write("moleculo" + '\t' + process_cfg.bool_to_str(cfg["dataset"].truseq) + '\n')
                    if os.path.isfile(corrected_dataset_yaml_filename):
                        dataset_file.write("reads" + '\t' + process_cfg.process_spaces(corrected_dataset_yaml_filename) + '\n')
                    else:
                        dataset_file.write("reads" + '\t' + process_cfg.process_spaces(cfg["dataset"].yaml_filename) + '\n')
                    if spades_cfg.developer_mode and "reference" in cfg["dataset"].__dict__:
                        dataset_file.write("reference_genome" + '\t')
                        dataset_file.write(process_cfg.process_spaces(cfg["dataset"].reference) + '\n')
                    dataset_file.close()
                spades_cfg.__dict__["dataset"] = dataset_filename

                latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)

                if os.path.isdir(misc_dir) and not options_storage.continue_mode:
                    shutil.rmtree(misc_dir)
                if not os.path.isdir(misc_dir):
                    os.makedirs(misc_dir)

                if options_storage.continue_mode and options_storage.restart_from and options_storage.restart_from.startswith('k'):
                    k_str = options_storage.restart_from[1:]
                    if k_str.find(":") != -1:
                        k_str = k_str[:k_str.find(":")]
                    support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
                log.info("\n===== %s finished. \n" % STAGE_NAME)
            if not options_storage.run_completed:
                if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
                    support.finish_here(log)

            #postprocessing
            if cfg["run_truseq_postprocessing"] and not options_storage.run_completed:
                if options_storage.continue_mode and os.path.isfile(truseq_long_reads_file_base + ".fastq") and not options_storage.restart_from == 'tpp':
                    log.info("\n===== Skipping %s (already processed). \n" % "TruSeq postprocessing")
                else:
                    support.continue_from_here(log)
                    if os.path.isfile(result_scaffolds_filename):
                        shutil.move(result_scaffolds_filename, assembled_scaffolds_filename)
                    reads_library = dataset_data[0]
                    alignment_bin = os.path.join(bin_home, "bwa-spades")
                    alignment_dir = os.path.join(cfg["common"].output_dir, "alignment")
                    sam_files = alignment.align_bwa(alignment_bin, assembled_scaffolds_filename, dataset_data, alignment_dir, log, options_storage.threads)
                    moleculo_postprocessing.moleculo_postprocessing(assembled_scaffolds_filename, truseq_long_reads_file_base, sam_files, log)
                if options_storage.stop_after == 'tpp':
                    support.finish_here(log)

            #corrector
            if "mismatch_corrector" in cfg and not options_storage.run_completed and \
                    (os.path.isfile(result_contigs_filename) or
                    (options_storage.continue_mode and os.path.isfile(assembled_contigs_filename))):
                STAGE_NAME = "Mismatch correction"
                to_correct = dict()
                to_correct["contigs"] = (result_contigs_filename, assembled_contigs_filename)
                if os.path.isfile(result_scaffolds_filename) or (options_storage.continue_mode and
                                                                 os.path.isfile(assembled_scaffolds_filename)):
                    to_correct["scaffolds"] = (result_scaffolds_filename, assembled_scaffolds_filename)

                # moving assembled contigs (scaffolds) to misc dir
                for assembly_type, (old, new) in to_correct.items():
                    if options_storage.continue_mode and os.path.isfile(new):
                        continue
                    if os.path.isfile(old):
                        shutil.move(old, new)

                if options_storage.continue_mode and os.path.isfile(result_contigs_filename) and \
                    (os.path.isfile(result_scaffolds_filename) or not os.path.isfile(assembled_scaffolds_filename)) \
                    and not options_storage.restart_from == 'mc':
                    log.info("\n===== Skipping %s (already processed). \n" % STAGE_NAME)
                else:
                    if options_storage.restart_from == 'mc':
                        support.continue_from_here(log)

                    log.info("\n===== %s started." % STAGE_NAME)
                    # detecting paired-end library with the largest insert size
                    cfg["mismatch_corrector"].__dict__["dataset"] = cfg["dataset"].yaml_filename
                    #TODO: add reads orientation

                    import corrector_logic
                    corrector_cfg = cfg["mismatch_corrector"]
                    # processing contigs and scaffolds (or only contigs)
                    for assembly_type, (corrected, assembled) in to_correct.items():
                        if options_storage.continue_mode and os.path.isfile(corrected):
                            log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
                            continue

                        support.continue_from_here(log)
                        log.info("\n== Processing of " + assembly_type + "\n")

                        tmp_dir_for_corrector = os.path.join(cfg["common"].output_dir, "mismatch_corrector", assembly_type)

                        cfg["mismatch_corrector"].__dict__["output_dir"] = tmp_dir_for_corrector
                        # correcting
                        corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"])
                        
                        result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")

                        corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg,
                        ext_python_modules_home, log, assembled, result_corrected_filename)

                        if os.path.isfile(result_corrected_filename):
                            shutil.copyfile(result_corrected_filename, corrected)
                        tmp_d = os.path.join(tmp_dir_for_corrector, "tmp")
                        if os.path.isdir(tmp_d) and not cfg["common"].developer_mode:
                            shutil.rmtree(tmp_d)
                    log.info("\n===== %s finished.\n" % STAGE_NAME)
                if options_storage.stop_after == 'mc':
                    support.finish_here(log)

        if not cfg["common"].developer_mode and os.path.isdir(tmp_configs_dir):
            shutil.rmtree(tmp_configs_dir)

        if not options_storage.run_completed:
            #log.info("")
            if "error_correction" in cfg and os.path.isdir(os.path.dirname(corrected_dataset_yaml_filename)):
                log.info(" * Corrected reads are in " + support.process_spaces(os.path.dirname(corrected_dataset_yaml_filename) + "/"))
            if "assembly" in cfg and os.path.isfile(result_contigs_filename):
                message = " * Assembled contigs are in " + support.process_spaces(result_contigs_filename)
                log.info(message)
            if "assembly" in cfg and os.path.isfile(result_scaffolds_filename):
                message = " * Assembled scaffolds are in " + support.process_spaces(result_scaffolds_filename)
                log.info(message)
            if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
                message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
                log.info(message)
            #log.info("")

        #breaking scaffolds
        if os.path.isfile(result_scaffolds_filename):
            if not os.path.isdir(misc_dir):
                os.makedirs(misc_dir)
            result_broken_scaffolds = os.path.join(misc_dir, "broken_scaffolds.fasta")
            if not os.path.isfile(result_broken_scaffolds) or not options_storage.continue_mode:
                modified, broken_scaffolds = support.break_scaffolds(result_scaffolds_filename,
                    options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS)
                if modified:
                    support.write_fasta(result_broken_scaffolds, broken_scaffolds)
                    #log.info(" * Scaffolds broken by " + str(options_storage.THRESHOLD_FOR_BREAKING_SCAFFOLDS) +
                    # " Ns are in " + result_broken_scaffolds)

        ### printing WARNINGS SUMMARY
        if not support.log_warnings(log):
            log.info("\n======= SPAdes pipeline finished.")  # otherwise it finished WITH WARNINGS

        if options_storage.test_mode:
            if options_storage.truseq_mode:
                if not os.path.isfile(truseq_long_reads_file):
                    support.error("TEST FAILED: %s does not exist!" % truseq_long_reads_file)
            else:
                for result_filename in [result_contigs_filename, result_scaffolds_filename]:
                    if os.path.isfile(result_filename):
                        result_fasta = list(support.read_fasta(result_filename))
                        # correctness check: should be one contig of length 1000 bp
                        correct_number = 1
                        correct_length = 1000
                        if not len(result_fasta):
                            support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
                        elif len(result_fasta) > correct_number:
                            support.error("TEST FAILED: %s contains more than %d contig (%d)!" %
                                          (result_filename, correct_number, len(result_fasta)))
                        elif len(result_fasta[0][1]) != correct_length:
                            if len(result_fasta[0][1]) > correct_length:
                                relation = "more"
                            else:
                                relation = "less"
                            support.error("TEST FAILED: %s contains %s than %d bp (%d bp)!" %
                                          (result_filename, relation, correct_length, len(result_fasta[0][1])))
                    else:
                        support.error("TEST FAILED: " + result_filename + " does not exist!")
            log.info("\n========= TEST PASSED CORRECTLY.")


        log.info("\nSPAdes log can be found here: " + log_filename)
        log.info("")
        log.info("Thank you for using SPAdes!")
        log.removeHandler(log_handler)

    except Exception:
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            if exc_type == OSError and exc_value.errno == errno.ENOEXEC: # Exec format error
                support.error("It looks like you are using SPAdes binaries for another platform.\n" +
                              support.get_spades_binaries_info_message())
            else:
                log.exception(exc_value)
                support.error("exception caught: %s" % exc_type, log)
    except BaseException:  # since python 2.5 system-exiting exceptions (e.g. KeyboardInterrupt) are derived from BaseException
        exc_type, exc_value, _ = sys.exc_info()
        if exc_type == SystemExit:
            sys.exit(exc_value)
        else:
            log.exception(exc_value)
            support.error("exception caught: %s" % exc_type, log)
Ejemplo n.º 59
0
def run_hammer(corrected_dataset_yaml_filename, configs_dir, execution_home, cfg,
               dataset_data, ext_python_modules_home, only_compressing_is_needed, log):
    addsitedir(ext_python_modules_home)
    if sys.version.startswith('2.'):
        import pyyaml2 as pyyaml
    elif sys.version.startswith('3.'):
        import pyyaml3 as pyyaml

    # not all reads need processing
    if support.get_lib_ids_by_type(dataset_data, options_storage.LONG_READS_TYPES):
        not_used_dataset_data = support.get_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
        to_correct_dataset_data = support.rm_libs_by_type(dataset_data, options_storage.LONG_READS_TYPES)
        to_correct_dataset_yaml_filename = os.path.join(cfg.output_dir, "to_correct.yaml")
        pyyaml.dump(to_correct_dataset_data, open(to_correct_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500)
        cfg.dataset_yaml_filename = to_correct_dataset_yaml_filename
    else:
        not_used_dataset_data = None

    if not only_compressing_is_needed:
        dst_configs = os.path.join(cfg.output_dir, "configs")
        if os.path.exists(dst_configs):
            shutil.rmtree(dst_configs)
        if cfg.iontorrent:
            dir_util.copy_tree(os.path.join(configs_dir, "ionhammer"), dst_configs, preserve_times=False)
            cfg_file_name = os.path.join(dst_configs, "ionhammer.cfg")
        else:
            dir_util.copy_tree(os.path.join(configs_dir, "hammer"), dst_configs, preserve_times=False)
            cfg_file_name = os.path.join(dst_configs, "config.info")

        cfg.tmp_dir = support.get_tmp_dir(prefix="hammer_")
        if cfg.iontorrent:
            prepare_config_ih(cfg_file_name, cfg, ext_python_modules_home)
            binary_name = "ionhammer"
        else:
            prepare_config_bh(cfg_file_name, cfg, log)
            binary_name = "hammer"

        command = [os.path.join(execution_home, binary_name),
                   os.path.abspath(cfg_file_name)]

        log.info("\n== Running read error correction tool: " + ' '.join(command) + "\n")
        support.sys_call(command, log)
        if not os.path.isfile(corrected_dataset_yaml_filename):
            support.error("read error correction finished abnormally: " + corrected_dataset_yaml_filename + " not found!")
    else:
        log.info("\n===== Skipping %s (already processed). \n" % "read error correction tool")
        support.continue_from_here(log)

    corrected_dataset_data = pyyaml.load(open(corrected_dataset_yaml_filename, 'r'))
    remove_not_corrected_reads(cfg.output_dir)
    is_changed = False
    if cfg.gzip_output:
        is_changed = True
        compress_dataset_files(corrected_dataset_data, ext_python_modules_home, cfg.max_threads, log)
    if not_used_dataset_data:
        is_changed = True
        corrected_dataset_data += not_used_dataset_data
    if is_changed:
        pyyaml.dump(corrected_dataset_data, open(corrected_dataset_yaml_filename, 'w'), default_flow_style = False, default_style='"', width=100500)
    log.info("\n== Dataset description file was created: " + corrected_dataset_yaml_filename + "\n")

    if os.path.isdir(cfg.tmp_dir):
        shutil.rmtree(cfg.tmp_dir)