Exemple #1
0
def Task(script="", inputs={}, outputs={}, parameters=None, dist=None):
    if parameters is None:
        parameters = dict()
    if dist is None:
        dist = Dist()

    # Make paths relative to CWD. (But ok if caller does this.)
    def get_rel(maybe_abs):
        rel = dict()
        for (k, v) in maybe_abs.items():
            try:
                if os.path.isabs(v):
                    v = os.path.relpath(v)
                rel[k] = v
            except Exception:
                LOG.exception('Error for {!r}->{!r}'.format(k, v))
                raise
        return rel

    inputs = get_rel(inputs)
    outputs = get_rel(outputs)

    # All outputs must be in same directory.

    params = dict(parameters)

    pt = gen_task(script, inputs, outputs, params, dist)

    return pt
Exemple #2
0
def run_ovlp_to_ctg(wf, args, read_db_abs_prefix, read_db, ovlps):
    asm_script = """\
cat {params.ovlps} | shmr_dedup > preads.ovl; echo "-" >> preads.ovl
/usr/bin/time ovlp_to_graph.py >& asm.log
/usr/bin/time graph_to_path.py >& to_path.log
/usr/bin/time path_to_contig.py {params.read_db_prefix} \
    p_ctg_tiling_path > {output.p_ctg} 2> to_contig.log
"""
    asm_dir = os.path.join(os.path.abspath(args["--output"]), "3-asm")
    ovlps_list = " ".join(sorted([v for v in ovlps.values()]))
    inputs = {}
    inputs.update(ovlps)
    inputs.update(read_db)
    outputs = {}
    outputs["p_ctg"] = os.path.join(asm_dir, "p_ctg.fa")
    wf.addTask(
        Task(script=asm_script,
             inputs=inputs,
             outputs=outputs,
             parameters={
                 'read_db_prefix': read_db_abs_prefix,
                 'ovlps': ovlps_list
             },
             dist=Dist(NPROC=1, local=True)))
    wf.max_jobs = 1
    wf.refreshTargets()
    return outputs
Exemple #3
0
def run_build_db(wf, args, seq_dataset_lst_fn):

    build_db = """
/usr/bin/time shmr_mkseqdb \
    -p {params.read_db_prefix} \
    -d {input.seq_dataset}
"""

    read_db_dir = os.path.join(os.path.abspath(args["--output"]), "0-seqdb")
    read_db_prefix = "seq_dataset"
    seq_dataset = seq_dataset_lst_fn
    read_db = os.path.join(read_db_dir, f"{read_db_prefix}.seqdb")
    seqidx = os.path.join(read_db_dir, f"{read_db_prefix}.idx")
    read_db_abs_prefix = os.path.join(read_db_dir, read_db_prefix)
    outputs = {'read_db': read_db, 'seqidx': seqidx}

    wf.addTask(
        Task(script=build_db,
             inputs={'seq_dataset': seq_dataset},
             outputs=outputs,
             parameters={'read_db_prefix': read_db_abs_prefix},
             dist=Dist(NPROC=1, local=True)))

    wf.max_jobs = 1
    wf.refreshTargets()

    return read_db_abs_prefix, outputs
Exemple #4
0
def run_overlapper(wf, args,
                   read_db_abs_prefix, index_abs_prefix, ovlp_in):
    n_chunk = int(args["<ovlp_nchunk>"])
    n_proc = int(args["<ovlp_nproc>"])
    shmr_ovlp_script = """
/usr/bin/time shmr_overlap\
    -b {params.best_n_ovlp}\
    -m {params.mc_lower}\
    -M {params.mc_upper}\
    -w {params.align_bandwidth}\
    -n {params.ovlp_upper}\
    -p {params.read_db_prefix}\
    -l {params.index_prefix}\
    -t {params.n_chunk}\
    -c {params.my_chunk}\
    -o {output.ovlp_out}
"""
    ovlp_dir = os.path.join(os.path.abspath(args["--output"]), "2-ovlp")
    outputs = {}
    for my_chunk in range(1, n_chunk+1):
        ovlp_chunk_dir = os.path.join(ovlp_dir, f"chunk-{my_chunk:02d}")
        ovlp_chunk_abs_prefix = os.path.join(ovlp_chunk_dir, "ovlp")
        ovlp_fn = f"{ovlp_chunk_abs_prefix}-{my_chunk:02d}.dat"
        outputs[f'ovlp_{my_chunk:02d}'] = ovlp_fn

        wf.addTask(Task(
            script=shmr_ovlp_script,
            inputs=ovlp_in,
            outputs={'ovlp_out': ovlp_fn},
            parameters={
                'read_db_prefix': read_db_abs_prefix,
                'index_prefix': index_abs_prefix,
                'n_chunk': n_chunk,
                'my_chunk': my_chunk,
                'best_n_ovlp': int(args["--best_n_ovlp"]),
                'mc_lower': int(args["--mc_lower"]),
                'mc_upper': int(args["--mc_upper"]),
                'align_bandwidth': int(args["--aln_bw"]),
                'ovlp_upper': int(args["--ovlp_upper"]),
            },
            dist=Dist(NPROC=1, local=True)
        ))

    wf.max_jobs = n_proc
    wf.refreshTargets()

    return outputs
Exemple #5
0
def wrap_gen_task(rule_writer,
                  script,
                  inputs,
                  outputs,
                  parameters=None,
                  dist=None):
    if parameters is None:
        parameters = dict()
    if dist is None:
        dist = Dist()
    from future.utils import viewitems
    rel_inputs = dict()
    rel_outputs = dict()

    # Make relative to CWD. (But better if caller does this.)
    def get_rel(maybe_abs):
        rel = dict()
        for (k, v) in viewitems(maybe_abs):
            if os.path.isabs(v):
                v = os.path.relpath(v)
            rel[k] = v
        return rel

    inputs = get_rel(inputs)
    outputs = get_rel(outputs)

    first_output_dir = os.path.normpath(os.path.dirname(outputs.values()[0]))
    rel_topdir = os.path.relpath(
        '.', first_output_dir)  # redundant for rel-inputs, but fine
    params = dict(parameters)
    params['topdir'] = rel_topdir

    pt = pype_gen_task(script, inputs, outputs, params, dist)

    # Run pype_gen_task first because it can valid some stuff.
    rule_writer(inputs, outputs, params, script)
    return pt
Exemple #6
0
def run_cns(wf, args, read_db_abs_prefix, read_db, index_abs_prefix,
            read_index, p_ctg):
    mapping_nchunk = int(args["<mapping_nchunk>"])
    mapping_nproc = int(args["<mapping_nproc>"])
    cns_nchunk = int(args["<cns_nchunk>"])
    cns_nproc = int(args["<cns_nproc>"])
    sort_nproc = int(args["<sort_nproc>"])
    shimmer_k = int(args["--shimmer-k"])
    shimmer_w = int(args["--shimmer-w"])
    shimmer_r = int(args["--shimmer-r"])
    shimmer_l = int(args["--shimmer-l"])
    build_index_script = """\
echo {input.p_ctg} > p_ctg.lst

/usr/bin/time shmr_mkseqdb -p p_ctg \
    -d p_ctg.lst 2> build_p_ctg_db.log
"""

    build_index_script += f"""
/usr/bin/time shmr_index \
    -p p_ctg -t 1 -c 1 \
    -k {shimmer_k}\
    -w {shimmer_w}\
    -r {shimmer_r}\
    -l {shimmer_l}\
    -o p_ctg 2> build_p_ctg_index.log
"""
    cns_dir = os.path.join(os.path.abspath(args["--output"]), "4-cns")
    inputs = {}
    inputs.update(p_ctg)
    output_dir = os.path.join(cns_dir, "p_ctg_index")
    outputs = {}
    outputs["p_ctg_db"] = os.path.join(output_dir, "p_ctg.seqdb")
    outputs["p_ctg_idx"] = os.path.join(output_dir, "p_ctg.idx")
    if shimmer_l == 2:
        outputs["p_ctg_shmr_idx"] = os.path.join(output_dir,
                                                 "p_ctg-L2-01-of-01.dat")
        p_ctg_idx_abs_prefix = os.path.join(output_dir, "p_ctg-L2")
    elif shimmer_l == 1:
        outputs["p_ctg_shmr_idx"] = os.path.join(output_dir,
                                                 "p_ctg-L1-01-of-01.dat")
        p_ctg_idx_abs_prefix = os.path.join(output_dir, "p_ctg-L1")

    p_ctg_db_abs_prefix = os.path.join(output_dir, "p_ctg")

    wf.addTask(
        Task(script=build_index_script,
             inputs=inputs,
             outputs=outputs,
             parameters={
                 'read_db_prefix': read_db_abs_prefix,
                 'index_prefix': index_abs_prefix
             },
             dist=Dist(NPROC=1, local=True)))
    wf.max_jobs = 1
    wf.refreshTargets()

    mapping_script = """\
/usr/bin/time shmr_map \
    -r {params.p_ctg_db_prefix} \
    -m {params.p_ctg_idx_prefix} \
    -p {params.read_db_prefix} \
    -l {params.index_prefix} \
    -t {params.n_chunk} -c {params.my_chunk}  > {output.readmap}
"""
    inputs = {}
    inputs["p_ctg_db"] = outputs["p_ctg_db"]
    inputs["p_ctg_idx"] = outputs["p_ctg_idx"]
    inputs["p_ctg_shmr_idx"] = outputs["p_ctg_shmr_idx"]
    inputs.update(read_db)
    inputs.update(read_index)
    outputs = {}
    for my_chunk in range(1, mapping_nchunk + 1):
        mapping_chunk_dir = os.path.join(cns_dir, f"map-{my_chunk:02d}")
        mapping_chunk_abs_prefix = os.path.join(mapping_chunk_dir, "reads2ref")
        map_fn = f"{mapping_chunk_abs_prefix}-{my_chunk:02d}.dat"
        outputs[f'readmap_{my_chunk:02d}'] = map_fn

        wf.addTask(
            Task(script=mapping_script,
                 inputs=inputs,
                 outputs={'readmap': map_fn},
                 parameters={
                     'read_db_prefix': read_db_abs_prefix,
                     'index_prefix': index_abs_prefix,
                     'p_ctg_db_prefix': p_ctg_db_abs_prefix,
                     'p_ctg_idx_prefix': p_ctg_idx_abs_prefix,
                     'n_chunk': mapping_nchunk,
                     'my_chunk': my_chunk,
                 },
                 dist=Dist(NPROC=1, local=True)))

    wf.max_jobs = mapping_nproc
    wf.refreshTargets()
    mapping_chunk_outputs = outputs
    map_files = " ".join(sorted([v for v in mapping_chunk_outputs.values()]))

    mapping_merge_script = """
mkdir -p {params.tmp_dir}
cat {params.map_files} | \
    sort -T {params.tmp_dir} -S 8g --parallel {params.sort_nproc}\
        -k 1 -g -k 2 -g > {output.merged_mapping_file}
"""
    mapping_merge_dir = os.path.join(cns_dir, "map-merge")
    merged_mapping_fn = os.path.join(mapping_merge_dir, "reads2ref_all.out")

    wf.addTask(
        Task(script=mapping_merge_script,
             inputs=mapping_chunk_outputs,
             outputs={"merged_mapping_file": merged_mapping_fn},
             parameters={
                 'tmp_dir': os.path.join(cns_dir, "tmp"),
                 'sort_nproc': sort_nproc,
                 'map_files': map_files
             },
             dist=Dist(NPROC=1, local=True)))

    cns_script = """\
/usr/bin/time pg_asm_cns.py {params.read_db_prefix} \
    {params.p_ctg_db_prefix} {input.merged_mapping_file} \
    {params.n_chunk} {params.my_chunk} > {output.cns_file} 2> cns.log
"""
    inputs.update({"merged_mapping_file": merged_mapping_fn})
    outputs = {}
    for my_chunk in range(1, cns_nchunk + 1):
        cns_chunk_dir = os.path.join(cns_dir, f"cns-{my_chunk:02d}")
        cnd_chunk_abs_prefix = os.path.join(cns_chunk_dir, "p_ctg_cns")
        cns_fn = f"{cnd_chunk_abs_prefix}-{my_chunk:02d}.fa"
        outputs[f'cns_{my_chunk:02d}'] = cns_fn
        wf.addTask(
            Task(script=cns_script,
                 inputs=inputs,
                 outputs={"cns_file": cns_fn},
                 parameters={
                     'read_db_prefix': read_db_abs_prefix,
                     'p_ctg_db_prefix': p_ctg_db_abs_prefix,
                     'n_chunk': cns_nchunk,
                     'my_chunk': my_chunk
                 },
                 dist=Dist(NPROC=1, local=True)))

    wf.max_jobs = cns_nproc
    wf.refreshTargets()

    gather_cns_script = """\
cat {params.cns_chunk_files} > {output.cns_file}
ln -sf {params.cns_merge_dir}/{output.cns_file} {params.workdir}
"""
    cns_chunk_files = " ".join(sorted([v for v in outputs.values()]))
    inputs = outputs
    cns_merge_dir = os.path.join(cns_dir, f"cns-merge")
    cns_fn = os.path.join(cns_merge_dir, "p_ctg_cns.fa")
    outputs = {"cns_file": cns_fn}
    wf.addTask(
        Task(script=gather_cns_script,
             inputs=inputs,
             outputs=outputs,
             parameters={
                 'cns_chunk_files': cns_chunk_files,
                 'workdir': os.path.abspath(args["--output"]),
                 'cns_merge_dir': "./4-cns/cns-merge"
             },
             dist=Dist(NPROC=1, local=True)))
    wf.max_jobs = 1
    wf.refreshTargets()

    return outputs
Exemple #7
0
def run_build_idx(wf, args, read_db_abs_prefix):
    n_chunk = int(args["<index_nchunk>"])
    n_proc = int(args["<index_nproc>"])
    shimmer_l = int(args["--shimmer-l"])

    build_idx = """
/usr/bin/time shmr_index\
    -m {params.output_L0_index}\
    -p {params.read_db_prefix}\
    -k {params.shimmer_k}\
    -w {params.shimmer_w}\
    -r {params.shimmer_r}\
    -l {params.shimmer_l}\
    -t {params.n_chunk}\
    -c {params.my_chunk}\
    -o {params.index_prefix}
ln -s {params.index_prefix}* {params.index_dir}
"""
    index_dir = os.path.join(os.path.abspath(args["--output"]), "1-index")
    if shimmer_l == 2:
        index_abs_prefix = os.path.join(index_dir, "shmr-L2")
    elif shimmer_l == 1:
        index_abs_prefix = os.path.join(index_dir, "shmr-L1")
    else:
        sys.exit(1)

    outputs = {}
    for my_chunk in range(1, n_chunk + 1):
        index_chunk_dir = os.path.join(index_dir, f"chunk-{my_chunk:02d}")
        index_chunk_abs_prefix = os.path.join(index_chunk_dir, "shmr")
        # index_L0_fn = f"{index_chunk_abs_prefix}-L0-{my_chunk:02d}-of-{n_chunk:02d}.dat"
        # index_L0_MC_fn = f"{index_chunk_abs_prefix}-L0-MC-{my_chunk:02d}-of-{n_chunk:02d}.dat"
        if shimmer_l == 2:
            index_shmr_fn = f"{index_chunk_abs_prefix}-L2-{my_chunk:02d}-of-{n_chunk:02d}.dat"
            index_shmr_MC_fn = f"{index_chunk_abs_prefix}-L2-MC-{my_chunk:02d}-of-{n_chunk:02d}.dat"
        elif shimmer_l == 1:
            index_shmr_fn = f"{index_chunk_abs_prefix}-L1-{my_chunk:02d}-of-{n_chunk:02d}.dat"
            index_shmr_MC_fn = f"{index_chunk_abs_prefix}-L1-MC-{my_chunk:02d}-of-{n_chunk:02d}.dat"
        else:
            sys.exit(1)

        outputs[f'index_shmr_{my_chunk:02d}'] = index_shmr_fn
        outputs[f'index_shmr_MC_{my_chunk:02d}'] = index_shmr_MC_fn
        wf.addTask(
            Task(script=build_idx,
                 inputs={
                     'read_db': f"{read_db_abs_prefix}.seqdb",
                     'seqidx': f"{read_db_abs_prefix}.idx"
                 },
                 outputs={
                     'index_shmr': index_shmr_fn,
                     'index_shmr_MC': index_shmr_MC_fn
                 },
                 parameters={
                     'read_db_prefix': read_db_abs_prefix,
                     'index_prefix': index_chunk_abs_prefix,
                     'index_dir': index_dir,
                     'output_L0_index': 1 if args["--with-L0-index"] else 0,
                     'shimmer_k': int(args["--shimmer-k"]),
                     'shimmer_w': int(args["--shimmer-w"]),
                     'shimmer_r': int(args["--shimmer-r"]),
                     'shimmer_l': int(args["--shimmer-l"]),
                     'n_chunk': n_chunk,
                     'my_chunk': my_chunk
                 },
                 dist=Dist(NPROC=1, local=True)))

    wf.max_jobs = n_proc
    wf.refreshTargets()

    return index_abs_prefix, outputs
Exemple #8
0
def gen_parallel_tasks(
    wf,
    rule_writer,
    split_fn,
    gathered_fn,
    run_dict,
    dist=None,
    run_script=TASK_GENERIC_RUN_UNITS_SCRIPT,
):
    """
    By convention, the first (wildcard) output in run_dict['outputs'] must be the gatherable list,
    in the same format as the gathered_fn to be generated from them.

    For now, we require a single such output, since we do not yet test for wildcards.
    """
    assert 'dist' not in run_dict, 'dist should be a parameter of gen_parallel_tasks(), not of its run_dict'
    if dist is None:
        dist = Dist()
    from future.utils import itervalues
    #from future.utils import viewitems
    # run_dict['inputs'] should be patterns to match the inputs in split_fn, by convention.

    # Write 3 wildcard rules for snakemake, 2 with dynamic.
    rule_writer.write_dynamic_rules(
        rule_name="foo",
        input_json=split_fn,
        inputs=dict_rel_paths(run_dict['inputs']),
        shell_template=run_dict['script'],
        parameters=run_dict['parameters'],
        wildcard_outputs=dict_rel_paths(run_dict['outputs']),
        output_json=gathered_fn,
    )

    #outputs = {k:patt.format(**jobkv) for k,patt in output_patterns}
    #inputs =  {k:patt.format(**jobkv) for k,patt in input_patterns}
    #inputs['SPLIT'] = split_fn # presumably ignored by script; might not be needed at all
    #split_fn = scatter_dict['outputs']['split'] # by convention
    wf.refreshTargets()
    wait_for(split_fn)
    split = io.deserialize(split_fn)
    bash_template_fn = run_dict['bash_template_fn']

    def find_wildcard_input(inputs):
        for k, v in inputs.items():
            if '{' in v:
                return v
        else:
            raise Exception('No wildcard inputs among {!r}'.format(inputs))

    LOG.debug('PARALLEL OUTPUTS:{}'.format(run_dict['outputs']))
    task_results = dict()
    for split_idx, job in enumerate(split):
        #inputs = job['input']
        #outputs = job['output']
        #params = job['params']
        #wildcards = job['wildcards']
        #params.update({k: v for (k, v) in viewitems(job['wildcards'])}) # include expanded wildcards
        #LOG.warning('OUT:{}'.format(outputs))

        wildcards = job['wildcards']

        def resolved(v):
            return v.format(**wildcards)

        def resolved_dict(d):
            result = dict(d)
            LOG.debug(' wildcards={!r}'.format(wildcards))
            for k, v in d.items():
                LOG.debug('  k={}, v={!r}'.format(k, v))
                result[k] = v.format(**wildcards)
            return result

        #task_inputs = resolved_dict(run_dict['inputs'])
        task_outputs = resolved_dict(run_dict['outputs'])
        task_parameters = resolved_dict(run_dict['parameters'])

        wild_input = find_wildcard_input(run_dict['inputs'])
        one_uow_fn = os.path.abspath(wild_input.format(**wildcards))

        wf.addTask(
            pype_gen_task(
                script=TASK_GENERIC_SCATTER_ONE_UOW_SCRIPT,
                inputs={
                    'all': split_fn,
                },
                outputs={
                    'one': one_uow_fn,
                },
                parameters={
                    'split_idx': split_idx,
                },
                dist=Dist(local=True),
            ))

        wf.addTask(
            pype_gen_task(
                script=
                run_script,  # usually TASK_GENERIC_RUN_UNITS_SCRIPT, unless individual load-time is slow
                inputs={
                    'units_of_work': one_uow_fn,
                    'bash_template': bash_template_fn,
                },
                outputs=task_outputs,
                parameters=task_parameters,
                dist=dist,
            ))
        wildcards_str = '_'.join(w for w in itervalues(job['wildcards']))
        job_name = 'job{}'.format(wildcards_str)
        task_results[job_name] = os.path.abspath(task_outputs.values()[0])

    gather_inputs = dict(task_results)
    ## An implicit "gatherer" simply takes the output filenames and combines their contents.
    result_fn_list_fn = os.path.join(os.path.dirname(gathered_fn),
                                     'result-fn-list.json')
    io.serialize(result_fn_list_fn, list(task_results.values())
                 )  # dump into next task-dir before next task starts
    #assert 'result_fn_list' not in gather_inputs
    #gather_inputs['result_fn_list'] = result_fn_list_fn # No! pseudo output, since it must exist in a known directory
    LOG.debug('gather_inputs:{!r}'.format(gather_inputs))
    wf.addTask(
        pype_gen_task(
            script=TASK_GENERIC_UNSPLIT_SCRIPT,
            inputs=gather_inputs,
            outputs={
                'gathered': gathered_fn,
                'result_fn_list': result_fn_list_fn,
            },
            parameters={},
            dist=Dist(local=True),
        ))