def Task(script="", inputs={}, outputs={}, parameters=None, dist=None): if parameters is None: parameters = dict() if dist is None: dist = Dist() # Make paths relative to CWD. (But ok if caller does this.) def get_rel(maybe_abs): rel = dict() for (k, v) in maybe_abs.items(): try: if os.path.isabs(v): v = os.path.relpath(v) rel[k] = v except Exception: LOG.exception('Error for {!r}->{!r}'.format(k, v)) raise return rel inputs = get_rel(inputs) outputs = get_rel(outputs) # All outputs must be in same directory. params = dict(parameters) pt = gen_task(script, inputs, outputs, params, dist) return pt
def run_ovlp_to_ctg(wf, args, read_db_abs_prefix, read_db, ovlps): asm_script = """\ cat {params.ovlps} | shmr_dedup > preads.ovl; echo "-" >> preads.ovl /usr/bin/time ovlp_to_graph.py >& asm.log /usr/bin/time graph_to_path.py >& to_path.log /usr/bin/time path_to_contig.py {params.read_db_prefix} \ p_ctg_tiling_path > {output.p_ctg} 2> to_contig.log """ asm_dir = os.path.join(os.path.abspath(args["--output"]), "3-asm") ovlps_list = " ".join(sorted([v for v in ovlps.values()])) inputs = {} inputs.update(ovlps) inputs.update(read_db) outputs = {} outputs["p_ctg"] = os.path.join(asm_dir, "p_ctg.fa") wf.addTask( Task(script=asm_script, inputs=inputs, outputs=outputs, parameters={ 'read_db_prefix': read_db_abs_prefix, 'ovlps': ovlps_list }, dist=Dist(NPROC=1, local=True))) wf.max_jobs = 1 wf.refreshTargets() return outputs
def run_build_db(wf, args, seq_dataset_lst_fn): build_db = """ /usr/bin/time shmr_mkseqdb \ -p {params.read_db_prefix} \ -d {input.seq_dataset} """ read_db_dir = os.path.join(os.path.abspath(args["--output"]), "0-seqdb") read_db_prefix = "seq_dataset" seq_dataset = seq_dataset_lst_fn read_db = os.path.join(read_db_dir, f"{read_db_prefix}.seqdb") seqidx = os.path.join(read_db_dir, f"{read_db_prefix}.idx") read_db_abs_prefix = os.path.join(read_db_dir, read_db_prefix) outputs = {'read_db': read_db, 'seqidx': seqidx} wf.addTask( Task(script=build_db, inputs={'seq_dataset': seq_dataset}, outputs=outputs, parameters={'read_db_prefix': read_db_abs_prefix}, dist=Dist(NPROC=1, local=True))) wf.max_jobs = 1 wf.refreshTargets() return read_db_abs_prefix, outputs
def run_overlapper(wf, args, read_db_abs_prefix, index_abs_prefix, ovlp_in): n_chunk = int(args["<ovlp_nchunk>"]) n_proc = int(args["<ovlp_nproc>"]) shmr_ovlp_script = """ /usr/bin/time shmr_overlap\ -b {params.best_n_ovlp}\ -m {params.mc_lower}\ -M {params.mc_upper}\ -w {params.align_bandwidth}\ -n {params.ovlp_upper}\ -p {params.read_db_prefix}\ -l {params.index_prefix}\ -t {params.n_chunk}\ -c {params.my_chunk}\ -o {output.ovlp_out} """ ovlp_dir = os.path.join(os.path.abspath(args["--output"]), "2-ovlp") outputs = {} for my_chunk in range(1, n_chunk+1): ovlp_chunk_dir = os.path.join(ovlp_dir, f"chunk-{my_chunk:02d}") ovlp_chunk_abs_prefix = os.path.join(ovlp_chunk_dir, "ovlp") ovlp_fn = f"{ovlp_chunk_abs_prefix}-{my_chunk:02d}.dat" outputs[f'ovlp_{my_chunk:02d}'] = ovlp_fn wf.addTask(Task( script=shmr_ovlp_script, inputs=ovlp_in, outputs={'ovlp_out': ovlp_fn}, parameters={ 'read_db_prefix': read_db_abs_prefix, 'index_prefix': index_abs_prefix, 'n_chunk': n_chunk, 'my_chunk': my_chunk, 'best_n_ovlp': int(args["--best_n_ovlp"]), 'mc_lower': int(args["--mc_lower"]), 'mc_upper': int(args["--mc_upper"]), 'align_bandwidth': int(args["--aln_bw"]), 'ovlp_upper': int(args["--ovlp_upper"]), }, dist=Dist(NPROC=1, local=True) )) wf.max_jobs = n_proc wf.refreshTargets() return outputs
def wrap_gen_task(rule_writer, script, inputs, outputs, parameters=None, dist=None): if parameters is None: parameters = dict() if dist is None: dist = Dist() from future.utils import viewitems rel_inputs = dict() rel_outputs = dict() # Make relative to CWD. (But better if caller does this.) def get_rel(maybe_abs): rel = dict() for (k, v) in viewitems(maybe_abs): if os.path.isabs(v): v = os.path.relpath(v) rel[k] = v return rel inputs = get_rel(inputs) outputs = get_rel(outputs) first_output_dir = os.path.normpath(os.path.dirname(outputs.values()[0])) rel_topdir = os.path.relpath( '.', first_output_dir) # redundant for rel-inputs, but fine params = dict(parameters) params['topdir'] = rel_topdir pt = pype_gen_task(script, inputs, outputs, params, dist) # Run pype_gen_task first because it can valid some stuff. rule_writer(inputs, outputs, params, script) return pt
def run_cns(wf, args, read_db_abs_prefix, read_db, index_abs_prefix, read_index, p_ctg): mapping_nchunk = int(args["<mapping_nchunk>"]) mapping_nproc = int(args["<mapping_nproc>"]) cns_nchunk = int(args["<cns_nchunk>"]) cns_nproc = int(args["<cns_nproc>"]) sort_nproc = int(args["<sort_nproc>"]) shimmer_k = int(args["--shimmer-k"]) shimmer_w = int(args["--shimmer-w"]) shimmer_r = int(args["--shimmer-r"]) shimmer_l = int(args["--shimmer-l"]) build_index_script = """\ echo {input.p_ctg} > p_ctg.lst /usr/bin/time shmr_mkseqdb -p p_ctg \ -d p_ctg.lst 2> build_p_ctg_db.log """ build_index_script += f""" /usr/bin/time shmr_index \ -p p_ctg -t 1 -c 1 \ -k {shimmer_k}\ -w {shimmer_w}\ -r {shimmer_r}\ -l {shimmer_l}\ -o p_ctg 2> build_p_ctg_index.log """ cns_dir = os.path.join(os.path.abspath(args["--output"]), "4-cns") inputs = {} inputs.update(p_ctg) output_dir = os.path.join(cns_dir, "p_ctg_index") outputs = {} outputs["p_ctg_db"] = os.path.join(output_dir, "p_ctg.seqdb") outputs["p_ctg_idx"] = os.path.join(output_dir, "p_ctg.idx") if shimmer_l == 2: outputs["p_ctg_shmr_idx"] = os.path.join(output_dir, "p_ctg-L2-01-of-01.dat") p_ctg_idx_abs_prefix = os.path.join(output_dir, "p_ctg-L2") elif shimmer_l == 1: outputs["p_ctg_shmr_idx"] = os.path.join(output_dir, "p_ctg-L1-01-of-01.dat") p_ctg_idx_abs_prefix = os.path.join(output_dir, "p_ctg-L1") p_ctg_db_abs_prefix = os.path.join(output_dir, "p_ctg") wf.addTask( Task(script=build_index_script, inputs=inputs, outputs=outputs, parameters={ 'read_db_prefix': read_db_abs_prefix, 'index_prefix': index_abs_prefix }, dist=Dist(NPROC=1, local=True))) wf.max_jobs = 1 wf.refreshTargets() mapping_script = """\ /usr/bin/time shmr_map \ -r {params.p_ctg_db_prefix} \ -m {params.p_ctg_idx_prefix} \ -p {params.read_db_prefix} \ -l {params.index_prefix} \ -t {params.n_chunk} -c {params.my_chunk} > {output.readmap} """ inputs = {} inputs["p_ctg_db"] = outputs["p_ctg_db"] inputs["p_ctg_idx"] = outputs["p_ctg_idx"] inputs["p_ctg_shmr_idx"] = outputs["p_ctg_shmr_idx"] inputs.update(read_db) inputs.update(read_index) outputs = {} for my_chunk in range(1, mapping_nchunk + 1): mapping_chunk_dir = os.path.join(cns_dir, f"map-{my_chunk:02d}") mapping_chunk_abs_prefix = os.path.join(mapping_chunk_dir, "reads2ref") map_fn = f"{mapping_chunk_abs_prefix}-{my_chunk:02d}.dat" outputs[f'readmap_{my_chunk:02d}'] = map_fn wf.addTask( Task(script=mapping_script, inputs=inputs, outputs={'readmap': map_fn}, parameters={ 'read_db_prefix': read_db_abs_prefix, 'index_prefix': index_abs_prefix, 'p_ctg_db_prefix': p_ctg_db_abs_prefix, 'p_ctg_idx_prefix': p_ctg_idx_abs_prefix, 'n_chunk': mapping_nchunk, 'my_chunk': my_chunk, }, dist=Dist(NPROC=1, local=True))) wf.max_jobs = mapping_nproc wf.refreshTargets() mapping_chunk_outputs = outputs map_files = " ".join(sorted([v for v in mapping_chunk_outputs.values()])) mapping_merge_script = """ mkdir -p {params.tmp_dir} cat {params.map_files} | \ sort -T {params.tmp_dir} -S 8g --parallel {params.sort_nproc}\ -k 1 -g -k 2 -g > {output.merged_mapping_file} """ mapping_merge_dir = os.path.join(cns_dir, "map-merge") merged_mapping_fn = os.path.join(mapping_merge_dir, "reads2ref_all.out") wf.addTask( Task(script=mapping_merge_script, inputs=mapping_chunk_outputs, outputs={"merged_mapping_file": merged_mapping_fn}, parameters={ 'tmp_dir': os.path.join(cns_dir, "tmp"), 'sort_nproc': sort_nproc, 'map_files': map_files }, dist=Dist(NPROC=1, local=True))) cns_script = """\ /usr/bin/time pg_asm_cns.py {params.read_db_prefix} \ {params.p_ctg_db_prefix} {input.merged_mapping_file} \ {params.n_chunk} {params.my_chunk} > {output.cns_file} 2> cns.log """ inputs.update({"merged_mapping_file": merged_mapping_fn}) outputs = {} for my_chunk in range(1, cns_nchunk + 1): cns_chunk_dir = os.path.join(cns_dir, f"cns-{my_chunk:02d}") cnd_chunk_abs_prefix = os.path.join(cns_chunk_dir, "p_ctg_cns") cns_fn = f"{cnd_chunk_abs_prefix}-{my_chunk:02d}.fa" outputs[f'cns_{my_chunk:02d}'] = cns_fn wf.addTask( Task(script=cns_script, inputs=inputs, outputs={"cns_file": cns_fn}, parameters={ 'read_db_prefix': read_db_abs_prefix, 'p_ctg_db_prefix': p_ctg_db_abs_prefix, 'n_chunk': cns_nchunk, 'my_chunk': my_chunk }, dist=Dist(NPROC=1, local=True))) wf.max_jobs = cns_nproc wf.refreshTargets() gather_cns_script = """\ cat {params.cns_chunk_files} > {output.cns_file} ln -sf {params.cns_merge_dir}/{output.cns_file} {params.workdir} """ cns_chunk_files = " ".join(sorted([v for v in outputs.values()])) inputs = outputs cns_merge_dir = os.path.join(cns_dir, f"cns-merge") cns_fn = os.path.join(cns_merge_dir, "p_ctg_cns.fa") outputs = {"cns_file": cns_fn} wf.addTask( Task(script=gather_cns_script, inputs=inputs, outputs=outputs, parameters={ 'cns_chunk_files': cns_chunk_files, 'workdir': os.path.abspath(args["--output"]), 'cns_merge_dir': "./4-cns/cns-merge" }, dist=Dist(NPROC=1, local=True))) wf.max_jobs = 1 wf.refreshTargets() return outputs
def run_build_idx(wf, args, read_db_abs_prefix): n_chunk = int(args["<index_nchunk>"]) n_proc = int(args["<index_nproc>"]) shimmer_l = int(args["--shimmer-l"]) build_idx = """ /usr/bin/time shmr_index\ -m {params.output_L0_index}\ -p {params.read_db_prefix}\ -k {params.shimmer_k}\ -w {params.shimmer_w}\ -r {params.shimmer_r}\ -l {params.shimmer_l}\ -t {params.n_chunk}\ -c {params.my_chunk}\ -o {params.index_prefix} ln -s {params.index_prefix}* {params.index_dir} """ index_dir = os.path.join(os.path.abspath(args["--output"]), "1-index") if shimmer_l == 2: index_abs_prefix = os.path.join(index_dir, "shmr-L2") elif shimmer_l == 1: index_abs_prefix = os.path.join(index_dir, "shmr-L1") else: sys.exit(1) outputs = {} for my_chunk in range(1, n_chunk + 1): index_chunk_dir = os.path.join(index_dir, f"chunk-{my_chunk:02d}") index_chunk_abs_prefix = os.path.join(index_chunk_dir, "shmr") # index_L0_fn = f"{index_chunk_abs_prefix}-L0-{my_chunk:02d}-of-{n_chunk:02d}.dat" # index_L0_MC_fn = f"{index_chunk_abs_prefix}-L0-MC-{my_chunk:02d}-of-{n_chunk:02d}.dat" if shimmer_l == 2: index_shmr_fn = f"{index_chunk_abs_prefix}-L2-{my_chunk:02d}-of-{n_chunk:02d}.dat" index_shmr_MC_fn = f"{index_chunk_abs_prefix}-L2-MC-{my_chunk:02d}-of-{n_chunk:02d}.dat" elif shimmer_l == 1: index_shmr_fn = f"{index_chunk_abs_prefix}-L1-{my_chunk:02d}-of-{n_chunk:02d}.dat" index_shmr_MC_fn = f"{index_chunk_abs_prefix}-L1-MC-{my_chunk:02d}-of-{n_chunk:02d}.dat" else: sys.exit(1) outputs[f'index_shmr_{my_chunk:02d}'] = index_shmr_fn outputs[f'index_shmr_MC_{my_chunk:02d}'] = index_shmr_MC_fn wf.addTask( Task(script=build_idx, inputs={ 'read_db': f"{read_db_abs_prefix}.seqdb", 'seqidx': f"{read_db_abs_prefix}.idx" }, outputs={ 'index_shmr': index_shmr_fn, 'index_shmr_MC': index_shmr_MC_fn }, parameters={ 'read_db_prefix': read_db_abs_prefix, 'index_prefix': index_chunk_abs_prefix, 'index_dir': index_dir, 'output_L0_index': 1 if args["--with-L0-index"] else 0, 'shimmer_k': int(args["--shimmer-k"]), 'shimmer_w': int(args["--shimmer-w"]), 'shimmer_r': int(args["--shimmer-r"]), 'shimmer_l': int(args["--shimmer-l"]), 'n_chunk': n_chunk, 'my_chunk': my_chunk }, dist=Dist(NPROC=1, local=True))) wf.max_jobs = n_proc wf.refreshTargets() return index_abs_prefix, outputs
def gen_parallel_tasks( wf, rule_writer, split_fn, gathered_fn, run_dict, dist=None, run_script=TASK_GENERIC_RUN_UNITS_SCRIPT, ): """ By convention, the first (wildcard) output in run_dict['outputs'] must be the gatherable list, in the same format as the gathered_fn to be generated from them. For now, we require a single such output, since we do not yet test for wildcards. """ assert 'dist' not in run_dict, 'dist should be a parameter of gen_parallel_tasks(), not of its run_dict' if dist is None: dist = Dist() from future.utils import itervalues #from future.utils import viewitems # run_dict['inputs'] should be patterns to match the inputs in split_fn, by convention. # Write 3 wildcard rules for snakemake, 2 with dynamic. rule_writer.write_dynamic_rules( rule_name="foo", input_json=split_fn, inputs=dict_rel_paths(run_dict['inputs']), shell_template=run_dict['script'], parameters=run_dict['parameters'], wildcard_outputs=dict_rel_paths(run_dict['outputs']), output_json=gathered_fn, ) #outputs = {k:patt.format(**jobkv) for k,patt in output_patterns} #inputs = {k:patt.format(**jobkv) for k,patt in input_patterns} #inputs['SPLIT'] = split_fn # presumably ignored by script; might not be needed at all #split_fn = scatter_dict['outputs']['split'] # by convention wf.refreshTargets() wait_for(split_fn) split = io.deserialize(split_fn) bash_template_fn = run_dict['bash_template_fn'] def find_wildcard_input(inputs): for k, v in inputs.items(): if '{' in v: return v else: raise Exception('No wildcard inputs among {!r}'.format(inputs)) LOG.debug('PARALLEL OUTPUTS:{}'.format(run_dict['outputs'])) task_results = dict() for split_idx, job in enumerate(split): #inputs = job['input'] #outputs = job['output'] #params = job['params'] #wildcards = job['wildcards'] #params.update({k: v for (k, v) in viewitems(job['wildcards'])}) # include expanded wildcards #LOG.warning('OUT:{}'.format(outputs)) wildcards = job['wildcards'] def resolved(v): return v.format(**wildcards) def resolved_dict(d): result = dict(d) LOG.debug(' wildcards={!r}'.format(wildcards)) for k, v in d.items(): LOG.debug(' k={}, v={!r}'.format(k, v)) result[k] = v.format(**wildcards) return result #task_inputs = resolved_dict(run_dict['inputs']) task_outputs = resolved_dict(run_dict['outputs']) task_parameters = resolved_dict(run_dict['parameters']) wild_input = find_wildcard_input(run_dict['inputs']) one_uow_fn = os.path.abspath(wild_input.format(**wildcards)) wf.addTask( pype_gen_task( script=TASK_GENERIC_SCATTER_ONE_UOW_SCRIPT, inputs={ 'all': split_fn, }, outputs={ 'one': one_uow_fn, }, parameters={ 'split_idx': split_idx, }, dist=Dist(local=True), )) wf.addTask( pype_gen_task( script= run_script, # usually TASK_GENERIC_RUN_UNITS_SCRIPT, unless individual load-time is slow inputs={ 'units_of_work': one_uow_fn, 'bash_template': bash_template_fn, }, outputs=task_outputs, parameters=task_parameters, dist=dist, )) wildcards_str = '_'.join(w for w in itervalues(job['wildcards'])) job_name = 'job{}'.format(wildcards_str) task_results[job_name] = os.path.abspath(task_outputs.values()[0]) gather_inputs = dict(task_results) ## An implicit "gatherer" simply takes the output filenames and combines their contents. result_fn_list_fn = os.path.join(os.path.dirname(gathered_fn), 'result-fn-list.json') io.serialize(result_fn_list_fn, list(task_results.values()) ) # dump into next task-dir before next task starts #assert 'result_fn_list' not in gather_inputs #gather_inputs['result_fn_list'] = result_fn_list_fn # No! pseudo output, since it must exist in a known directory LOG.debug('gather_inputs:{!r}'.format(gather_inputs)) wf.addTask( pype_gen_task( script=TASK_GENERIC_UNSPLIT_SCRIPT, inputs=gather_inputs, outputs={ 'gathered': gathered_fn, 'result_fn_list': result_fn_list_fn, }, parameters={}, dist=Dist(local=True), ))