def start(parallel, items, config, dirs=None, name=None, multiplier=1, max_multicore=None): """Start a parallel cluster or machines to be used for running remote functions. Returns a function used to process, in parallel items with a given function. Allows sharing of a single cluster across multiple functions with identical resource requirements. Uses local execution for non-distributed clusters or completed jobs. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling cores when an item is split during processing. max_multicore -- The maximum number of cores to use for each process. Can be used to process less multicore usage when jobs run faster on more single cores. """ if name: checkpoint_dir = utils.safe_makedir( os.path.join(dirs["work"], "checkpoints_parallel")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name) else: checkpoint_file = None sysinfo = system.get_info(dirs, parallel) items = [x for x in items if x is not None] if items else [] parallel = resources.calculate( parallel, items, sysinfo, config, multiplier=multiplier, max_multicore=int(max_multicore or sysinfo.get("cores", 1))) try: if checkpoint_file and os.path.exists(checkpoint_file): logger.info("run local -- checkpoint passed: %s" % name) parallel["cores_per_job"] = 1 parallel["num_jobs"] = 1 yield multi.runner(parallel, config) elif parallel["type"] == "ipython": with ipython.create(parallel, dirs, config) as view: yield ipython.runner(view, parallel, dirs, config) elif parallel["type"] == "clusterk": with clusterk.create(parallel) as queue: yield clusterk.runner(queue, parallel) else: yield multi.runner(parallel, config) except: raise else: for x in ["cores_per_job", "num_jobs", "mem"]: parallel.pop(x, None) if checkpoint_file: with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n")
def start(parallel, items, config, dirs=None, name=None, multiplier=1, max_multicore=None): """Start a parallel cluster or machines to be used for running remote functions. Returns a function used to process, in parallel items with a given function. Allows sharing of a single cluster across multiple functions with identical resource requirements. Uses local execution for non-distributed clusters or completed jobs. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling cores when an item is split during processing. max_multicore -- The maximum number of cores to use for each process. Can be used to process less multicore usage when jobs run faster on more single cores. """ if name: checkpoint_dir = utils.safe_makedir(os.path.join(dirs["work"], "checkpoints_parallel")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name) else: checkpoint_file = None sysinfo = system.get_info(dirs, parallel) items = [x for x in items if x is not None] if items else [] max_multicore = int(max_multicore or sysinfo.get("cores", 1)) parallel = resources.calculate(parallel, items, sysinfo, config, multiplier=multiplier, max_multicore=max_multicore) try: view = None if checkpoint_file and os.path.exists(checkpoint_file): logger.info("run local -- checkpoint passed: %s" % name) parallel["cores_per_job"] = 1 parallel["num_jobs"] = 1 parallel["checkpointed"] = True yield multi.runner(parallel, config) elif parallel["type"] == "ipython": with ipython.create(parallel, dirs, config) as view: yield ipython.runner(view, parallel, dirs, config) elif parallel["type"] == "clusterk": with clusterk.create(parallel) as queue: yield clusterk.runner(queue, parallel) else: yield multi.runner(parallel, config) except: if view is not None: ipython.stop(view) raise else: for x in ["cores_per_job", "num_jobs", "mem"]: parallel.pop(x, None) if checkpoint_file: with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n")
def _run_variantcall_batch_multicore(items, regions, final_file): """Run variant calling on a batch of items using multiple cores. """ batch_name = _get_batch_name(items) variantcaller = _get_batch_variantcaller(items) work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items] def split_fn(data): out = [] for region in regions: region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) out.append((region, work_bams, out_file)) return final_file, out parallel = { "type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1 } run_parallel = dmulti.runner(parallel, items[0]["config"]) to_run = copy.deepcopy(items[0]) to_run["sam_ref"] = dd.get_ref_file(to_run) to_run["group_orig"] = items parallel_split_combine([[to_run]], split_fn, run_parallel, "variantcall_sample", "concat_variant_files", "vrn_file", ["region", "sam_ref", "config"]) return final_file
def _run_variantcall_batch_multicore(items, regions, final_file): """Run variant calling on a batch of items using multiple cores. """ batch_name = _get_batch_name(items) variantcaller = _get_batch_variantcaller(items) work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items] def split_fn(data): out = [] for region in regions: region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) out.append((region, work_bams, out_file)) return final_file, out parallel = {"type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1} run_parallel = dmulti.runner(parallel, items[0]["config"]) to_run = copy.deepcopy(items[0]) to_run["sam_ref"] = dd.get_ref_file(to_run) to_run["group_orig"] = items parallel_split_combine([[to_run]], split_fn, run_parallel, "variantcall_sample", "concat_variant_files", "vrn_file", ["region", "sam_ref", "config"]) return final_file