def run_parallel(fn_name, items, metadata=None): items = [x for x in items if x is not None] items = diagnostics.track_parallel(items, fn_name) imodule = parallel.get("module", "bcbio.distributed") if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=imodule) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], config) else: logger.info("multiprocessing: %s" % fn_name) fn = getattr( __import__("{base}.multitasks".format(base=imodule), fromlist=["multitasks"]), fn_name) num_jobs, cores_per_job = ipython.find_cores_per_job([fn], parallel, items, config) items = [ ipython.add_cores_to_config(x, cores_per_job) for x in items ] num_jobs = cores_including_resources(num_jobs, metadata, config) if joblib is None: raise ImportError( "Need joblib for multiprocessing parallelization") out = [] for data in joblib.Parallel(num_jobs)(joblib.delayed(fn)(x) for x in items): if data: out.extend(data) return out
def start(parallel, items, config, dirs=None, name=None, multiplier=1, max_multicore=None): """Start a parallel cluster or machines to be used for running remote functions. Returns a function used to process, in parallel items with a given function. Allows sharing of a single cluster across multiple functions with identical resource requirements. Uses local execution for non-distributed clusters or completed jobs. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling cores when an item is split during processing. max_multicore -- The maximum number of cores to use for each process. Can be used to process less multicore usage when jobs run faster on more single cores. """ if name: checkpoint_dir = utils.safe_makedir( os.path.join(dirs["work"], "checkpoints_parallel")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name) else: checkpoint_file = None sysinfo = system.get_info(dirs, parallel) items = [x for x in items if x is not None] if items else [] parallel = resources.calculate( parallel, items, sysinfo, config, multiplier=multiplier, max_multicore=int(max_multicore or sysinfo.get("cores", 1))) try: if checkpoint_file and os.path.exists(checkpoint_file): logger.info("run local -- checkpoint passed: %s" % name) parallel["cores_per_job"] = 1 parallel["num_jobs"] = 1 yield multi.runner(parallel, config) elif parallel["type"] == "ipython": with ipython.create(parallel, dirs, config) as view: yield ipython.runner(view, parallel, dirs, config) elif parallel["type"] == "clusterk": with clusterk.create(parallel) as queue: yield clusterk.runner(queue, parallel) else: yield multi.runner(parallel, config) except: raise else: for x in ["cores_per_job", "num_jobs", "mem"]: parallel.pop(x, None) if checkpoint_file: with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n")
def run_parallel(fn_name, items, metadata=None): items = [x for x in items if x is not None] if len(items) == 0: return [] items = diagnostics.track_parallel(items, fn_name) imodule = parallel.get("module", "bcbio.distributed") sysinfo = system.get_info(dirs, parallel) if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=imodule) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config) else: logger.info("multiprocessing: %s" % fn_name) fn = getattr(__import__("{base}.multitasks".format(base=imodule), fromlist=["multitasks"]), fn_name) jobr = ipython.find_job_resources([fn], parallel, items, sysinfo, config) items = [ipython.add_cores_to_config(x, jobr.cores_per_job) for x in items] if joblib is None: raise ImportError("Need joblib for multiprocessing parallelization") out = [] for data in joblib.Parallel(jobr.num_jobs)(joblib.delayed(fn)(x) for x in items): if data: out.extend(data) return out
def start(parallel, items, config, dirs=None, name=None, multiplier=1, max_multicore=None): """Start a parallel cluster or machines to be used for running remote functions. Returns a function used to process, in parallel items with a given function. Allows sharing of a single cluster across multiple functions with identical resource requirements. Uses local execution for non-distributed clusters or completed jobs. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling cores when an item is split during processing. max_multicore -- The maximum number of cores to use for each process. Can be used to process less multicore usage when jobs run faster on more single cores. """ if name: checkpoint_dir = utils.safe_makedir(os.path.join(dirs["work"], "checkpoints_parallel")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name) else: checkpoint_file = None sysinfo = system.get_info(dirs, parallel) items = [x for x in items if x is not None] if items else [] max_multicore = int(max_multicore or sysinfo.get("cores", 1)) parallel = resources.calculate(parallel, items, sysinfo, config, multiplier=multiplier, max_multicore=max_multicore) try: view = None if checkpoint_file and os.path.exists(checkpoint_file): logger.info("run local -- checkpoint passed: %s" % name) parallel["cores_per_job"] = 1 parallel["num_jobs"] = 1 parallel["checkpointed"] = True yield multi.runner(parallel, config) elif parallel["type"] == "ipython": with ipython.create(parallel, dirs, config) as view: yield ipython.runner(view, parallel, dirs, config) elif parallel["type"] == "clusterk": with clusterk.create(parallel) as queue: yield clusterk.runner(queue, parallel) else: yield multi.runner(parallel, config) except: if view is not None: ipython.stop(view) raise else: for x in ["cores_per_job", "num_jobs", "mem"]: parallel.pop(x, None) if checkpoint_file: with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n")
def run_parallel(fn_name, items, metadata=None): items = [x for x in items if x is not None] if len(items) == 0: return [] items = diagnostics.track_parallel(items, fn_name) sysinfo = system.get_info(dirs, parallel) if parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config) else: imodule = parallel.get("module", "bcbio.distributed") logger.info("multiprocessing: %s" % fn_name) fn = getattr(__import__("{base}.multitasks".format(base=imodule), fromlist=["multitasks"]), fn_name) return run_multicore(fn, items, config, parallel["cores"])
def run_parallel(fn_name, items, metadata=None): if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=parallel["module"]) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], config) else: out = [] fn = getattr(__import__("{base}.multitasks".format(base=parallel["module"]), fromlist=["multitasks"]), fn_name) cores = cores_including_resources(int(parallel["cores"]), metadata, config) with utils.cpmap(cores) as cpmap: for data in cpmap(fn, filter(lambda x: x is not None, items)): if data: out.extend(data) return out
def run_parallel(fn_name, items, metadata=None): if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=parallel["module"]) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], config) else: out = [] fn = getattr( __import__("{base}.multitasks".format(base=parallel["module"]), fromlist=["multitasks"]), fn_name) cores = cores_including_resources(int(parallel["cores"]), metadata, config) with utils.cpmap(cores) as cpmap: for data in cpmap(fn, filter(lambda x: x is not None, items)): if data: out.extend(data) return out
def run_parallel(fn_name, items, metadata=None): items = [x for x in items if x is not None] if len(items) == 0: return [] items = diagnostics.track_parallel(items, fn_name) imodule = parallel.get("module", "bcbio.distributed") sysinfo = system.get_info(dirs, parallel) if parallel["type"].startswith("messaging"): raise NotImplementedError("Messaging parallelization no longer supported") task_module = "{base}.tasks".format(base=imodule) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config) else: logger.info("multiprocessing: %s" % fn_name) fn = getattr(__import__("{base}.multitasks".format(base=imodule), fromlist=["multitasks"]), fn_name) return run_multicore(fn, items, config, parallel["cores"])
def run_parallel(fn_name, items, metadata=None): items = [x for x in items if x is not None] items = diagnostics.track_parallel(items, fn_name) if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=parallel["module"]) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], config) else: logger.info("multiprocessing: %s" % fn_name) out = [] fn = getattr(__import__("{base}.multitasks".format(base=parallel["module"]), fromlist=["multitasks"]), fn_name) num_jobs, cores_per_job = ipython.find_cores_per_job(fn, parallel, items, config) items = [ipython.add_cores_to_config(x, cores_per_job) for x in items] num_jobs = cores_including_resources(num_jobs, metadata, config) with utils.cpmap(num_jobs) as cpmap: for data in cpmap(fn, items): if data: out.extend(data) return out