def run_multicore(fn, items, config, parallel=None): """Run the function using multiple cores on the given items to process. """ if len(items) == 0: return [] if parallel is None or "num_jobs" not in parallel: if parallel is None: parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1) } sysinfo = system.get_info({}, parallel) parallel = resources.calculate( parallel, items, sysinfo, config, parallel.get("multiplier", 1), max_multicore=int(parallel.get("max_multicore", sysinfo["cores"]))) items = [ config_utils.add_cores_to_config(x, parallel["cores_per_job"]) for x in items ] if joblib is None: raise ImportError("Need joblib for multiprocessing parallelization") out = [] for data in joblib.Parallel(parallel["num_jobs"], batch_size=1, backend="multiprocessing")( joblib.delayed(fn)(x) for x in items): if data: out.extend(data) return out
def start(parallel, items, config, dirs=None, name=None, multiplier=1, max_multicore=None): """Start a parallel cluster or machines to be used for running remote functions. Returns a function used to process, in parallel items with a given function. Allows sharing of a single cluster across multiple functions with identical resource requirements. Uses local execution for non-distributed clusters or completed jobs. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling cores when an item is split during processing. max_multicore -- The maximum number of cores to use for each process. Can be used to process less multicore usage when jobs run faster on more single cores. """ if name: checkpoint_dir = utils.safe_makedir( os.path.join(dirs["work"], "checkpoints_parallel")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name) else: checkpoint_file = None sysinfo = system.get_info(dirs, parallel) items = [x for x in items if x is not None] if items else [] parallel = resources.calculate( parallel, items, sysinfo, config, multiplier=multiplier, max_multicore=int(max_multicore or sysinfo.get("cores", 1))) try: if checkpoint_file and os.path.exists(checkpoint_file): logger.info("run local -- checkpoint passed: %s" % name) parallel["cores_per_job"] = 1 parallel["num_jobs"] = 1 yield multi.runner(parallel, config) elif parallel["type"] == "ipython": with ipython.create(parallel, dirs, config) as view: yield ipython.runner(view, parallel, dirs, config) elif parallel["type"] == "clusterk": with clusterk.create(parallel) as queue: yield clusterk.runner(queue, parallel) else: yield multi.runner(parallel, config) except: raise else: for x in ["cores_per_job", "num_jobs", "mem"]: parallel.pop(x, None) if checkpoint_file: with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n")
def run_multicore(fn, items, config, parallel=None): """Run the function using multiple cores on the given items to process. """ if len(items) == 0: return [] if parallel is None or "num_jobs" not in parallel: if parallel is None: parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1)} sysinfo = system.get_info({}, parallel) parallel = resources.calculate( parallel, items, sysinfo, config, parallel.get("multiplier", 1), max_multicore=int(parallel.get("max_multicore", sysinfo["cores"])), ) items = [config_utils.add_cores_to_config(x, parallel["cores_per_job"]) for x in items] if joblib is None: raise ImportError("Need joblib for multiprocessing parallelization") out = [] for data in joblib.Parallel(parallel["num_jobs"])(joblib.delayed(fn)(x) for x in items): if data: out.extend(data) return out
def start(parallel, items, config, dirs=None, name=None, multiplier=1, max_multicore=None): """Start a parallel cluster or machines to be used for running remote functions. Returns a function used to process, in parallel items with a given function. Allows sharing of a single cluster across multiple functions with identical resource requirements. Uses local execution for non-distributed clusters or completed jobs. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling cores when an item is split during processing. max_multicore -- The maximum number of cores to use for each process. Can be used to process less multicore usage when jobs run faster on more single cores. """ if name: checkpoint_dir = utils.safe_makedir(os.path.join(dirs["work"], "checkpoints_parallel")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name) else: checkpoint_file = None sysinfo = system.get_info(dirs, parallel) items = [x for x in items if x is not None] if items else [] max_multicore = int(max_multicore or sysinfo.get("cores", 1)) parallel = resources.calculate(parallel, items, sysinfo, config, multiplier=multiplier, max_multicore=max_multicore) try: view = None if checkpoint_file and os.path.exists(checkpoint_file): logger.info("run local -- checkpoint passed: %s" % name) parallel["cores_per_job"] = 1 parallel["num_jobs"] = 1 parallel["checkpointed"] = True yield multi.runner(parallel, config) elif parallel["type"] == "ipython": with ipython.create(parallel, dirs, config) as view: yield ipython.runner(view, parallel, dirs, config) elif parallel["type"] == "clusterk": with clusterk.create(parallel) as queue: yield clusterk.runner(queue, parallel) else: yield multi.runner(parallel, config) except: if view is not None: ipython.stop(view) raise else: for x in ["cores_per_job", "num_jobs", "mem"]: parallel.pop(x, None) if checkpoint_file: with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n")
def _calculate_resources(data, args, resources): parallel = clargs.to_parallel(args) config = data[0][0]['config'] config['resources'].update({resources['name']: {'memory': "%sg" % resources['mem'], 'cores': resources['cores']}}) parallel.update({'progs': [resources['name']]}) # parallel = log.create_base_logger(config, parallel) # log.setup_local_logging(config, parallel) log.setup_log(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] log.logger.info("Number of items %s" % len(data)) parallel = res.calculate(parallel, data, sysinfo, config) log.logger.info(parallel) # print parallel # raise return parallel
"bcbio_system.yaml") except ValueError as err: print(err) print( "WARNING: Attempting to read bcbio_system.yaml in the current directory." ) system_config = "bcbio_system.yaml" with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log") parallel = clargs.to_parallel(args) parallel.update({'progs': ['samtools', 'sambamba']}) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] samples = _get_samples_to_process(args.csv, out_dir, config, args.force_single, args.separators) parallel = resources.calculate(parallel, [samples], sysinfo, config) with prun.start(parallel, samples, config, dirs) as run_parallel: with profile.report("prepare bcbio samples", dirs): samples = run_parallel("prepare_bcbio_samples", samples) create_new_csv(samples, args)
system_config = args.galaxy with open(system_config) as in_handle: config = yaml.load(in_handle) parallel = clargs.to_parallel(args) parallel.update({'progs': args.progs}) dirs = {'work': os.path.abspath(os.getcwd())} if args.sys_info.find(";") > -1: info = args.sys_info.split(";") sysinfo = {'cores': int(info[0]), 'memory': float(info[1])} else: if utils.file_exists(args.sys_info): sysinfo = yaml.load(open(args.sys_info))[0] print "system info %s" % sysinfo samples = [] pipelines, config = _pair_samples_with_pipelines(args.yaml_file, config) for s in pipelines: samples = [item for item in pipelines[s]] print "number of samples %s" % len(samples) print "after calculate fn" parallel = resources.calculate(parallel, samples, sysinfo, config) print parallel if args.fixed: print "after fixed ipython fn" parallel = ipython_fn(parallel, config) else: print "after ipython fn" parallel = ipython_current(parallel, config) print parallel ipc_fn(parallel)