Esempio n. 1
0
def global_parallel(parallel, name, fn_names, items, dirs, config,
                    multiplier=1):
    """Add an IPython cluster to be used for multiple remote functions.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Falls back into local execution for
    non-distributed clusters or completed jobs.
    """
    checkpoint_dir = utils.safe_makedir(os.path.join(dirs["work"], "checkpoints_ipython"))
    checkpoint_file = os.path.join(checkpoint_dir, "global-%s.done" % name)
    sysinfo = system.get_info(dirs, parallel)
    try:
        if parallel["type"] != "ipython" or os.path.exists(checkpoint_file):
            yield parallel
        else:
            items = [x for x in items if x is not None]
            jobr = find_job_resources([_get_ipython_fn(x, parallel) for x in fn_names],
                                      parallel, items, sysinfo, config, multiplier=multiplier)
            parallel = dictadd(parallel, "cores_per_job", jobr.cores_per_job)
            parallel = dictadd(parallel, "num_jobs", jobr.num_jobs)
            parallel = dictadd(parallel, "mem", jobr.memory_per_job)
            with _view_from_parallel(parallel, dirs["work"], config) as view:
                parallel["view"] = view
                yield parallel
    except:
        raise
    else:
        parallel["view"] = None
        with open(checkpoint_file, "w") as out_handle:
            out_handle.write("done\n")
Esempio n. 2
0
def run_multicore(fn, items, config, parallel=None):
    """Run the function using multiple cores on the given items to process.
    """
    if len(items) == 0:
        return []
    if parallel is None or "num_jobs" not in parallel:
        if parallel is None:
            parallel = {
                "type": "local",
                "cores": config["algorithm"].get("num_cores", 1)
            }
        sysinfo = system.get_info({}, parallel)
        parallel = resources.calculate(
            parallel,
            items,
            sysinfo,
            config,
            parallel.get("multiplier", 1),
            max_multicore=int(parallel.get("max_multicore", sysinfo["cores"])))
    items = [
        config_utils.add_cores_to_config(x, parallel["cores_per_job"])
        for x in items
    ]
    if joblib is None:
        raise ImportError("Need joblib for multiprocessing parallelization")
    out = []
    for data in joblib.Parallel(parallel["num_jobs"],
                                batch_size=1,
                                backend="multiprocessing")(
                                    joblib.delayed(fn)(x) for x in items):
        if data:
            out.extend(data)
    return out
Esempio n. 3
0
def start(parallel,
          items,
          config,
          dirs=None,
          name=None,
          multiplier=1,
          max_multicore=None):
    """Start a parallel cluster or machines to be used for running remote functions.

    Returns a function used to process, in parallel items with a given function.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Uses local execution for non-distributed
    clusters or completed jobs.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.

    multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling
      cores when an item is split during processing.
    max_multicore -- The maximum number of cores to use for each process. Can be used
      to process less multicore usage when jobs run faster on more single cores.
    """
    if name:
        checkpoint_dir = utils.safe_makedir(
            os.path.join(dirs["work"], "checkpoints_parallel"))
        checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name)
    else:
        checkpoint_file = None
    sysinfo = system.get_info(dirs, parallel)
    items = [x for x in items if x is not None] if items else []
    parallel = resources.calculate(
        parallel,
        items,
        sysinfo,
        config,
        multiplier=multiplier,
        max_multicore=int(max_multicore or sysinfo.get("cores", 1)))
    try:
        if checkpoint_file and os.path.exists(checkpoint_file):
            logger.info("run local -- checkpoint passed: %s" % name)
            parallel["cores_per_job"] = 1
            parallel["num_jobs"] = 1
            yield multi.runner(parallel, config)
        elif parallel["type"] == "ipython":
            with ipython.create(parallel, dirs, config) as view:
                yield ipython.runner(view, parallel, dirs, config)
        elif parallel["type"] == "clusterk":
            with clusterk.create(parallel) as queue:
                yield clusterk.runner(queue, parallel)
        else:
            yield multi.runner(parallel, config)
    except:
        raise
    else:
        for x in ["cores_per_job", "num_jobs", "mem"]:
            parallel.pop(x, None)
        if checkpoint_file:
            with open(checkpoint_file, "w") as out_handle:
                out_handle.write("done\n")
Esempio n. 4
0
 def run_parallel(fn_name, items, metadata=None):
     items = [x for x in items if x is not None]
     if len(items) == 0:
         return []
     items = diagnostics.track_parallel(items, fn_name)
     imodule = parallel.get("module", "bcbio.distributed")
     sysinfo = system.get_info(dirs, parallel)
     if parallel["type"].startswith("messaging"):
         task_module = "{base}.tasks".format(base=imodule)
         runner_fn = runner(task_module, dirs, config, config_file)
         return runner_fn(fn_name, items)
     elif parallel["type"] == "ipython":
         return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config)
     else:
         logger.info("multiprocessing: %s" % fn_name)
         fn = getattr(__import__("{base}.multitasks".format(base=imodule),
                                 fromlist=["multitasks"]),
                      fn_name)
         jobr = ipython.find_job_resources([fn], parallel, items, sysinfo, config)
         items = [ipython.add_cores_to_config(x, jobr.cores_per_job) for x in items]
         if joblib is None:
             raise ImportError("Need joblib for multiprocessing parallelization")
         out = []
         for data in joblib.Parallel(jobr.num_jobs)(joblib.delayed(fn)(x) for x in items):
             if data:
                 out.extend(data)
         return out
Esempio n. 5
0
 def run_parallel(fn_name, items, metadata=None):
     items = [x for x in items if x is not None]
     if len(items) == 0:
         return []
     items = diagnostics.track_parallel(items, fn_name)
     imodule = parallel.get("module", "bcbio.distributed")
     sysinfo = system.get_info(dirs, parallel)
     if parallel["type"].startswith("messaging"):
         task_module = "{base}.tasks".format(base=imodule)
         runner_fn = runner(task_module, dirs, config, config_file)
         return runner_fn(fn_name, items)
     elif parallel["type"] == "ipython":
         return ipython.runner(parallel, fn_name, items, dirs["work"],
                               sysinfo, config)
     else:
         logger.info("multiprocessing: %s" % fn_name)
         fn = getattr(
             __import__("{base}.multitasks".format(base=imodule),
                        fromlist=["multitasks"]), fn_name)
         jobr = ipython.find_job_resources([fn], parallel, items, sysinfo,
                                           config)
         items = [
             ipython.add_cores_to_config(x, jobr.cores_per_job)
             for x in items
         ]
         if joblib is None:
             raise ImportError(
                 "Need joblib for multiprocessing parallelization")
         out = []
         for data in joblib.Parallel(jobr.num_jobs)(joblib.delayed(fn)(x)
                                                    for x in items):
             if data:
                 out.extend(data)
         return out
Esempio n. 6
0
def run_multicore(fn, items, config, parallel=None):
    """Run the function using multiple cores on the given items to process.
    """
    if len(items) == 0:
        return []
    if parallel is None or "num_jobs" not in parallel:
        if parallel is None:
            parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1)}
        sysinfo = system.get_info({}, parallel)
        parallel = resources.calculate(
            parallel,
            items,
            sysinfo,
            config,
            parallel.get("multiplier", 1),
            max_multicore=int(parallel.get("max_multicore", sysinfo["cores"])),
        )
    items = [config_utils.add_cores_to_config(x, parallel["cores_per_job"]) for x in items]
    if joblib is None:
        raise ImportError("Need joblib for multiprocessing parallelization")
    out = []
    for data in joblib.Parallel(parallel["num_jobs"])(joblib.delayed(fn)(x) for x in items):
        if data:
            out.extend(data)
    return out
Esempio n. 7
0
def start(parallel, items, config, dirs=None, name=None, multiplier=1,
          max_multicore=None):
    """Start a parallel cluster or machines to be used for running remote
    functions.

    Returns a function used to process, in parallel items with a given function.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Uses local execution for non-distributed
    clusters or completed jobs.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up
    clusters for sections that have been previous processed.

    multiplier - Number of expected jobs per initial input item. Used to avoid
    underscheduling cores when an item is split during processing.
    max_multicore -- The maximum number of cores to use for each process. Can be
    used to process less multicore usage when jobs run faster on more single
    cores.
    """
    if name:
        checkpoint_dir = utils.safe_makedir(os.path.join(dirs["work"],
                                                         "checkpoints_parallel"))
        checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name)
    else:
        checkpoint_file = None
    sysinfo = system.get_info(dirs, parallel)
    items = [x for x in items if x is not None] if items else []
    max_multicore = int(max_multicore or sysinfo.get("cores", 1))
    parallel = resources.calculate(parallel, items, sysinfo, config,
                                   multiplier=multiplier,
                                   max_multicore=max_multicore)
    try:
        view = None
        if checkpoint_file and os.path.exists(checkpoint_file):
            logger.info("run local -- checkpoint passed: %s" % name)
            parallel["cores_per_job"] = 1
            parallel["num_jobs"] = 1
            parallel["checkpointed"] = True
            yield multi.runner(parallel, config)
        elif parallel["type"] == "ipython":
            with ipython.create(parallel, dirs, config) as view:
                yield ipython.runner(view, parallel, dirs, config)
        elif parallel["type"] == "clusterk":
            with clusterk.create(parallel) as queue:
                yield clusterk.runner(queue, parallel)
        else:
            yield multi.runner(parallel, config)
    except:
        if view is not None:
            ipython.stop(view)
        raise
    else:
        for x in ["cores_per_job", "num_jobs", "mem"]:
            parallel.pop(x, None)
        if checkpoint_file:
            with open(checkpoint_file, "w") as out_handle:
                out_handle.write("done\n")
Esempio n. 8
0
 def run_parallel(fn_name, items, metadata=None):
     items = [x for x in items if x is not None]
     if len(items) == 0:
         return []
     items = diagnostics.track_parallel(items, fn_name)
     sysinfo = system.get_info(dirs, parallel)
     if parallel["type"] == "ipython":
         return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config)
     else:
         imodule = parallel.get("module", "bcbio.distributed")
         logger.info("multiprocessing: %s" % fn_name)
         fn = getattr(__import__("{base}.multitasks".format(base=imodule),
                                 fromlist=["multitasks"]),
                      fn_name)
         return run_multicore(fn, items, config, parallel["cores"])
Esempio n. 9
0
def global_parallel(parallel,
                    name,
                    fn_names,
                    items,
                    dirs,
                    config,
                    multiplier=1,
                    max_multicore=None):
    """Add an IPython cluster to be used for multiple remote functions.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Falls back into local execution for
    non-distributed clusters or completed jobs.
    """
    checkpoint_dir = utils.safe_makedir(
        os.path.join(dirs["work"], "checkpoints_ipython"))
    checkpoint_file = os.path.join(checkpoint_dir, "global-%s.done" % name)
    sysinfo = system.get_info(dirs, parallel)
    try:
        if parallel["type"] != "ipython":
            yield parallel
        elif os.path.exists(checkpoint_file):
            parallel["checkpoint"] = True
            yield parallel
        else:
            items = [x for x in items if x is not None]
            jobr = find_job_resources(
                [_get_ipython_fn(x, parallel) for x in fn_names],
                parallel,
                items,
                sysinfo,
                config,
                multiplier=multiplier,
                max_multicore=max_multicore)
            parallel = dictadd(parallel, "cores_per_job", jobr.cores_per_job)
            parallel = dictadd(parallel, "num_jobs", jobr.num_jobs)
            parallel = dictadd(parallel, "mem", jobr.memory_per_job)
            with _view_from_parallel(parallel, dirs["work"], config) as view:
                parallel["checkpoint"] = False
                parallel["view"] = view
                yield parallel
    except:
        raise
    else:
        parallel["view"] = None
        parallel["checkpoint"] = False
        with open(checkpoint_file, "w") as out_handle:
            out_handle.write("done\n")
Esempio n. 10
0
def run_multicore(fn, items, config, cores=None):
    """Run the function using multiple cores on the given items to process.
    """
    if cores is None:
        cores = config["algorithm"].get("num_cores", 1)
    parallel = {"type": "local", "cores": cores}
    sysinfo = system.get_info({}, parallel)
    jobr = ipython.find_job_resources([fn], parallel, items, sysinfo, config,
                                      parallel.get("multiplier", 1),
                                      max_multicore=int(sysinfo["cores"]))
    items = [ipython.add_cores_to_config(x, jobr.cores_per_job) for x in items]
    if joblib is None:
        raise ImportError("Need joblib for multiprocessing parallelization")
    out = []
    for data in joblib.Parallel(jobr.num_jobs)(joblib.delayed(fn)(x) for x in items):
        if data:
            out.extend(data)
    return out
Esempio n. 11
0
 def run_parallel(fn_name, items, metadata=None):
     items = [x for x in items if x is not None]
     if len(items) == 0:
         return []
     items = diagnostics.track_parallel(items, fn_name)
     imodule = parallel.get("module", "bcbio.distributed")
     sysinfo = system.get_info(dirs, parallel)
     if parallel["type"].startswith("messaging"):
         raise NotImplementedError("Messaging parallelization no longer supported")
         task_module = "{base}.tasks".format(base=imodule)
         runner_fn = runner(task_module, dirs, config, config_file)
         return runner_fn(fn_name, items)
     elif parallel["type"] == "ipython":
         return ipython.runner(parallel, fn_name, items, dirs["work"], sysinfo, config)
     else:
         logger.info("multiprocessing: %s" % fn_name)
         fn = getattr(__import__("{base}.multitasks".format(base=imodule),
                                 fromlist=["multitasks"]),
                      fn_name)
         return run_multicore(fn, items, config, parallel["cores"])