Exemple #1
0
def start(parallel,
          items,
          config,
          dirs=None,
          name=None,
          multiplier=1,
          max_multicore=None):
    """Start a parallel cluster or machines to be used for running remote functions.

    Returns a function used to process, in parallel items with a given function.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Uses local execution for non-distributed
    clusters or completed jobs.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.

    multiplier - Number of expected jobs per initial input item. Used to avoid underscheduling
      cores when an item is split during processing.
    max_multicore -- The maximum number of cores to use for each process. Can be used
      to process less multicore usage when jobs run faster on more single cores.
    """
    if name:
        checkpoint_dir = utils.safe_makedir(
            os.path.join(dirs["work"], "checkpoints_parallel"))
        checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name)
    else:
        checkpoint_file = None
    sysinfo = system.get_info(dirs, parallel)
    items = [x for x in items if x is not None] if items else []
    parallel = resources.calculate(
        parallel,
        items,
        sysinfo,
        config,
        multiplier=multiplier,
        max_multicore=int(max_multicore or sysinfo.get("cores", 1)))
    try:
        if checkpoint_file and os.path.exists(checkpoint_file):
            logger.info("run local -- checkpoint passed: %s" % name)
            parallel["cores_per_job"] = 1
            parallel["num_jobs"] = 1
            yield multi.runner(parallel, config)
        elif parallel["type"] == "ipython":
            with ipython.create(parallel, dirs, config) as view:
                yield ipython.runner(view, parallel, dirs, config)
        elif parallel["type"] == "clusterk":
            with clusterk.create(parallel) as queue:
                yield clusterk.runner(queue, parallel)
        else:
            yield multi.runner(parallel, config)
    except:
        raise
    else:
        for x in ["cores_per_job", "num_jobs", "mem"]:
            parallel.pop(x, None)
        if checkpoint_file:
            with open(checkpoint_file, "w") as out_handle:
                out_handle.write("done\n")
Exemple #2
0
def start(parallel, items, config, dirs=None, name=None, multiplier=1,
          max_multicore=None):
    """Start a parallel cluster or machines to be used for running remote
    functions.

    Returns a function used to process, in parallel items with a given function.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Uses local execution for non-distributed
    clusters or completed jobs.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up
    clusters for sections that have been previous processed.

    multiplier - Number of expected jobs per initial input item. Used to avoid
    underscheduling cores when an item is split during processing.
    max_multicore -- The maximum number of cores to use for each process. Can be
    used to process less multicore usage when jobs run faster on more single
    cores.
    """
    if name:
        checkpoint_dir = utils.safe_makedir(os.path.join(dirs["work"],
                                                         "checkpoints_parallel"))
        checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name)
    else:
        checkpoint_file = None
    sysinfo = system.get_info(dirs, parallel)
    items = [x for x in items if x is not None] if items else []
    max_multicore = int(max_multicore or sysinfo.get("cores", 1))
    parallel = resources.calculate(parallel, items, sysinfo, config,
                                   multiplier=multiplier,
                                   max_multicore=max_multicore)
    try:
        view = None
        if checkpoint_file and os.path.exists(checkpoint_file):
            logger.info("run local -- checkpoint passed: %s" % name)
            parallel["cores_per_job"] = 1
            parallel["num_jobs"] = 1
            parallel["checkpointed"] = True
            yield multi.runner(parallel, config)
        elif parallel["type"] == "ipython":
            with ipython.create(parallel, dirs, config) as view:
                yield ipython.runner(view, parallel, dirs, config)
        elif parallel["type"] == "clusterk":
            with clusterk.create(parallel) as queue:
                yield clusterk.runner(queue, parallel)
        else:
            yield multi.runner(parallel, config)
    except:
        if view is not None:
            ipython.stop(view)
        raise
    else:
        for x in ["cores_per_job", "num_jobs", "mem"]:
            parallel.pop(x, None)
        if checkpoint_file:
            with open(checkpoint_file, "w") as out_handle:
                out_handle.write("done\n")
Exemple #3
0
def _run_variantcall_batch_multicore(items, regions, final_file):
    """Run variant calling on a batch of items using multiple cores.
    """
    batch_name = _get_batch_name(items)
    variantcaller = _get_batch_variantcaller(items)
    work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items]

    def split_fn(data):
        out = []
        for region in regions:
            region = _region_to_coords(region)
            chrom, start, end = region
            region_str = "_".join(str(x) for x in region)
            out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller,
                                    chrom,
                                    "%s-%s.vcf.gz" % (batch_name, region_str))
            out.append((region, work_bams, out_file))
        return final_file, out

    parallel = {
        "type": "local",
        "num_jobs": dd.get_num_cores(items[0]),
        "cores_per_job": 1
    }
    run_parallel = dmulti.runner(parallel, items[0]["config"])
    to_run = copy.deepcopy(items[0])
    to_run["sam_ref"] = dd.get_ref_file(to_run)
    to_run["group_orig"] = items
    parallel_split_combine([[to_run]], split_fn, run_parallel,
                           "variantcall_sample", "concat_variant_files",
                           "vrn_file", ["region", "sam_ref", "config"])
    return final_file
def _run_variantcall_batch_multicore(items, regions, final_file):
    """Run variant calling on a batch of items using multiple cores.
    """
    batch_name = _get_batch_name(items)
    variantcaller = _get_batch_variantcaller(items)
    work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items]
    def split_fn(data):
        out = []
        for region in regions:
            region = _region_to_coords(region)
            chrom, start, end = region
            region_str = "_".join(str(x) for x in region)
            out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom,
                                    "%s-%s.vcf.gz" % (batch_name, region_str))
            out.append((region, work_bams, out_file))
        return final_file, out
    parallel = {"type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1}
    run_parallel = dmulti.runner(parallel, items[0]["config"])
    to_run = copy.deepcopy(items[0])
    to_run["sam_ref"] = dd.get_ref_file(to_run)
    to_run["group_orig"] = items
    parallel_split_combine([[to_run]], split_fn, run_parallel,
                           "variantcall_sample", "concat_variant_files",
                           "vrn_file", ["region", "sam_ref", "config"])
    return final_file