예제 #1
0
def create(parallel, dirs, config):
    """Create a cluster based on the provided parallel arguments.

    Returns an IPython view on the cluster, enabling processing on jobs.

    Adds a mincores specification if he have machines with a larger
    number of cores to allow jobs to be batched together for shared
    memory usage.
    """
    profile_dir = utils.safe_makedir(os.path.join(dirs["work"], get_log_dir(config), "ipython"))
    has_mincores = any(x.startswith("mincores=") for x in parallel["resources"])
    cores = min(_get_common_cores(config["resources"]), parallel["system_cores"])
    if cores > 1 and not has_mincores:
        adj_cores = max(1, int(math.floor(cores * float(parallel.get("mem_pct", 1.0)))))
        # if we have less scheduled cores than per machine, use the scheduled count
        if cores > parallel["cores"]:
            cores = parallel["cores"]
        # if we have less total cores required for the entire process, use that
        elif adj_cores > parallel["num_jobs"] * parallel["cores_per_job"]:
            cores = parallel["num_jobs"] * parallel["cores_per_job"]
        else:
            cores = adj_cores
            cores = per_machine_target_cores(cores, parallel["num_jobs"] // cores)
        parallel["resources"].append("mincores=%s" % cores)
    return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"],
                                        parallel["num_jobs"], parallel["cores_per_job"],
                                        profile=profile_dir, start_wait=parallel["timeout"],
                                        extra_params={"resources": parallel["resources"],
                                                      "mem": parallel["mem"],
                                                      "tag": parallel.get("tag"),
                                                      "run_local": parallel.get("run_local")},
                                        retries=parallel.get("retries"))
예제 #2
0
def create(parallel, dirs, config):
    """Create a cluster based on the provided parallel arguments.

    Returns an IPython view on the cluster, enabling processing on jobs.

    Adds a mincores specification if he have machines with a larger
    number of cores to allow jobs to be batched together for shared
    memory usage.
    """
    profile_dir = utils.safe_makedir(
        os.path.join(dirs["work"], get_log_dir(config), "ipython"))
    has_mincores = any(
        x.startswith("mincores=") for x in parallel["resources"])
    if parallel["system_cores"] > 3 and not has_mincores:
        cores = parallel["system_cores"]
        # if we have larger number of cores, leave room for standard batch script and controller
        if parallel["system_cores"] > 30:
            cores = cores - 2
        elif parallel["system_cores"] > 15:
            cores = cores - 1
        parallel["resources"].append("mincores=%s" % cores)
    return ipython_cluster.cluster_view(parallel["scheduler"].lower(),
                                        parallel["queue"],
                                        parallel["num_jobs"],
                                        parallel["cores_per_job"],
                                        profile=profile_dir,
                                        start_wait=parallel["timeout"],
                                        extra_params={
                                            "resources": parallel["resources"],
                                            "mem": parallel["mem"],
                                            "tag": parallel.get("tag"),
                                            "run_local":
                                            parallel.get("run_local")
                                        },
                                        retries=parallel.get("retries"))
예제 #3
0
def main():
    #files = (sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.bam")) +
    #         sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.cram")))
    files = (sorted(glob.glob("/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/calls/*alz-*/final/*/*-ready.bam")))
    with cluster_view(scheduler="lsf", queue="hsph_bioinfo", num_jobs=64,
                      cores_per_job=1, extra_params={"mem": "3.3"}) as view:
        view.map(cram_compress, files)
예제 #4
0
def create(parallel, dirs, config):
    """Create a cluster based on the provided parallel arguments.

    Returns an IPython view on the cluster, enabling processing on jobs.

    Adds a mincores specification if he have machines with a larger
    number of cores to allow jobs to be batched together for shared
    memory usage.
    """
    profile_dir = utils.safe_makedir(os.path.join(dirs["work"], get_log_dir(config), "ipython"))
    has_mincores = any(x.startswith("mincores=") for x in parallel["resources"])
    if parallel["system_cores"] > 3 and not has_mincores:
        cores = parallel["system_cores"]
        # if we have larger number of cores, leave room for standard batch script and controller
        if parallel["system_cores"] > 30:
            cores = cores - 2
        elif parallel["system_cores"] > 15:
            cores = cores - 1
        parallel["resources"].append("mincores=%s" % cores)
    return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"],
                                        parallel["num_jobs"], parallel["cores_per_job"],
                                        profile=profile_dir, start_wait=parallel["timeout"],
                                        extra_params={"resources": parallel["resources"],
                                                      "mem": parallel["mem"],
                                                      "tag": parallel.get("tag"),
                                                      "run_local": parallel.get("run_local")},
                                        retries=parallel.get("retries"))
예제 #5
0
def get_cluster_view(args):
    return ipc.cluster_view(args.scheduler, args.queue,
                          args.num_jobs, args.cores_per_job,
                          start_wait=args.timeout,
                          extra_params={"resources": args.resources,
                                        "mem": args.memory_per_job,
                                        "tag": "singlecell",
                                        "run_local": args.local})
예제 #6
0
def _view_from_parallel(parallel, work_dir, config):
    """Translate parallel map into options for a cluster view.
    """
    profile_dir = utils.safe_makedir(os.path.join(work_dir, get_log_dir(config), "ipython"))
    return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"],
                                        parallel["num_jobs"], parallel["cores_per_job"],
                                        profile=profile_dir, start_wait=parallel["timeout"],
                                        extra_params={"resources": parallel["resources"]},
                                        retries=parallel.get("retries"))
예제 #7
0
파일: idr_test.py 프로젝트: sirusb/spp-idr
def test_run_analysis():
    tmp_control = idr.tagalign_split(_copy_file_to_testdir(control_file))
    tmp_exp = idr.tagalign_split(_copy_file_to_testdir(exp_file))
    with cluster_view(scheduler, queue, jobs) as view:
        plots, filtered_files = idr.run_analysis(tmp_control, tmp_exp,
                                                 spp_path, idr_runner_path,
                                                 idr_plotter_path, view.map,
                                                 caller)

    all(map(idr.file_exists, filtered_files))
예제 #8
0
def get_cluster_view(args):
    if not os.path.exists("ipython"):
        os.mkdir("ipython")
    return ipc.cluster_view(args.scheduler, args.queue,
                          args.num_jobs, args.cores_per_job,
                          start_wait=args.timeout,
                          profile="ipython",
                          extra_params={"resources": args.resources,
                                        "mem": args.memory_per_job,
                                        "tag": "ts",
                                        "run_local": args.local})
예제 #9
0
파일: idr_test.py 프로젝트: sirusb/spp-idr
 def test_pooled_pseudo_replicate_caller(self):
     peak_caller = idr.spp_peak_caller(spp_path)
     control_replicates = idr.tagalign_split(self.control, nfiles=2)
     experimental_replicates = idr.tagalign_split(self.experimental,
                                                  nfiles=2)
     with cluster_view(scheduler, queue, jobs) as view:
         peaks = idr.call_peaks_on_pooled_pseudoreplicates(control_replicates,
                                                         experimental_replicates,
                                                         peak_caller, view.map)
     self.assertTrue(all(map(idr.file_exists, peaks)))
     self.assertTrue(all(map(idr.is_peak_file, peaks)))
예제 #10
0
def get_cluster_view(args):
    if not os.path.exists("ipython"):
        os.mkdir("ipython")
    return ipc.cluster_view(
        args.scheduler,
        args.queue,
        args.num_jobs,
        args.cores_per_job,
        start_wait=args.timeout,
        profile="ipython",
        extra_params={"resources": args.resources, "mem": args.memory_per_job, "tag": "ts", "run_local": args.local},
    )
예제 #11
0
def get_cluster_view(args):
    if not os.path.exists("ipython"):
        utils.safe_makedir("ipython")
        utils.safe_makedir("checkpoint")
    return ipc.cluster_view(args['scheduler'], args['queue'],
                            args['num_jobs'], args['cores_per_job'],
                            start_wait=args['timeout'],
                            profile="ipython",
                            extra_params={"resources": args['resources'],
                                          "mem": args['mem'],
                                          "tag": "ichwrapper",
                                          "run_local": args['run_local']})
예제 #12
0
def get_cluster_view(p):
    """get ipython running"""
    return ipc.cluster_view(p['scheduler'],
                            p['queue'],
                            p['num_jobs'],
                            p['cores_per_job'],
                            start_wait=p['timeout'],
                            extra_params={
                                "resources": p['resources'],
                                "mem": p['mem'],
                                "tag": p['tag'],
                                "run_local": False
                            })
예제 #13
0
def main():
    #files = (sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.bam")) +
    #         sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.cram")))
    files = (sorted(
        glob.glob(
            "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/calls/*alz-*/final/*/*-ready.bam"
        )))
    with cluster_view(scheduler="lsf",
                      queue="hsph_bioinfo",
                      num_jobs=64,
                      cores_per_job=1,
                      extra_params={"mem": "3.3"}) as view:
        view.map(cram_compress, files)
def get_cluster_view(args):
    """get ipython running"""
    return ipc.cluster_view(args.scheduler,
                            args.queue,
                            args.num_jobs,
                            args.cores_per_job,
                            start_wait=args.timeout,
                            extra_params={
                                "resources": args.resources,
                                "mem": args.memory_per_job,
                                "tag": "bcbio_prepare",
                                "run_local": False
                            })
예제 #15
0
def create(parallel, dirs, config):
    """Create a cluster based on the provided parallel arguments.

    Returns an IPython view on the cluster, enabling processing on jobs.
    """
    profile_dir = utils.safe_makedir(os.path.join(dirs["work"], get_log_dir(config), "ipython"))
    return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"],
                                        parallel["num_jobs"], parallel["cores_per_job"],
                                        profile=profile_dir, start_wait=parallel["timeout"],
                                        extra_params={"resources": parallel["resources"],
                                                      "mem": parallel["mem"],
                                                      "tag": parallel.get("tag"),
                                                      "run_local": parallel.get("run_local")},
                                        retries=parallel.get("retries"))
예제 #16
0
def get_cluster_view(args, num_jobs=None):
    if not os.path.exists("ipython"):
        os.mkdir("ipython")
        os.mkdir("checkpoint")
    if not num_jobs:
        num_jobs = args.num_jobs
    return ipc.cluster_view(args.scheduler, args.queue,
                          num_jobs, args.cores_per_job,
                          start_wait=args.timeout,
                          profile="ipython",
                          extra_params={"resources": args.resources,
                                        "mem": args.memory_per_job,
                                        "tag": "seqcluster",
                                        "run_local": args.local})
예제 #17
0
def _view_from_parallel(parallel, work_dir, config):
    """Translate parallel map into options for a cluster view.
    """
    profile_dir = utils.safe_makedir(
        os.path.join(work_dir, get_log_dir(config), "ipython"))
    return ipython_cluster.cluster_view(
        parallel["scheduler"].lower(),
        parallel["queue"],
        parallel["num_jobs"],
        parallel["cores_per_job"],
        profile=profile_dir,
        start_wait=parallel["timeout"],
        extra_params={"resources": parallel["resources"]},
        retries=parallel.get("retries"))
예제 #18
0
def create(parallel, dirs, config):
    """Create a cluster based on the provided parallel arguments.

    Returns an IPython view on the cluster, enabling processing on jobs.

    Adds a mincores specification if he have machines with a larger
    number of cores to allow jobs to be batched together for shared
    memory usage.
    """
    profile_dir = utils.safe_makedir(
        os.path.join(dirs["work"], get_log_dir(config), "ipython"))
    has_mincores = any(
        x.startswith("mincores=") for x in parallel["resources"])
    cores = min(_get_common_cores(config["resources"]),
                parallel["system_cores"])
    if cores > 1 and not has_mincores:
        adj_cores = max(
            1, int(math.floor(cores * float(parallel.get("mem_pct", 1.0)))))
        # if we have less scheduled cores than per machine, use the scheduled count
        if cores > parallel["cores"]:
            cores = parallel["cores"]
        # if we have less total cores required for the entire process, use that
        elif adj_cores > parallel["num_jobs"] * parallel["cores_per_job"]:
            cores = parallel["num_jobs"] * parallel["cores_per_job"]
        else:
            cores = adj_cores
            cores = per_machine_target_cores(cores,
                                             parallel["num_jobs"] // cores)
        parallel["resources"].append("mincores=%s" % cores)
    return ipython_cluster.cluster_view(parallel["scheduler"].lower(),
                                        parallel["queue"],
                                        parallel["num_jobs"],
                                        parallel["cores_per_job"],
                                        profile=profile_dir,
                                        start_wait=parallel["timeout"],
                                        extra_params={
                                            "resources":
                                            parallel["resources"],
                                            "mem":
                                            parallel["mem"],
                                            "tag":
                                            parallel.get("tag"),
                                            "run_local":
                                            parallel.get("run_local"),
                                            "local_controller":
                                            parallel.get("local_controller")
                                        },
                                        retries=parallel.get("retries"))
예제 #19
0
def runner(parallel, fn_name, items, work_dir, config):
    """Run a task on an ipython parallel cluster, allowing alternative queue types.

    This will spawn clusters for parallel and custom queue types like multicore
    and high I/O tasks on demand.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.
    """
    setup_logging(config)
    out = []
    checkpoint_dir = utils.safe_makedir(
        os.path.join(work_dir, "checkpoints_ipython"))
    checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name)
    fn = getattr(
        __import__("{base}.ipythontasks".format(base=parallel["module"]),
                   fromlist=["ipythontasks"]), fn_name)
    items = [x for x in items if x is not None]
    num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config)
    parallel = dictadd(parallel, "cores_per_job", cores_per_job)
    parallel = dictadd(parallel, "num_jobs", num_jobs)
    # already finished, run locally on current machine to collect details
    if os.path.exists(checkpoint_file):
        logger.info("ipython: %s -- local; checkpoint passed" % fn_name)
        for args in items:
            if args:
                data = fn(args)
                if data:
                    out.extend(data)
    # Run on a standard parallel queue
    else:
        logger.info("ipython: %s" % fn_name)
        if len(items) > 0:
            items = [add_cores_to_config(x, cores_per_job) for x in items]
            with ipython_cluster.cluster_view(
                    parallel["scheduler"].lower(),
                    parallel["queue"],
                    parallel["num_jobs"],
                    parallel["cores_per_job"],
                    profile=parallel["profile"]) as view:
                for data in view.map_sync(fn, items, track=False):
                    if data:
                        out.extend(data)
    with open(checkpoint_file, "w") as out_handle:
        out_handle.write("done\n")
    return out
예제 #20
0
def runner(parallel, fn_name, items, work_dir, config):
    """Run a task on an ipython parallel cluster, allowing alternative queue types.

    This will spawn clusters for parallel and custom queue types like multicore
    and high I/O tasks on demand.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.
    """
    setup_logging(config)
    out = []
    checkpoint_dir = utils.safe_makedir(os.path.join(work_dir, "checkpoints_ipython"))
    checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name)
    fn = getattr(__import__("{base}.ipythontasks".format(base=parallel["module"]),
                            fromlist=["ipythontasks"]),
                 fn_name)
    items = [x for x in items if x is not None]
    num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config)
    parallel = dictadd(parallel, "cores_per_job", cores_per_job)
    parallel = dictadd(parallel, "num_jobs", num_jobs)
    # already finished, run locally on current machine to collect details
    if os.path.exists(checkpoint_file):
        logger.info("ipython: %s -- local; checkpoint passed" % fn_name)
        for args in items:
            if args:
                data = fn(args)
                if data:
                    out.extend(data)
    # Run on a standard parallel queue
    else:
        logger.info("ipython: %s" % fn_name)
        if len(items) > 0:
            items = [add_cores_to_config(x, cores_per_job) for x in items]
            with ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"],
                                              parallel["num_jobs"], parallel["cores_per_job"],
                                              profile=parallel["profile"]) as view:
                for data in view.map_sync(fn, items, track=False):
                    if data:
                        out.extend(data)
    with open(checkpoint_file, "w") as out_handle:
        out_handle.write("done\n")
    return out
예제 #21
0
    def map(self, fn, args):
        from cluster_helper.cluster import cluster_view

        cluster_args = {
            "scheduler": None,
            "queue": None,
            "num_jobs": self.processes,
            "extra_params": {
                "run_local": True
            }
        }

        cluster_args.update(self.cluster_settings.cluster_options)

        print cluster_args

        with cluster_view(**cluster_args) as view:
            async_results = view.map(fn, args, block=False)
            async_results.wait_interactive()
            return async_results.get()
예제 #22
0
def get_cluster_view(p):
    """get ipython running"""
    return ipc.cluster_view(p['scheduler'], p['queue'], p['num_jobs'], p['cores_per_job'], start_wait=p['timeout'], extra_params={"resources": p['resources'], "mem": p['mem'], "tag": p['tag'], "run_local": False})
예제 #23
0
def load_ipython(args):
    grabix_file = bgzip(args.vcf)
    with cluster_view(*get_ipython_args(args)) as view:
        chunks = load_chunks_ipython(grabix_file, args, view)
        merge_chunks_ipython(chunks, args.db, view)
    gemini_annotate.add_extras(args.db, chunks)
예제 #24
0
        "--local", dest="local", default=False, action="store_true")

    args = parser.parse_args()
    args.resources = {'resources': args.resources,
                      'mem': args.mem,
                      'local_controller': args.local_controller}
    if args.local:
        args.resources["run_local"] = True

    if not (args.local or (args.scheduler and args.queue)):
        print("Please specify --local to run locally or a scheduler and queue"
              "to run on with --scheduler and --queue")
        sys.exit(1)

    with cluster_view(args.scheduler, args.queue, args.num_jobs,
                      cores_per_job=args.cores_per_job,
                      start_wait=args.timeout, profile=args.profile,
                      extra_params=args.resources) as view:
        print("First check to see if we can talk to the engines.")
        results = view.map(lambda x: "hello world!", range(5))
        print("This long computation that waits for 5 seconds before "
              "returning takes a while to run serially..")
        start_time = time.time()
        results = list(map(long_computation, range(20), range(20, 40), range(40, 60)))
        print(results)
        print("That took {} seconds.".format(time.time() - start_time))
        print("Running it in parallel goes much faster...")
        start_time = time.time()
        results = list(view.map(long_computation, range(20), range(20, 40), range(40, 60)))
        print(results)
        print("That took {} seconds.".format(time.time() - start_time))
예제 #25
0
            bam_files = view.map(sam.sam2bam, out_files)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)

if __name__ == "__main__":
    # read in the config file and perform initial setup
    main_config_file = sys.argv[1]
    with open(main_config_file) as config_in_handle:
        startup_config = yaml.load(config_in_handle)
    parallel = create_base_logger(startup_config, {"type": "ipython"})
    setup_local_logging(startup_config, parallel)
    startup_config["parallel"] = parallel
         #setup_logging(startup_config)

    cluster_config = startup_config["cluster"]
    cores_per_job = cluster_config.get("cores_per_job", 1)
    if startup_config["cluster"].get("local", False):
        main(startup_config, DummyView())
    else:
        with cluster_view(cluster_config["scheduler"],
                          cluster_config["queue"],
                          cluster_config["cores"],
                          cores_per_job) as view:
            main(startup_config, view)


class DummyView(object):

    def __init__(self):
        self.map = map
예제 #26
0
파일: gemini_load.py 프로젝트: arq5x/gemini
def load_ipython(args):
    grabix_file = bgzip(args.vcf)
    with cluster_view(*get_ipython_args(args)) as view:
        chunks = load_chunks_ipython(grabix_file, args, view)
        merge_chunks_ipython(chunks, args, view)
예제 #27
0
                        type=int, help="number of cores for each job.")
    parser.add_argument("--profile", dest="profile", default=None,
                        help="Optional profile to test.")
    parser.add_argument("--resources", dest="resources", default=None,
                        help="Native specification flags to the scheduler")
    parser.add_argument("--timeout", dest="timeout", default=15,
                        help="Time (in minutes) to wait before timing out.")
    parser.add_argument("--memory", dest="mem", default=1,
                        help="Memory in GB to reserve.")

    args = parser.parse_args()
    args.resources = {'resources': args.resources,
                      'mem': args.mem}

    with cluster_view(args.scheduler, args.queue, args.num_jobs,
                      start_wait=args.timeout,
                      profile=args.profile, extra_params=args.resources) as view:
        print "First check to see if we can talk to the engines."
        results = view.map(lambda x: "hello world!", range(5))
        print ("This long computation that waits for 5 seconds before returning "
               "takes a while to run serially..")
        start_time = time.time()
        results = map(long_computation, range(20), range(20, 40), range(40, 60))
        print results
        print "That took {0} seconds.".format(time.time() - start_time)

        print "Running it in parallel goes much faster..."
        start_time = time.time()
        results = view.map(long_computation, range(20), range(20, 40), range(40, 60))
        print results
        print "That took {0} seconds.".format(time.time() - start_time)
예제 #28
0
            down_bam = view.map(sam.downsample_bam, *down_args)
            view.map(rseqc.genebody_coverage, down_bam,
                     [config] * len(down_bam))
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs


if __name__ == "__main__":
    main_config_file = sys.argv[1]
    with open(main_config_file) as config_in_handle:
        startup_config = yaml.load(config_in_handle)
    setup_logging(startup_config)
    cluster_config = startup_config["cluster"]
    with cluster_view(cluster_config["scheduler"], cluster_config["queue"],
                      cluster_config["cores"]) as view:
        main(main_config_file, view)
예제 #29
0
파일: run_idr.py 프로젝트: sirusb/spp-idr
                        default="spp",
                        help="Peak caller to run "
                        "(spp or clipper)")
    parser.add_argument('--cores-per-job',
                        default=1,
                        type=int,
                        help="Number of cores to run for each job.")
    parser.add_argument('tool_path', help="Path to spp and idr installation.")
    args = parser.parse_args()

    args.control = map(os.path.abspath, args.control)
    args.experimental = map(os.path.abspath, args.experimental)

    if args.lsf_queue:
        with cluster_view("LSF",
                          args.lsf_queue,
                          args.num_jobs,
                          cores_per_job=args.cores_per_job) as view:
            main(args.control, args.experimental, args.tool_path, view.map,
                 args.caller, args.cores_per_job)
    elif args.sge_queue:
        with cluster_view("LSF",
                          args.lsf_queue,
                          args.num_jobs,
                          cores_per_job=args.cores_per_job) as view:
            main(args.control, args.experimental, args.tool_path, view.map,
                 args.caller, args.cores_per_job)
    elif args.torque_queue:
        with cluster_view("LSF",
                          args.lsf_queue,
                          args.num_jobs,
                          cores_per_job=args.cores_per_job) as view:
예제 #30
0
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs


def find_cores_per_job(config):
    max_threads = config["program"].get("max_threads", 1)
    return max_threads


if __name__ == "__main__":
    # read in the config file and perform initial setup
    main_config_file = sys.argv[1]
    with open(main_config_file) as config_in_handle:
        startup_config = yaml.load(config_in_handle)
    setup_logging(startup_config)
    scheduler = startup_config["cluster"]["scheduler"]
    queue = startup_config["cluster"]["queue"]
    profile = startup_config["cluster"].get("profile", None)
    engines = startup_config["cluster"]["cores"]
    threads = startup_config["program"].get("max_threads", 1)
    cores_per_job = find_cores_per_job(startup_config)

    with cluster.cluster_view(scheduler, queue, engines, cores_per_job, profile) as view:
        main(main_config_file, view)
예제 #31
0
                    used.append(comp_file)
                    pairs.append([in_file, comp_file])
                    break
        if in_file not in used:
            pairs.append([in_file])
            used.append(in_file)
    return pairs


def main(data_dir, view):
    fastq_files = list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq")))
    fastq_files = combine_pairs(fastq_files)
    print "Aligning %s." % (fastq_files)
    aligned_files = view.map(align, fastq_files)
    print "Marking duplicates in %s." % (aligned_files)
    marked = view.map(mark_duplicates, aligned_files)
    print "Filtering duplicates and unmapped reads in %s." % (marked)
    deduped = view.map(filter_duplicates, marked)
    #compute_coverage(deduped)
    print "Computing start sites of %s." % (deduped)
    starts = view.map(count_starts, deduped)


if __name__ == "__main__":
    data_dir = sys.argv[1]
    print combine_pairs(list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq"))))
    fastq_files = combine_pairs(list(glob.glob(os.path.join(data_dir,
                                                            "*_trimmed.fixed.fastq"))))
    with cluster_view("lsf", "hsph_bioinfo", len(fastq_files), cores_per_job=1) as view:
        main(data_dir, view)
예제 #32
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="example script for doing parallel " "work with IPython.")
    parser.add_argument(
        "--scheduler", dest="scheduler", required=True, help="scheduler to use (lsf, sge, torque or pbs)"
    )
    parser.add_argument("--queue", dest="queue", required=True, help="queue to use on scheduler.")
    parser.add_argument(
        "--num_jobs", dest="num_jobs", required=True, type=int, help="number of jobs to run in parallel."
    )
    parser.add_argument(
        "--cores_per_job", dest="cores_per_job", default=1, type=int, help="number of cores for each job."
    )
    parser.add_argument("--profile", dest="profile", default=None, help="Optional profile to test.")

    args = parser.parse_args()

    with cluster_view(args.scheduler, args.queue, args.num_jobs, profile=args.profile) as view:
        print "First check to see if we can talk to the engines."
        results = view.map(lambda x: "hello world!", range(5))
        print ("This long computation that waits for 5 seconds before returning " "takes a while to run serially..")
        start_time = time.time()
        results = map(long_computation, [1, 2, 3], [4, 5, 6], [7, 8, 9])
        print results
        print "That took {0} seconds.".format(time.time() - start_time)

        print "Running it in parallel goes much faster..."
        start_time = time.time()
        results = view.map(long_computation, [1, 2, 3], [4, 5, 6], [7, 8, 9])
        print results
        print "That took {0} seconds.".format(time.time() - start_time)
예제 #33
0
                        help="scheduler to use (lsf, sge, torque, slurm, or pbs)")
    parser.add_argument("--queue", dest='queue', required=True,
                        help="queue to use on scheduler.")
    parser.add_argument("--num_jobs", dest='num_jobs', required=True,
                        type=int, help="number of jobs to run in parallel.")
    parser.add_argument("--cores_per_job", dest="cores_per_job", default=1,
                        type=int, help="number of cores for each job.")
    parser.add_argument("--profile", dest="profile", default=None,
                        help="Optional profile to test.")
    parser.add_argument("--resources", dest="resources", default=None,
                        help="Native specification flags to the scheduler")

    args = parser.parse_args()
    args.resources = {'resources': args.resources}

    with cluster_view(args.scheduler, args.queue, args.num_jobs,
                      profile=args.profile, extra_params=args.resources) as view:
        print "First check to see if we can talk to the engines."
        results = view.map(lambda x: "hello world!", range(5))
        print ("This long computation that waits for 5 seconds before returning "
               "takes a while to run serially..")
        start_time = time.time()
        results = map(long_computation, range(20), range(20, 40), range(40, 60))
        print results
        print "That took {0} seconds.".format(time.time() - start_time)

        print "Running it in parallel goes much faster..."
        start_time = time.time()
        results = view.map(long_computation, range(20), range(20, 40), range(40, 60))
        print results
        print "That took {0} seconds.".format(time.time() - start_time)
예제 #34
0
파일: run_idr.py 프로젝트: daler/spp-idr
    parser.add_argument('--sge-queue', help="SGE queue name")
    parser.add_argument('--torque-queue', help="Torque queue name")
    parser.add_argument('--num-jobs', default=1, help="number of parallel jobs to run",
                        type=int)
    parser.add_argument('--caller', default="spp", help="Peak caller to run "
                        "(spp or clipper)")
    parser.add_argument('--cores-per-job', default=1, type=int,
                        help="Number of cores to run for each job.")
    parser.add_argument('tool_path', help="Path to spp and idr installation.")
    args = parser.parse_args()

    args.control = map(os.path.abspath, args.control)
    args.experimental = map(os.path.abspath, args.experimental)

    if args.lsf_queue:
        with cluster_view("LSF", args.lsf_queue, args.num_jobs,
                          cores_per_job=args.cores_per_job) as view:
            main(args.control, args.experimental, args.tool_path, view.map,
                 args.caller, args.cores_per_job)
    elif args.sge_queue:
        with cluster_view("LSF", args.lsf_queue, args.num_jobs,
                          cores_per_job=args.cores_per_job) as view:
            main(args.control, args.experimental, args.tool_path, view.map,
                 args.caller, args.cores_per_job)
    elif args.torque_queue:
        with cluster_view("LSF", args.lsf_queue, args.num_jobs,
                          cores_per_job=args.cores_per_job) as view:
            main(args.control, args.experimental, args.tool_path, view.map,
                 args.caller, args.cores_per_job)
    else:
        main(args.control, args.experimental, args.tool_path, map,
             args.caller, args.cores_per_job)
예제 #35
0
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs


def find_cores_per_job(config):
    max_threads = config["program"].get("max_threads", 1)
    return max_threads


if __name__ == "__main__":
    # read in the config file and perform initial setup
    main_config_file = sys.argv[1]
    with open(main_config_file) as config_in_handle:
        startup_config = yaml.load(config_in_handle)
    setup_logging(startup_config)
    scheduler = startup_config["cluster"]["scheduler"]
    queue = startup_config["cluster"]["queue"]
    profile = startup_config["cluster"].get("profile", None)
    engines = startup_config["cluster"]["cores"]
    threads = startup_config["program"].get("max_threads", 1)
    cores_per_job = find_cores_per_job(startup_config)

    with cluster.cluster_view(scheduler, queue, engines, cores_per_job,
                              profile) as view:
        main(main_config_file, view)
예제 #36
0
def load_ipython(args):
    grabix_file = bgzip(args.vcf)
    with cluster_view(*get_ipython_args(args)) as view:
        chunks = load_chunks_ipython(grabix_file, args, view)
        merge_chunks_ipython(chunks, args.db, view)
    gemini_annotate.add_extras(args.db, chunks)
def get_cluster_view(args):
    """get ipython running"""
    return ipc.cluster_view(args.scheduler, args.queue, args.num_jobs, args.cores_per_job, start_wait=args.timeout, extra_params={"resources": args.resources,"mem": args.memory_per_job,"tag": "bcbio_prepare","run_local": False})
예제 #38
0
def load_ipython(args):
    grabix_file = bgzip(args.vcf)
    with cluster_view(*get_ipython_args(args)) as view:
        chunks = load_chunks_ipython(grabix_file, args, view)
        merge_chunks_ipython(chunks, args, view)
예제 #39
0
        if in_file not in used:
            pairs.append([in_file])
            used.append(in_file)
    return pairs


def main(data_dir, view):
    fastq_files = list(
        glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq")))
    fastq_files = combine_pairs(fastq_files)
    print "Aligning %s." % (fastq_files)
    aligned_files = view.map(align, fastq_files)
    print "Marking duplicates in %s." % (aligned_files)
    marked = view.map(mark_duplicates, aligned_files)
    print "Filtering duplicates and unmapped reads in %s." % (marked)
    deduped = view.map(filter_duplicates, marked)
    #compute_coverage(deduped)
    print "Computing start sites of %s." % (deduped)
    starts = view.map(count_starts, deduped)


if __name__ == "__main__":
    data_dir = sys.argv[1]
    print combine_pairs(
        list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq"))))
    fastq_files = combine_pairs(
        list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq"))))
    with cluster_view("lsf", "hsph_bioinfo", len(fastq_files),
                      cores_per_job=1) as view:
        main(data_dir, view)