def create(parallel, dirs, config): """Create a cluster based on the provided parallel arguments. Returns an IPython view on the cluster, enabling processing on jobs. Adds a mincores specification if he have machines with a larger number of cores to allow jobs to be batched together for shared memory usage. """ profile_dir = utils.safe_makedir(os.path.join(dirs["work"], get_log_dir(config), "ipython")) has_mincores = any(x.startswith("mincores=") for x in parallel["resources"]) cores = min(_get_common_cores(config["resources"]), parallel["system_cores"]) if cores > 1 and not has_mincores: adj_cores = max(1, int(math.floor(cores * float(parallel.get("mem_pct", 1.0))))) # if we have less scheduled cores than per machine, use the scheduled count if cores > parallel["cores"]: cores = parallel["cores"] # if we have less total cores required for the entire process, use that elif adj_cores > parallel["num_jobs"] * parallel["cores_per_job"]: cores = parallel["num_jobs"] * parallel["cores_per_job"] else: cores = adj_cores cores = per_machine_target_cores(cores, parallel["num_jobs"] // cores) parallel["resources"].append("mincores=%s" % cores) return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=profile_dir, start_wait=parallel["timeout"], extra_params={"resources": parallel["resources"], "mem": parallel["mem"], "tag": parallel.get("tag"), "run_local": parallel.get("run_local")}, retries=parallel.get("retries"))
def create(parallel, dirs, config): """Create a cluster based on the provided parallel arguments. Returns an IPython view on the cluster, enabling processing on jobs. Adds a mincores specification if he have machines with a larger number of cores to allow jobs to be batched together for shared memory usage. """ profile_dir = utils.safe_makedir( os.path.join(dirs["work"], get_log_dir(config), "ipython")) has_mincores = any( x.startswith("mincores=") for x in parallel["resources"]) if parallel["system_cores"] > 3 and not has_mincores: cores = parallel["system_cores"] # if we have larger number of cores, leave room for standard batch script and controller if parallel["system_cores"] > 30: cores = cores - 2 elif parallel["system_cores"] > 15: cores = cores - 1 parallel["resources"].append("mincores=%s" % cores) return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=profile_dir, start_wait=parallel["timeout"], extra_params={ "resources": parallel["resources"], "mem": parallel["mem"], "tag": parallel.get("tag"), "run_local": parallel.get("run_local") }, retries=parallel.get("retries"))
def main(): #files = (sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.bam")) + # sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.cram"))) files = (sorted(glob.glob("/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/calls/*alz-*/final/*/*-ready.bam"))) with cluster_view(scheduler="lsf", queue="hsph_bioinfo", num_jobs=64, cores_per_job=1, extra_params={"mem": "3.3"}) as view: view.map(cram_compress, files)
def create(parallel, dirs, config): """Create a cluster based on the provided parallel arguments. Returns an IPython view on the cluster, enabling processing on jobs. Adds a mincores specification if he have machines with a larger number of cores to allow jobs to be batched together for shared memory usage. """ profile_dir = utils.safe_makedir(os.path.join(dirs["work"], get_log_dir(config), "ipython")) has_mincores = any(x.startswith("mincores=") for x in parallel["resources"]) if parallel["system_cores"] > 3 and not has_mincores: cores = parallel["system_cores"] # if we have larger number of cores, leave room for standard batch script and controller if parallel["system_cores"] > 30: cores = cores - 2 elif parallel["system_cores"] > 15: cores = cores - 1 parallel["resources"].append("mincores=%s" % cores) return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=profile_dir, start_wait=parallel["timeout"], extra_params={"resources": parallel["resources"], "mem": parallel["mem"], "tag": parallel.get("tag"), "run_local": parallel.get("run_local")}, retries=parallel.get("retries"))
def get_cluster_view(args): return ipc.cluster_view(args.scheduler, args.queue, args.num_jobs, args.cores_per_job, start_wait=args.timeout, extra_params={"resources": args.resources, "mem": args.memory_per_job, "tag": "singlecell", "run_local": args.local})
def _view_from_parallel(parallel, work_dir, config): """Translate parallel map into options for a cluster view. """ profile_dir = utils.safe_makedir(os.path.join(work_dir, get_log_dir(config), "ipython")) return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=profile_dir, start_wait=parallel["timeout"], extra_params={"resources": parallel["resources"]}, retries=parallel.get("retries"))
def test_run_analysis(): tmp_control = idr.tagalign_split(_copy_file_to_testdir(control_file)) tmp_exp = idr.tagalign_split(_copy_file_to_testdir(exp_file)) with cluster_view(scheduler, queue, jobs) as view: plots, filtered_files = idr.run_analysis(tmp_control, tmp_exp, spp_path, idr_runner_path, idr_plotter_path, view.map, caller) all(map(idr.file_exists, filtered_files))
def get_cluster_view(args): if not os.path.exists("ipython"): os.mkdir("ipython") return ipc.cluster_view(args.scheduler, args.queue, args.num_jobs, args.cores_per_job, start_wait=args.timeout, profile="ipython", extra_params={"resources": args.resources, "mem": args.memory_per_job, "tag": "ts", "run_local": args.local})
def test_pooled_pseudo_replicate_caller(self): peak_caller = idr.spp_peak_caller(spp_path) control_replicates = idr.tagalign_split(self.control, nfiles=2) experimental_replicates = idr.tagalign_split(self.experimental, nfiles=2) with cluster_view(scheduler, queue, jobs) as view: peaks = idr.call_peaks_on_pooled_pseudoreplicates(control_replicates, experimental_replicates, peak_caller, view.map) self.assertTrue(all(map(idr.file_exists, peaks))) self.assertTrue(all(map(idr.is_peak_file, peaks)))
def get_cluster_view(args): if not os.path.exists("ipython"): os.mkdir("ipython") return ipc.cluster_view( args.scheduler, args.queue, args.num_jobs, args.cores_per_job, start_wait=args.timeout, profile="ipython", extra_params={"resources": args.resources, "mem": args.memory_per_job, "tag": "ts", "run_local": args.local}, )
def get_cluster_view(args): if not os.path.exists("ipython"): utils.safe_makedir("ipython") utils.safe_makedir("checkpoint") return ipc.cluster_view(args['scheduler'], args['queue'], args['num_jobs'], args['cores_per_job'], start_wait=args['timeout'], profile="ipython", extra_params={"resources": args['resources'], "mem": args['mem'], "tag": "ichwrapper", "run_local": args['run_local']})
def get_cluster_view(p): """get ipython running""" return ipc.cluster_view(p['scheduler'], p['queue'], p['num_jobs'], p['cores_per_job'], start_wait=p['timeout'], extra_params={ "resources": p['resources'], "mem": p['mem'], "tag": p['tag'], "run_local": False })
def main(): #files = (sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.bam")) + # sorted(glob.glob("/n/hsphS10/hsphfs2/tanzi_recalled/*alz-*/final/*/*-ready.cram"))) files = (sorted( glob.glob( "/n/hsphS10/hsphfs1/chb/projects/tanzi_ad/calls/*alz-*/final/*/*-ready.bam" ))) with cluster_view(scheduler="lsf", queue="hsph_bioinfo", num_jobs=64, cores_per_job=1, extra_params={"mem": "3.3"}) as view: view.map(cram_compress, files)
def get_cluster_view(args): """get ipython running""" return ipc.cluster_view(args.scheduler, args.queue, args.num_jobs, args.cores_per_job, start_wait=args.timeout, extra_params={ "resources": args.resources, "mem": args.memory_per_job, "tag": "bcbio_prepare", "run_local": False })
def create(parallel, dirs, config): """Create a cluster based on the provided parallel arguments. Returns an IPython view on the cluster, enabling processing on jobs. """ profile_dir = utils.safe_makedir(os.path.join(dirs["work"], get_log_dir(config), "ipython")) return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=profile_dir, start_wait=parallel["timeout"], extra_params={"resources": parallel["resources"], "mem": parallel["mem"], "tag": parallel.get("tag"), "run_local": parallel.get("run_local")}, retries=parallel.get("retries"))
def get_cluster_view(args, num_jobs=None): if not os.path.exists("ipython"): os.mkdir("ipython") os.mkdir("checkpoint") if not num_jobs: num_jobs = args.num_jobs return ipc.cluster_view(args.scheduler, args.queue, num_jobs, args.cores_per_job, start_wait=args.timeout, profile="ipython", extra_params={"resources": args.resources, "mem": args.memory_per_job, "tag": "seqcluster", "run_local": args.local})
def _view_from_parallel(parallel, work_dir, config): """Translate parallel map into options for a cluster view. """ profile_dir = utils.safe_makedir( os.path.join(work_dir, get_log_dir(config), "ipython")) return ipython_cluster.cluster_view( parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=profile_dir, start_wait=parallel["timeout"], extra_params={"resources": parallel["resources"]}, retries=parallel.get("retries"))
def create(parallel, dirs, config): """Create a cluster based on the provided parallel arguments. Returns an IPython view on the cluster, enabling processing on jobs. Adds a mincores specification if he have machines with a larger number of cores to allow jobs to be batched together for shared memory usage. """ profile_dir = utils.safe_makedir( os.path.join(dirs["work"], get_log_dir(config), "ipython")) has_mincores = any( x.startswith("mincores=") for x in parallel["resources"]) cores = min(_get_common_cores(config["resources"]), parallel["system_cores"]) if cores > 1 and not has_mincores: adj_cores = max( 1, int(math.floor(cores * float(parallel.get("mem_pct", 1.0))))) # if we have less scheduled cores than per machine, use the scheduled count if cores > parallel["cores"]: cores = parallel["cores"] # if we have less total cores required for the entire process, use that elif adj_cores > parallel["num_jobs"] * parallel["cores_per_job"]: cores = parallel["num_jobs"] * parallel["cores_per_job"] else: cores = adj_cores cores = per_machine_target_cores(cores, parallel["num_jobs"] // cores) parallel["resources"].append("mincores=%s" % cores) return ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=profile_dir, start_wait=parallel["timeout"], extra_params={ "resources": parallel["resources"], "mem": parallel["mem"], "tag": parallel.get("tag"), "run_local": parallel.get("run_local"), "local_controller": parallel.get("local_controller") }, retries=parallel.get("retries"))
def runner(parallel, fn_name, items, work_dir, config): """Run a task on an ipython parallel cluster, allowing alternative queue types. This will spawn clusters for parallel and custom queue types like multicore and high I/O tasks on demand. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. """ setup_logging(config) out = [] checkpoint_dir = utils.safe_makedir( os.path.join(work_dir, "checkpoints_ipython")) checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name) fn = getattr( __import__("{base}.ipythontasks".format(base=parallel["module"]), fromlist=["ipythontasks"]), fn_name) items = [x for x in items if x is not None] num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config) parallel = dictadd(parallel, "cores_per_job", cores_per_job) parallel = dictadd(parallel, "num_jobs", num_jobs) # already finished, run locally on current machine to collect details if os.path.exists(checkpoint_file): logger.info("ipython: %s -- local; checkpoint passed" % fn_name) for args in items: if args: data = fn(args) if data: out.extend(data) # Run on a standard parallel queue else: logger.info("ipython: %s" % fn_name) if len(items) > 0: items = [add_cores_to_config(x, cores_per_job) for x in items] with ipython_cluster.cluster_view( parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=parallel["profile"]) as view: for data in view.map_sync(fn, items, track=False): if data: out.extend(data) with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n") return out
def runner(parallel, fn_name, items, work_dir, config): """Run a task on an ipython parallel cluster, allowing alternative queue types. This will spawn clusters for parallel and custom queue types like multicore and high I/O tasks on demand. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. """ setup_logging(config) out = [] checkpoint_dir = utils.safe_makedir(os.path.join(work_dir, "checkpoints_ipython")) checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name) fn = getattr(__import__("{base}.ipythontasks".format(base=parallel["module"]), fromlist=["ipythontasks"]), fn_name) items = [x for x in items if x is not None] num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config) parallel = dictadd(parallel, "cores_per_job", cores_per_job) parallel = dictadd(parallel, "num_jobs", num_jobs) # already finished, run locally on current machine to collect details if os.path.exists(checkpoint_file): logger.info("ipython: %s -- local; checkpoint passed" % fn_name) for args in items: if args: data = fn(args) if data: out.extend(data) # Run on a standard parallel queue else: logger.info("ipython: %s" % fn_name) if len(items) > 0: items = [add_cores_to_config(x, cores_per_job) for x in items] with ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=parallel["profile"]) as view: for data in view.map_sync(fn, items, track=False): if data: out.extend(data) with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n") return out
def map(self, fn, args): from cluster_helper.cluster import cluster_view cluster_args = { "scheduler": None, "queue": None, "num_jobs": self.processes, "extra_params": { "run_local": True } } cluster_args.update(self.cluster_settings.cluster_options) print cluster_args with cluster_view(**cluster_args) as view: async_results = view.map(fn, args, block=False) async_results.wait_interactive() return async_results.get()
def get_cluster_view(p): """get ipython running""" return ipc.cluster_view(p['scheduler'], p['queue'], p['num_jobs'], p['cores_per_job'], start_wait=p['timeout'], extra_params={"resources": p['resources'], "mem": p['mem'], "tag": p['tag'], "run_local": False})
def load_ipython(args): grabix_file = bgzip(args.vcf) with cluster_view(*get_ipython_args(args)) as view: chunks = load_chunks_ipython(grabix_file, args, view) merge_chunks_ipython(chunks, args.db, view) gemini_annotate.add_extras(args.db, chunks)
"--local", dest="local", default=False, action="store_true") args = parser.parse_args() args.resources = {'resources': args.resources, 'mem': args.mem, 'local_controller': args.local_controller} if args.local: args.resources["run_local"] = True if not (args.local or (args.scheduler and args.queue)): print("Please specify --local to run locally or a scheduler and queue" "to run on with --scheduler and --queue") sys.exit(1) with cluster_view(args.scheduler, args.queue, args.num_jobs, cores_per_job=args.cores_per_job, start_wait=args.timeout, profile=args.profile, extra_params=args.resources) as view: print("First check to see if we can talk to the engines.") results = view.map(lambda x: "hello world!", range(5)) print("This long computation that waits for 5 seconds before " "returning takes a while to run serially..") start_time = time.time() results = list(map(long_computation, range(20), range(20, 40), range(40, 60))) print(results) print("That took {} seconds.".format(time.time() - start_time)) print("Running it in parallel goes much faster...") start_time = time.time() results = list(view.map(long_computation, range(20), range(20, 40), range(40, 60))) print(results) print("That took {} seconds.".format(time.time() - start_time))
bam_files = view.map(sam.sam2bam, out_files) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if __name__ == "__main__": # read in the config file and perform initial setup main_config_file = sys.argv[1] with open(main_config_file) as config_in_handle: startup_config = yaml.load(config_in_handle) parallel = create_base_logger(startup_config, {"type": "ipython"}) setup_local_logging(startup_config, parallel) startup_config["parallel"] = parallel #setup_logging(startup_config) cluster_config = startup_config["cluster"] cores_per_job = cluster_config.get("cores_per_job", 1) if startup_config["cluster"].get("local", False): main(startup_config, DummyView()) else: with cluster_view(cluster_config["scheduler"], cluster_config["queue"], cluster_config["cores"], cores_per_job) as view: main(startup_config, view) class DummyView(object): def __init__(self): self.map = map
def load_ipython(args): grabix_file = bgzip(args.vcf) with cluster_view(*get_ipython_args(args)) as view: chunks = load_chunks_ipython(grabix_file, args, view) merge_chunks_ipython(chunks, args, view)
type=int, help="number of cores for each job.") parser.add_argument("--profile", dest="profile", default=None, help="Optional profile to test.") parser.add_argument("--resources", dest="resources", default=None, help="Native specification flags to the scheduler") parser.add_argument("--timeout", dest="timeout", default=15, help="Time (in minutes) to wait before timing out.") parser.add_argument("--memory", dest="mem", default=1, help="Memory in GB to reserve.") args = parser.parse_args() args.resources = {'resources': args.resources, 'mem': args.mem} with cluster_view(args.scheduler, args.queue, args.num_jobs, start_wait=args.timeout, profile=args.profile, extra_params=args.resources) as view: print "First check to see if we can talk to the engines." results = view.map(lambda x: "hello world!", range(5)) print ("This long computation that waits for 5 seconds before returning " "takes a while to run serially..") start_time = time.time() results = map(long_computation, range(20), range(20, 40), range(40, 60)) print results print "That took {0} seconds.".format(time.time() - start_time) print "Running it in parallel goes much faster..." start_time = time.time() results = view.map(long_computation, range(20), range(20, 40), range(40, 60)) print results print "That took {0} seconds.".format(time.time() - start_time)
down_bam = view.map(sam.downsample_bam, *down_args) view.map(rseqc.genebody_coverage, down_bam, [config] * len(down_bam)) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs if __name__ == "__main__": main_config_file = sys.argv[1] with open(main_config_file) as config_in_handle: startup_config = yaml.load(config_in_handle) setup_logging(startup_config) cluster_config = startup_config["cluster"] with cluster_view(cluster_config["scheduler"], cluster_config["queue"], cluster_config["cores"]) as view: main(main_config_file, view)
default="spp", help="Peak caller to run " "(spp or clipper)") parser.add_argument('--cores-per-job', default=1, type=int, help="Number of cores to run for each job.") parser.add_argument('tool_path', help="Path to spp and idr installation.") args = parser.parse_args() args.control = map(os.path.abspath, args.control) args.experimental = map(os.path.abspath, args.experimental) if args.lsf_queue: with cluster_view("LSF", args.lsf_queue, args.num_jobs, cores_per_job=args.cores_per_job) as view: main(args.control, args.experimental, args.tool_path, view.map, args.caller, args.cores_per_job) elif args.sge_queue: with cluster_view("LSF", args.lsf_queue, args.num_jobs, cores_per_job=args.cores_per_job) as view: main(args.control, args.experimental, args.tool_path, view.map, args.caller, args.cores_per_job) elif args.torque_queue: with cluster_view("LSF", args.lsf_queue, args.num_jobs, cores_per_job=args.cores_per_job) as view:
["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs def find_cores_per_job(config): max_threads = config["program"].get("max_threads", 1) return max_threads if __name__ == "__main__": # read in the config file and perform initial setup main_config_file = sys.argv[1] with open(main_config_file) as config_in_handle: startup_config = yaml.load(config_in_handle) setup_logging(startup_config) scheduler = startup_config["cluster"]["scheduler"] queue = startup_config["cluster"]["queue"] profile = startup_config["cluster"].get("profile", None) engines = startup_config["cluster"]["cores"] threads = startup_config["program"].get("max_threads", 1) cores_per_job = find_cores_per_job(startup_config) with cluster.cluster_view(scheduler, queue, engines, cores_per_job, profile) as view: main(main_config_file, view)
used.append(comp_file) pairs.append([in_file, comp_file]) break if in_file not in used: pairs.append([in_file]) used.append(in_file) return pairs def main(data_dir, view): fastq_files = list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq"))) fastq_files = combine_pairs(fastq_files) print "Aligning %s." % (fastq_files) aligned_files = view.map(align, fastq_files) print "Marking duplicates in %s." % (aligned_files) marked = view.map(mark_duplicates, aligned_files) print "Filtering duplicates and unmapped reads in %s." % (marked) deduped = view.map(filter_duplicates, marked) #compute_coverage(deduped) print "Computing start sites of %s." % (deduped) starts = view.map(count_starts, deduped) if __name__ == "__main__": data_dir = sys.argv[1] print combine_pairs(list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq")))) fastq_files = combine_pairs(list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq")))) with cluster_view("lsf", "hsph_bioinfo", len(fastq_files), cores_per_job=1) as view: main(data_dir, view)
if __name__ == "__main__": parser = argparse.ArgumentParser(description="example script for doing parallel " "work with IPython.") parser.add_argument( "--scheduler", dest="scheduler", required=True, help="scheduler to use (lsf, sge, torque or pbs)" ) parser.add_argument("--queue", dest="queue", required=True, help="queue to use on scheduler.") parser.add_argument( "--num_jobs", dest="num_jobs", required=True, type=int, help="number of jobs to run in parallel." ) parser.add_argument( "--cores_per_job", dest="cores_per_job", default=1, type=int, help="number of cores for each job." ) parser.add_argument("--profile", dest="profile", default=None, help="Optional profile to test.") args = parser.parse_args() with cluster_view(args.scheduler, args.queue, args.num_jobs, profile=args.profile) as view: print "First check to see if we can talk to the engines." results = view.map(lambda x: "hello world!", range(5)) print ("This long computation that waits for 5 seconds before returning " "takes a while to run serially..") start_time = time.time() results = map(long_computation, [1, 2, 3], [4, 5, 6], [7, 8, 9]) print results print "That took {0} seconds.".format(time.time() - start_time) print "Running it in parallel goes much faster..." start_time = time.time() results = view.map(long_computation, [1, 2, 3], [4, 5, 6], [7, 8, 9]) print results print "That took {0} seconds.".format(time.time() - start_time)
help="scheduler to use (lsf, sge, torque, slurm, or pbs)") parser.add_argument("--queue", dest='queue', required=True, help="queue to use on scheduler.") parser.add_argument("--num_jobs", dest='num_jobs', required=True, type=int, help="number of jobs to run in parallel.") parser.add_argument("--cores_per_job", dest="cores_per_job", default=1, type=int, help="number of cores for each job.") parser.add_argument("--profile", dest="profile", default=None, help="Optional profile to test.") parser.add_argument("--resources", dest="resources", default=None, help="Native specification flags to the scheduler") args = parser.parse_args() args.resources = {'resources': args.resources} with cluster_view(args.scheduler, args.queue, args.num_jobs, profile=args.profile, extra_params=args.resources) as view: print "First check to see if we can talk to the engines." results = view.map(lambda x: "hello world!", range(5)) print ("This long computation that waits for 5 seconds before returning " "takes a while to run serially..") start_time = time.time() results = map(long_computation, range(20), range(20, 40), range(40, 60)) print results print "That took {0} seconds.".format(time.time() - start_time) print "Running it in parallel goes much faster..." start_time = time.time() results = view.map(long_computation, range(20), range(20, 40), range(40, 60)) print results print "That took {0} seconds.".format(time.time() - start_time)
parser.add_argument('--sge-queue', help="SGE queue name") parser.add_argument('--torque-queue', help="Torque queue name") parser.add_argument('--num-jobs', default=1, help="number of parallel jobs to run", type=int) parser.add_argument('--caller', default="spp", help="Peak caller to run " "(spp or clipper)") parser.add_argument('--cores-per-job', default=1, type=int, help="Number of cores to run for each job.") parser.add_argument('tool_path', help="Path to spp and idr installation.") args = parser.parse_args() args.control = map(os.path.abspath, args.control) args.experimental = map(os.path.abspath, args.experimental) if args.lsf_queue: with cluster_view("LSF", args.lsf_queue, args.num_jobs, cores_per_job=args.cores_per_job) as view: main(args.control, args.experimental, args.tool_path, view.map, args.caller, args.cores_per_job) elif args.sge_queue: with cluster_view("LSF", args.lsf_queue, args.num_jobs, cores_per_job=args.cores_per_job) as view: main(args.control, args.experimental, args.tool_path, view.map, args.caller, args.cores_per_job) elif args.torque_queue: with cluster_view("LSF", args.lsf_queue, args.num_jobs, cores_per_job=args.cores_per_job) as view: main(args.control, args.experimental, args.tool_path, view.map, args.caller, args.cores_per_job) else: main(args.control, args.experimental, args.tool_path, map, args.caller, args.cores_per_job)
["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs def find_cores_per_job(config): max_threads = config["program"].get("max_threads", 1) return max_threads if __name__ == "__main__": # read in the config file and perform initial setup main_config_file = sys.argv[1] with open(main_config_file) as config_in_handle: startup_config = yaml.load(config_in_handle) setup_logging(startup_config) scheduler = startup_config["cluster"]["scheduler"] queue = startup_config["cluster"]["queue"] profile = startup_config["cluster"].get("profile", None) engines = startup_config["cluster"]["cores"] threads = startup_config["program"].get("max_threads", 1) cores_per_job = find_cores_per_job(startup_config) with cluster.cluster_view(scheduler, queue, engines, cores_per_job, profile) as view: main(main_config_file, view)
def load_ipython(args): grabix_file = bgzip(args.vcf) with cluster_view(*get_ipython_args(args)) as view: chunks = load_chunks_ipython(grabix_file, args, view) merge_chunks_ipython(chunks, args.db, view) gemini_annotate.add_extras(args.db, chunks)
def get_cluster_view(args): """get ipython running""" return ipc.cluster_view(args.scheduler, args.queue, args.num_jobs, args.cores_per_job, start_wait=args.timeout, extra_params={"resources": args.resources,"mem": args.memory_per_job,"tag": "bcbio_prepare","run_local": False})
def load_ipython(args): grabix_file = bgzip(args.vcf) with cluster_view(*get_ipython_args(args)) as view: chunks = load_chunks_ipython(grabix_file, args, view) merge_chunks_ipython(chunks, args, view)
if in_file not in used: pairs.append([in_file]) used.append(in_file) return pairs def main(data_dir, view): fastq_files = list( glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq"))) fastq_files = combine_pairs(fastq_files) print "Aligning %s." % (fastq_files) aligned_files = view.map(align, fastq_files) print "Marking duplicates in %s." % (aligned_files) marked = view.map(mark_duplicates, aligned_files) print "Filtering duplicates and unmapped reads in %s." % (marked) deduped = view.map(filter_duplicates, marked) #compute_coverage(deduped) print "Computing start sites of %s." % (deduped) starts = view.map(count_starts, deduped) if __name__ == "__main__": data_dir = sys.argv[1] print combine_pairs( list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq")))) fastq_files = combine_pairs( list(glob.glob(os.path.join(data_dir, "*_trimmed.fixed.fastq")))) with cluster_view("lsf", "hsph_bioinfo", len(fastq_files), cores_per_job=1) as view: main(data_dir, view)