def do_analysis(args, dockerconf): """Run a full analysis on a local machine, utilizing multiple cores. """ work_dir = os.getcwd() with open(args.sample_config) as in_handle: sample_config, dmounts = mounts.update_config(yaml.load(in_handle), args.fcdir) dmounts += mounts.prepare_system(args.datadir, dockerconf["biodata_dir"]) dmounts.append("%s:%s" % (work_dir, dockerconf["work_dir"])) system_config, system_mounts = _read_system_config(dockerconf, args.systemconfig, args.datadir) system_cfile = os.path.join(work_dir, "bcbio_system-forvm.yaml") sample_cfile = os.path.join(work_dir, "bcbio_sample-forvm.yaml") with open(system_cfile, "w") as out_handle: yaml.dump(system_config, out_handle, default_flow_style=False, allow_unicode=False) with open(sample_cfile, "w") as out_handle: yaml.dump(sample_config, out_handle, default_flow_style=False, allow_unicode=False) in_files = [ os.path.join(dockerconf["work_dir"], os.path.basename(x)) for x in [system_cfile, sample_cfile] ] log.setup_local_logging({"include_time": False}) manage.run_bcbio_cmd( args.image, dmounts + system_mounts, in_files + [ "--numcores", str(args.numcores), "--workdir=%s" % dockerconf["work_dir"] ])
def process(args): """Run the function in args.name given arguments in args.argfile. """ try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, multisample = _world_from_cwl(fnargs[1:], work_dir) argfile = os.path.join(work_dir, "cwl-%s-world.json" % args.name) with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) out = fn(fnargs) if argfile: with open(argfile, "w") as out_handle: if argfile.endswith(".json"): if multisample: json.dump([_remove_work_dir(xs[0], work_dir + "/") for xs in out], out_handle) else: json.dump(_remove_work_dir(out[0][0], work_dir + "/"), out_handle) else: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = run_info.organize(dirs, config, run_info_yaml) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane lane_items = lane.process_all_lanes(run_items, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def run_main(workdir, config_file=None, fc_dir=None, run_info_yaml=None, parallel=None, workflow=None): """Run variant analysis, handling command line options. """ # Set environment to standard to use periods for decimals and avoid localization locale_to_use = utils.get_locale() os.environ["LC_ALL"] = locale_to_use os.environ["LC"] = locale_to_use os.environ["LANG"] = locale_to_use workdir = utils.safe_makedir(os.path.abspath(workdir)) os.chdir(workdir) config, config_file = config_utils.load_system_config(config_file, workdir) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) logger.info(f"System YAML configuration: {os.path.abspath(config_file)}.") logger.info(f"Locale set to {locale_to_use}.") if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(workdir, DEFAULT_LOG_DIR) if parallel["type"] in ["local", "clusterk"]: _setup_resources() _run_toplevel(config, config_file, workdir, parallel, fc_dir, run_info_yaml) elif parallel["type"] == "ipython": assert parallel["scheduler"] is not None, "IPython parallel requires a specified scheduler (-s)" if parallel["scheduler"] != "sge": assert parallel["queue"] is not None, "IPython parallel requires a specified queue (-q)" elif not parallel["queue"]: parallel["queue"] = "" _run_toplevel(config, config_file, workdir, parallel, fc_dir, run_info_yaml) else: raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_lanes_with_pipelines(samples) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) logger.info("System YAML configuration: %s" % os.path.abspath(config_file)) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config) system.write_info(dirs, parallel, config) with tx_tmpdir(config if parallel.get("type") == "local" else None) as tmpdir: tempfile.tempdir = tmpdir for pipeline, samples in pipelines.items(): for xs in pipeline(config, run_info_yaml, parallel, dirs, samples): pass
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
def process(args): """Run the function in args.name given arguments in args.argfile. """ try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs = _world_from_cwl(fnargs[1:], work_dir) argfile = os.path.join(work_dir, "cwl-%s-world.json" % args.name) with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) out = fn(fnargs) if argfile: with open(argfile, "w") as out_handle: if argfile.endswith(".json"): json.dump(_remove_work_dir(out[0][0], work_dir + "/"), out_handle) else: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None, samples=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process samples -- Pre-processed samples, useful if run inside of docker containers. """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) if samples: dockerized = True else: dockerized = False samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_samples_with_pipelines(samples) final = [] with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) if not dockerized: versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_samples_with_pipelines(samples) final = [] with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def __call__(self, in_files): setup_local_logging(self.config, self.config["parallel"]) self._start_message(in_files) # first is human, second is mouse out_files = self._disambiguate(in_files[0], in_files[1]) dis_files = self._disambiguate_out(in_files) if all(map(file_exists, dis_files)): [shutil.move(x[0], x[1]) for x in zip(dis_files, out_files)] self._end_message(in_files) return out_files
def check_and_postprocess(args): """Check for newly dumped sequencer output, post-processing and transferring. """ with open(args.process_config) as in_handle: config = yaml.safe_load(in_handle) setup_local_logging() for dname in _find_unprocessed(config): runinfo = nglims.get_runinfo(config["galaxy_url"], config["galaxy_apikey"], dname) lane_details = nglims.flatten_lane_detail(runinfo) fcid_ss = samplesheet.from_flowcell(dname, lane_details) fastq_dir = demultiplex.run_bcl2fastq(dname, fcid_ss, config)
def check_and_postprocess(args): """Check for newly dumped sequencer output, post-processing and transferring. """ with open(args.process_config) as in_handle: config = yaml.safe_load(in_handle) setup_local_logging(config) for dname in _find_unprocessed(config): lane_details = nglims.get_runinfo(config["galaxy_url"], config["galaxy_apikey"], dname, utils.get_in(config, ("process", "storedir"))) fcid_ss = samplesheet.from_flowcell(dname, lane_details) _update_reported(config["msg_db"], dname) fastq_dir = demultiplex.run_bcl2fastq(dname, fcid_ss, config) bcbio_config, ready_fastq_dir = nglims.prep_samples_and_config(dname, lane_details, fastq_dir, config) transfer.copy_flowcell(dname, ready_fastq_dir, bcbio_config, config) _start_processing(dname, bcbio_config, config)
def check_and_postprocess(args): """Check for newly dumped sequencer output, post-processing and transferring. """ with open(args.process_config) as in_handle: config = yaml.safe_load(in_handle) setup_local_logging(config) for dname in _find_unprocessed(config): lane_details = nglims.get_runinfo(config["galaxy_url"], config["galaxy_apikey"], dname, utils.get_in(config, ("process", "storedir"))) fcid_ss = samplesheet.from_flowcell(dname, lane_details) _update_reported(config["msg_db"], dname) fastq_dir = demultiplex.run_bcl2fastq(dname, fcid_ss, config) bcbio_config, ready_fastq_dir = nglims.prep_samples_and_config(dname, lane_details, fastq_dir, config) transfer.copy_to_remote(dname, ready_fastq_dir, bcbio_config, config) _start_processing(dname, bcbio_config, config)
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError( "Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext( args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys = _world_from_cwl(fnargs[1:], work_dir) fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys = None, [] with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) try: out = fn(fnargs) except: logger.exception() raise if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, work_dir) except: logger.exception() raise if argfile.endswith(".json"): _write_wdl_outputs(argfile, out_keys)
def _setup_logging(args): # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg( arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def wrapper(*args, **kwargs): config = None for arg in args: if config_utils.is_std_config_arg(arg): config = arg break elif config_utils.is_nested_config_arg(arg): config = arg["config"] elif isinstance( arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]): config = arg[0]["config"] break assert config, "Could not find config dictionary in function arguments." if config.get("parallel", {}).get("log_queue") and not config.get( "parallel", {}).get("wrapper"): handler = setup_local_logging(config, config["parallel"]) else: handler = None try: out = f(*args, **kwargs) finally: if handler and hasattr(handler, "close"): handler.close() return out
def _setup_logging(args): config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg( arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def _setup_logging(args): # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def _setup_logging(args): config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if config_utils.is_nested_config_arg(arg): config = arg["config"] break elif config_utils.is_std_config_arg(arg): config = arg break elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]): config = arg[0]["config"] break if config is None: raise NotImplementedError("No config found in arguments: %s" % args[0]) handler = setup_local_logging(config, config.get("parallel", {})) try: yield config except: logger.exception("Unexpected error") raise finally: if hasattr(handler, "close"): handler.close()
def process(args): """Run the function in args.name given arguments in args.argfile. """ try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) fnargs = config_utils.merge_resources(fnargs) work_dir = os.path.dirname(args.argfile) with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) out = fn(fnargs) out_file = "%s-out%s" % os.path.splitext(args.argfile) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config) system.write_info(dirs, parallel, config) with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, samples in pipelines.items(): for xs in pipeline.run(config, run_info_yaml, parallel, dirs, samples): pass
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization locale_to_use = utils.get_locale() os.environ["LC_ALL"] = locale_to_use os.environ["LC"] = locale_to_use os.environ["LANG"] = locale_to_use setpath.prepend_bcbiopath() try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError( "Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext( args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys, input_files = _world_from_cwl( args.name, fnargs[1:], work_dir) # Can remove this awkward Docker merge when we do not need custom GATK3 installs fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys, input_files = None, {}, [] with utils.chdir(work_dir): with contextlib.closing( log.setup_local_logging(parallel={"wrapper": "runfn"})): try: out = fn(*fnargs) except: logger.exception() raise finally: # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage wf_input_dir = os.path.join(work_dir, "wf-inputs") if os.path.exists(wf_input_dir) and os.path.isdir( wf_input_dir): shutil.rmtree(wf_input_dir) if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir) except: logger.exception() raise
def do_analysis(args, dockerconf): """Run a full analysis on a local machine, utilizing multiple cores. """ work_dir = os.getcwd() with open(args.sample_config) as in_handle: sample_config, dmounts = mounts.update_config(yaml.load(in_handle), dockerconf["input_dir"], args.fcdir) dmounts += mounts.prepare_system(args.datadir, dockerconf["biodata_dir"]) dmounts.append("%s:%s" % (work_dir, dockerconf["work_dir"])) system_config, system_mounts = _read_system_config(dockerconf, args.systemconfig, args.datadir) system_cfile = os.path.join(work_dir, "bcbio_system-forvm.yaml") sample_cfile = os.path.join(work_dir, "bcbio_sample-forvm.yaml") with open(system_cfile, "w") as out_handle: yaml.dump(system_config, out_handle, default_flow_style=False, allow_unicode=False) with open(sample_cfile, "w") as out_handle: yaml.dump(sample_config, out_handle, default_flow_style=False, allow_unicode=False) in_files = [os.path.join(dockerconf["work_dir"], os.path.basename(x)) for x in [system_cfile, sample_cfile]] log.setup_local_logging({"include_time": False}) manage.run_bcbio_cmd(dockerconf["image"], dmounts + system_mounts, in_files + ["--workdir=%s" % dockerconf["work_dir"]])
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir) fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel = None with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) try: out = fn(fnargs) except: logger.exception() raise if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, work_dir) except: logger.exception() raise
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
def process(args): """Run the function in args.name given arguments in args.argfile. """ try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir) argfile = os.path.join(work_dir, "cwl-%s-world.json" % args.name) with utils.chdir(work_dir): log.setup_local_logging(parallel={"wrapper": "runfn"}) out = fn(fnargs) if argfile: with open(argfile, "w") as out_handle: if argfile.endswith(".json"): if parallel in ["single-split", "multi-combined", "batch-split"]: json.dump([utils.to_single_data(xs) for xs in out], out_handle, sort_keys=True, separators=(',', ':')) elif parallel in ["multi-batch"]: json.dump([_collapse_to_cwl_record(xs, work_dir) for xs in out], out_handle, sort_keys=True, separators=(',', ':')) else: json.dump(utils.to_single_data(utils.to_single_data(out)), out_handle, sort_keys=True, separators=(',', ':')) else: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
def process(args): """Run the function in args.name given arguments in args.argfile. """ # Set environment to standard to use periods for decimals and avoid localization os.environ["LC_ALL"] = "C" os.environ["LC"] = "C" os.environ["LANG"] = "C" setpath.prepend_bcbiopath() try: fn = getattr(multitasks, args.name) except AttributeError: raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name) if args.moreargs or args.raw: fnargs = [args.argfile] + args.moreargs work_dir = None argfile = None else: with open(args.argfile) as in_handle: fnargs = yaml.safe_load(in_handle) work_dir = os.path.dirname(args.argfile) fnargs = config_utils.merge_resources(fnargs) argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile) if not work_dir: work_dir = os.getcwd() if len(fnargs) > 0 and fnargs[0] == "cwl": fnargs, parallel, out_keys, input_files = _world_from_cwl(args.name, fnargs[1:], work_dir) # Can remove this awkward Docker merge when we do not need custom GATK3 installs fnargs = config_utils.merge_resources(fnargs) argfile = os.path.join(work_dir, "cwl.output.json") else: parallel, out_keys, input_files = None, {}, [] with utils.chdir(work_dir): with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})): try: out = fn(*fnargs) except: logger.exception() raise finally: # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage wf_input_dir = os.path.join(work_dir, "wf-inputs") if os.path.exists(wf_input_dir) and os.path.isdir(wf_input_dir): shutil.rmtree(wf_input_dir) if argfile: try: _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir) except: logger.exception() raise
def calc_callable_loci(data, region=None, out_file=None): """Determine callable bases for input BAM using Broad's CallableLoci walker. http://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html """ if data["config"].get("parallel", {}).get("log_queue"): handler = setup_local_logging(data["config"], data["config"]["parallel"]) else: handler = None broad_runner = broad.runner_from_config(data["config"]) if out_file is None: out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0] out_summary = "%s-callable-summary.txt" % os.path.splitext( data["work_bam"])[0] variant_regions = data["config"]["algorithm"].get("variant_regions", None) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: broad_runner.run_fn("picard_index", data["work_bam"]) params = [ "-T", "CallableLoci", "-R", data["sam_ref"], "-I", data["work_bam"], "--out", tx_out_file, "--summary", out_summary ] ready_region = shared.subset_variant_regions( variant_regions, region, tx_out_file) if ready_region: params += ["-L", ready_region] if ((variant_regions and ready_region and os.path.isfile(ready_region)) or not variant_regions or not region): broad_runner.run_gatk(params) else: with open(out_file, "w") as out_handle: for tregion in get_ref_bedtool(data["sam_ref"], data["config"]): if tregion.chrom == region: out_handle.write( "%s\t%s\t%s\tNO_COVERAGE\n" % (tregion.chrom, tregion.start, tregion.stop)) if handler and hasattr(handler, "close"): handler.close() return [{ "callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"] }]
def calc_callable_loci(data, region=None, out_file=None): """Determine callable bases for input BAM using Broad's CallableLoci walker. http://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html """ if data["config"].get("parallel", {}).get("log_queue"): handler = setup_local_logging(data["config"], data["config"]["parallel"]) else: handler = None broad_runner = broad.runner_from_config(data["config"]) if out_file is None: out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0] out_summary = "%s-callable-summary.txt" % os.path.splitext(data["work_bam"])[0] variant_regions = data["config"]["algorithm"].get("variant_regions", None) # set a maximum depth to avoid calling in repetitive regions with excessive coverage max_depth = int(1e6 if data["config"]["algorithm"].get("coverage_depth", "").lower() == "super-high" else 2.5e4) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: bam.index(data["work_bam"], data["config"]) params = ["-T", "CallableLoci", "-R", data["sam_ref"], "-I", data["work_bam"], "--minDepth", "0", "--downsample_to_coverage", str(max_depth + 1000), "--minMappingQuality", "0", "--maxFractionOfReadsWithLowMAPQ", "1.1", "--maxDepth", str(max_depth), "--out", tx_out_file, "--summary", out_summary] ready_region = shared.subset_variant_regions(variant_regions, region, tx_out_file) if ready_region: params += ["-L", ready_region] if ((variant_regions and ready_region and os.path.isfile(ready_region)) or not variant_regions or not region): broad_runner.run_gatk(params) else: with open(out_file, "w") as out_handle: for tregion in get_ref_bedtool(data["sam_ref"], data["config"]): if tregion.chrom == region: out_handle.write("%s\t%s\t%s\tNO_COVERAGE\n" % (tregion.chrom, tregion.start, tregion.stop)) if handler and hasattr(handler, "close"): handler.close() return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
def wrapper(*args, **kwargs): config = None for arg in args: if ipython.is_std_config_arg(arg): config = arg break elif ipython.is_nested_config_arg(arg): config = arg["config"] break assert config, "Could not find config dictionary in function arguments." if config.get("parallel", {}).get("log_queue"): handler = setup_local_logging(config, config["parallel"]) else: handler = None try: out = f(*args, **kwargs) finally: if handler and hasattr(handler, "close"): handler.close() return out
def calc_callable_loci(data, region=None, out_file=None): """Determine callable bases for input BAM using Broad's CallableLoci walker. http://www.broadinstitute.org/gatk/gatkdocs/ org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html """ if data["config"].get("parallel", {}).get("log_queue"): handler = setup_local_logging(data["config"], data["config"]["parallel"]) else: handler = None broad_runner = broad.runner_from_config(data["config"]) if out_file is None: out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0] out_summary = "%s-callable-summary.txt" % os.path.splitext(data["work_bam"])[0] variant_regions = data["config"]["algorithm"].get("variant_regions", None) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: bam.index(data["work_bam"], data["config"]) params = ["-T", "CallableLoci", "-R", data["sam_ref"], "-I", data["work_bam"], "--out", tx_out_file, "--summary", out_summary] ready_region = shared.subset_variant_regions(variant_regions, region, tx_out_file) if ready_region: params += ["-L", ready_region] if ((variant_regions and ready_region and os.path.isfile(ready_region)) or not variant_regions or not region): broad_runner.run_gatk(params) else: with open(out_file, "w") as out_handle: for tregion in get_ref_bedtool(data["sam_ref"], data["config"]): if tregion.chrom == region: out_handle.write("%s\t%s\t%s\tNO_COVERAGE\n" % (tregion.chrom, tregion.start, tregion.stop)) if handler and hasattr(handler, "close"): handler.close() return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
def setup_log(config, parallel): parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel)
for stage in config["run"]: if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = Disambiguate(config) out_files = list(flatten(view.map(disambiguate, curr_files))) bam_files = view.map(sam.sam2bam, out_files) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if __name__ == "__main__": # read in the config file and perform initial setup main_config_file = sys.argv[1] with open(main_config_file) as config_in_handle: startup_config = yaml.load(config_in_handle) parallel = create_base_logger(startup_config, {"type": "ipython"}) setup_local_logging(startup_config, parallel) startup_config["parallel"] = parallel #setup_logging(startup_config) cluster_config = startup_config["cluster"] cores_per_job = cluster_config.get("cores_per_job", 1) if startup_config["cluster"].get("local", False): main(startup_config, DummyView()) else: with cluster_view(cluster_config["scheduler"], cluster_config["queue"], cluster_config["cores"], cores_per_job) as view: main(startup_config, view)
parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep") parser.add_argument("-t", "--paralleltype", choices=["local", "ipython"], default="local", help="Run with iptyhon") args = parser.parse_args() out_dir = os.path.abspath(args.out) utils.safe_makedir(out_dir) system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) parallel = clargs.to_parallel(args) parallel.update({'progs': ['samtools', 'sambamba']}) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] samples = _get_samples_to_process(args.csv, out_dir, config) parallel = resources.calculate(parallel, [samples], sysinfo, config) with prun.start(parallel, samples, config, dirs) as run_parallel: with profile.report("prepare bcbio samples", dirs): samples = run_parallel("prepare_bcbio_samples", samples) create_new_csv(samples, args)
"bcbio_system.yaml") except ValueError as err: print(err) print( "WARNING: Attempting to read bcbio_system.yaml in the current directory." ) system_config = "bcbio_system.yaml" with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log") parallel = clargs.to_parallel(args) parallel.update({'progs': ['samtools', 'sambamba']}) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] samples = _get_samples_to_process(args.csv, out_dir, config, args.force_single, args.separators) parallel = resources.calculate(parallel, [samples], sysinfo, config) with prun.start(parallel, samples, config, dirs) as run_parallel: with profile.report("prepare bcbio samples", dirs): samples = run_parallel("prepare_bcbio_samples", samples) create_new_csv(samples, args)