def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) logger.info("System YAML configuration: %s" % os.path.abspath(config_file)) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config) system.write_info(dirs, parallel, config) with tx_tmpdir(config if parallel.get("type") == "local" else None) as tmpdir: tempfile.tempdir = tmpdir for pipeline, samples in pipelines.items(): for xs in pipeline(config, run_info_yaml, parallel, dirs, samples): pass
def run_main(workdir, config_file=None, fc_dir=None, run_info_yaml=None, parallel=None, workflow=None): """Run variant analysis, handling command line options. """ # Set environment to standard to use periods for decimals and avoid localization locale_to_use = utils.get_locale() os.environ["LC_ALL"] = locale_to_use os.environ["LC"] = locale_to_use os.environ["LANG"] = locale_to_use workdir = utils.safe_makedir(os.path.abspath(workdir)) os.chdir(workdir) config, config_file = config_utils.load_system_config(config_file, workdir) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) logger.info(f"System YAML configuration: {os.path.abspath(config_file)}.") logger.info(f"Locale set to {locale_to_use}.") if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(workdir, DEFAULT_LOG_DIR) if parallel["type"] in ["local", "clusterk"]: _setup_resources() _run_toplevel(config, config_file, workdir, parallel, fc_dir, run_info_yaml) elif parallel["type"] == "ipython": assert parallel["scheduler"] is not None, "IPython parallel requires a specified scheduler (-s)" if parallel["scheduler"] != "sge": assert parallel["queue"] is not None, "IPython parallel requires a specified queue (-q)" elif not parallel["queue"]: parallel["queue"] = "" _run_toplevel(config, config_file, workdir, parallel, fc_dir, run_info_yaml) else: raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_samples_with_pipelines(samples) final = [] with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_lanes_with_pipelines(samples) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = run_info.organize(dirs, config, run_info_yaml) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane lane_items = lane.process_all_lanes(run_items, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None, samples=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process samples -- Pre-processed samples, useful if run inside of docker containers. """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) if samples: dockerized = True else: dockerized = False samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_samples_with_pipelines(samples) final = [] with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) if not dockerized: versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config) system.write_info(dirs, parallel, config) with tx_tmpdir(config) as tmpdir: tempfile.tempdir = tmpdir for pipeline, samples in pipelines.items(): for xs in pipeline.run(config, run_info_yaml, parallel, dirs, samples): pass
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
"bcbio_system.yaml") except ValueError as err: print(err) print( "WARNING: Attempting to read bcbio_system.yaml in the current directory." ) system_config = "bcbio_system.yaml" with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log") parallel = clargs.to_parallel(args) parallel.update({'progs': ['samtools', 'sambamba']}) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] samples = _get_samples_to_process(args.csv, out_dir, config, args.force_single, args.separators) parallel = resources.calculate(parallel, [samples], sysinfo, config) with prun.start(parallel, samples, config, dirs) as run_parallel: with profile.report("prepare bcbio samples", dirs): samples = run_parallel("prepare_bcbio_samples", samples) create_new_csv(samples, args)
parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep") parser.add_argument("-t", "--paralleltype", choices=["local", "ipython"], default="local", help="Run with iptyhon") args = parser.parse_args() out_dir = os.path.abspath(args.out) utils.safe_makedir(out_dir) system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) parallel = clargs.to_parallel(args) parallel.update({'progs': ['samtools', 'sambamba']}) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] samples = _get_samples_to_process(args.csv, out_dir, config) parallel = resources.calculate(parallel, [samples], sysinfo, config) with prun.start(parallel, samples, config, dirs) as run_parallel: with profile.report("prepare bcbio samples", dirs): samples = run_parallel("prepare_bcbio_samples", samples) create_new_csv(samples, args)
def setup_log(config, parallel): parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel)
for stage in config["run"]: if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = Disambiguate(config) out_files = list(flatten(view.map(disambiguate, curr_files))) bam_files = view.map(sam.sam2bam, out_files) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if __name__ == "__main__": # read in the config file and perform initial setup main_config_file = sys.argv[1] with open(main_config_file) as config_in_handle: startup_config = yaml.load(config_in_handle) parallel = create_base_logger(startup_config, {"type": "ipython"}) setup_local_logging(startup_config, parallel) startup_config["parallel"] = parallel #setup_logging(startup_config) cluster_config = startup_config["cluster"] cores_per_job = cluster_config.get("cores_per_job", 1) if startup_config["cluster"].get("local", False): main(startup_config, DummyView()) else: with cluster_view(cluster_config["scheduler"], cluster_config["queue"], cluster_config["cores"], cores_per_job) as view: main(startup_config, view)