def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): config = load_config(config_file) galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) if config.has_key("galaxy_api_key") else None) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name, fname_out) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list( select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config, fname_out)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder( library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config, config_file, fname_out) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api and not run_info_yaml: add_run_summary_metrics(analysis_dir, galaxy_api)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) ## process each flowcell lane #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) #lane_items = run_parallel("process_lane", lanes) logger.info (">>> Parse lane") lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config) #for item in lane_items: #utils.prettyprint_dict(item) logger.info (">>> Process alignment") align_items = run_parallel("process_alignment", lane_items) ## process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) logger.info (">>> Merge samples") samples = run_parallel("merge_sample", samples) logger.info (">>> Recalibrate samples") samples = run_parallel("recalibrate_sample", samples) logger.info (">>> realign sample") samples = parallel_realign_sample(samples, run_parallel) logger.info (">>> variantcall") samples = parallel_variantcall(samples, run_parallel) logger.info (">>> postprocess_variatns") samples = run_parallel("postprocess_variants", samples) logger.info (">>> combine_multiple_calles") samples = combine_multiple_callers(samples) logger.info (">>> detect_sv") samples = run_parallel("detect_sv", samples) logger.info (">>> combine_calls") samples = run_parallel("combine_calls", samples) logger.info (">>> process_sample") run_parallel("process_sample", samples) logger.info (">>> Generate bigwig") run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) logger.info (">>> Writing project summary") write_project_summary(samples) logger.info (">>> Writing metrics") write_metrics(run_info, fc_name, fc_date, dirs) logger.info (">>> Done")
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir, } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("detect_sv", samples) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) if config.has_key("galaxy_api_key") else None) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list(select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder(library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config, config_file) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api and not run_info_yaml: add_run_summary_metrics(analysis_dir, galaxy_api)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) # process each flowcell lane lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = _run_parallel("process_lane", lanes, dirs, config, config_file) # upload the demultiplex counts to Google Docs create_bc_report_on_gdocs(fc_date, fc_name, work_dir, run_info, config) align_items = _run_parallel("process_alignment", lane_items, dirs, config, config_file) # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = \ organize_samples(dirs, fc_name, fc_date, run_items, align_items) samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file) for n, bam_files in sample_files) sample_items = _run_parallel("process_sample", samples, dirs, config, config_file) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) write_metrics(run_info, fc_name, fc_date, dirs)
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None): if project_desc is None and lanes is None: log.error("No project description or lanes provided: cannot deliver files without this information") sys.exit() config = load_config(config_file) ## Set log file in project output directory config.update(log_dir=os.path.join(project_dir, "log")) log_handler = create_log_handler(config, log.name) fc_dir = os.path.normpath(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) with log_handler.applicationbound(): run_info = prune_run_info_by_description(run_info['details'], project_desc, lanes) if len(run_info) == 0: log.error("No lanes found with matching description %s: please check your flowcell run information" % project_desc) sys.exit() dirs = dict(fc_dir=fc_dir, project_dir=project_dir) fc_name, fc_date = get_flowcell_id(run_info, dirs['fc_dir']) config.update(fc_name = fc_name, fc_date = fc_date) config.update(fc_alias = "%s_%s" % (fc_date, fc_name) if not fc_alias else fc_alias) dirs.update(fc_delivery_dir = os.path.join(dirs['project_dir'], options.data_prefix, config['fc_alias'] )) dirs.update(data_delivery_dir = os.path.join(dirs['project_dir'], options.data_prefix, "%s_%s" %(fc_date, fc_name) )) with log_handler.applicationbound(): config = _make_delivery_directory(dirs, config) _save_run_info(run_info, dirs['fc_delivery_dir'], run_exit=options.only_run_info) run_main(run_info, config, dirs)
def _get_cores_and_type(config, fc_dir, run_info_yaml, numcores=None, paralleltype=None): """Return core and parallelization approach from combo of config and commandline. Prefers passed commandline parameters over pre-configured, defaulting to a local run on a single core. The preferred approach is to pass in values explicitly on the commandline and this helps maintain back compatibility. """ config_cores = config["algorithm"].get("num_cores", None) if config_cores: try: config_cores = int(config_cores) if numcores is None: numcores = config_cores except ValueError: if paralleltype is None: paralleltype = config_cores if paralleltype is None: paralleltype = "local" if numcores is None: if config.get("distributed", {}).get("num_workers", "") == "all": cp = config["distributed"]["cluster_platform"] cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp]) numcores = cluster.available_nodes(config["distributed"]["platform_args"]) - 1 if numcores is None: if paralleltype == "local": numcores = 1 else: numcores = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1]) return paralleltype, int(numcores)
def main(config_file, fc_dir, run_info_yaml=None, num_workers=None): with open(config_file) as in_handle: config = yaml.load(in_handle) assert config["algorithm"]["num_cores"] == "messaging", \ "Use this script only with configured 'messaging' parallelization" if num_workers is None: num_workers = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1]) task_module = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) run_and_monitor(config, config_file, args, num_workers, task_module)
def test_run_info_combine(self): """Combine multiple lanes in a test run into a single combined lane. """ run_info_yaml = os.path.join(self.data_dir, "run_info-alternatives.yaml") _, _, run_info = get_run_info("", {}, run_info_yaml) assert len(run_info["details"]) == 2 assert len(run_info["details"][0]) == 3 x1, x2, x3 = run_info["details"][0] assert x1["description"] == "1: BC1" assert x2["description"] == "1: BC2" assert x3["genome_build"] == "mm9" x1 = run_info["details"][1][0] assert x1["barcode_id"] is None
def main(config_file, fc_dir, run_info_yaml=None): with open(config_file) as in_handle: config = yaml.load(in_handle) assert config["algorithm"]["num_cores"] == "messaging", \ "This script is used with configured 'messaging' parallelization" cluster = globals()[config["distributed"]["cluster_platform"]] workers_needed = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1]) print "Starting cluster workers" jobids = start_workers(cluster, workers_needed, config, config_file) try: print "Running analysis" run_analysis(config_file, fc_dir, run_info_yaml, cluster, config) finally: print "Cleaning up cluster workers" stop_workers(cluster, jobids)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def main(config_file, fc_dir, run_info_yaml=None, num_workers=None): config = load_config(config_file) assert config["algorithm"]["num_cores"] == "messaging", \ "Use this script only with configured 'messaging' parallelization" if num_workers is None: if config["distributed"].get("num_workers", "") == "all": cp = config["distributed"]["cluster_platform"] cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp]) num_workers = cluster.available_nodes(config["distributed"]["platform_args"]) - 1 if num_workers is None: num_workers = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1]) task_module = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) run_and_monitor(config, config_file, args, num_workers, task_module)
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) assert config["algorithm"]["num_cores"] == "messaging", \ "This script is used with configured 'messaging' parallelization" cluster = globals()[config["distributed"]["cluster_platform"]] workers_needed = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1]) jobids = [] try: print "Starting manager" manager_id = start_analysis_manager(config_file, fc_dir, run_info_yaml, cluster, config) print "Starting cluster workers" jobids.extend(start_workers(cluster, workers_needed, config, config_file)) jobids.append(manager_id) print "Running analysis" monitor_analysis(cluster, manager_id) finally: print "Cleaning up cluster workers" stop_workers(cluster, jobids)
def main(config_file, fc_dir, run_info_yaml=None, num_workers=None): config = load_config(config_file) assert config["algorithm"]["num_cores"] == "messaging", \ "Use this script only with configured 'messaging' parallelization" if num_workers is None: if config["distributed"].get("num_workers", "") == "all": cp = config["distributed"]["cluster_platform"] cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp]) num_workers = cluster.available_nodes( config["distributed"]["platform_args"]) - 1 if num_workers is None: num_workers = _needed_workers( get_run_info(fc_dir, config, run_info_yaml)[-1]) task_module = "bcbio.distributed.tasks" args = [config_file, fc_dir] if run_info_yaml: args.append(run_info_yaml) run_and_monitor(config, config_file, args, num_workers, task_module)
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None, barcodes=None): config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(project_dir, "log") setup_logging(config) if project_desc is None and lanes is None: logger.error("No project description or lanes provided: cannot deliver files without this information") sys.exit() if options.customer_delivery and not fc_alias == "": logger.info("INFO: Ignoring flowcell_alias when doing customer_delivery") fc_dir = os.path.abspath(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fp = open(run_info_yaml) run_info_structure = yaml.load(fp) original_fc = PostProcessedFlowcell(fc_name, fc_date, run_info_structure, fc_dir=fc_dir, fc_results_dir=fc_dir) pruned_fc = original_fc.prune_to_project(project_desc, exclude_unmatched=True) if pruned_fc is None or len(pruned_fc.get_lanes()) == 0: if not project_desc is None: logger.error("No lanes found with matching description %s: please check your flowcell run information" % project_desc) print >> sys.stderr, "Available projects: \n\t%s" % ("\n\t".join(original_fc.get_project_names())) sys.exit() if not lanes is None: logger.error("No lanes found with numbers %s: please check your flowcell run information" % " ".join(lanes)) sys.exit() # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data) rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/data", rawdata_fc.get_fc_id()))) analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) analysis_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/intermediate", rawdata_fc.get_fc_id()))) # If customer delivery setup some special options if options.customer_delivery: rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, project_desc, rawdata_fc.get_fc_id()))) rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id()) analysis_fc = rawdata_fc _make_delivery_directory(rawdata_fc) _make_delivery_directory(analysis_fc) run_main(pruned_fc, rawdata_fc, analysis_fc)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
def _get_cores_and_type(config, fc_dir, run_info_yaml, numcores=None, paralleltype=None): """Return core and parallelization approach from combo of config and commandline. Prefers passed commandline parameters over pre-configured, defaulting to a local run on a single core. The preferred approach is to pass in values explicitly on the commandline and this helps maintain back compatibility. """ config_cores = config["algorithm"].get("num_cores", None) if config_cores: try: config_cores = int(config_cores) if numcores is None: numcores = config_cores except ValueError: if paralleltype is None: paralleltype = config_cores if paralleltype is None: paralleltype = "local" if numcores is None: if config.get("distributed", {}).get("num_workers", "") == "all": cp = config["distributed"]["cluster_platform"] cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp]) numcores = cluster.available_nodes( config["distributed"]["platform_args"]) - 1 if numcores is None: if paralleltype == "local": numcores = 1 else: numcores = _needed_workers( get_run_info(fc_dir, config, run_info_yaml)[-1]) return paralleltype, int(numcores)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, run_info_yaml): # Working directory has to be identical to where (demultiplexed) fastq files are located fc_dir = os.path.normpath(fc_dir) work_dir = os.getcwd() align_dir = os.path.join(work_dir, "alignments") #(_, fastq_dir_label) = os.path.split(work_dir) #fastq_dir = os.path.join(project_dir, fastq_dir_label) #fc_name, fc_date = get_flowcell_info(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) #run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml) #fastq_dir, galaxy_dir, config_dir = _get_full_paths(fastq_dir, config, config_file) galaxy_dir, config_dir = _get_full_paths(config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) # fastq = fastq_dir, dirs = dict(galaxy= galaxy_dir, align = align_dir, work = work_dir, config = config_dir, flowcell = fc_dir, fc_dir = fc_dir ) # Since demultiplexing is already done, just extract run_items run_items = run_info['details'] lane_items = [] for info in run_items: print info lane_items.extend(make_lane_items(info, fc_date, fc_name, dirs, config)) _run_parallel("process_alignment", lane_items, dirs, config) # Process samples sample_files, sample_fastq, sample_info = \ organize_samples(dirs, fc_name, fc_date, run_items) samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file) for n, bam_files in sample_files) _run_parallel("process_sample", samples, dirs, config)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("process_lane") # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) _add_to_compress(to_compress, align_items, 'align_items') prog.dummy() prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.dummy() prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("variantcall") samples = run_parallel("detect_sv", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("detect_sv") samples = run_parallel("process_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.dummy() prog.progress("write_metrics") # Compress all files in to_compress if config['algorithm'].get('compress_files', True): sizes = run_parallel("compress_files", [[[cf]] for cf in to_compress]) before = sum([s[0] for s in sizes]) after = sum([s[1] for s in sizes]) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None, barcodes=None): config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(project_dir, "log") setup_logging(config) if project_desc is None and lanes is None: logger.error( "No project description or lanes provided: cannot deliver files without this information" ) sys.exit() if options.customer_delivery and not fc_alias == "": logger.info( "INFO: Ignoring flowcell_alias when doing customer_delivery") fc_dir = os.path.abspath(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fp = open(run_info_yaml) run_info_structure = yaml.load(fp) original_fc = PostProcessedFlowcell(fc_name, fc_date, run_info_structure, fc_dir=fc_dir, fc_results_dir=fc_dir) pruned_fc = original_fc.prune_to_project(project_desc, exclude_unmatched=True) if pruned_fc is None or len(pruned_fc.get_lanes()) == 0: if not project_desc is None: logger.error( "No lanes found with matching description %s: please check your flowcell run information" % project_desc) print >> sys.stderr, "Available projects: \n\t%s" % ("\n\t".join( original_fc.get_project_names())) sys.exit() if not lanes is None: logger.error( "No lanes found with numbers %s: please check your flowcell run information" % " ".join(lanes)) sys.exit() # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data) rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) rawdata_fc.set_fc_dir( os.path.abspath( os.path.join(project_dir, "nobackup/data", rawdata_fc.get_fc_id()))) analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) analysis_fc.set_fc_dir( os.path.abspath( os.path.join(project_dir, "nobackup/intermediate", rawdata_fc.get_fc_id()))) # If customer delivery setup some special options if options.customer_delivery: rawdata_fc.set_fc_dir( os.path.abspath( os.path.join(project_dir, project_desc, rawdata_fc.get_fc_id()))) rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id()) analysis_fc = rawdata_fc _make_delivery_directory(rawdata_fc) _make_delivery_directory(analysis_fc) run_main(pruned_fc, rawdata_fc, analysis_fc)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = utils.RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("process_lane") # upload the sequencing report to Google Docs # will skip this for now and rely on external mechanism for uploading this data #gdocs_indicator = os.path.join(work_dir, "gdocs_report_complete.txt") #if not os.path.exists(gdocs_indicator) \ #and queue_report(fc_date, fc_name, os.path.abspath(run_info_yaml), dirs, config, config_file): # utils.touch_file(gdocs_indicator) # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) [to_compress.add(f) for f in align_items[0]['fastq']] prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) prog.progress("variantcall") samples = run_parallel("detect_sv", samples) prog.progress("detect_sv") samples = run_parallel("process_sample", samples) prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.progress("write_metrics") # Write statusdb metrics # will skip this for now and rely on external mechanism for uploading this data #report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config) #Compress all files in to_compress if config['algorithm'].get('compress_files', True): (before, after) = utils.compress_files(to_compress) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))