def get_runinfo(galaxy_url, galaxy_apikey, run_folder, storedir): """Retrieve flattened run information for a processed directory from Galaxy nglims API. """ galaxy_api = GalaxyApiAccess(galaxy_url, galaxy_apikey) fc_name, fc_date = flowcell.parse_dirname(run_folder) galaxy_info = galaxy_api.run_details(fc_name, fc_date) if "error" in galaxy_info: return galaxy_info if not galaxy_info["run_name"].startswith(fc_date) and not galaxy_info["run_name"].endswith(fc_name): raise ValueError("Galaxy NGLIMS information %s does not match flowcell %s %s" % (galaxy_info["run_name"], fc_date, fc_name)) ldetails = _flatten_lane_details(galaxy_info) out = [] for item in ldetails: # Do uploads for all non-controls if item["description"] != "control" or item["project_name"] != "control": item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date, "dir": storedir, "galaxy_url": galaxy_url, "galaxy_api_key": galaxy_apikey} for k in ["lab_association", "private_libs", "researcher", "researcher_id", "sample_id", "galaxy_library", "galaxy_role"]: item["upload"][k] = item.pop(k, "") out.append(item) return out
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Using input YAML configuration: %s" % run_info_yaml) run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = flowcell.parse_dirname(dirs["flowcell"]) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_details = [] galaxy_info = galaxy_api.run_details(fc_name, fc_date) for item in galaxy_info["details"]: item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date} run_details.append(item) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] item = add_reference_resources(item) out.append(item) return out
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = flowcell.parse_dirname(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) item = _clean_metadata(item) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = flowcell.parse_dirname(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) item = _clean_metadata(item) item = _clean_algorithm(item) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _write_sample_config(run_folder, ldetails): """Generate a bcbio-nextgen YAML configuration file for processing a sample. """ out_file = os.path.join(run_folder, "%s.yaml" % os.path.basename(run_folder)) with open(out_file, "w") as out_handle: fc_name, fc_date = flowcell.parse_dirname(run_folder) out = {"details": sorted([_prepare_sample(x, run_folder) for x in ldetails], key=operator.itemgetter("name", "description")), "fc_name": fc_name, "fc_date": fc_date} yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) fc_name, fc_date = flowcell.parse_dirname(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = flowcell.get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map( _process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]) ) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = flowcell.parse_dirname(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = flowcell.get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def run_has_samplesheet(fc_dir, config, require_single=True): """Checks if there's a suitable SampleSheet.csv present for the run """ fc_name, _ = flowcell.parse_dirname(fc_dir) sheet_dirs = config.get("samplesheet_directories", []) fcid_sheet = {} for ss_dir in (s for s in sheet_dirs if os.path.exists(s)): with utils.chdir(ss_dir): for ss in glob.glob("*.csv"): fc_ids = _get_flowcell_id(ss, require_single) for fcid in fc_ids: if fcid: fcid_sheet[fcid] = os.path.join(ss_dir, ss) # difflib handles human errors while entering data on the SampleSheet. # Only one best candidate is returned (if any). 0.85 cutoff allows for # maximum of 2 mismatches in fcid potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1, 0.85) if len(potential_fcids) > 0 and fcid_sheet.has_key(potential_fcids[0]): return fcid_sheet[potential_fcids[0]] else: return None
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())), config, remove_orig=False, out_dir=inputs_dir) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, is_cwl=False, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: config[iname] = retriever.set_cache(config[iname]) loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) description = _clean_characters(str(item["description"])) item["description"] = description # make names R safe if we are likely to use R downstream if item["analysis"].lower() in R_DOWNSTREAM_ANALYSIS: if description[0].isdigit(): valid = "X" + description logger.info("%s is not a valid R name, converting to %s." % (description, valid)) item["description"] = valid if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if not upload: upload["dir"] = "../final" if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"], item.get("analysis", ""), is_cwl) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())) if os.path.isfile(item["vrn_file"]): # Try to prepare in place (or use ready to go inputs) try: item["vrn_file"] = vcfutils.bgzip_and_index( item["vrn_file"], config, remove_orig=False) # In case of permission errors, fix in inputs directory except IOError: item["vrn_file"] = vcfutils.bgzip_and_index( item["vrn_file"], config, remove_orig=False, out_dir=inputs_dir) if not tz.get_in(("metadata", "batch"), item) and tz.get_in( ["algorithm", "validate"], item): raise ValueError( "%s: Please specify a metadata batch for variant file (vrn_file) input.\n" % (item["description"]) + "Batching with a standard sample provides callable regions for validation." ) item = _clean_metadata(item) item = _clean_algorithm(item) item = _clean_background(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) if item.get("files"): item["files"] = [genome.abs_file_paths(f) for f in item["files"]] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): item["vrn_file"] = vcfutils.bgzip_and_index( genome.abs_file_paths(item["vrn_file"]), config) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.iteritems(): if prog not in item["resources"]: item["resources"][prog] = {} for key, val in pkvs.iteritems(): item["resources"][prog][key] = val run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details