def _prep_bam_input(f, i, base): if not os.path.exists(f): raise ValueError("Could not find input file: %s" % f) cur = copy.deepcopy(base) cur["files"] = [os.path.abspath(f)] cur["description"] = get_sample_name(f) or os.path.splitext(os.path.basename(f))[0] return cur
def _prep_bam_input(f, i, base): if not os.path.exists(f): raise ValueError("Could not find input file: %s" % f) cur = copy.deepcopy(base) cur["files"] = os.path.abspath(f) cur["description"] = get_sample_name(f) or "Sample%s" % (i+1) return cur
def _prep_bam_input(f, base): if not os.path.exists(f): raise ValueError("Could not find input file: %s" % f) cur = copy.deepcopy(base) cur["files"] = os.path.abspath(f) cur["description"] = get_sample_name(f) return cur
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = flowcell.parse_dirname(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) item = _clean_metadata(item) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = flowcell.parse_dirname(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) item = _clean_metadata(item) item = _clean_algorithm(item) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if not item.has_key("lane"): item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if not item.has_key("description"): if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=[ "variantcaller", "realign", "recalibrate", "phasing", "svcaller" ]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=["variantcaller", "realign", "recalibrate", "phasing", "svcaller"] ) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [genome.abs_file_paths(f) for f in item["files"]] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir(os.path.join(dirs.get("work", os.getcwd()), "inputs")) item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths(item["vrn_file"]), config, remove_orig=False, out_dir=inputs_dir) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.iteritems(): if prog not in item["resources"]: item["resources"][prog] = {} for key, val in pkvs.iteritems(): item["resources"][prog][key] = val run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())), config, remove_orig=False, out_dir=inputs_dir) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, is_cwl=False, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: config[iname] = retriever.set_cache(config[iname]) loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) description = _clean_characters(str(item["description"])) item["description"] = description # make names R safe if we are likely to use R downstream if item["analysis"].lower() in R_DOWNSTREAM_ANALYSIS: if description[0].isdigit(): valid = "X" + description logger.info("%s is not a valid R name, converting to %s." % (description, valid)) item["description"] = valid if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if not upload: upload["dir"] = "../final" if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"], item.get("analysis", ""), is_cwl) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())) if os.path.isfile(item["vrn_file"]): # Try to prepare in place (or use ready to go inputs) try: item["vrn_file"] = vcfutils.bgzip_and_index( item["vrn_file"], config, remove_orig=False) # In case of permission errors, fix in inputs directory except IOError: item["vrn_file"] = vcfutils.bgzip_and_index( item["vrn_file"], config, remove_orig=False, out_dir=inputs_dir) if not tz.get_in(("metadata", "batch"), item) and tz.get_in( ["algorithm", "validate"], item): raise ValueError( "%s: Please specify a metadata batch for variant file (vrn_file) input.\n" % (item["description"]) + "Batching with a standard sample provides callable regions for validation." ) item = _clean_metadata(item) item = _clean_algorithm(item) item = _clean_background(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) if item.get("files"): item["files"] = [genome.abs_file_paths(f) for f in item["files"]] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): item["vrn_file"] = vcfutils.bgzip_and_index( genome.abs_file_paths(item["vrn_file"]), config) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.iteritems(): if prog not in item["resources"]: item["resources"][prog] = {} for key, val in pkvs.iteritems(): item["resources"][prog][key] = val run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details