Ejemplo n.º 1
def organize(dirs, config, run_info_yaml, sample_names):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names)
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
            item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths(tmp_dir)
    out = _add_provenance(out, dirs, config)
    return out
Ejemplo n.º 2
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
            utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
    return out
Ejemplo n.º 3
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
                utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
    return out
Ejemplo n.º 4
def organize(dirs,
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    from bcbio.pipeline import qcsummary
    if integrations is None: integrations = {}
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs,
    remote_retriever = None
    for iname, retriever in integrations.items():
        if iname in config:
            run_details = retriever.add_remotes(run_details, config[iname])
            remote_retriever = retriever
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item, remote_retriever)
        item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item)
        item["config"]["algorithm"]["vcfanno"] = vcfanno.find_annotations(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            # if no environmental variables, make and normalize the directory
            # otherwise we normalize later in distributed.transaction:
            if os.path.expandvars(tmp_dir) == tmp_dir:
                tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
                tmp_dir = genome.abs_file_paths(tmp_dir,
                                                do_download=not integrations)
            item["config"]["resources"]["tmp"]["dir"] = tmp_dir
    out = _add_provenance(out, dirs, config, not is_cwl)
    return out
Ejemplo n.º 5
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
            fc_name, fc_date = flowcell.parse_dirname(fc_dir)
        except ValueError:
    global_config = {}
    global_vars = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        loaded = loaded["details"]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS)
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        item = _clean_metadata(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Ejemplo n.º 6
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
            fc_name, fc_date = flowcell.parse_dirname(fc_dir)
        except ValueError:
    global_config = {}
    global_vars = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        loaded = loaded["details"]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError("No `description` sample name provided for input #%s" % (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars)
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Ejemplo n.º 7
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if not item.has_key("lane"):
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if not item.has_key("description"):
            if len(item.get("files",
                            [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
                                                      "realign", "recalibrate",
                                                      "phasing", "svcaller"
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Ejemplo n.º 8
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError("No `description` sample name provided for input #%s" % (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        # Handle specifying a local directory directly in upload
        if isinstance(upload, basestring):
            upload = {"dir": upload}
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"], ignore_keys=["variantcaller", "realign", "recalibrate", "phasing", "svcaller"]
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Ejemplo n.º 9
def organize(dirs, config, run_info_yaml, sample_names=None, add_provenance=True,
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    from bcbio.pipeline import qcsummary
    if integrations is None: integrations = {}
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names,
    remote_retriever = None
    for iname, retriever in integrations.items():
        if iname in config:
            run_details = retriever.add_remotes(run_details, config[iname])
            remote_retriever = retriever
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item, remote_retriever)
        item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            # if no environmental variables, make and normalize the directory
            # otherwise we normalize later in distributed.transaction:
            if os.path.expandvars(tmp_dir) == tmp_dir:
                tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
                tmp_dir = genome.abs_file_paths(tmp_dir, do_download=not integrations)
            item["config"]["resources"]["tmp"]["dir"] = tmp_dir
    out = _add_provenance(out, dirs, config, add_provenance)
    return out
Ejemplo n.º 10
def organize(dirs, config, run_info_yaml, sample_names):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config,
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
            item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths(
    out = _add_provenance(out, dirs, config)
    return out
Ejemplo n.º 11
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None):
    """Read run information from a passed YAML file.
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
    global_config = {}
    global_vars = {}
    resources = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError("No `description` sample name provided for input #%s" % (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True)
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars)
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [genome.abs_file_paths(f) for f in item["files"]]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(os.path.join(dirs.get("work", os.getcwd()), "inputs"))
            item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths(item["vrn_file"]), config,
                                                        remove_orig=False, out_dir=inputs_dir)
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.iteritems():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            for key, val in pkvs.iteritems():
                item["resources"][prog][key] = val
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Ejemplo n.º 12
def _run_info_from_yaml(dirs,
    """Read run information from a passed YAML file.
    validate_yaml(run_info_yaml, run_info_yaml)
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
    global_config = {}
    global_vars = {}
    resources = {}
    integration_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
        if "fc_date" in loaded:
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        for iname in ["arvados"]:
            integration_config[iname] = global_config.pop(iname, {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    if integrations:
        for iname, retriever in integrations.items():
            if iname in config:
                loaded = retriever.add_remotes(loaded, config[iname])

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name:
                upload["fc_name"] = fc_name
            if fc_date:
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
        item["algorithm"] = genome.abs_file_paths(
            do_download=all(not x for x in integrations.values()))
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["metadata"] = add_metadata_defaults(item.get("metadata", {}))
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [
                    f, do_download=all(not x for x in integrations.values()))
                for f in item["files"]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(
                os.path.join(dirs.get("work", os.getcwd()), "inputs",
            item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths(
                do_download=all(not x for x in integrations.values())),
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.items():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            if pkvs is not None:
                for key, val in pkvs.items():
                    item["resources"][prog][key] = val
        for iname, ivals in integration_config.items():
            if ivals:
                if iname not in item:
                    item[iname] = {}
                for k, v in ivals.items():
                    item[iname][k] = v

    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Ejemplo n.º 13
def _run_info_from_yaml(dirs,
    """Read run information from a passed YAML file.
    validate_yaml(run_info_yaml, run_info_yaml)
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
    global_config = {}
    global_vars = {}
    resources = {}
    integration_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
        if "fc_date" in loaded:
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        for iname in ["arvados"]:
            integration_config[iname] = global_config.pop(iname, {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    if integrations:
        for iname, retriever in integrations.items():
            if iname in config:
                config[iname] = retriever.set_cache(config[iname])
                loaded = retriever.add_remotes(loaded, config[iname])

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        description = _clean_characters(str(item["description"]))
        item["description"] = description
        # make names R safe if we are likely to use R downstream
        if item["analysis"].lower() in R_DOWNSTREAM_ANALYSIS:
            if description[0].isdigit():
                valid = "X" + description
                logger.info("%s is not a valid R name, converting to %s." %
                            (description, valid))
                item["description"] = valid
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if not upload:
                upload["dir"] = "../final"
            if fc_name:
                upload["fc_name"] = fc_name
            if fc_date:
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
        item["algorithm"] = genome.abs_file_paths(
            do_download=all(not x for x in integrations.values()))
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"],
                                                    item.get("analysis", ""),
        item["metadata"] = add_metadata_defaults(item.get("metadata", {}))
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [
                    f, do_download=all(not x for x in integrations.values()))
                for f in item["files"]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(
                os.path.join(dirs.get("work", os.getcwd()), "inputs",
            item["vrn_file"] = genome.abs_file_paths(
                do_download=all(not x for x in integrations.values()))
            if os.path.isfile(item["vrn_file"]):
                # Try to prepare in place (or use ready to go inputs)
                    item["vrn_file"] = vcfutils.bgzip_and_index(
                        item["vrn_file"], config, remove_orig=False)
                # In case of permission errors, fix in inputs directory
                except IOError:
                    item["vrn_file"] = vcfutils.bgzip_and_index(
            if not tz.get_in(("metadata", "batch"), item) and tz.get_in(
                ["algorithm", "validate"], item):
                raise ValueError(
                    "%s: Please specify a metadata batch for variant file (vrn_file) input.\n"
                    % (item["description"]) +
                    "Batching with a standard sample provides callable regions for validation."
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        item = _clean_background(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.items():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            if pkvs is not None:
                for key, val in pkvs.items():
                    item["resources"][prog][key] = val
        for iname, ivals in integration_config.items():
            if ivals:
                if iname not in item:
                    item[iname] = {}
                for k, v in ivals.items():
                    item[iname][k] = v

    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Ejemplo n.º 14
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None):
    """Read run information from a passed YAML file.
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
    global_config = {}
    global_vars = {}
    resources = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS)
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        if item.get("files"):
            item["files"] = [genome.abs_file_paths(f) for f in item["files"]]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            item["vrn_file"] = vcfutils.bgzip_and_index(
                genome.abs_file_paths(item["vrn_file"]), config)
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.iteritems():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            for key, val in pkvs.iteritems():
                item["resources"][prog][key] = val
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details