Ejemplo n.º 1
0
def get_source(path, basedir=".", wildcards=None, params=None):
    source = None
    if not path.startswith("http") and not path.startswith("git+file"):
        if path.startswith("file://"):
            path = path[7:]
        elif path.startswith("file:"):
            path = path[5:]
        if not os.path.isabs(path):
            path = smart_join(basedir, path, abspath=True)
        if is_local_file(path):
            path = "file://" + path
    if wildcards is not None and params is not None:
        # Format path if wildcards are given.
        path = format(path, wildcards=wildcards, params=params)
    if path.startswith("file://"):
        sourceurl = "file:" + pathname2url(path[7:])
    elif path.startswith("git+file"):
        source = git_content(path).encode()
        (root_path, file_path, version) = split_git_path(path)
        path = path.rstrip("@" + version)
    else:
        sourceurl = path

    if source is None:
        with urlopen(sourceurl) as source:
            source = source.read()

    language = get_language(path, source)

    return path, source, language
Ejemplo n.º 2
0
def infer_source_file(path_or_uri, basedir: SourceFile = None):
    if isinstance(path_or_uri, SourceFile):
        if basedir is None or isinstance(path_or_uri, HostingProviderFile):
            return path_or_uri
        else:
            path_or_uri = path_or_uri.get_path_or_uri()
    if isinstance(path_or_uri, Path):
        path_or_uri = str(path_or_uri)
    if not isinstance(path_or_uri, str):
        raise SourceFileError(
            "must be given as Python string or one of the predefined source file marker types (see docs)"
        )
    if is_local_file(path_or_uri):
        # either local file or relative to some remote basedir
        for schema in ("file://", "file:"):
            if path_or_uri.startswith(schema):
                path_or_uri = path_or_uri[len(schema):]
                break
        if not os.path.isabs(path_or_uri) and basedir is not None:
            return basedir.join(path_or_uri)
        return LocalSourceFile(path_or_uri)
    if path_or_uri.startswith("git+file:"):
        try:
            root_path, file_path, ref = split_git_path(path_or_uri)
        except Exception as e:
            raise WorkflowError(
                f"Failed to read source {path_or_uri} from git repo.", e)
        return LocalGitFile(root_path, file_path, ref=ref)
    # something else
    return GenericSourceFile(path_or_uri)
Ejemplo n.º 3
0
 def conda_env_file(self):
     if self._conda_env_file is None:
         expanded_env = self.rule.expand_conda_env(self.wildcards_dict)
         if expanded_env is not None:
             # Normalize 'file:///my/path.yml' to '/my/path.yml'
             if is_local_file(expanded_env):
                 self._conda_env_file = parse_uri(expanded_env).uri_path
             else:
                 self._conda_env_file = expanded_env
     return self._conda_env_file
Ejemplo n.º 4
0
def get_resource_as_string(path_or_uri):
    import requests

    if is_local_file(path_or_uri):
        return open(Path(__file__).parent.parent / "template" /
                    path_or_uri).read()
    else:
        r = requests.get(path_or_uri)
        if r.status_code == requests.codes.ok:
            return r.text
        raise WorkflowError("Failed to download resource needed for "
                            "report: {}".format(path_or_uri))
Ejemplo n.º 5
0
 def outputs_older_than_script_or_notebook(self):
     """return output that's older than script, i.e. script has changed"""
     path = self.rule.script or self.rule.notebook
     if not path:
         return
     if self.rule.basedir:
         # needed if rule is included from another subdirectory
         path = self.rule.basedir.join(path).get_path_or_uri()
     if is_local_file(path):
         assert os.path.exists(path), "cannot find {0}".format(path)
         script_mtime = os.lstat(path).st_mtime
         for f in self.expanded_output:
             if f.exists:
                 if not f.is_newer(script_mtime):
                     yield f
Ejemplo n.º 6
0
    def expand_conda_env(self, wildcards, params=None, input=None):
        from snakemake.common import is_local_file
        from snakemake.sourcecache import SourceFile, infer_source_file
        from snakemake.deployment.conda import (
            is_conda_env_file,
            CondaEnvFileSpec,
            CondaEnvNameSpec,
        )

        conda_env = self._conda_env
        if callable(conda_env):
            conda_env, _ = self.apply_input_function(conda_env,
                                                     wildcards=wildcards,
                                                     params=params,
                                                     input=input)

        if conda_env is None:
            return None

        if is_conda_env_file(conda_env):
            if not isinstance(conda_env, SourceFile):
                if is_local_file(conda_env) and not os.path.isabs(conda_env):
                    # Conda env file paths are considered to be relative to the directory of the Snakefile
                    # hence we adjust the path accordingly.
                    # This is not necessary in case of receiving a SourceFile.
                    conda_env = self.basedir.join(conda_env)
                else:
                    # infer source file from unmodified uri or path
                    conda_env = infer_source_file(conda_env)

            conda_env = CondaEnvFileSpec(conda_env, rule=self)
        else:
            conda_env = CondaEnvNameSpec(conda_env)

        conda_env = conda_env.apply_wildcards(wildcards, self)
        conda_env.check()

        return conda_env
Ejemplo n.º 7
0
 def is_persistently_cacheable(self, path_or_uri):
     # TODO remove special git url handling once included in smart_open
     if path_or_uri.startswith("git+file:"):
         return False
     return is_local_file(path_or_uri) and self.cacheable_prefixes.match(
         path_or_uri)
Ejemplo n.º 8
0
 def is_local(self):
     return is_local_file(self.url)
Ejemplo n.º 9
0
 def apply_wildcards(self, wildcards, rule):
     filepath = self.file.apply_wildcards(wildcards)
     if is_local_file(filepath):
         # Normalize 'file:///my/path.yml' to '/my/path.yml'
         filepath = parse_uri(filepath).uri_path
     return CondaEnvFileSpec(filepath, rule)
Ejemplo n.º 10
0
def notebook(
    path,
    basedir,
    input,
    output,
    params,
    wildcards,
    threads,
    resources,
    log,
    config,
    rulename,
    conda_env,
    conda_base_path,
    container_img,
    singularity_args,
    env_modules,
    bench_record,
    jobid,
    bench_iteration,
    cleanup_scripts,
    shadow_dir,
    edit,
    runtime_sourcecache_path,
):
    """
    Load a script from the given basedir + path and execute it.
    """
    draft = False
    if edit is not None:
        if is_local_file(path):
            if not os.path.isabs(path):
                local_path = os.path.join(basedir, path)
            else:
                local_path = path
            if not os.path.exists(local_path):
                # draft the notebook, it does not exist yet
                language = None
                draft = True
                path = "file://{}".format(os.path.abspath(local_path))
                if path.endswith(".py.ipynb"):
                    language = "jupyter_python"
                elif path.endswith(".r.ipynb"):
                    language = "jupyter_r"
                else:
                    raise WorkflowError(
                        "Notebook to edit has to end on .py.ipynb or .r.ipynb in order "
                        "to decide which programming language shall be used.")
        else:
            raise WorkflowError(
                "Notebook {} is not local, but edit mode is only allowed for "
                "local notebooks.".format(path))

    if not draft:
        path, source, language, is_local = get_source(
            path, SourceCache(runtime_sourcecache_path), basedir, wildcards,
            params)
    else:
        source = None
        is_local = True

    exec_class = get_exec_class(language)

    executor = exec_class(
        path,
        source,
        basedir,
        input,
        output,
        params,
        wildcards,
        threads,
        resources,
        log,
        config,
        rulename,
        conda_env,
        conda_base_path,
        container_img,
        singularity_args,
        env_modules,
        bench_record,
        jobid,
        bench_iteration,
        cleanup_scripts,
        shadow_dir,
        is_local,
    )

    if draft:
        executor.draft(listen=edit)
    else:
        executor.evaluate(edit=edit)
Ejemplo n.º 11
0
def notebook(
    path,
    basedir,
    input,
    output,
    params,
    wildcards,
    threads,
    resources,
    log,
    config,
    rulename,
    conda_env,
    conda_base_path,
    container_img,
    singularity_args,
    env_modules,
    bench_record,
    jobid,
    bench_iteration,
    cleanup_scripts,
    shadow_dir,
    edit,
    runtime_sourcecache_path,
):
    """
    Load a script from the given basedir + path and execute it.
    """
    draft = False
    if edit is not None:
        if is_local_file(path):
            if not os.path.isabs(path):
                local_path = os.path.join(basedir, path)
            else:
                local_path = path
            if not os.path.exists(local_path):
                # draft the notebook, it does not exist yet
                language = None
                draft = True
                path = "file://{}".format(os.path.abspath(local_path))
                if path.endswith(".py.ipynb"):
                    language = "jupyter_python"
                elif path.endswith(".r.ipynb"):
                    language = "jupyter_r"
                else:
                    raise WorkflowError(
                        "Notebook to edit has to end on .py.ipynb or .r.ipynb in order "
                        "to decide which programming language shall be used.")
        else:
            raise WorkflowError(
                "Notebook {} is not local, but edit mode is only allowed for "
                "local notebooks.".format(path))

    if not draft:
        path, source, language, is_local = get_source(
            path, SourceCache(runtime_sourcecache_path), basedir, wildcards,
            params)
    else:
        source = None
        is_local = True
        path = infer_source_file(path)

    exec_class = get_exec_class(language)

    executor = exec_class(
        path,
        source,
        basedir,
        input,
        output,
        params,
        wildcards,
        threads,
        resources,
        log,
        config,
        rulename,
        conda_env,
        conda_base_path,
        container_img,
        singularity_args,
        env_modules,
        bench_record,
        jobid,
        bench_iteration,
        cleanup_scripts,
        shadow_dir,
        is_local,
    )

    if edit is None:
        executor.evaluate(edit=edit)
    elif edit.draft_only:
        executor.draft()
        msg = "Generated skeleton notebook:\n{} ".format(path)
        if conda_env and not container_img:
            msg += (
                "\n\nEditing with VSCode:\nOpen notebook, run command 'Select notebook kernel' (Ctrl+Shift+P or Cmd+Shift+P), and choose:"
                "\n{}\n".format(
                    str(
                        Path(conda_env) / "bin" /
                        executor.get_interpreter_exec())))
            msg += ("\nEditing with Jupyter CLI:"
                    "\nconda activate {}\njupyter notebook {}\n".format(
                        conda_env, path))
        logger.info(msg)
    elif draft:
        executor.draft_and_edit(listen=edit)
    else:
        executor.evaluate(edit=edit)
Ejemplo n.º 12
0
    def create(self, dryrun=False):
        """Create the conda enviroment."""
        from snakemake.shell import shell

        # Read env file and create hash.
        env_file = self.file
        tmp_file = None

        if not is_local_file(env_file) or env_file.startswith("git+file:/"):
            with tempfile.NamedTemporaryFile(delete=False, suffix=".yaml") as tmp:
                tmp.write(self.content)
                env_file = tmp.name
                tmp_file = tmp.name

        env_hash = self.hash
        env_path = self.path

        if self.is_containerized:
            if not dryrun:
                try:
                    shell.check_output(
                        singularity.shellcmd(
                            self._container_img.path,
                            "[ -d '{}' ]".format(env_path),
                            args=self._singularity_args,
                            envvars=self.get_singularity_envvars(),
                            quiet=True,
                        ),
                        stderr=subprocess.PIPE,
                    )
                except subprocess.CalledProcessError as e:
                    raise WorkflowError(
                        "Unable to find environment in container image. "
                        "Maybe a conda environment was modified without containerizing again "
                        "(see snakemake --containerize)?\nDetails:\n{}\n{}".format(
                            e, e.stderr.decode()
                        )
                    )
                return env_path
            else:
                # env should be present in the container
                return env_path

        # Check for broken environment
        if os.path.exists(
            os.path.join(env_path, "env_setup_start")
        ) and not os.path.exists(os.path.join(env_path, "env_setup_done")):
            if dryrun:
                logger.info(
                    "Incomplete Conda environment {} will be recreated.".format(
                        utils.simplify_path(self.file)
                    )
                )
            else:
                logger.info(
                    "Removing incomplete Conda environment {}...".format(
                        utils.simplify_path(self.file)
                    )
                )
                shutil.rmtree(env_path, ignore_errors=True)

        # Create environment if not already present.
        if not os.path.exists(env_path):
            if dryrun:
                logger.info(
                    "Conda environment {} will be created.".format(
                        utils.simplify_path(self.file)
                    )
                )
                return env_path
            conda = Conda(self._container_img)
            logger.info(
                "Creating conda environment {}...".format(
                    utils.simplify_path(self.file)
                )
            )
            # Check if env archive exists. Use that if present.
            env_archive = self.archive_file
            try:
                # Touch "start" flag file
                os.makedirs(env_path, exist_ok=True)
                with open(os.path.join(env_path, "env_setup_start"), "a") as f:
                    pass

                if os.path.exists(env_archive):
                    logger.info("Installing archived conda packages.")
                    pkg_list = os.path.join(env_archive, "packages.txt")
                    if os.path.exists(pkg_list):
                        # read pacakges in correct order
                        # this is for newer env archives where the package list
                        # was stored
                        packages = [
                            os.path.join(env_archive, pkg.rstrip())
                            for pkg in open(pkg_list)
                        ]
                    else:
                        # guess order
                        packages = glob(os.path.join(env_archive, "*.tar.bz2"))

                    # install packages manually from env archive
                    cmd = " ".join(
                        [
                            "conda",
                            "create",
                            "--quiet",
                            "--yes",
                            "--prefix '{}'".format(env_path),
                        ]
                        + packages
                    )
                    if self._container_img:
                        cmd = singularity.shellcmd(
                            self._container_img.path,
                            cmd,
                            args=self._singularity_args,
                            envvars=self.get_singularity_envvars(),
                        )
                    out = shell.check_output(
                        cmd, stderr=subprocess.STDOUT, universal_newlines=True
                    )

                else:
                    # Copy env file to env_path (because they can be on
                    # different volumes and singularity should only mount one).
                    # In addition, this allows to immediately see what an
                    # environment in .snakemake/conda contains.
                    target_env_file = env_path + ".yaml"
                    shutil.copy(env_file, target_env_file)

                    logger.info("Downloading and installing remote packages.")
                    cmd = " ".join(
                        [
                            self.frontend,
                            "env",
                            "create",
                            "--quiet",
                            '--file "{}"'.format(target_env_file),
                            '--prefix "{}"'.format(env_path),
                        ]
                    )
                    if self._container_img:
                        cmd = singularity.shellcmd(
                            self._container_img.path,
                            cmd,
                            args=self._singularity_args,
                            envvars=self.get_singularity_envvars(),
                        )
                    out = shell.check_output(
                        cmd, stderr=subprocess.STDOUT, universal_newlines=True
                    )

                    # cleanup if requested
                    if self._cleanup is CondaCleanupMode.tarballs:
                        logger.info("Cleaning up conda package tarballs.")
                        shell.check_output("conda clean -y --tarballs")
                    elif self._cleanup is CondaCleanupMode.cache:
                        logger.info(
                            "Cleaning up conda package tarballs and package cache."
                        )
                        shell.check_output("conda clean -y --tarballs --packages")
                # Touch "done" flag file
                with open(os.path.join(env_path, "env_setup_done"), "a") as f:
                    pass

                logger.debug(out)
                logger.info(
                    "Environment for {} created (location: {})".format(
                        os.path.relpath(env_file), os.path.relpath(env_path)
                    )
                )
            except subprocess.CalledProcessError as e:
                # remove potential partially installed environment
                shutil.rmtree(env_path, ignore_errors=True)
                raise CreateCondaEnvironmentException(
                    "Could not create conda environment from {}:\n".format(env_file)
                    + e.output
                )

        if tmp_file:
            # temporary file was created
            os.remove(tmp_file)

        return env_path
Ejemplo n.º 13
0
def validate(data, schema, set_default=True):
    """Validate data with JSON schema at given path.

    Args:
        data (object): data to validate. Can be a config dict or a pandas data frame.
        schema (str): Path to JSON schema used for validation. The schema can also be
            in YAML format. If validating a pandas data frame, the schema has to
            describe a row record (i.e., a dict with column names as keys pointing
            to row values). See https://json-schema.org. The path is interpreted
            relative to the Snakefile when this function is called.
        set_default (bool): set default values defined in schema. See
            https://python-jsonschema.readthedocs.io/en/latest/faq/ for more
            information
    """
    frame = inspect.currentframe().f_back
    workflow = frame.f_globals.get("workflow")

    if workflow and workflow.modifier.skip_validation:
        # skip if a corresponding modifier has been defined
        return

    try:
        import jsonschema
        from jsonschema import validators, RefResolver
    except ImportError:
        raise WorkflowError(
            "The Python 3 package jsonschema must be installed "
            "in order to use the validate directive.")

    schemafile = schema

    if not os.path.isabs(schemafile):
        frame = inspect.currentframe().f_back
        # if workflow object is not available this has not been started from a workflow
        if workflow:
            schemafile = smart_join(workflow.current_basedir, schemafile)

    source = workflow.sourcecache.open(schemafile) if workflow else schemafile
    schema = _load_configfile(source, filetype="Schema")
    if is_local_file(schemafile):
        resolver = RefResolver(
            urljoin("file:", schemafile),
            schema,
            handlers={
                "file":
                lambda uri: _load_configfile(re.sub("^file://", "", uri))
            },
        )
    else:
        resolver = RefResolver(
            schemafile,
            schema,
        )

    # Taken from https://python-jsonschema.readthedocs.io/en/latest/faq/
    def extend_with_default(validator_class):
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for property, subschema in properties.items():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

            for error in validate_properties(validator, properties, instance,
                                             schema):
                yield error

        return validators.extend(validator_class, {"properties": set_defaults})

    Validator = validators.validator_for(schema)
    if Validator.META_SCHEMA["$schema"] != schema["$schema"]:
        logger.warning(
            "No validator found for JSON Schema version identifier '{}'".
            format(schema["$schema"]))
        logger.warning(
            "Defaulting to validator for JSON Schema version '{}'".format(
                Validator.META_SCHEMA["$schema"]))
        logger.warning("Note that schema file may not be validated correctly.")
    DefaultValidator = extend_with_default(Validator)

    if not isinstance(data, dict):
        try:
            import pandas as pd

            recordlist = []
            if isinstance(data, pd.DataFrame):
                for i, record in enumerate(data.to_dict("records")):
                    record = {
                        k: v
                        for k, v in record.items() if not pd.isnull(v)
                    }
                    try:
                        if set_default:
                            DefaultValidator(
                                schema, resolver=resolver).validate(record)
                            recordlist.append(record)
                        else:
                            jsonschema.validate(record,
                                                schema,
                                                resolver=resolver)
                    except jsonschema.exceptions.ValidationError as e:
                        raise WorkflowError(
                            "Error validating row {} of data frame.".format(i),
                            e)
                if set_default:
                    newdata = pd.DataFrame(recordlist, data.index)
                    newcol = ~newdata.columns.isin(data.columns)
                    n = len(data.columns)
                    for col in newdata.loc[:, newcol].columns:
                        data.insert(n, col, newdata.loc[:, col])
                        n = n + 1
                return
        except ImportError:
            pass
        raise WorkflowError("Unsupported data type for validation.")
    else:
        try:
            if set_default:
                DefaultValidator(schema, resolver=resolver).validate(data)
            else:
                jsonschema.validate(data, schema, resolver=resolver)
        except jsonschema.exceptions.ValidationError as e:
            raise WorkflowError("Error validating config file.", e)