def lint_not_used_params( self, rule, valid_names={"input", "output", "log", "params", "wildcards", "threads"}, regex=re.compile("{{(?P<name>{}).*?}}".format(NAME_PATTERN)), ): if rule.shellcmd: for match in regex.finditer(rule.shellcmd): name = match.group("name") before = match.start() - 1 after = match.end() if name not in valid_names and ( not (before >= 0 and after < len(rule.shellcmd)) or (rule.shellcmd[before] != "{" and rule.shellcmd[after] != "}") ): yield Lint( title="Shell command directly uses variable {} from outside of the rule".format( name ), body="It is recommended to pass all files as input and output, and non-file parameters " "via the params directive. Otherwise, provenance tracking is less accurate.", links=[links.params], )
def lint_dynamic(self, rule): for file in chain(rule.output, rule.input): if is_flagged(file, "dynamic"): yield Lint( title="The dynamic flag is deprecated", body="Use checkpoints instead, which are more powerful and less error-prone.", links=[links.checkpoints], )
def lint_version(self, rule): if rule.version: yield Lint( title="The version directive is deprecated", body="It was meant for documenting tool version, but this has been replaced " "by using the conda or container directive.", links=[links.package_management, links.containers], )
def lint_iofile_by_index(self, rule, regex=re.compile("(input|output)\[[0-9]+\]")): if rule.shellcmd and regex.search(rule.shellcmd): yield Lint( title="Do not access input and output files individually by index in shell commands", body="When individual access to input or output files is needed (i.e., just writing '{input}' " "is impossible), use names ('{input.somename}') instead of index based access.", links=[links.rules], )
def lint_tab_usage(self, snakefile, regex=re.compile(r"^ *\t")): for match in regex.finditer(snakefile): line = get_line(match, snakefile) yield Lint( title="Tab usage in line {}.".format(line), body= "Both Python and Snakemake can get confused when mixing tabs and spaces for indentation. " "It is recommended to only use spaces for indentation.", )
def lint_log_directive(self, rule): if not rule.log and not rule.norun: yield Lint( title="No log directive defined", body="Without a log directive, all output will be printed " "to the terminal. In distributed environments, this means " "that errors are harder to discover. In local environments, " "output of concurrent jobs will be mixed and become unreadable.", links=[links.log], )
def lint_singularity(self, snakefile, regex=re.compile("singularity:")): for match in regex.finditer(snakefile): line = get_line(match, snakefile) yield Lint( title="Deprecated singularity directive used for container definition in line {}.".format( line ), body="Use the container directive instead (it is agnostic of the underlying container runtime).", links=[links.containers], )
def lint_absolute_paths(self, snakefile, regex=re.compile(ABS_PATH_PATTERN)): for match in regex.finditer(snakefile): line = get_line(match, snakefile) yield Lint( title='Absolute path "{}" in line {}'.format(match.group("path"), line), body="Do not define absolute paths inside of the workflow, since this " "renders your workflow irreproducible on other machines. " "Use path relative to the working directory instead, or make the path " "configurable via a config file.", links=[links.config], )
def lint_long_run(self, rule): func_code = rule.run_func.__code__.co_code if rule.is_run and len(func_code) > 70: yield Lint( title="Migrate long run directives into scripts or notebooks", body="Long run directives hamper workflow readability. Use the script or notebook direcive instead. " "Note that the script or notebook directive does not involve boilerplate. Similar to run, you " "will have direct access to params, input, output, and wildcards." "Only use the run direcive for a handful of lines.", links=[links.external_scripts, links.notebooks], )
def lint_missing_software_definition(self, rule): if ( not rule.norun and not rule.is_run and not (rule.conda_env or rule.container_img) ): if rule.env_modules: yield Lint( title="Additionally specify a conda environment or container for each rule, environment modules are not enough", body="While environment modules allow to document and deploy the required software on a certain " "platform, they lock your workflow in there, disabling easy reproducibility on other machines " "that don't have exactly the same environment modules. Hence env modules (which might be beneficial " "in certain cluster environments), should allways be complemented with equivalent conda " "environments.", links=[links.package_management, links.containers], ) else: yield Lint( title="Specify a conda environment or container for each rule.", body="This way, the used software for each specific step is documented, and " "the workflow can be executed on any machine without prerequisites.", links=[links.package_management, links.containers], )
def lint_mixed_func_and_rules( self, snakefile, rule_regex=re.compile("rule .+?:"), func_regex=re.compile("def .+?:"), ): if rule_regex.search(snakefile) and func_regex.search(snakefile): yield Lint( title="Mixed rules and functions in same snakefile.", body="Small one-liner functions used only once should be " "defined as lambda expressions. Other functions should be collected " "in a common module, e.g. 'rules/common.smk'. This makes the workflow " "steps more readable.", links=[links.includes], )
def lint_params_prefix(self, rule): for param, value in rule.params.items(): if (isinstance(value, str) and value and any( f.startswith(value) for f in chain(rule.input, rule.output) if isinstance(f, str))): yield Lint( title= "Param {} is a prefix of input or output file but hardcoded" .format(param), body= "If this is meant to represent a file path prefix, it will fail when running " "workflow in environments without a shared filesystem. " "Instead, provide a function that infers the appropriate prefix from the input or " "output file, e.g.: lambda w, input: os.path.splitext(input[0])[0]", links=[links.params, links.input_functions], )
def lint_path_add( self, snakefile, regex1=re.compile("{name} *\\+ *{path}".format(name=NAME_PATTERN, path=PATH_PATTERN)), regex2=re.compile("{path} *\\+ *{name}".format(path=PATH_PATTERN, name=NAME_PATTERN)), ): for match in chain(regex1.finditer(snakefile), regex2.finditer(snakefile)): line = get_line(match, snakefile) yield Lint( title="Path composition with '+' in line {}".format(line), body= "This becomes quickly unreadable. Usually, it is better to endure some " "redundancy against having a more readable workflow. Hence, just repeat common " 'prefixes. If path composition is unavoidable, use pathlib or (python >= 3.6) string formatting with f"...". ', )
def lint_envvars( self, snakefile, regex=re.compile("os.environ\[(?P<quote>['\"])(?P<name>.+)?(?P=quote)\]"), ): for match in regex.finditer(snakefile): line = get_line(match, snakefile) name = match.group("name") if name not in self.workflow.envvars: yield Lint( title="Environment variable {} used but not asserted with envvars directive in line {}.".format( name, line ), body="Asserting existence of environment variables with the envvars directive ensures proper error " "messages if the user fails to invoke a workflow with all required environment variables defined. " "Further, it allows snakemake to pass them on in case of distributed execution.", links=[links.envvars], )