Beispiel #1
0
class Pipeline(object):
    '''
    The Pipeline class represents the entire processing pipeline which is defined
    and configured via the configuration file config.yaml.
    
    Individual steps may be defined in a tree, and their combination with samples
    as generated by one or more source leads to an array of tasks.
    '''

    states = misc.Enum(['WAITING', 'READY', 'QUEUED', 'EXECUTING', 'FINISHED'])
    '''
    Possible states a task can be in.
    '''
    def __init__(self, **kwargs):
        self.caught_signal = None

        self.git_dirty_diff = None

        self.cluster_type = None
        '''
        The cluster type to be used (must be one of the keys specified in
        cluster_config).
        '''

        # Check the availability of git
        command = ['git', '--version']
        try:
            with open(os.devnull, 'w') as devnull:
                subprocess.check_call(command, stdout=devnull)

        except subprocess.CalledProcessError as e:
            logger.error("Execution of '%s' failed. Git seems to be "
                         "unavailable." % " ".join(command))
            sys.exit(1)

        # now determine the Git hash of the repository
        command = ['git', 'describe', '--all', '--dirty', '--long']
        try:
            self.git_hash_tag = subprocess.check_output(command).strip()
        except:
            logger.error("Execution of %s failed." % " ".join(command))
            raise
            sys.exit(1)

        # check if we got passed an 'arguments' parameter
        # this parameter should contain a argparse.Namespace object
        args = None
        if 'arguments' in kwargs:
            args = kwargs['arguments']

        self._uap_path = args.uap_path
        '''
        Absolute path to the directory of the uap executable.
        It is used to circumvent path issues.
        '''

        self._cluster_config_path = os.path.join(
            self._uap_path, 'cluster/cluster-specific-commands.yaml')
        with open(self._cluster_config_path, 'r') as cluster_config_file:
            self._cluster_config = yaml.load(cluster_config_file)
        '''
        Cluster-related configuration for every cluster system supported.
        '''

        if self.git_hash_tag.endswith('-dirty'):
            if not args.even_if_dirty:
                print("The repository has uncommitted changes, which is why " +
                      "we will exit right now.")
                print(
                    "If this is not a production environment, you can skip " +
                    "this test by specifying --even-if-dirty on the command " +
                    "line.")
                print(self.git_hash_tag)
                exit(1)
                command = ['git', 'diff']
                try:
                    self.git_dirty_diff = subprocess.check_output(command)
                except:
                    logger.error("Execution of %s failed." % " ".join(command))
                    sys.exit(1)
        try:
            # set cluster type
            if args.cluster == 'auto':
                self.set_cluster_type(self.autodetect_cluster_type())
            else:
                self.set_cluster_type(args.cluster)
        except AttributeError:
            # cluster type is not an applicable parameter here, and that's fine
            # (we're probably in run-locally.py)
            pass

        self._config_filepath = args.config.name
        '''
        Name of the YAML configuration file
        '''

        self.config = dict()
        '''
        Dictionary representation of configuration YAML file.
        '''

        self.steps = dict()
        '''
        This dict stores step objects by their name. Each step knows his
        dependencies.
        '''

        self.topological_step_order = list()
        '''
        List with topologically ordered steps. 
        '''

        self.file_dependencies = dict()
        '''
        This dict stores file dependencies within this pipeline, but regardless
        of step, output file tag or run ID. This dict has, for all output 
        files generated by the pipeline, a set of input files that output 
        file depends on.
        '''

        self.file_dependencies_reverse = dict()
        '''
        This dict stores file dependencies within this pipeline, but regardless
        of step, output file tag or run ID. This dict has, for all input
        files required by the pipeline, a set of output files which are generated
        using this input file.
        '''

        self.task_id_for_output_file = dict()
        '''
        This dict stores a task ID for every output file created by the pipeline.
        '''

        self.task_ids_for_input_file = dict()
        '''
        This dict stores a set of task IDs for every input file used in the
        pipeline.
        '''

        self.input_files_for_task_id = dict()
        '''
        This dict stores a set of input files for every task id in the pipeline.
        '''

        self.output_files_for_task_id = dict()
        '''
        This dict stores a set of output files for every task id in the pipeline.
        '''

        self.task_for_task_id = dict()
        '''
        This dict stores task objects by task IDs.
        '''

        self.all_tasks_topologically_sorted = list()
        '''
        List of all tasks in topological order. 
        '''

        self.read_config(args.config)

        # collect all tasks
        for step_name in self.topological_step_order:
            step = self.get_step(step_name)
            logger.debug("Collect now all tasks for step: %s" % step)
            for run_index, run_id in enumerate(
                    misc.natsorted(step.get_run_ids())):
                task = task_module.Task(self, step, run_id, run_index)
                # if any run of a step contains an exec_groups,
                # the task (step/run) is added to the task list
                run = step.get_run(run_id)
                logger.debug("Step: %s, Run: %s" % (step, run_id))
                run_has_exec_groups = False
                if len(run.get_exec_groups()) > 0:
                    run_has_exec_groups = True
                if run_has_exec_groups:
                    logger.debug("Task: %s" % task)
                    self.all_tasks_topologically_sorted.append(task)
                # Fail if multiple tasks with the same name exist
                if str(task) in self.task_for_task_id:
                    logger.error("%s: Duplicate task ID %s." %
                                 (self.get_config_filepath(), str(task)))
                    sys.exit(1)
                self.task_for_task_id[str(task)] = task

        self.tool_versions = {}
        self.check_tools()

    def get_uap_path(self):
        return self._uap_path

    def get_config_filepath(self):
        return self._config_filepath

    def get_cluster_config(self):
        return self._cluster_config

    def get_steps(self):
        return self.steps

    def get_step(self, step_name):
        return self.steps[step_name]

    # read configuration and make sure it's good
    def read_config(self, config_file):
        # yaml.load works fine, even for duplicate dictionary keys (WTF)
        self.config = yaml.load(config_file)

        # Make self.config['destination_path'] an absolute path if necessary
        if not os.path.isabs(self.config['destination_path']):
            config_abspath = os.path.dirname(config_file.name)
            self.config['destination_path'] = os.path.join(
                config_abspath, self.config['destination_path'])

        if not 'id' in self.config:
            self.config['id'] = config_file.name

        if not 'destination_path' in self.config:
            logger.error("%s: Missing key: destination_path" %
                         self.get_config_filepath())
            sys.exit(1)
        if not os.path.exists(self.config['destination_path']):
            logger.error(
                "%s: Destination path does not exist: %s" %
                (self.get_config_filepath(), self.config['destination_path']))
            sys.exit(1)
        if not os.path.exists("%s-out" % self.config['id']):
            os.symlink(self.config['destination_path'],
                       '%s-out' % self.config['id'])

        if not 'cluster' in self.config:
            self.config['cluster'] = dict()

        for i in [
                'default_submit_options', 'default_pre_job_command',
                'default_post_job_command'
        ]:
            if i not in self.config['cluster']:
                self.config['cluster'][i] = ''

        self.build_steps()

    def get_config(self):
        return self.config

    def build_steps(self):
        self.steps = {}
        if not 'steps' in self.config:
            logger.error("%s: Missing key: steps" % self.get_config_filepath())
            sys.exit(1)
        re_simple_key = re.compile('^[a-zA-Z0-9_]+$')
        re_complex_key = re.compile('^([a-zA-Z0-9_]+)\s+\(([a-zA-Z0-9_]+)\)$')

        # step one: instantiate all steps
        for step_key, step_description in self.config['steps'].items():

            # the step keys in the configuration may be either:
            # - MODULE_NAME
            # - DIFFERENT_STEP_NAME\s+\(MODULE_NAME\)
            step_name = None
            module_name = None
            if re_simple_key.match(step_key):
                step_name = step_key
                module_name = step_key
            else:
                match = re_complex_key.match(step_key)
                if match:
                    step_name = match.group(1)
                    module_name = match.group(2)

            if step_name == 'temp':
                # A step cannot be named 'temp' because we need the out/temp
                # directory to store temporary files.
                logger.error("%s: A step name cannot be 'temp'." %
                             self.get_config_filepath())
                sys.exit(1)
            step_class = abstract_step.AbstractStep.get_step_class_for_key(
                module_name)
            step = step_class(self)

            step.set_step_name(step_name)
            step.set_options(step_description)

            self.steps[step_name] = step

        # step two: set dependencies
        for step_name, step in self.steps.items():
            if not step.needs_parents:
                if '_depends' in step._options:
                    logger.error("%s: %s must not have dependencies because "
                                 "it declares no in/* connections (remove the "
                                 "_depends key)." %
                                 (self.get_config_filepath(), step_name))
                    sys.exit(1)
            else:
                if not '_depends' in step._options:
                    logger.error("%s: Missing key in step '%s': _depends (set "
                                 "to null if the step has no dependencies)." %
                                 (self.get_config_filepath(), step_name))
                    sys.exit(1)
                depends = step._options['_depends']
                if depends == None:
                    pass
                else:
                    temp_list = depends
                    if depends.__class__ == str:
                        temp_list = [depends]
                    for d in temp_list:
                        if not d in self.steps:
                            logger.error(
                                "%s: Step %s specifies an undefined "
                                "dependency: %s." %
                                (self.get_config_filepath(), step_name, d))
                            sys.exit(1)
                        step.add_dependency(self.steps[d])

        # step three: perform topological sort
        # if there's a cycle (yeah, the algorithm is O(n^2), tsk, tsk...)

        unassigned_steps = set(self.steps.keys())
        assigned_steps = set()
        self.topological_step_order = []
        while len(unassigned_steps) > 0:
            # choose all tasks which have all dependencies resolved, either
            # because they have no dependencies or are already assigned
            next_steps = []
            for step_name in unassigned_steps:
                is_ready = True
                for dep in self.steps[step_name].dependencies:
                    dep_name = dep.get_step_name()
                    if not dep_name in assigned_steps:
                        is_ready = False
                        break
                if is_ready:
                    next_steps.append(step_name)
            if len(next_steps) == 0:
                logger.error("%s: There is a cycle in the step dependencies." %
                             self.get_config_filepath())
                sys.exit(1)
            for step_name in misc.natsorted(next_steps):
                self.topological_step_order.append(step_name)
                assigned_steps.add(step_name)
                unassigned_steps.remove(step_name)

        # step four: finalize step
        for step in self.steps.values():
            step.finalize()

    def print_source_runs(self):
        for step_name in self.topological_step_order:
            step = self.steps[step_name]
            if isinstance(step, abstract_step.AbstractSourceStep):
                for run_id in misc.natsorted(step.get_run_ids()):
                    print("%s/%s" % (step, run_id))

    def add_file_dependencies(self, output_path, input_paths):
        if output_path in self.file_dependencies:
            logger.error("Different steps/runs/tags want to create "
                         "the same output file: %s." % output_path)
            sys.exit(1)
        self.file_dependencies[output_path] = set(input_paths)

        for inpath in input_paths:
            if not inpath in self.file_dependencies_reverse:
                self.file_dependencies_reverse[inpath] = set()
            self.file_dependencies_reverse[inpath].add(output_path)

    def add_task_for_output_file(self, output_path, task_id):
        if output_path in self.task_id_for_output_file:
            logger.error("More than one step is trying to create the "
                         "same output file: %s." % output_path)
            sys.exit(1)
        self.task_id_for_output_file[output_path] = task_id

        if not task_id in self.output_files_for_task_id:
            self.output_files_for_task_id[task_id] = set()
        self.output_files_for_task_id[task_id].add(output_path)

    def add_task_for_input_file(self, input_path, task_id):
        if not input_path in self.task_ids_for_input_file:
            self.task_ids_for_input_file[input_path] = set()
        self.task_ids_for_input_file[input_path].add(task_id)

        if not task_id in self.input_files_for_task_id:
            self.input_files_for_task_id[task_id] = set()
        self.input_files_for_task_id[task_id].add(input_path)

    def check_command(self, command):
        for argument in command:
            if not isinstance(argument, str):
                logger.error(
                    "The command to be launched '%s' contains non-string "
                    "argument '%s'. Therefore the command will fail. Please "
                    "fix this type issue." % (command, argument))
                sys.exit(1)
        return

    def exec_pre_post_calls(self, tool_id, info_key, info_command,
                            tool_check_info):
        if info_command.__class__ == str:
            info_command = [info_command]
        for command in info_command:
            if type(command) is str:
                command = command.split()
            self.check_command(command)
            logger.info("Executing command: %s" % " ".join(command))
            try:
                proc = subprocess.Popen(command,
                                        stdin=None,
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        close_fds=True)

            except OSError as e:
                logger.error("%s: Error while executing '%s' for %s: %s "
                             "Error no.: %s Error message: %s" %
                             (self.get_config_filepath(), info_key, tool_id,
                              " ".join(command), e.errno, e.strerror))
                sys.exit(1)

            command_call = info_key
            command_exit_code = '%s-exit-code' % info_key
            command_response = '%s-respone' % info_key
            (output, error) = proc.communicate()
            if info_key in ['module_load', 'module_unload']:
                logger.info("Try '%s' for '%s': %s" %
                            (info_key, tool_id, " ".join(command)))
                exec output
                tool_check_info.update({
                    command_call: (' '.join(command)).strip(),
                    command_exit_code:
                    proc.returncode
                })
                sys.stderr.write(error)
                sys.stderr.flush()
            else:
                tool_check_info.update({
                    command_call: (' '.join(command)).strip(),
                    command_exit_code:
                    proc.returncode,
                    command_response: (output + error)
                })

        return tool_check_info

    def check_tools(self):
        '''
        checks whether all tools references by the configuration are available 
        and records their versions as determined by ``[tool] --version`` etc.
        '''
        if not 'tools' in self.config:
            return
        for tool_id, info in self.config['tools'].items():
            tool_check_info = dict()

            # Load module(s) and execute command if configured
            for pre_cmd in (x for x in ('module_load', 'pre_command')
                            if x in info):
                tool_check_info = self.exec_pre_post_calls(
                    tool_id, pre_cmd, info[pre_cmd], tool_check_info)

            # Execute command to check if tool is available
            command = [copy.deepcopy(info['path'])]
            if info['path'].__class__ == list:
                command = copy.deepcopy(info['path'])
            self.check_command(command)
            if 'get_version' in info:
                command.append(info['get_version'])
            try:
                proc = subprocess.Popen(command,
                                        stdin=subprocess.PIPE,
                                        stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE,
                                        close_fds=True)
                proc.stdin.close()
            except OSError as e:
                logger.error("%s: Error while checking Tool %s "
                             "Error no.: %s Error message: %s\ncommand: %s "
                             "\nSTDOUT-ERR: %s\n" %
                             (self.get_config_filepath(), info['path'],
                              e.errno, e.strerror, command, subprocess.PIPE))
                sys.exit(1)
            proc.wait()
            exit_code = None
            exit_code = proc.returncode
            tool_check_info.update({
                'command': (' '.join(command)).strip(),
                'exit_code':
                exit_code,
                'response': (proc.stdout.read() + proc.stderr.read()).strip()
            })
            # print("Command: %s" % tool_check_info['command'])
            # print("Exit Code: %s" % tool_check_info['exit_code'])
            # print("Response: %s" % tool_check_info['response'])
            expected_exit_code = 0
            if 'exit_code' in info:
                expected_exit_code = info['exit_code']
            if exit_code != expected_exit_code:
                logger.error(
                    "%s: Tool check failed for %s: %s - exit code is: %d "
                    "(expected %d) (response %s)" %
                    (self.get_config_filepath(), tool_id, ' '.join(command),
                     exit_code, expected_exit_code,
                     tool_check_info['response']))
                sys.exit(1)
            # Execute clean-up command (if configured)
            for info_key in (x for x in ('module_unload', 'post_command')
                             if x in info):
                tool_check_info = self.exec_pre_post_calls(
                    tool_id, info_key, info[info_key], tool_check_info)
            # Store captured information
            self.tool_versions[tool_id] = tool_check_info

    def check_ping_files(self,
                         print_more_warnings=False,
                         print_details=False,
                         fix_problems=False):
        run_problems = list()
        queue_problems = list()
        check_queue = True

        try:
            stat_output = subprocess.check_output(
                [self.get_cluster_command('stat')], stderr=subprocess.STDOUT)
        except (KeyError, OSError, subprocess.CalledProcessError):
            # we don't have a stat tool here, if subprocess.CalledProcessError
            # is raised
            check_queue = False

        if print_more_warnings and not check_queue:
            try:
                ce = self.get_cluster_command('stat')
            except KeyError:
                ce = "a cluster engine"
            print(
                "Attention, we cannot check stale queued ping files because "
                "this host does not have %s." % ce)

        running_jids = set()

        if check_queue:
            for line in stat_output.split("\n"):
                try:
                    jid = int(line.strip().split(' ')[0])
                    running_jids.add(str(jid))
                except ValueError:
                    # this is not a JID
                    pass

        now = datetime.datetime.now()
        for task in self.all_tasks_topologically_sorted:
            exec_ping_file = task.get_run().get_executing_ping_file()
            queued_ping_file = task.get_run().get_queued_ping_file()
            if os.path.exists(exec_ping_file):
                info = yaml.load(open(exec_ping_file, 'r'))
                start_time = info['start_time']
                last_activity = datetime.datetime.fromtimestamp(
                    abstract_step.AbstractStep.fsc.getmtime(exec_ping_file))
                last_activity_difference = now - last_activity
                if last_activity_difference.total_seconds() > \
                   abstract_step.AbstractStep.PING_TIMEOUT:
                    run_problems.append(
                        (task, exec_ping_file, last_activity_difference,
                         last_activity - start_time))

            if os.path.exists(queued_ping_file) and check_queue:
                info = yaml.load(open(queued_ping_file, 'r'))
                if not str(info['job_id']) in running_jids:
                    queue_problems.append(
                        (task, queued_ping_file, info['submit_time']))

        show_hint = False

        if len(run_problems) > 0:
            show_hint = True
            label = "Warning: There are %d stale run ping files." % len(
                run_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                run_problems = sorted(run_problems,
                                      key=itemgetter(2, 3),
                                      reverse=True)
                for problem in run_problems:
                    task = problem[0]
                    path = problem[1]
                    last_activity_difference = problem[2]
                    ran_for = problem[3]
                    print("dead since %13s, ran for %13s: %s" %
                          (misc.duration_to_str(last_activity_difference),
                           misc.duration_to_str(ran_for), task))
                print("")

        if len(queue_problems) > 0:
            show_hint = True
            label = "Warning: There are %d tasks marked as queued, but they do not seem to be queued." % len(
                queue_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                queue_problems = sorted(queue_problems,
                                        key=itemgetter(2),
                                        reverse=True)
                for problem in queue_problems:
                    task = problem[0]
                    path = problem[1]
                    start_time = problem[2]
                    print("submitted at %13s: %s" % (start_time, task))
                print("")

        if fix_problems:
            all_problems = run_problems
            all_problems.extend(queue_problems)
            for problem in all_problems:
                path = problem[1]
                print("Now deleting %s..." % path)
                os.unlink(path)

        if show_hint:
            if print_more_warnings and not print_details or not fix_problems:
                print(
                    "Hint: Run 'uap %s fix-problems --details' to see the "
                    "details." % self.get_config_filepath())
            if not fix_problems:
                print(
                    "Hint: Run 'uap %s fix-problems --srsly' to fix these "
                    "problems (that is, delete all problematic ping files)." %
                    self.get_config_filepath())

    def check_volatile_files(self, details=False, srsly=False):
        collected_files = set()
        for task in self.all_tasks_topologically_sorted:
            collected_files |= task.volatilize_if_possible(srsly)
        if not srsly and len(collected_files) > 0:
            if details:
                for path in sorted(collected_files):
                    print(path)
            total_size = 0
            for path in collected_files:
                total_size += os.path.getsize(path)
            print(
                "Hint: You could save %s of disk space by volatilizing %d "
                "output files." %
                (misc.bytes_to_str(total_size), len(collected_files)))
            print("Call 'uap %s volatilize --srsly' to purge the files." %
                  self.get_config_filepath())

    def autodetect_cluster_type(self):
        cluster_config = self.get_cluster_config()
        # Let's see if we can successfully run a cluster identity test
        # Test all configured cluster types
        for cluster_type in cluster_config.keys():
            # Do we have an identity test command
            identity = dict()
            for key in ['test', 'answer']:
                try:
                    identity[key] = cluster_config[cluster_type]\
                                        ['identity_%s' % key]
                except KeyError:
                    logger.error(
                        "%s: Missing 'identity_%s' for %s"
                        "cluster type." %
                        (self._cluster_config_path, key, cluster_type))
                    sys.exit(1)
            # Now that we know let's test for that cluster
            try:
                if (subprocess.check_output(identity['test']).startswith(
                        identity['answer'])):
                    return cluster_type
            except OSError:
                pass
        return None

    def get_cluster_type(self):
        return self.cluster_type

    def set_cluster_type(self, cluster_type):
        if not cluster_type in self.get_cluster_config():
            logger.info("No cluster type detected.")
            self.cluster_type = None
        self.cluster_type = cluster_type

    '''
    Shorthand to retrieve a cluster-type-dependent command or filename
    (cc == cluster command).
    '''

    def get_cluster_command(self, key):
        return self.get_cluster_config()[self.get_cluster_type()][key]

    '''
    Shorthand to retrieve a cluster-type-dependent command line part (this is a
    list)
    '''

    def get_cluster_command_cli_option(self, key, value):
        result = self.get_cluster_config()[self.get_cluster_type()][key]
        if '%s' in result:
            return [result % value]
        else:
            return [result, value]
Beispiel #2
0
class AbstractStep(object):

    PING_TIMEOUT = 300
    PING_RENEW = 30
    VOLATILE_SUFFIX = '.volatile.placeholder.yaml'
    UNDERSCORE_OPTIONS = [
        '_depends', '_volatile', '_BREAK', '_connect',
        '_cluster_submit_options', '_cluster_pre_job_command',
        '_cluster_post_job_command', '_cluster_job_quota'
    ]

    states = misc.Enum(['DEFAULT', 'EXECUTING'])

    def __init__(self, pipeline):

        self._pipeline = pipeline

        self.dependencies = list()
        '''
        All steps this step depends on.
        '''

        self._options = dict()
        '''
        Options as specified in the configuration.
        '''

        self._step_name = self.__module__
        '''
        By default, this is the name of the module. Can be overridden
        to allow for multiple steps of the same kind.
        '''

        self._runs = None
        '''
        Cached run information. ``declare_runs`` is only called once, the
        post-processed run objects are stored in here.
        '''

        self._pipeline_log = dict()

        self._cores = 1
        self._connections = set()
        self._optional_connections = set()
        self._connection_formats = dict()
        self._connection_descriptions = dict()
        self._pre_command = dict()
        self._post_command = dict()
        self._module_load = dict()
        self._module_unload = dict()
        self._tools = dict()

        self._defined_options = dict()

        self.needs_parents = False

        self.children_step_names = set()

        self.finalized = False

        self._state = AbstractStep.states.DEFAULT

        self._submit_script = None

    def finalize(self):
        '''Finalizes the step.

        The intention is to make further changes to the step
        impossible, but apparently, it's checked nowhere at the moment.
        '''
        if self.finalized:
            return

        for parent_step in self.dependencies:
            parent_step.finalize()

        self.finalized = True

    def _reset(self):
        self._pipeline_log = dict()

    def get_pipeline(self):
        return self._pipeline

    def declare_run(self, run_id):
        '''
        Declare a run. Use it like this::

            with self.declare_run(run_id) as run:
                # add output files and information to the run here
        '''
        # Replace whitespaces by underscores
        run_id = re.sub(r'\s', '_', run_id)
        if run_id in self._runs:
            raise UAPError("Cannot declare the same run ID twice: %s." %
                           run_id)
        run = Run(self, run_id)
        self.add_run(run)
        return run

    def add_run(self, run):
        self._runs[run.get_run_id()] = run

    def get_run(self, run_id):
        '''
        Returns a single run object for run_id or None.
        '''
        if run_id in self._runs:
            return self._runs[run_id]
        else:
            return None

    def set_step_name(self, step_name):
        '''
        Change the step name.

        The step name is initially set to the module name. This method
        is used in case we need multiple steps of the same kind.
        '''
        self._step_name = step_name

    def set_options(self, options):
        '''
        Checks and stores step options.

        The options are either set to values given in YAML config or
        the default values set in self.add_option().
        '''
        self._options = dict()

        # set options
        for key, value in options.items():
            if key[0] == '_':
                if key not in AbstractStep.UNDERSCORE_OPTIONS:
                    raise UAPError("Invalid option in %s: %s" % (key, self))
                self._options[key] = value
            else:
                if key not in self._defined_options:
                    message = "Unknown option in %s (%s): %s." % \
                        (self.get_step_name(), self.get_step_type(), key)
                    logger.error(
                        message + "\nAvailable options are:\n%s" %
                        yaml.dump(self._defined_options, Dumper=misc.UAPDumper)
                    )
                    raise UAPError(message)
                if value is not None and type(
                        value) not in self._defined_options[key]['types']:
                    raise UAPError(
                        "Invalid type for option %s - it's %s and should be "
                        "one of %s." % (key, type(value),
                                        self._defined_options[key]['types']))
                if self._defined_options[key]['choices'] is not None and \
                   value not in self._defined_options[key]['choices']:
                    raise UAPError(
                        "Invalid value '%s' specified for option %s - "
                        "possible values are %s." %
                        (value, key, self._defined_options[key]['choices']))
                self._options[key] = value

        # set default values for unset options and make sure all required
        # options have been set
        for key, info in self._defined_options.items():
            if key not in self._options:
                if info['optional'] is not True:
                    raise UAPError("Required option not set in %s: %s." %
                                   (self, key))
                self._options[key] = info['default']

        self._options.setdefault('_volatile', False)

        for i in [
                '_cluster_submit_options', '_cluster_pre_job_command',
                '_cluster_post_job_command'
        ]:
            self._options.setdefault(i, '')
        self._options.setdefault('_cluster_job_quota', 0)

        self._options.setdefault('_connect', dict())
        self._options.setdefault('_depends', list())
        if not isinstance(self._options['_depends'], list):
            self._options['_depends'] = [self._options['_depends']]
        # add implied dependencies
        for in_cons in self._options['_connect'].values():
            in_cons = in_cons if isinstance(in_cons, list) else [in_cons]
            for parent_cons in in_cons:
                parent = parent_cons.split("/")[0]
                if parent not in self._options['_depends'] \
                        and parent != 'empty':
                    # We cannot use sets here since the order of
                    # dependecies matters in rare cases, e.g., collect_scs.
                    self._options['_depends'].append(parent)

    def get_options(self):
        '''
        Returns a dictionary of all given options
        '''
        return self._options

    def get_option(self, key):
        """
        Query an option.
        """
        if key not in self._defined_options:
            raise UAPError("Cannot query undefined option %s in step %s." %
                           (key, self.__module__))
        return self._options[key]

    def is_option_set_in_config(self, key):
        """
        Determine whether an optional option (that is, a non-required option)
        has been set in the configuration.
        """
        if key not in self._defined_options:
            raise UAPError("Cannot query undefined option %s in step %s." %
                           (key, self.get_step_name()))
        is_set = key in self._options
        if is_set:
            if isinstance(self._options[key], list):
                is_set = any([v is not None for v in self._options[key]])
            else:
                is_set = self._options[key] is not None
        return is_set

    def is_volatile(self):
        return self._options['_volatile']

    def add_dependency(self, parent):
        '''
        Add a parent step to this steps dependencies.

        parent -- parent step this step depends on
        '''
        if not isinstance(parent, AbstractStep):
            raise UAPError("Error: parent argument must be an AbstractStep.")
        if parent == self:
            raise UAPError("Cannot add a node as its own dependency.")
        self.dependencies.append(parent)
        parent.children_step_names.add(str(self))

    def get_dependencies(self):
        return self.dependencies

    def get_input_runs(self):
        '''
        Return a dict which contains all runs per parent steps.
        '''
        input_runs = dict()
        for parent in self.get_dependencies():
            input_runs[parent.get_step_name()] = parent.get_runs()
        return input_runs

    def declare_runs(self):
        # fetch all incoming run IDs which produce reads...
        self.runs(self.get_run_ids_in_connections_input_files())
        self.check_required_out_connections()

    def check_required_out_connections(self):
        '''
        This functions tests if all required out connections
        were set by all runs.
        '''
        required_out = self.get_out_connections(with_optional=False)
        bad_runs = 0
        for run_id, run in self._runs.items():
            used_conns = set()
            for connection, content in run._output_files.items():
                used = any([fl is not None for fl in content.keys()])
                if used:
                    used_conns.add(connection)
            missings = required_out - used_conns
            if missings:
                bad_runs += 1
                logger.warning(
                    'Run "%s" of step "%s" misses the required '
                    'connections %s. To remove this warning pass '
                    'optional=True to the add_connection method in the '
                    'step constructor __init__ of "%s".' %
                    (run_id, self.get_step_name(), list(missings),
                     self.get_step_type()))
            if bad_runs == 5:
                logger.warning('... Emitting connection test for further '
                               'runs of "%s".' % self.get_step_name())
                break
        if bad_runs:
            logger.warning(
                '[Deprecation] Unmet required connections '
                'may trigger an error in future version of the UAP.')

    def get_output_directory(self):
        '''
        Returns the step output directory.
        '''
        return os.path.join(self.get_pipeline().config['destination_path'],
                            self.get_step_name())

    def get_submit_script_file(self):
        if self._submit_script is None:
            self._submit_script = os.path.join(
                self.get_output_directory(),
                ".submit-%s.sh" % self.get_step_name())
        return self._submit_script

    def runs(self, run_ids_connections_files):
        '''
        Abstract method this must be implemented by actual step.

        Raise NotImplementedError if subclass does not override this
        method.
        '''
        raise NotImplementedError()

    def execute(self, run_id, run):
        # get run_info objects
        with self.get_run(run_id) as run:
            logger.info("Run ID: %s" % run_id)
            # for each exec_group in that run ...
            for exec_group in run.get_exec_groups():
                # ... create a process pool
                with process_pool.ProcessPool(run) as pool:
                    # Clean up (use last ProcessPool for that)
                    if exec_group == run.get_exec_groups()[-1]:
                        logger.info("Telling pipeline to clean up!")
                        pool.clean_up_temp_paths()

                    for poc in exec_group.get_pipes_and_commands():
                        # for each pipe or command (poc)
                        # check if it is a pipeline ...
                        if isinstance(poc, pipeline_info.PipelineInfo):
                            # ... create a pipeline ...
                            with pool.Pipeline(pool) as pipeline:
                                for command in poc.get_commands():
                                    pipeline.append(
                                        command.get_command(),
                                        stdout_path=command.get_stdout_path(),
                                        stderr_path=command.get_stderr_path())
                        elif isinstance(poc, command_info.CommandInfo):
                            pool.launch(poc.get_command(),
                                        stdout_path=poc.get_stdout_path(),
                                        stderr_path=poc.get_stderr_path())

    def get_runs(self):
        '''
        Getter method for runs of this step.

        If there are no runs as this method is called, they are created here.
        '''
        # create runs if they don't exist yet
        if not self._runs:
            # if _BREAK: true is specified in the configuration,
            # return no runs and thus cut off further processing
            if '_BREAK' in self._options and self._options['_BREAK']:
                return dict()

            self._runs = dict()
            self.declare_runs()

            # define file dependencies
            for run_id in self._runs.keys():
                pipeline = self.get_pipeline()
                run = self.get_run(run_id)
                for connection in run.get_output_files_abspath().keys():
                    for output_path, input_paths in \
                            run.get_output_files_abspath()[connection].items():
                        # proceed if we have normal output_path/input_paths
                        if output_path is not None and input_paths is not None:
                            # store file dependencies
                            pipeline.add_file_dependencies(
                                output_path, input_paths)
                            # create task ID
                            task_id = '%s/%s' % (str(self), run_id)
                            pipeline.add_task_for_output_file(
                                output_path, task_id)
                            # No input paths? Add empty string NOT None
                            # as file name
                            if len(input_paths) == 0:
                                pipeline.add_task_for_input_file("", task_id)
                            for input_path in input_paths:
                                pipeline.add_task_for_input_file(
                                    input_path, task_id)

        # now that _runs exists, it remains constant, just return it
        return self._runs

    def reset_run_caches(self):
        for run in self.get_runs().values():
            run.fsc.clear()

    def get_run_ids(self):
        '''
        Returns sorted list of runs generated by step.
        '''
        return sorted(self.get_runs().keys())

    def get_step_name(self):
        '''
        Returns this steps name.

        Returns the step name which is initially equal to the step type
        (== module name)  but can be changed via set_step_name() or via
        the YAML configuration.
        '''
        return self._step_name

    def get_step_type(self):
        '''
        Returns the original step name (== module name).
        '''
        return self.__module__

    def remove_ping_file(self, ping_path, bad_copy=False):
        # don't remove the ping file, rename it so we can inspect it later
        try:
            backup = self.get_pipeline().args.debugging
        except AttributeError:
            backup = False
        suffix = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        if os.path.exists(ping_path):
            try:
                out_w_suffix = ping_path + '.' + suffix
                if bad_copy:
                    out_w_bad = ping_path + '.bad'
                    os.rename(ping_path, out_w_bad)
                    if backup:
                        copyfile(out_w_bad, out_w_suffix)
                    logger.debug('The run ping file "%s" was moved to "%s" '
                                 'and copied to "%s" by host %s.' %
                                 (ping_path, out_w_bad, out_w_suffix,
                                  socket.gethostname()))
                elif backup:
                    os.rename(ping_path, out_w_suffix)
                    logger.debug(
                        'The run ping file "%s" was moved to "%s" '
                        'by host %s.' %
                        (ping_path, out_w_suffix, socket.gethostname()))
                else:
                    os.unlink(ping_path)
                    logger.debug('The run ping file "%s" was removed by %s.' %
                                 (ping_path, socket.gethostname()))
            except OSError as e:
                logger.debug('The run ping file "%s" could not be moved: %s' %
                             (ping_path, str(e)))
                pass
        else:
            logger.debug('This run ping file was not found: %s' % ping_path)

    def run(self, run_id):
        '''
        Create a temporary output directory and execute a run. After the run
        has finished, it is checked that all output files are in place and
        the output files are moved to the final output location. Finally,
        YAML annotations are written.
        '''

        # this is the run we'll execute now
        run = self.get_run(run_id)

        # create the output directory if it doesn't exist yet
        if not os.path.isdir(run.get_output_directory()):
            os.makedirs(run.get_output_directory())

        # now write the run ping file
        executing_ping_path = run.get_executing_ping_file()

        if os.path.exists(executing_ping_path):
            raise UAPError("%s/%s seems to be already running, exiting..." %
                           (self, run_id))
        queued_ping_path = run.get_queued_ping_file()
        try:
            with open(queued_ping_path, 'r') as buff:
                info = yaml.load(buff, Loader=yaml.FullLoader)
            job_id = info['cluster job id']
        except (IOError, KeyError):
            job_id = None

        # create a temporary directory for the output files
        temp_directory = run.get_temp_output_directory()
        os.makedirs(temp_directory)

        # prepare known_paths
        known_paths = dict()
        for tag, tag_info in run.get_output_files_abspath().items():
            for output_path, input_paths in tag_info.items():
                # add the real output path
                if output_path is not None and input_paths is not None:
                    known_paths[output_path] = {
                        'designation': 'output',
                        'label': os.path.basename(output_path),
                        'type': 'step_file'
                    }
                    # ...and also add the temporary output path
                    known_paths[os.path.join(
                        temp_directory, os.path.basename(output_path))] = {
                            'designation': 'output',
                            'label':
                            "%s\\n(%s)" % (os.path.basename(output_path), tag),
                            'type': 'step_file',
                            'real_path': output_path
                        }
                    for input_path in input_paths:
                        if input_path is not None:
                            known_paths[input_path] = {
                                'designation': 'input',
                                'label': os.path.basename(input_path),
                                'type': 'step_file'
                            }

        # now write the run ping file
        executing_ping_info = dict()
        executing_ping_info['start_time'] = datetime.now()
        executing_ping_info['host'] = socket.gethostname()
        executing_ping_info['pid'] = os.getpid()
        executing_ping_info['user'] = pwd.getpwuid(os.getuid())[0]
        executing_ping_info['temp_directory'] = run.get_temp_output_directory()
        if job_id:
            executing_ping_info['cluster job id'] = job_id

        with open(executing_ping_path, 'w') as f:
            f.write(yaml.dump(executing_ping_info, default_flow_style=False))

        executing_ping_pid = os.fork()
        if executing_ping_pid == 0:
            # this is the chid process
            try:
                signal.signal(signal.SIGTERM, signal.SIG_DFL)
                signal.signal(signal.SIGINT, signal.SIG_IGN)
                while True:
                    time.sleep(AbstractStep.PING_RENEW)
                    # if the executing ping file is gone and the touching
                    # operation fails, then SO BE IT!
                    os.utime(executing_ping_path, None)
            finally:
                os._exit(0)

        def kill_exec_ping():
            try:
                os.kill(executing_ping_pid, signal.SIGTERM)
                os.waitpid(executing_ping_pid, 0)
            except OSError:
                # if the ping process was already killed, it's gone anyway
                pass
            self.remove_ping_file(executing_ping_path)

        p = self.get_pipeline()

        def ping_on_term(signum, frame):
            logger.warning('Recived SIGTERM and moving execution ping file...')
            kill_exec_ping()
            self.remove_ping_file(queued_ping_path, bad_copy=True)
            p.caught_signal = signum
            process_pool.ProcessPool.kill()
            raise UAPError('Recived TERM signal (canceled job).')

        def ping_on_int(signum, frame):
            logger.warning('Recived SIGINT and moving execution ping file...')
            kill_exec_ping()
            self.remove_ping_file(queued_ping_path, bad_copy=True)
            p.caught_signal = signum
            process_pool.ProcessPool.kill()
            raise UAPError('Recived INT signal (keybord interrupt).')

        original_term_handler = signal.signal(signal.SIGTERM, ping_on_term)
        original_int_handler = signal.signal(signal.SIGINT, ping_on_int)

        self.start_time = datetime.now()
        message = "[START] starting %s/%s on %s" % \
            (self, run_id, socket.gethostname())
        if job_id:
            message += " with job id %s" % job_id
        p.notify(message)
        caught_exception = None
        self._state = AbstractStep.states.EXECUTING
        base_working_dir = os.getcwd()
        os.chdir(run.get_temp_output_directory())
        try:
            self.execute(run_id, run)
        except BaseException:
            # Oh my. We have a situation. This is awkward. Tell the process
            # pool to wrap up. This way, we can try to get process stats before
            # shutting everything down.
            process_pool.ProcessPool.kill()
            # Store the exception, re-raise it later
            caught_exception = sys.exc_info()
            error = ''.join(
                traceback.format_exception(*caught_exception)[-2:]).strip()
            logger.debug(error)
        finally:
            signal.signal(signal.SIGTERM, original_term_handler)
            signal.signal(signal.SIGINT, original_int_handler)
            self._state = AbstractStep.states.DEFAULT  # changes relative paths
            os.chdir(base_working_dir)

        self.end_time = datetime.now()
        # step has completed invalidate the FS cache because things have
        # changed by now...
        run.reset_fsc()

        to_be_moved = dict()
        if not p.caught_signal and not caught_exception:
            # if we're here, we can assume the step has finished successfully
            # now log file stats

            try:
                for tag in run.get_output_files().keys():
                    for out_file in run.get_output_files()[tag].keys():
                        # don't try to rename files if they were not meant to exist
                        # in our temporary directory
                        # 1. out_file should not be None (empty output connection)
                        # 2. out_file should not contain a '/' (file belongs to a
                        #    source step)
                        if out_file is None or '/' in out_file:
                            continue
                        source_path = os.path.join(
                            run.get_temp_output_directory(),
                            os.path.basename(out_file))
                        new_path = os.path.join(run.get_output_directory(),
                                                os.path.basename(out_file))
                        # first, delete a possibly existing volatile placeholder
                        # file
                        path_volatile = new_path + AbstractStep.VOLATILE_SUFFIX
                        if os.path.exists(path_volatile):
                            logger.info("Now deleting: %s" % path_volatile)
                            os.unlink(path_volatile)
                        if os.path.exists(source_path):
                            known_paths.pop(source_path, None)
                            known_paths.setdefault(new_path, dict())
                            if known_paths[new_path][
                                    'designation'] == 'output':
                                to_be_moved[source_path] = new_path
                                size = run.fsc.getsize(source_path)
                                mtime = datetime.fromtimestamp(
                                    run.fsc.getmtime(source_path))
                                known_paths[new_path]['size'] = size
                                known_paths[new_path][
                                    'modification time'] = mtime
                            if known_paths[new_path]['type'] != 'step_file':
                                logger.debug(
                                    "Set %s 'type' info to 'step_file'" %
                                    new_path)
                                known_paths[new_path]['type'] = 'step_file'
                        else:
                            raise UAPError(
                                'The step failed to produce an '
                                'announced output file: "%s".\n'
                                'Source file doesn\'t exists: "%s"' %
                                (out_file, source_path))
            except BaseException:
                caught_exception = sys.exc_info()

        pool = None

        class SignalError(Exception):
            def __init__(self, signum):
                self.signum = signum
                m = "Recived signal %s during hashing!" % \
                    process_pool.ProcessPool.SIGNAL_NAMES[signum]
                super(SignalError, self).__init__(m)

        if caught_exception is None and to_be_moved:
            p.notify("[INFO] %s/%s hashing %d output file(s)." %
                     (str(self), run_id, len(to_be_moved)))
            if p.has_interactive_shell() \
                    and logger.getEffectiveLevel() > 20:
                show_progress = True
            else:
                show_progress = False
            try:

                def stop(signum, frame):
                    raise SignalError(signum)

                original_term_handler = signal.signal(signal.SIGTERM, stop)
                original_int_handler = signal.signal(signal.SIGINT, stop)
                pool = multiprocessing.Pool(self.get_cores())
                total = len(to_be_moved)
                file_iter = pool.imap(misc.sha_and_file, to_be_moved.keys())
                file_iter = tqdm(
                    file_iter,
                    total=total,
                    leave=False,
                    bar_format='{desc}:{percentage:3.0f}%|{bar:10}{r_bar}',
                    disable=not show_progress,
                    desc='files')
                for i, (hashsum, path) in enumerate(file_iter):
                    run.fsc.sha256sum_of(to_be_moved[path], value=hashsum)
                    known_paths[to_be_moved[path]]['sha256'] = hashsum
                    if not show_progress:
                        logger.info("sha256 [%d/%d] %s %s" %
                                    (i + 1, total, hashsum, path))
            except BaseException:
                caught_exception = sys.exc_info()
                try:
                    # removing the progress bar
                    file_iter.close()
                except BaseException:
                    pass
                error = caught_exception[1]
                if caught_exception[0] is SignalError:
                    p.caught_signal = error.signum
                logger.error(error)
                if pool:
                    pool.terminate()
            else:
                pool.close()
            signal.signal(signal.SIGTERM, original_term_handler)
            signal.signal(signal.SIGINT, original_int_handler)

        run.add_known_paths(known_paths)
        if not p.caught_signal and not caught_exception:
            try:
                for source_path, new_path in to_be_moved.items():
                    logger.debug("Moving %s to %s." % (source_path, new_path))
                    os.rename(source_path, new_path)
            except BaseException:
                caught_exception = sys.exc_info()

        error = None
        if p.caught_signal is not None:
            signum = p.caught_signal
            signame = process_pool.ProcessPool.SIGNAL_NAMES[signum]
            error = 'Pipeline stopped because it caught signal %d - %s' % \
                    (signum, signame)
        elif caught_exception is not None:
            error = ''.join(
                traceback.format_exception(*caught_exception)[-2:]).strip()
        annotation_path = run.write_annotation_file(run.get_output_directory(),
                                                    error=error,
                                                    job_id=job_id)

        kill_exec_ping()
        self._state = AbstractStep.states.DEFAULT

        if error:
            message = "[BAD] %s/%s failed on %s after %s\n" % \
                      (str(self), run_id, socket.gethostname(),
                       misc.duration_to_str(self.end_time - self.start_time))
            message += "Here are the details: " + annotation_path + '\n'
            attachment = None
            if os.path.exists(annotation_path + '.png'):
                attachment = dict()
                attachment['name'] = 'details.png'
                attachment['data'] = open(annotation_path + '.png').read()
            p.notify(message, attachment)
            self.remove_ping_file(queued_ping_path, bad_copy=True)
            if caught_exception is not None:
                raise caught_exception[1].with_traceback(caught_exception[2])

        else:
            # finally, remove the temporary directory if it's empty
            try:
                os.rmdir(temp_directory)
            except OSError as e:
                logger.info('Coult not remove temp dir "%s": %s' %
                            (temp_directory, e))
            temp = os.path.normpath(os.path.join(temp_directory, '..'))
            try:
                os.rmdir(temp)
            except OSError:
                # there may still be tasks in process
                pass

            remaining_task_info = self.get_run_info_str()

            message = "[OK] %s/%s successfully finished on %s after %s\n" % \
                      (str(self), run_id, socket.gethostname(),
                       misc.duration_to_str(self.end_time - self.start_time))
            message += str(self) + ': ' + remaining_task_info + "\n"
            attachment = None
            if os.path.exists(annotation_path + '.png'):
                attachment = dict()
                attachment['name'] = 'details.png'
                attachment['data'] = open(annotation_path + '.png').read()
            p.notify(message, attachment)
            self.remove_ping_file(queued_ping_path)

            self._reset()

        if pool is not None:
            pool.join()

    def get_pre_commands(self):
        """
        Return dictionary with commands to execute before starting any other
        command of this step
        """
        return self._pre_command

    def get_module_loads(self):
        """
        Return dictionary with module load commands to execute before starting
        any other command of this step
        """
        return self._module_load

    def get_tool(self, key):
        """
        Return full path to a configured tool.
        """
        if key not in self._tools:
            raise UAPError("Tool %s unknown. Maybe you forgot to use "
                           "self.require_tool('%s')" % (key, key))
        return self._tools[key]

    def get_path_tool(self):
        '''
        Returns a dict with a tool name for each tool paths.
        '''
        return {' '.join(path): tool for tool, path in self._tools.items()}

    @property
    def used_tools(self):
        return set(self._tools.keys())

    def get_module_unloads(self):
        """
        Return dictionary with module unload commands to execute before
        starting any other command of this step
        """
        return self._module_unload

    def get_post_commands(self):
        """
        Return dictionary with commands to execute after finishing any other
        command of this step
        """
        return self._post_command

    def get_run_info_str(self, progress=False, do_hash=False):
        count = {}
        runs = self.get_runs()
        run_iter = tqdm(runs,
                        total=len(runs),
                        desc='runs',
                        bar_format='{desc}:{percentage:3.0f}%|{bar:10}{r_bar}',
                        disable=not progress,
                        leave=False)
        try:
            for run in run_iter:
                if isinstance(run, str):
                    run = self.get_run(run)
                state = run.get_state(do_hash=do_hash)
                if state not in count:
                    count[state] = 0
                count[state] += 1
        except BaseException:
            run_iter.close()
            raise
        return ', '.join([
            "%d %s" % (count[_], _.lower())
            for _ in self.get_pipeline().states.order if _ in count
        ])

    def append_pipeline_log(self, log):
        if len(self._pipeline_log) == 0:
            self._pipeline_log = log
        else:
            for k in log.keys():
                if k == 'process_watcher':
                    for k2 in log[k].keys():
                        if k2 == 'max':
                            for _ in log[k][k2].keys():
                                if _ == 'sum':
                                    for k3 in self._pipeline_log[k][k2][
                                            _].keys():
                                        self._pipeline_log[k][k2][_][k3] = \
                                            max(self._pipeline_log[k][k2][_][k3],
                                                log[k][k2][_][k3])
                                else:
                                    self._pipeline_log[k][k2][_] = log[k][k2][
                                        _]
                        else:
                            self._pipeline_log[k][k2].update(log[k][k2])

                else:
                    if log[k].__class__ == list:
                        self._pipeline_log[k].extend(log[k])
                    else:
                        self._pipeline_log[k].update(log[k])

    def __str__(self):
        return self._step_name

    @classmethod
    def get_step_class_for_key(cls, key):
        """
        Returns a step (or source step) class for a given key which corresponds
        to the name of the module the class is defined in. Pass 'cutadapt' and
        you will get the cutadapt.Cutadapt class which you may then instantiate.
        """

        check_classes = [AbstractSourceStep, AbstractStep]
        for index, c in enumerate(check_classes):

            classes = [
                _ for _ in inspect.getmembers(__import__(key), inspect.isclass)
                if c in _[1].__bases__
            ]

            for k in range(index):
                classes = [_ for _ in classes if _[1] != check_classes[k]]
            if len(classes) > 0:
                if len(classes) != 1:
                    raise UAPError("need exactly one subclass of %s in %s" %
                                   (c, key))
                return classes[0][1]

        raise UAPError("No suitable class found for module %s." % key)

    def set_cores(self, cores):
        """
        Specify the number of CPU cores this step will use.
        """
        if not isinstance(cores, int) or cores < 1:
            raise UAPError(
                '[%s] Cores need to be a positive integer, not %s.' %
                (self.get_step_name(), cores))
        self._cores = cores

    def get_cores(self):
        """
        Returns the number of cores used in this step.
        """
        return self._cores

    def add_input_connection(self, connection):
        '''
        Add an input connection to this step
        '''
        self.add_connection('in/%s' % connection)

    def add_output_connection(self, connection):
        '''
        Add an output connection to this step
        '''
        self.add_connection('out/%s' % connection)

    def add_connection(self,
                       connection,
                       optional=False,
                       format=None,
                       description=None):
        """
        Add a connection, which must start with 'in/' or 'out/'.
        :type format: (str) Data format passed in the connection.
        :type description: (str) Explain the connection.
        """
        if not (connection[0:3] == 'in/' or connection[0:4] == 'out/'):
            raise UAPError("A connection must start with 'in/' or 'out/'.")
        if connection[0:3] == 'in/':
            self.needs_parents = True
        if optional is True:
            self._optional_connections.add(connection)
        else:
            self._connections.add(connection)
        if format is not None:
            self._connection_formats[connection] = format
        if description is not None:
            self._connection_descriptions[connection] = \
                re.sub(r'\s+', ' ', description)

    def get_connections(self, with_optional=True):
        """
        Return all connections for this step
        """
        connections = self._connections.copy()
        if with_optional is True:
            connections = connections.union(self._optional_connections)
        return connections

    def get_in_connections(self, with_optional=True, strip_prefix=False):
        """
        Return all in-connections for this step
        """
        connections = self._connections.copy()
        if with_optional is True:
            connections = connections.union(self._optional_connections)
        in_connections = set()
        for connection in connections:
            if connection[0:3] == "in/":
                if strip_prefix is True:
                    con = connection[3:]
                else:
                    con = connection
                in_connections.add(con)
        return in_connections

    def get_out_connections(self, with_optional=True, strip_prefix=False):
        """
        Return all out-connections for this step
        """
        connections = self._connections.copy()
        if with_optional is True:
            connections = connections.union(self._optional_connections)
        out_connections = set()
        for connection in connections:
            if connection[0:4] == "out/":
                if strip_prefix is True:
                    con = connection[4:]
                else:
                    con = connection
                out_connections.add(con)
        return out_connections

    def require_tool(self, tool):
        """
        Declare that this step requires an external tool. Query it later with
        *get_tool()*.
        """
        if self.get_pipeline() is not None:
            if tool not in self.get_pipeline().config['tools']:
                raise UAPError(
                    "%s requires the tool %s but it's not declared in "
                    "the configuration." % (self, tool))
            self._tools[tool] = self.get_pipeline(
            ).config['tools'][tool]['path']
            if 'pre_command' in self.get_pipeline().config['tools'][tool]:
                self._pre_command[tool] = self.get_pipeline(
                ).config['tools'][tool]['pre_command']
            if 'module_load' in self.get_pipeline().config['tools'][tool]:
                self._module_load[tool] = self.get_pipeline(
                ).config['tools'][tool]['module_load']
            if 'module_unload' in self.get_pipeline().config['tools'][tool]:
                self._module_unload[tool] = self.get_pipeline(
                ).config['tools'][tool]['module_unload']
            if 'post_command' in self.get_pipeline().config['tools'][tool]:
                self._post_command[tool] = self.get_pipeline(
                ).config['tools'][tool]['post_command']
        else:
            self._tools[tool] = True

    def add_option(self, key, *option_types, **kwargs):
        """
        Add an option. Multiple types may be specified.
        """
        if 'optional' not in kwargs:
            kwargs['optional'] = False
        for _ in ['default', 'description', 'choices']:
            if _ not in kwargs:
                kwargs[_] = None

        if key[0] == '_':
            raise UAPError(
                "Option key must not start with an underscore: %s." % key)
        if key in self._defined_options:
            raise UAPError("Option %s is already defined." % key)
        if len(option_types) == 0:
            raise UAPError("No option type specified for option %s." % key)
        if len(option_types) > 1 and kwargs['choices'] is not None:
            raise UAPError(
                "You cannot define choices if multiple options types "
                "are defined (%s)." % key)
        for option_type in option_types:
            if option_type not in [int, float, str, bool, list, dict]:
                raise UAPError("Invalid type for option %s: %s." %
                               (key, option_type))
        if kwargs['optional'] and (kwargs['default'] is not None):
            if type(kwargs['default']) not in option_types:
                raise UAPError(
                    "In step: (%s) option: (%s) Type of default value (%s) "
                    "does not match any of the declared possible types (%s)." %
                    (self, key, type(kwargs['default']), option_types))

        info = dict()
        info['types'] = misc.type_tuple(option_types)
        for _ in ['optional', 'default', 'description', 'choices']:
            info[_] = kwargs[_]

        if info['description'] is not None:
            if not isinstance(info['description'], str):
                raise UAPError(
                    'The description of option %s in step %s is not a string.'
                    % (key, self))
            # collapse whites spaces
            info['description'] = re.sub(r'\s+', ' ', info['description'])

        self._defined_options[key] = info

    def find_upstream_info_for_input_paths_as_set(self,
                                                  input_paths,
                                                  key,
                                                  expected=1):
        task_ids = set()
        for path in input_paths:
            task_ids.add(self.get_pipeline().task_id_for_output_file[path])
        results = set()
        for task_id in task_ids:
            task = self.get_pipeline().task_for_task_id[task_id]
            step = task.step
            run_id = task.run_id
            run = step._runs[run_id]
            if run.has_public_info(key):
                results.add(run.get_public_info(key))
            results |= self.find_upstream_info_for_input_paths_as_set(
                task.input_files(), key, None)

        if expected is not None:
            if len(results) != expected:
                raise UAPError(
                    "Unable to determine upstream %s info from %s." %
                    (key, self))
        return results

    def find_upstream_info_for_input_paths(self, input_paths, key):
        """
        Find a piece of public information in all upstream steps. If the
        information is not found or defined in more than one upstream step,
        this will crash.
        """

        result = self.find_upstream_info_for_input_paths_as_set(input_paths,
                                                                key,
                                                                expected=1)
        return list(result)[0]

    def get_run_ids_in_connections_input_files(self):
        '''
        Return a dictionary with all run IDs from parent steps, the
        in connections they provide data for, and the names of the files::

           run_id_1:
               in_connection_1: [input_path_1, input_path_2, ...]
               in_connection_2: ...
           run_id_2: ...

        Format of ``in_connection``: ``in/<connection>``. Input paths are
        absolute.
        '''

        cc = ConnectionsCollector(self.get_step_name())
        self._options.setdefault('_connect', dict())

        # Check if set in-connections are defined in the step class
        # and collect out connections for later check.
        set_out_connections = set()
        used_out_connections = set()
        for in_conn, out_conn in self._options['_connect'].items():
            if in_conn not in self.get_in_connections():
                raise UAPError('_connect: unknown input connection "%s" '
                               'found. Available connections are %s' %
                               (in_conn, list(self.get_in_connections())))
            out_conn = out_conn if isinstance(out_conn, list) else [out_conn]
            set_out_connections = set_out_connections.union(set(out_conn))

        if 'empty' in set_out_connections:
            logger.warning(
                '[%s] "empty" in _connect is deprecated and will be '
                'ignored.' % self.get_step_name())
            set_out_connections.discard('empty')

        # For each parent step ...
        for parent in self.get_dependencies():
            if not parent.get_runs():
                raise UAPError('The step "%s" produces no output.' %
                               parent.get_step_name())
            logger.debug('Connecting "%s" to "%s".' %
                         (parent.get_step_name(), self.get_step_name()))
            # ... look for connection to add
            used_conns = cc.connect(parent, self, self._options['_connect'])
            if not used_conns:
                # ... or add connections with the same name.
                logger.debug('Parent "%s" not connected to child "%s". '
                             'Hence connecting equally named connections.' %
                             (parent.get_step_name(), self.get_step_name()))
                used_conns = cc.connect(parent, self)
            if not used_conns:
                raise UAPError('No connections could be made between '
                               '"%s" and its dependency "%s".' %
                               (self.get_step_name(), parent.get_step_name()))
            used_out_connections = used_out_connections.union(used_conns)

        # Check if all required connections are sattisfied.
        required_connections = self.get_in_connections(with_optional=False)
        missing = required_connections - cc.existing_connections
        if missing:
            logger.warning(
                '_connect: The required connection %s of step '
                '"%s" is not satisfied. To remove this warning pass '
                'optional=True to the add_connection method in the step '
                'constructor __init__ of "%s".' %
                (missing, self.get_step_type(), self.get_step_type()))
            logger.warning(
                '[Deprecation] Unmet required connections may trigger '
                'an error in future version of the UAP.')

        # Check if all set out connections were recognized.
        unrecognized = set_out_connections - used_out_connections
        if len(unrecognized) > 0:
            raise UAPError('For the following connections into step "%s" '
                           'no parent run could be found: %s.' %
                           (self.get_step_name(), list(unrecognized)))

        return cc
Beispiel #3
0
class Pipeline(object):
    '''
    The Pipeline class represents the entire processing pipeline which is defined
    and configured via the configuration file config.yaml.

    Individual steps may be defined in a tree, and their combination with samples
    as generated by one or more source leads to an array of tasks.
    '''

    states = misc.Enum([
        'WAITING', 'READY', 'QUEUED', 'EXECUTING', 'FINISHED', 'BAD',
        'CHANGED', 'VOLATILIZED'
    ])
    '''
    Possible states a task can be in.
    '''
    def __init__(self, **kwargs):
        self.caught_signal = None
        self._cluster_type = None
        self.git_version = None
        self.git_status = None
        self.git_diff = None
        self.git_untracked = None
        self.git_tag = None
        '''use git diff to determine any changes in git
        directory if git is available
        '''
        command = ['git', '--version']
        try:

            self.git_version = subprocess.check_output(command).strip()

        except subprocess.CalledProcessError:
            logger.warning("""Execution of %s failed. Git seems to be
                         unavailable. Continue anyways""" % " ".join(command))

        if self.git_version:
            command = ['git', 'status', '--porcelain']
            try:
                self.git_status = subprocess.check_output(command)
            except subprocess.CalledProcessError:
                logger.error("Execution of %s failed." % " ".join(command))

            command = ['git', 'diff', 'HEAD']
            try:
                self.git_diff = subprocess.check_output(command)
            except subprocess.CalledProcessError:
                logger.error("Execution of %s failed." % " ".join(command))

            command = ['git', 'ls-files', '--others', '--exclude-standard']
            try:
                self.git_untracked = subprocess.check_output(command)
            except subprocess.CalledProcessError:
                logger.error("Execution of %s failed." % " ".join(command))

            command = ['git', 'describe', '--all', '--long']
            try:
                self.git_tag = subprocess.check_output(command).strip()
            except subprocess.CalledProcessError:
                logger.error("Execution of %s failed." % " ".join(command))

            if self.git_diff:
                logger.warning('THE GIT REPOSITORY HAS UNCOMMITED CHANGES:\n'
                               '%s' % self.git_diff.decode('utf-8'))
            if self.git_untracked:
                logger.warning('THE GIT REPOSITORY HAS UNTRACKED FILES:\n'
                               '%s' % self.git_untracked.decode('utf-8'))
        """
        check if we got passed an 'arguments' parameter
        this parameter should contain a argparse.Namespace object
        """
        self.args = None
        if 'arguments' in kwargs:
            self.args = kwargs['arguments']
        '''
        Absolute path to the directory of the uap executable.
        It is used to circumvent path issues.
        '''
        self._uap_path = self.args.uap_path
        '''
        The cluster type to be used (must be one of the keys specified in
        cluster_config).
        '''
        self._cluster_config_path = os.path.join(
            self._uap_path, 'cluster/cluster-specific-commands.yaml')
        with open(self._cluster_config_path, 'r') as cluster_config_file:
            self._cluster_config = yaml.load(cluster_config_file,
                                             Loader=yaml.FullLoader)

        try:
            # set cluster type
            if self.args.cluster == 'auto':
                self.set_cluster_type(self.autodetect_cluster_type())
            else:
                self.set_cluster_type(self.args.cluster)
        except AttributeError:
            # cluster type is not an applicable parameter here, and that's fine
            # (we're probably in run-locally.py)
            pass

        self._start_working_dir = os.getcwd()
        '''
        User working directory.
        '''

        if not self.args.config:
            raise UAPError('No <project-config>.yaml specified.')
        self._config_path, self.config_name = os.path.split(
            self.args.config.name)
        '''
        Name of the YAML configuration file
        '''

        self._config_path = os.path.abspath(self._config_path)
        '''
        Path of the YAML configuration file
        '''

        self.config = dict()
        '''
        Dictionary representation of configuration YAML file.
        '''

        self.steps = dict()
        '''
        This dict stores step objects by their name. Each step knows his
        dependencies.
        '''

        self.topological_step_order = list()
        '''
        List with topologically ordered steps.
        '''

        self.file_dependencies = dict()
        '''
        This dict stores file dependencies within this pipeline, but regardless
        of step, output file tag or run ID. This dict has, for all output
        files generated by the pipeline, a set of input files that output
        file depends on.
        '''

        self.file_dependencies_reverse = dict()
        '''

        This dict stores file dependencies within this pipeline, but regardless
        of step, output file tag or run ID. This dict has, for all input
        files required by the pipeline, a set of output files which are generated
        using this input file.
        '''

        self.task_id_for_output_file = dict()
        '''
        This dict stores a task ID for every output file created by the pipeline.
        '''

        self.task_for_output_file = dict()
        '''
        This dict stores a task ID for every output file created by the pipeline.
        '''

        self.task_ids_for_input_file = dict()
        '''
        This dict stores a set of task IDs for every input file used in the
        pipeline.
        '''

        self.input_files_for_task_id = dict()
        '''
        This dict stores a set of input files for every task id in the pipeline.
        '''

        self.output_files_for_task_id = dict()
        '''
        This dict stores a set of output files for every task id in the pipeline.
        '''

        self.task_for_task_id = dict()
        '''
        This dict stores task objects by task IDs.
        '''

        self.all_tasks_topologically_sorted = list()
        '''
        List of all tasks in topological order.
        '''

        self.tasks_in_step = dict()
        '''
        This dict stores tasks per step name.
        '''

        self.used_tools = set()
        '''
        A set that stores all tools used by some step.
        '''

        self.known_config_keys = {
            'destination_path', 'constants', 'cluster', 'steps', 'lmod',
            'tools', 'base_working_directory', 'id'
        }
        '''
        A set of accepted keys in the config.
        '''

        self.read_config(self.args.config)
        self.setup_lmod()
        self.build_steps()

        configured_tools = set(tool
                               for tool, conf in self.config['tools'].items()
                               if not conf.get('atomatically_configured'))
        unused_tools = configured_tools - self.used_tools
        if unused_tools:
            logger.warning('Unused tool(s): %s' % list(unused_tools))

        # collect all tasks
        for step_name in self.topological_step_order:
            step = self.get_step(step_name)
            self.tasks_in_step[step_name] = list()
            logger.debug("Collect now all tasks for step: %s" % step)
            for run_index, run_id in enumerate(
                    misc.natsorted(step.get_run_ids())):
                task = task_module.Task(self, step, run_id, run_index)
                # if any run of a step contains an exec_groups,
                # the task (step/run) is added to the task list
                run = step.get_run(run_id)
                logger.debug("Step: %s, Run: %s" % (step, run_id))
                run_has_exec_groups = False
                if len(run.get_exec_groups()) > 0:
                    run_has_exec_groups = True
                if run_has_exec_groups:
                    logger.debug("Task: %s" % task)
                    self.all_tasks_topologically_sorted.append(task)
                    self.tasks_in_step[step_name].append(task)
                # Fail if multiple tasks with the same name exist
                if str(task) in self.task_for_task_id:
                    raise UAPError("Duplicate task ID %s." % task)
                self.task_for_task_id[str(task)] = task

        self.tool_versions = {}
        if not self.args.no_tool_checks:
            self.check_tools()

    def get_uap_path(self):
        return self._uap_path

    def get_cluster_config(self):
        return self._cluster_config

    def get_steps(self):
        return self.steps

    def get_step(self, step_name):
        return self.steps[step_name]

    # read configuration and make sure it's good
    def read_config(self, config_file):

        # read yaml
        self.config = yaml.load(config_file, Loader=yaml.FullLoader)
        config_file.close()

        # was yaml an annotation file?
        if 'config' in self.config.keys():
            self.config = self.config['config']
            dest = os.path.join(self._config_path, '..', '..')
            self.config['destination_path'] = os.path.abspath(dest)
            print('[uap] Reading config from annotation file with destination '
                  '%s' % self.config['destination_path'])

        # is the config valid
        for key in self.config.keys():
            if key not in self.known_config_keys:
                raise UAPError('The key "%s" set in config is unknown.' % key)

        # default id
        if 'id' not in self.config:
            self.config['id'] = self.config_name

        # new workin directory to work with relative paths
        self.config.setdefault('base_working_directory', self._config_path)
        os.chdir(self.config['base_working_directory'])

        # configure lmod
        if 'lmod' not in self.config or self.config['lmod'] is None:
            self.config['lmod'] = dict()
        if 'LMOD_CMD' in os.environ:
            self.config['lmod'].setdefault('path', os.environ['LMOD_CMD'])
        if 'MODULEPATH' in os.environ:
            self.config['lmod'].setdefault('module_path',
                                           os.environ['MODULEPATH'])
        lmod_configured = all(key in self.config['lmod']
                              for key in ['path', 'module_path'])

        # configure GNU Core Utilities
        if 'tools' not in self.config or not isinstance(
                self.config['tools'], dict):
            self.config['tools'] = dict()
        for tool in coreutils:
            auto_add = False
            if tool not in self.config['tools'] or \
                    not self.config['tools'][tool]:
                self.config['tools'][tool] = dict()
                auto_add = True
            self.config['tools'][tool].setdefault('ignore_version', True)
            self.config['tools'][tool].setdefault('atomatically_configured',
                                                  auto_add)

        # configure regular tools
        for tool, args in self.config['tools'].items():
            if not args:
                self.config['tools'][tool] = dict()
            self.config['tools'][tool].setdefault('path', tool)
            self.config['tools'][tool].setdefault('get_version', '--version')
            self.config['tools'][tool].setdefault('exit_code', 0)
            self.config['tools'][tool].setdefault('ignore_version', False)
            if any(key in self.config['tools'][tool]
                   for key in ['module_name', 'module_load', 'module_unload'
                               ]) and not lmod_configured:
                raise UAPError("The tool %s requires lmod, but lmod is not "
                               "loaded nor configured in %s." %
                               (tool, self.args.config.name))
            if 'module_name' in self.config['tools'][tool]:
                mn = self.config['tools'][tool]['module_name']
                cmd = '%s python load %s' % (self.config['lmod']['path'], mn)
                self.config['tools'][tool].setdefault('module_load', cmd)
                cmd = '%s python unload %s' % (self.config['lmod']['path'], mn)
                self.config['tools'][tool].setdefault('module_unload', cmd)

        # configure tools that come with the uap
        uap_tools_path = os.path.join(self._uap_path, 'tools')
        uap_python = os.path.join(self._uap_path, "python_env", "bin",
                                  "python")
        for tool_file in os.listdir(uap_tools_path):
            tool_path = os.path.join(uap_tools_path, tool_file)
            if not tool_file.endswith('.py') or not os.path.isfile(tool_path):
                continue
            tool = tool_file[:-3]
            auto_add = False
            if tool not in self.config['tools'] or \
                    not isinstance(self.config['tools'][tool], dict):
                auto_add = True
                self.config['tools'][tool] = dict()
            elif self.config['tools'][tool].get('atomatically_configured'):
                auto_add = True
            if auto_add:
                self.config['tools'][tool]['path'] = [uap_python, tool_path]
            else:
                self.config['tools'][tool].setdefault('path',
                                                      [uap_python, tool_path])
            self.config['tools'][tool].setdefault('get_version', '--help')
            self.config['tools'][tool].setdefault('exit_code', 0)
            self.config['tools'][tool].setdefault('ignore_version', False)
            self.config['tools'][tool].setdefault('atomatically_configured',
                                                  auto_add)

        # destination path
        if 'destination_path' not in self.config:
            raise UAPError("Missing key: destination_path")
        if not os.path.exists(self.config['destination_path']):
            raise UAPError("Destination path does not exist: %s" %
                           self.config['destination_path'])
        self.config['destination_path'] = \
            os.path.abspath(self.config['destination_path'])

        # cluster
        if 'cluster' not in self.config or self.config['cluster'] is None:
            self.config['cluster'] = dict()
        if self.get_cluster_type() is not None:
            self.config['cluster'].setdefault(
                'default_submit_options',
                self.get_cluster_command('default_options'))
        for i in [
                'default_submit_options', 'default_pre_job_command',
                'default_post_job_command'
        ]:
            self.config['cluster'].setdefault(i, '')
        self.config['cluster'].setdefault('default_job_quota', 0)  # no quota

    def build_steps(self):
        self.steps = {}
        if 'steps' not in self.config:
            raise UAPError("Missing key: steps")
        re_simple_key = re.compile('^[a-zA-Z0-9_]+$')
        re_complex_key = re.compile(r'^([a-zA-Z0-9_]+)\s+\(([a-zA-Z0-9_]+)\)$')

        # step one: instantiate all steps
        for step_key, step_description in self.config['steps'].items():

            # the step keys in the configuration may be either:
            # - MODULE_NAME
            # - DIFFERENT_STEP_NAME\s+\(MODULE_NAME\)
            step_name = None
            module_name = None
            if re_simple_key.match(step_key):
                step_name = step_key
                module_name = step_key
            else:
                match = re_complex_key.match(step_key)
                if match:
                    step_name = match.group(1)
                    module_name = match.group(2)

            if step_name == 'temp':
                # A step cannot be named 'temp' because we need the out/temp
                # directory to store temporary files.
                raise UAPError("A step name cannot be 'temp'.")
            step_class = abstract_step.AbstractStep.get_step_class_for_key(
                module_name)
            step = step_class(self)

            step.set_step_name(step_name)
            step.set_options(step_description)

            self.steps[step_name] = step
            self.used_tools.update(step.used_tools)

        # step two: set dependencies
        for step_name, step in self.steps.items():
            for parent_step in step._options['_depends']:
                if parent_step not in self.steps.keys():
                    raise UAPError("Step %s specifies an undefined "
                                   "dependency: %s." %
                                   (step_name, parent_step))
                step.add_dependency(self.steps[parent_step])

        # step three: perform topological sort
        # if there's a cycle (yeah, the algorithm is O(n^2), tsk, tsk...)

        unassigned_steps = set(self.steps.keys())
        assigned_steps = set()
        self.topological_step_order = []
        while len(unassigned_steps) > 0:
            # choose all tasks which have all dependencies resolved, either
            # because they have no dependencies or are already assigned
            next_steps = []
            for step_name in unassigned_steps:
                is_ready = True
                step = self.steps[step_name]
                for dep in step.dependencies:
                    dep_name = dep.get_step_name()
                    if dep_name not in assigned_steps:
                        is_ready = False
                        break
                if is_ready and step.get_step_type() == 'source_controller':
                    # make sure source_controller attempts to run first
                    next_steps = [step_name]
                    break
                elif is_ready:
                    next_steps.append(step_name)
            if len(next_steps) == 0:
                raise UAPError("There is a cycle in the step dependencies.")
            for step_name in misc.natsorted(next_steps):
                self.topological_step_order.append(step_name)
                assigned_steps.add(step_name)
                unassigned_steps.remove(step_name)

        # step four: finalize step
        for step in self.steps.values():
            step.finalize()

    def print_source_runs(self):
        for step_name in self.topological_step_order:
            step = self.steps[step_name]
            if isinstance(step, abstract_step.AbstractSourceStep):
                for run_id in misc.natsorted(step.get_run_ids()):
                    print("%s/%s" % (step, run_id))

    def add_file_dependencies(self, output_path, input_paths):
        if output_path in self.file_dependencies:
            raise UAPError("Different steps/runs/tags want to create "
                           "the same output file: %s." % output_path)
        self.file_dependencies[output_path] = set(input_paths)

        for inpath in input_paths:
            if inpath not in self.file_dependencies_reverse:
                self.file_dependencies_reverse[inpath] = set()
            self.file_dependencies_reverse[inpath].add(output_path)

    def add_task_for_output_file(self, output_path, task_id):
        if output_path in self.task_id_for_output_file:
            raise UAPError("More than one step is trying to create the "
                           "same output file: %s." % output_path)
        self.task_id_for_output_file[output_path] = task_id

        if task_id not in self.output_files_for_task_id:
            self.output_files_for_task_id[task_id] = set()
        self.output_files_for_task_id[task_id].add(output_path)

    def add_task_for_input_file(self, input_path, task_id):
        if input_path not in self.task_ids_for_input_file:
            self.task_ids_for_input_file[input_path] = set()
        self.task_ids_for_input_file[input_path].add(task_id)

        if task_id not in self.input_files_for_task_id:
            self.input_files_for_task_id[task_id] = set()
        self.input_files_for_task_id[task_id].add(input_path)

    def get_task_for_file(self, path):
        '''
        Returns the task for a given output file path.
        '''
        task_id = self.task_id_for_output_file.get(path)
        if task_id:
            return self.task_for_task_id[task_id]
        else:
            return None

    def setup_lmod(self):
        '''
        If lmod is configured this functions sets the required environmental variables.
        '''
        module_path = self.config.get('lmod', dict()).get('module_path')
        if module_path:
            os.environ['MODULEPATH'] = module_path

    def check_tools(self):
        '''
        checks whether all tools references by the configuration are available
        and records their versions as determined by ``[tool] --version`` etc.
        '''
        if 'tools' not in self.config:
            return
        pool = multiprocessing.Pool(4)
        if logger.getEffectiveLevel() <= 20:
            show_status = False
        elif self.has_interactive_shell():
            show_status = True
        elif not (hasattr(self.args, 'run') and self.args.run):
            show_status = True
        else:
            show_status = False
        if not show_status:
            sys.stderr.write('[uap] Running tool check...\n')
            sys.stderr.flush()
        iter_tools = tqdm(
            pool.imap_unordered(check_tool, self.config['tools'].items()),
            total=len(self.config['tools']),
            desc='tool check',
            bar_format='{desc}:{percentage:3.0f}%|{bar:10}{r_bar}',
            disable=not show_status)
        try:
            for tool_id, tool_check_info in iter_tools:
                self.tool_versions[tool_id] = tool_check_info
        except BaseException:
            pool.terminate()
            iter_tools.close()
            raise
        pool.close()
        pool.join()

    def has_interactive_shell(self):
        return os.isatty(sys.stdout.fileno())

    def notify(self, message, attachment=None):
        '''
        prints a notification to the screen and optionally delivers the
        message on additional channels (as defined by the configuration)
        '''
        print(message)
        if 'notify' in self.config:
            try:
                notify = self.config['notify']
                match = re.search(r'^(https?://[^/]+)/([a-z0-9]+)$', notify)
                if match:
                    host = match.group(1)
                    token = match.group(2)
                    args = ['curl', host, '-X', 'POST', '-d', '@-']
                    proc = subprocess.Popen(args, stdin=subprocess.PIPE)
                    data = {'token': token, 'message': message}
                    if attachment:
                        data['attachment_name'] = attachment['name']
                        data['attachment_data'] = base64.b64encode(
                            attachment['data'])
                    proc.stdin.write(json.dumps(data))
                    proc.stdin.close()
                    proc.wait()
                else:
                    logger.warning(
                        'Cloud not split patter into http(s)://host/token to notify: %s'
                        % self.config['notify'])
            except BaseException:
                # swallow all exception that happen here, failing notifications
                # are no reason to crash the entire thing
                logger.warning('Notification of "%s" failed with: %s' %
                               (self.config['notify'], sys.exc_info()[0]))
                pass

    def get_cluster_job_ids(self):
        '''
        The argument less method returns a set the cluster job ids of all
        subbmited jobs.
        '''
        ids = set()
        for task in self.all_tasks_topologically_sorted:
            queued_ping_file = task.get_run().get_queued_ping_file()
            failed_qpf = queued_ping_file + '.bad'  # alternative location
            try:
                with open(queued_ping_file, 'r') as fl:
                    info = yaml.load(fl, Loader=yaml.FullLoader)
                ids.add(info['cluster job id'])
            except (IOError, TypeError) as e:
                if os.path.exists(queued_ping_file):
                    raise UAPError('Could not read ping file %s: %s' %
                                   (queued_ping_file, e))
                else:
                    try:
                        with open(failed_qpf, 'r') as fl:
                            info = yaml.load(fl, Loader=yaml.FullLoader)
                        ids.add(info['cluster job id'])
                    except (IOError, TypeError) as e:
                        if os.path.exists(failed_qpf):
                            raise UAPError('Could not read ping file %s: %s' %
                                           (failed_qpf, e))
        return ids

    def get_task_with_list(self, as_string=False, exclusive=False):
        '''
        Reruns a list of tasks, specified with the run argument.
        '''
        task_wish_list = list()
        args = list()
        if hasattr(self.args, 'run'):
            specified_tasks = self.args.run
        for task_id in specified_tasks:
            if task_id in self.task_for_task_id:
                task = self.task_for_task_id[task_id]
                if as_string:
                    task = str(task)
                task_wish_list.append(task)
            else:
                for task in self.all_tasks_topologically_sorted:
                    if str(task).startswith(task_id):
                        if as_string:
                            task = str(task)
                        task_wish_list.append(task)
        if specified_tasks and not task_wish_list:
            raise UAPError("No task matches the requested pattern(s) '%s'." %
                           ' '.join(specified_tasks))
        if not specified_tasks and exclusive is False:
            if not as_string:
                return self.all_tasks_topologically_sorted
            return [str(t) for t in self.all_tasks_topologically_sorted]
        return task_wish_list

    def check_ping_files(self,
                         print_more_warnings=False,
                         print_details=False,
                         fix_problems=False):
        run_problems = list()
        queue_problems = list()
        bad_problems = list()
        check_queue = True

        try:
            stat_output = subprocess.check_output(
                [self.get_cluster_command('stat')],
                stderr=subprocess.STDOUT).decode('utf-8')
        except (KeyError, OSError, subprocess.CalledProcessError):
            # we don't have a stat tool here, if subprocess.CalledProcessError
            # is raised
            check_queue = False

        if print_more_warnings and not check_queue:
            try:
                ce = self.get_cluster_command('stat')
            except KeyError:
                ce = "a cluster engine"
            print("Attention, we cannot check stale queued ping files because "
                  "this host does not have %s." % ce)

        running_jids = set()

        if check_queue:
            for line in stat_output.split("\n"):
                if 'COMPLETING' in line:
                    # this is sluem specific and if a closing job is stuck
                    continue
                try:
                    jid = int(line.strip().split(' ')[0].split('_')[0])
                    running_jids.add(str(jid))
                except ValueError:
                    # this is not a JID
                    pass

        for task in self.all_tasks_topologically_sorted:
            queued_ping_file = task.get_run().get_queued_ping_file()
            bad_queued_ping_file = queued_ping_file + '.bad'
            exec_ping_file = task.get_run().get_executing_ping_file()
            stale = task.get_run().is_stale()
            if stale:
                try:
                    info = yaml.load(open(exec_ping_file, 'r'),
                                     Loader=yaml.FullLoader)
                except IOError as e:
                    if os.path.exists(exec_ping_file):
                        raise e
                else:
                    start_time = info['start_time']
                    last_activity = datetime.datetime.fromtimestamp(
                        task.get_run().fsc.getmtime(exec_ping_file))
                    run_problems.append((task, exec_ping_file, stale,
                                         last_activity - start_time))
            if check_queue:
                try:
                    info = yaml.load(open(queued_ping_file, 'r'),
                                     Loader=yaml.FullLoader)
                except IOError as e:
                    if os.path.exists(queued_ping_file):
                        raise e
                else:
                    if not str(info['cluster job id']) in running_jids:
                        queue_problems.append(
                            (task, queued_ping_file, info['submit_time'],
                             info['cluster job id']))
            try:
                info = yaml.load(open(bad_queued_ping_file, 'r'),
                                 Loader=yaml.FullLoader)
            except IOError as e:
                if os.path.exists(bad_queued_ping_file):
                    raise e
            else:
                bad_problems.append(
                    (task, bad_queued_ping_file, info['submit_time'],
                     info['cluster job id']))

        show_hint = False

        if len(run_problems) > 0:
            show_hint = True
            label = "Warning: There are %d stale run ping files." % len(
                run_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                run_problems = sorted(run_problems,
                                      key=itemgetter(2, 3),
                                      reverse=True)
                for problem in run_problems:
                    task = problem[0]
                    path = problem[1]
                    last_activity_difference = problem[2]
                    ran_for = problem[3]
                    print("dead since %13s, ran for %13s: %s" %
                          (misc.duration_to_str(last_activity_difference),
                           misc.duration_to_str(ran_for), task))
                print("")

        if len(queue_problems) > 0:
            show_hint = True
            label = "Warning: There are %d tasks marked as queued, but they "\
                    "do not seem to be queued." % len(queue_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                queue_problems = sorted(queue_problems,
                                        key=itemgetter(2),
                                        reverse=True)
                for problem in queue_problems:
                    task = problem[0]
                    path = problem[1]
                    start_time = problem[2]
                    job_id = problem[3]
                    print("submitted job %s at %13s: %s" %
                          (job_id, start_time, task))
                print("")

        if len(bad_problems) > 0:
            label = "Info: Found %d queue files of failed tasks." % len(
                bad_problems)
            print(label)
            if print_details:
                print('-' * len(label))
                bad_problems = sorted(bad_problems,
                                      key=itemgetter(2),
                                      reverse=True)
                for problem in bad_problems:
                    task = problem[0]
                    path = problem[1]
                    start_time = problem[2]
                    job_id = problem[3]
                    print("submitted job %s at %13s: %s" %
                          (job_id, start_time, task))
                print("")

        if fix_problems:
            all_problems = run_problems
            all_problems.extend(queue_problems)
            all_problems.extend(bad_problems)
            for problem in all_problems:
                path = problem[1]
                print("Now deleting %s..." % path)
                os.unlink(path)

        if show_hint:
            if print_more_warnings and not print_details or not fix_problems:
                print("Hint: Run 'uap %s fix-problems --details' to see the "
                      "details." % self.args.config.name)
            if print_more_warnings and not fix_problems:
                print("Hint: Run 'uap %s fix-problems --first-error' to "
                      "investigate what happended." % self.args.config.name)
            if not fix_problems:
                print(
                    "Hint: Run 'uap %s fix-problems --srsly' to fix these "
                    "problems (that is, delete all problematic ping files)." %
                    self.args.config.name)
        else:
            print('No problematic ping files were found.')

    def check_volatile_files(self, details=False, srsly=False):
        collected_files = set()
        for task in self.all_tasks_topologically_sorted:
            collected_files |= task.volatilize_if_possible(srsly)
        if not srsly and len(collected_files) > 0:
            if details:
                for path in sorted(collected_files):
                    print(path)
            total_size = 0
            for path in collected_files:
                total_size += os.path.getsize(path)
            print("Hint: You could save %s of disk space by volatilizing %d "
                  "output files." %
                  (misc.bytes_to_str(total_size), len(collected_files)))
            print("Call 'uap %s volatilize --srsly' to purge the files." %
                  self.args.config.name)

    def autodetect_cluster_type(self):
        cluster_config = self.get_cluster_config()
        # Let's see if we can successfully run a cluster identity test
        # Test all configured cluster types
        for cluster_type in cluster_config.keys():
            # Do we have an identity test command
            identity = dict()
            for key in ['test', 'answer']:
                try:
                    identity[key] = cluster_config[cluster_type]['identity_%s'
                                                                 % key]
                except KeyError:
                    raise UAPError(
                        "%s: Missing 'identity_%s' for %s"
                        "cluster type." %
                        (self._cluster_config_path, key, cluster_type))
            # Now that we know let's test for that cluster
            if not isinstance(identity['answer'], list):
                identity['answer'] = [identity['answer']]
            for answer in identity['answer']:
                try:
                    if (subprocess.check_output(identity['test']).decode(
                            'utf-8').startswith(answer)):
                        return cluster_type
                except OSError:
                    pass
        logger.warning('Cluster type could not be detected.')
        return None

    def get_cluster_type(self):
        return self._cluster_type

    def set_cluster_type(self, cluster_type):
        if cluster_type is not None and cluster_type not in self.get_cluster_config(
        ):
            raise UAPError('Cluster type "%s" not configured.' % cluster_type)
        self._cluster_type = cluster_type

    '''
    Shorthand to retrieve a cluster-type-dependent command or filename
    (cc == cluster command).
    '''

    def get_cluster_command(self, key):
        ct = self.get_cluster_type()
        if key not in self.get_cluster_config()[ct].keys():
            raise UAPError(
                'The option "%s" is not available for the cluster "%s".' %
                (key, ct))
        return self.get_cluster_config()[ct][key]

    '''
    Shorthand to retrieve a cluster-type-dependent command line part (this is a
    list)
    '''

    def get_cluster_command_cli_option(self, key, value):
        result = self.get_cluster_config()[self.get_cluster_type()][key]
        if isinstance(result, list):
            nval = sum(part.count('%s') for part in result)
            value = tuple([value]) if not isinstance(value, tuple) else value
            if len(value) != nval:
                raise UAPError('The option %s requires a tuple '
                               'of %d values to be placed into %s but the '
                               'values are %s.' % (key, nval, result, value))
            options = list()
            i = 0
            for part in result:
                if '%s' in part:
                    options.append(part % value[i:i + part.count('%s')])
                    i += part.count('%s')
                else:
                    options.append(part)
            return options
        if '%s' in result:
            return [result % value]
        else:
            return [result, value]