def fetch_equipment_name(self): """ fetch equipment name filter by regex if there is one """ if self.ms.get_equipment_name_list()==0: print("No element fetched : fetch_equipment_name()==0") return False #nameListToFilter=misc.natsorted(nameListToSort) self.ms.set_list_node(misc.natsorted(self.ms.get_list_node())) self.ms.set_list_hwmanager(misc.natsorted(self.ms.get_list_hwmanager())) self.ms.set_list_switch(misc.natsorted(self.ms.get_list_switch())) self.ms.set_list_diskarray(misc.natsorted(self.ms.get_list_diskarray())) self.ms.set_list_metaservice(misc.natsorted(self.ms.get_list_metaservice())) if self.regex_name==None: return True else: if len(self.ms.get_list_node())>0: self.ms.set_list_node( self.filter_by_regex(self.ms.get_list_node())) if len(self.ms.get_list_hwmanager())>0: self.ms.set_list_hwmanager( self.filter_by_regex(self.ms.get_list_hwmanager())) if len(self.ms.get_list_switch())>0: self.ms.set_list_switch( self.filter_by_regex(self.ms.get_list_switch())) if len(self.ms.get_list_diskarray())>0: self.ms.set_list_diskarray( self.filter_by_regex(self.ms.get_list_diskarray())) if len(self.ms.get_list_metaservice())>0: self.ms.set_list_metaservice( self.filter_by_regex(self.ms.get_list_metaservice()))
def __init__(self, **kwargs): self.caught_signal = None self.git_dirty_diff = None self.cluster_type = None ''' The cluster type to be used (must be one of the keys specified in cluster_config). ''' # Check the availability of git command = ['git', '--version'] try: with open(os.devnull, 'w') as devnull: subprocess.check_call(command, stdout = devnull) except subprocess.CalledProcessError as e: raise StandardError("Execution of %s failed. Git seems to be " "unavailable." % " ".join(command)) # now determine the Git hash of the repository command = ['git', 'describe', '--all', '--dirty', '--long'] try: self.git_hash_tag = subprocess.check_output(command).strip() except: raise StandardError("Execution of %s failed." % " ".join(command)) # check if we got passed an 'arguments' parameter # this parameter should contain a argparse.Namespace object args = None if 'arguments' in kwargs: args = kwargs['arguments'] if self.git_hash_tag.endswith('-dirty'): if not args.even_if_dirty: print("The repository has uncommitted changes, which is why " + "we will exit right now.") print("If this is not a production environment, you can skip " + "this test by specifying --even-if-dirty on the command " + "line.") print(self.git_hash_tag) exit(1) command = ['git', 'diff'] try: self.git_dirty_diff = subprocess.check_output(command) except: raise StandardError("Execution of %s failed." % " ".join(command)) try: # set cluster type if args.cluster == 'auto': self.set_cluster_type(self.autodetect_cluster_type()) else: self.set_cluster_type(args.cluster) except AttributeError: # cluster type is not an applicable parameter here, and that's fine # (we're probably in run-locally.py) pass self._config_filepath = args.config.name ''' Name of the YAML configuration file ''' self.config = dict() ''' Dictionary representation of configuration YAML file. ''' self.steps = dict() ''' This dict stores step objects by their name. Each step knows his dependencies. ''' self.topological_step_order = list() ''' List with topologically ordered steps. ''' self.file_dependencies = dict() ''' This dict stores file dependencies within this pipeline, but regardless of step, output file tag or run ID. This dict has, for all output files generated by the pipeline, a set of input files that output file depends on. ''' self.file_dependencies_reverse = dict() ''' This dict stores file dependencies within this pipeline, but regardless of step, output file tag or run ID. This dict has, for all input files required pipeline, a set of output files which are generated using this input file. ''' self.task_id_for_output_file = dict() ''' This dict stores a task ID for every output file created by the pipeline. ''' self.task_ids_for_input_file = dict() ''' This dict stores a set of task IDs for every input file used in the pipeline. ''' self.input_files_for_task_id = dict() ''' This dict stores a set of input files for every task id in the pipeline. ''' self.output_files_for_task_id = dict() ''' This dict stores a set of output files for every task id in the pipeline. ''' self.task_for_task_id = dict() ''' This dict stores task objects by task IDs. ''' self.all_tasks_topologically_sorted = list() ''' List of all tasks in topological order. ''' self.config_file_name = args.config.name ''' This stores the name of the configuration file of the current analysis ''' self.read_config(args.config) # collect all tasks for step_name in self.topological_step_order: step = self.get_step(step_name) logger.debug("Collect now all tasks for step: %s" % step) for run_index, run_id in enumerate(misc.natsorted(step.get_run_ids())): task = task_module.Task(self, step, run_id, run_index) # if any run of a step contains an exec_groups, # the task (step/run) is added to the task list run = step.get_run(run_id) logger.debug("Step: %s, Run: %s" % (step, run_id)) run_has_exec_groups = False if len(run.get_exec_groups()) > 0: run_has_exec_groups = True if run_has_exec_groups: logger.debug("Task: %s" % task) self.all_tasks_topologically_sorted.append(task) # Fail if multiple tasks with the same name exist if str(task) in self.task_for_task_id: raise ConfigurationException("Duplicate task ID %s." % str(task)) self.task_for_task_id[str(task)] = task self.tool_versions = {} self.check_tools()
def print_source_runs(self): for step_name in self.topological_step_order: step = self.steps[step_name] if isinstance(step, abstract_step.AbstractSourceStep): for run_id in misc.natsorted(step.get_run_ids()): print("%s/%s" % (step, run_id))
def build_steps(self): self.steps = {} if not 'steps' in self.config: raise ConfigurationException("Missing key: steps") re_simple_key = re.compile('^[a-zA-Z0-9_]+$') re_complex_key = re.compile('^([a-zA-Z0-9_]+)\s+\(([a-zA-Z0-9_]+)\)$') # step one: instantiate all steps for step_key, step_description in self.config['steps'].items(): # the step keys in the configuration may be either: # - MODULE_NAME # - DIFFERENT_STEP_NAME\s+\(MODULE_NAME\) step_name = None module_name = None if re_simple_key.match(step_key): step_name = step_key module_name = step_key else: match = re_complex_key.match(step_key) if match: step_name = match.group(1) module_name = match.group(2) if step_name == 'temp': # A step cannot be named 'temp' because we need the out/temp # directory to store temporary files. raise ConfigurationException("A step name cannot be 'temp'.") step_class = abstract_step.AbstractStep.get_step_class_for_key(module_name) step = step_class(self) step.set_step_name(step_name) step.set_options(step_description) self.steps[step_name] = step # step two: set dependencies for step_name, step in self.steps.items(): if not step.needs_parents: if '_depends' in step._options: raise ConfigurationException("%s must not have dependencies " "because it declares no in/* connections (remove the " "_depends key)." % step_name) else: if not '_depends' in step._options: raise ConfigurationException("Missing key in step '%s': " "_depends (set to null if the step has no dependencies)." % step_name) depends = step._options['_depends'] if depends == None: pass else: temp_list = depends if depends.__class__ == str: temp_list = [depends] for d in temp_list: if not d in self.steps: raise ConfigurationException("Step %s specifies " "an undefined dependency: %s." % (step_name, d)) step.add_dependency(self.steps[d]) # step three: perform topological sort, raise a ConfigurationException # if there's a cycle (yeah, the algorithm is O(n^2), tsk, tsk...) unassigned_steps = set(self.steps.keys()) assigned_steps = set() self.topological_step_order = [] while len(unassigned_steps) > 0: # choose all tasks which have all dependencies resolved, either # because they have no dependencies or are already assigned next_steps = [] for step_name in unassigned_steps: is_ready = True for dep in self.steps[step_name].dependencies: dep_name = dep.get_step_name() if not dep_name in assigned_steps: is_ready = False break if is_ready: next_steps.append(step_name) if len(next_steps) == 0: raise ConfigurationException( "There is a cycle in the step dependencies.") for step_name in misc.natsorted(next_steps): self.topological_step_order.append(step_name) assigned_steps.add(step_name) unassigned_steps.remove(step_name) # step four: finalize step for step in self.steps.values(): step.finalize()
def __init__(self, **kwargs): self.caught_signal = None self.git_dirty_diff = None self.cluster_type = None ''' The cluster type to be used (must be one of the keys specified in cluster_config). ''' # Check the availability of git command = ['git', '--version'] try: with open(os.devnull, 'w') as devnull: subprocess.check_call(command, stdout=devnull) except subprocess.CalledProcessError as e: logger.error("Execution of '%s' failed. Git seems to be " "unavailable." % " ".join(command)) sys.exit(1) # now determine the Git hash of the repository command = ['git', 'describe', '--all', '--dirty', '--long'] try: self.git_hash_tag = subprocess.check_output(command).strip() except: logger.error("Execution of %s failed." % " ".join(command)) raise sys.exit(1) # check if we got passed an 'arguments' parameter # this parameter should contain a argparse.Namespace object args = None if 'arguments' in kwargs: args = kwargs['arguments'] self._uap_path = args.uap_path ''' Absolute path to the directory of the uap executable. It is used to circumvent path issues. ''' self._cluster_config_path = os.path.join( self._uap_path, 'cluster/cluster-specific-commands.yaml') with open(self._cluster_config_path, 'r') as cluster_config_file: self._cluster_config = yaml.load(cluster_config_file) ''' Cluster-related configuration for every cluster system supported. ''' if self.git_hash_tag.endswith('-dirty'): if not args.even_if_dirty: print("The repository has uncommitted changes, which is why " + "we will exit right now.") print( "If this is not a production environment, you can skip " + "this test by specifying --even-if-dirty on the command " + "line.") print(self.git_hash_tag) exit(1) command = ['git', 'diff'] try: self.git_dirty_diff = subprocess.check_output(command) except: logger.error("Execution of %s failed." % " ".join(command)) sys.exit(1) try: # set cluster type if args.cluster == 'auto': self.set_cluster_type(self.autodetect_cluster_type()) else: self.set_cluster_type(args.cluster) except AttributeError: # cluster type is not an applicable parameter here, and that's fine # (we're probably in run-locally.py) pass self._config_filepath = args.config.name ''' Name of the YAML configuration file ''' self.config = dict() ''' Dictionary representation of configuration YAML file. ''' self.steps = dict() ''' This dict stores step objects by their name. Each step knows his dependencies. ''' self.topological_step_order = list() ''' List with topologically ordered steps. ''' self.file_dependencies = dict() ''' This dict stores file dependencies within this pipeline, but regardless of step, output file tag or run ID. This dict has, for all output files generated by the pipeline, a set of input files that output file depends on. ''' self.file_dependencies_reverse = dict() ''' This dict stores file dependencies within this pipeline, but regardless of step, output file tag or run ID. This dict has, for all input files required by the pipeline, a set of output files which are generated using this input file. ''' self.task_id_for_output_file = dict() ''' This dict stores a task ID for every output file created by the pipeline. ''' self.task_ids_for_input_file = dict() ''' This dict stores a set of task IDs for every input file used in the pipeline. ''' self.input_files_for_task_id = dict() ''' This dict stores a set of input files for every task id in the pipeline. ''' self.output_files_for_task_id = dict() ''' This dict stores a set of output files for every task id in the pipeline. ''' self.task_for_task_id = dict() ''' This dict stores task objects by task IDs. ''' self.all_tasks_topologically_sorted = list() ''' List of all tasks in topological order. ''' self.read_config(args.config) # collect all tasks for step_name in self.topological_step_order: step = self.get_step(step_name) logger.debug("Collect now all tasks for step: %s" % step) for run_index, run_id in enumerate( misc.natsorted(step.get_run_ids())): task = task_module.Task(self, step, run_id, run_index) # if any run of a step contains an exec_groups, # the task (step/run) is added to the task list run = step.get_run(run_id) logger.debug("Step: %s, Run: %s" % (step, run_id)) run_has_exec_groups = False if len(run.get_exec_groups()) > 0: run_has_exec_groups = True if run_has_exec_groups: logger.debug("Task: %s" % task) self.all_tasks_topologically_sorted.append(task) # Fail if multiple tasks with the same name exist if str(task) in self.task_for_task_id: logger.error("%s: Duplicate task ID %s." % (self.get_config_filepath(), str(task))) sys.exit(1) self.task_for_task_id[str(task)] = task self.tool_versions = {} self.check_tools()
def build_steps(self): self.steps = {} if not 'steps' in self.config: logger.error("%s: Missing key: steps" % self.get_config_filepath()) sys.exit(1) re_simple_key = re.compile('^[a-zA-Z0-9_]+$') re_complex_key = re.compile('^([a-zA-Z0-9_]+)\s+\(([a-zA-Z0-9_]+)\)$') # step one: instantiate all steps for step_key, step_description in self.config['steps'].items(): # the step keys in the configuration may be either: # - MODULE_NAME # - DIFFERENT_STEP_NAME\s+\(MODULE_NAME\) step_name = None module_name = None if re_simple_key.match(step_key): step_name = step_key module_name = step_key else: match = re_complex_key.match(step_key) if match: step_name = match.group(1) module_name = match.group(2) if step_name == 'temp': # A step cannot be named 'temp' because we need the out/temp # directory to store temporary files. logger.error("%s: A step name cannot be 'temp'." % self.get_config_filepath()) sys.exit(1) step_class = abstract_step.AbstractStep.get_step_class_for_key( module_name) step = step_class(self) step.set_step_name(step_name) step.set_options(step_description) self.steps[step_name] = step # step two: set dependencies for step_name, step in self.steps.items(): if not step.needs_parents: if '_depends' in step._options: logger.error("%s: %s must not have dependencies because " "it declares no in/* connections (remove the " "_depends key)." % (self.get_config_filepath(), step_name)) sys.exit(1) else: if not '_depends' in step._options: logger.error("%s: Missing key in step '%s': _depends (set " "to null if the step has no dependencies)." % (self.get_config_filepath(), step_name)) sys.exit(1) depends = step._options['_depends'] if depends == None: pass else: temp_list = depends if depends.__class__ == str: temp_list = [depends] for d in temp_list: if not d in self.steps: logger.error( "%s: Step %s specifies an undefined " "dependency: %s." % (self.get_config_filepath(), step_name, d)) sys.exit(1) step.add_dependency(self.steps[d]) # step three: perform topological sort # if there's a cycle (yeah, the algorithm is O(n^2), tsk, tsk...) unassigned_steps = set(self.steps.keys()) assigned_steps = set() self.topological_step_order = [] while len(unassigned_steps) > 0: # choose all tasks which have all dependencies resolved, either # because they have no dependencies or are already assigned next_steps = [] for step_name in unassigned_steps: is_ready = True for dep in self.steps[step_name].dependencies: dep_name = dep.get_step_name() if not dep_name in assigned_steps: is_ready = False break if is_ready: next_steps.append(step_name) if len(next_steps) == 0: logger.error("%s: There is a cycle in the step dependencies." % self.get_config_filepath()) sys.exit(1) for step_name in misc.natsorted(next_steps): self.topological_step_order.append(step_name) assigned_steps.add(step_name) unassigned_steps.remove(step_name) # step four: finalize step for step in self.steps.values(): step.finalize()
def build_steps(self): self.steps = {} if 'steps' not in self.config: raise UAPError("Missing key: steps") re_simple_key = re.compile('^[a-zA-Z0-9_]+$') re_complex_key = re.compile(r'^([a-zA-Z0-9_]+)\s+\(([a-zA-Z0-9_]+)\)$') # step one: instantiate all steps for step_key, step_description in self.config['steps'].items(): # the step keys in the configuration may be either: # - MODULE_NAME # - DIFFERENT_STEP_NAME\s+\(MODULE_NAME\) step_name = None module_name = None if re_simple_key.match(step_key): step_name = step_key module_name = step_key else: match = re_complex_key.match(step_key) if match: step_name = match.group(1) module_name = match.group(2) if step_name == 'temp': # A step cannot be named 'temp' because we need the out/temp # directory to store temporary files. raise UAPError("A step name cannot be 'temp'.") step_class = abstract_step.AbstractStep.get_step_class_for_key( module_name) step = step_class(self) step.set_step_name(step_name) step.set_options(step_description) self.steps[step_name] = step self.used_tools.update(step.used_tools) # step two: set dependencies for step_name, step in self.steps.items(): for parent_step in step._options['_depends']: if parent_step not in self.steps.keys(): raise UAPError("Step %s specifies an undefined " "dependency: %s." % (step_name, parent_step)) step.add_dependency(self.steps[parent_step]) # step three: perform topological sort # if there's a cycle (yeah, the algorithm is O(n^2), tsk, tsk...) unassigned_steps = set(self.steps.keys()) assigned_steps = set() self.topological_step_order = [] while len(unassigned_steps) > 0: # choose all tasks which have all dependencies resolved, either # because they have no dependencies or are already assigned next_steps = [] for step_name in unassigned_steps: is_ready = True step = self.steps[step_name] for dep in step.dependencies: dep_name = dep.get_step_name() if dep_name not in assigned_steps: is_ready = False break if is_ready and step.get_step_type() == 'source_controller': # make sure source_controller attempts to run first next_steps = [step_name] break elif is_ready: next_steps.append(step_name) if len(next_steps) == 0: raise UAPError("There is a cycle in the step dependencies.") for step_name in misc.natsorted(next_steps): self.topological_step_order.append(step_name) assigned_steps.add(step_name) unassigned_steps.remove(step_name) # step four: finalize step for step in self.steps.values(): step.finalize()
def __init__(self, **kwargs): self.caught_signal = None self._cluster_type = None self.git_version = None self.git_status = None self.git_diff = None self.git_untracked = None self.git_tag = None '''use git diff to determine any changes in git directory if git is available ''' command = ['git', '--version'] try: self.git_version = subprocess.check_output(command).strip() except subprocess.CalledProcessError: logger.warning("""Execution of %s failed. Git seems to be unavailable. Continue anyways""" % " ".join(command)) if self.git_version: command = ['git', 'status', '--porcelain'] try: self.git_status = subprocess.check_output(command) except subprocess.CalledProcessError: logger.error("Execution of %s failed." % " ".join(command)) command = ['git', 'diff', 'HEAD'] try: self.git_diff = subprocess.check_output(command) except subprocess.CalledProcessError: logger.error("Execution of %s failed." % " ".join(command)) command = ['git', 'ls-files', '--others', '--exclude-standard'] try: self.git_untracked = subprocess.check_output(command) except subprocess.CalledProcessError: logger.error("Execution of %s failed." % " ".join(command)) command = ['git', 'describe', '--all', '--long'] try: self.git_tag = subprocess.check_output(command).strip() except subprocess.CalledProcessError: logger.error("Execution of %s failed." % " ".join(command)) if self.git_diff: logger.warning('THE GIT REPOSITORY HAS UNCOMMITED CHANGES:\n' '%s' % self.git_diff.decode('utf-8')) if self.git_untracked: logger.warning('THE GIT REPOSITORY HAS UNTRACKED FILES:\n' '%s' % self.git_untracked.decode('utf-8')) """ check if we got passed an 'arguments' parameter this parameter should contain a argparse.Namespace object """ self.args = None if 'arguments' in kwargs: self.args = kwargs['arguments'] ''' Absolute path to the directory of the uap executable. It is used to circumvent path issues. ''' self._uap_path = self.args.uap_path ''' The cluster type to be used (must be one of the keys specified in cluster_config). ''' self._cluster_config_path = os.path.join( self._uap_path, 'cluster/cluster-specific-commands.yaml') with open(self._cluster_config_path, 'r') as cluster_config_file: self._cluster_config = yaml.load(cluster_config_file, Loader=yaml.FullLoader) try: # set cluster type if self.args.cluster == 'auto': self.set_cluster_type(self.autodetect_cluster_type()) else: self.set_cluster_type(self.args.cluster) except AttributeError: # cluster type is not an applicable parameter here, and that's fine # (we're probably in run-locally.py) pass self._start_working_dir = os.getcwd() ''' User working directory. ''' if not self.args.config: raise UAPError('No <project-config>.yaml specified.') self._config_path, self.config_name = os.path.split( self.args.config.name) ''' Name of the YAML configuration file ''' self._config_path = os.path.abspath(self._config_path) ''' Path of the YAML configuration file ''' self.config = dict() ''' Dictionary representation of configuration YAML file. ''' self.steps = dict() ''' This dict stores step objects by their name. Each step knows his dependencies. ''' self.topological_step_order = list() ''' List with topologically ordered steps. ''' self.file_dependencies = dict() ''' This dict stores file dependencies within this pipeline, but regardless of step, output file tag or run ID. This dict has, for all output files generated by the pipeline, a set of input files that output file depends on. ''' self.file_dependencies_reverse = dict() ''' This dict stores file dependencies within this pipeline, but regardless of step, output file tag or run ID. This dict has, for all input files required by the pipeline, a set of output files which are generated using this input file. ''' self.task_id_for_output_file = dict() ''' This dict stores a task ID for every output file created by the pipeline. ''' self.task_for_output_file = dict() ''' This dict stores a task ID for every output file created by the pipeline. ''' self.task_ids_for_input_file = dict() ''' This dict stores a set of task IDs for every input file used in the pipeline. ''' self.input_files_for_task_id = dict() ''' This dict stores a set of input files for every task id in the pipeline. ''' self.output_files_for_task_id = dict() ''' This dict stores a set of output files for every task id in the pipeline. ''' self.task_for_task_id = dict() ''' This dict stores task objects by task IDs. ''' self.all_tasks_topologically_sorted = list() ''' List of all tasks in topological order. ''' self.tasks_in_step = dict() ''' This dict stores tasks per step name. ''' self.used_tools = set() ''' A set that stores all tools used by some step. ''' self.known_config_keys = { 'destination_path', 'constants', 'cluster', 'steps', 'lmod', 'tools', 'base_working_directory', 'id' } ''' A set of accepted keys in the config. ''' self.read_config(self.args.config) self.setup_lmod() self.build_steps() configured_tools = set(tool for tool, conf in self.config['tools'].items() if not conf.get('atomatically_configured')) unused_tools = configured_tools - self.used_tools if unused_tools: logger.warning('Unused tool(s): %s' % list(unused_tools)) # collect all tasks for step_name in self.topological_step_order: step = self.get_step(step_name) self.tasks_in_step[step_name] = list() logger.debug("Collect now all tasks for step: %s" % step) for run_index, run_id in enumerate( misc.natsorted(step.get_run_ids())): task = task_module.Task(self, step, run_id, run_index) # if any run of a step contains an exec_groups, # the task (step/run) is added to the task list run = step.get_run(run_id) logger.debug("Step: %s, Run: %s" % (step, run_id)) run_has_exec_groups = False if len(run.get_exec_groups()) > 0: run_has_exec_groups = True if run_has_exec_groups: logger.debug("Task: %s" % task) self.all_tasks_topologically_sorted.append(task) self.tasks_in_step[step_name].append(task) # Fail if multiple tasks with the same name exist if str(task) in self.task_for_task_id: raise UAPError("Duplicate task ID %s." % task) self.task_for_task_id[str(task)] = task self.tool_versions = {} if not self.args.no_tool_checks: self.check_tools()