def ev_close(self, worker, timedout): # Worker is closing -- it's time to gather results... self._runtimer_finalize(worker) # Display command output, try to order buffers by rc nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1])) cleaned = False for _rc, nodelist in sorted(worker.iter_retcodes()): ns_remain = NodeSet._fromlist1(nodelist) # Then order by node/nodeset (see nodeset_cmpkey) for buf, nodeset in sorted(map(nodesetify, worker.iter_buffers(nodelist)), key=bufnodeset_cmpkey): if not cleaned: # clean runtimer line before printing first result self._runtimer_clean() cleaned = True self._display.print_gather(nodeset, buf) ns_remain.difference_update(nodeset) if ns_remain: self._display.print_gather_finalize(ns_remain) self._display.flush() self._close_common(worker) # Notify main thread to update its prompt self.update_prompt(worker)
def update(self): """Update runtime progress info""" wrbwinfo = '' if self.bytes_written > 0: bandwidth = self.bytes_written/(time.time() - self.start_time) wrbwinfo = " write: %s/s" % human_bi_bytes_unit(bandwidth) gwcnt = len(self.task.gateways) if gwcnt: # tree mode act_targets = NodeSet() for gw, (chan, metaworkers) in self.task.gateways.items(): act_targets.updaten(mw.gwtargets[gw] for mw in metaworkers) cnt = len(act_targets) + len(self.task._engine.clients()) - gwcnt gwinfo = ' gw %d' % gwcnt else: cnt = len(self.task._engine.clients()) gwinfo = '' if self.bytes_written > 0 or cnt != self.cnt_last: self.cnt_last = cnt # display completed/total clients towrite = 'clush: %*d/%*d%s%s\r' % (self.tslen, self.total - cnt, self.tslen, self.total, gwinfo, wrbwinfo) self.wholelen = len(towrite) sys.stderr.write(towrite) self.started = True
def display(tree, disp, gather, trace_mode, enable_nodeset_key): """nicely display MsgTree instance `tree' content according to `disp' Display object and `gather' boolean flag""" out = sys_stdout() try: if trace_mode: display_tree(tree, disp, out) else: if gather: if enable_nodeset_key: # lambda to create a NodeSet from keys returned by walk() ns_getter = lambda x: NodeSet.fromlist(x[1]) for nodeset in sorted((ns_getter(item) for item in tree.walk()), key=nodeset_cmpkey): disp.print_gather(nodeset, tree[nodeset[0]]) else: for msg, key in tree.walk(): disp.print_gather_keys(key, msg) else: if enable_nodeset_key: # nodes are automagically sorted by NodeSet for node in NodeSet.fromlist(tree.keys()).nsiter(): disp.print_gather(node, tree[str(node)]) else: for key in tree.keys(): disp.print_gather_keys([ key ], tree[key]) finally: out.flush()
def get_available_nodes(self,slices_size=1): """ Returns a list of currently available nodes by slice of slices_size ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]'] :param slices_size: slices size :param type: int :returns: list of nodes_id :rtype: str """ node_list=[] a = pyslurm.node() node_dict = a.get() node_count=0 nodeset = NodeSet() if len(node_dict) > 0: for key, value in sorted(node_dict.iteritems()): if value['state']=='IDLE': nodetype=value nodeset.update(key) node_count+=1 if node_count==slices_size: node_list.append(str(nodeset)) nodeset=NodeSet() slice_str=None node_count=0 return node_list
def testConfigurationLongSyntax(self): """test detailed topology description syntax""" tmpfile = tempfile.NamedTemporaryFile() tmpfile.write(b'# this is a comment\n') tmpfile.write(b'[routes]\n') tmpfile.write(b'admin: proxy\n') tmpfile.write(b'proxy: STA[0-1]\n') tmpfile.write(b'STA0: STB[0-1]\n') tmpfile.write(b'STB0: nodes[0-2]\n') tmpfile.write(b'STB1: nodes[3-5]\n') tmpfile.write(b'STA1: STB[2-3]\n') tmpfile.write(b'STB2: nodes[6-7]\n') tmpfile.write(b'STB3: nodes[8-10]\n') tmpfile.flush() parser = TopologyParser() parser.load(tmpfile.name) ns_all = NodeSet('admin,proxy,STA[0-1],STB[0-3],nodes[0-10]') ns_tree = NodeSet() tree = parser.tree('admin') self.assertEqual(tree.inner_node_count(), 8) self.assertEqual(tree.leaf_node_count(), 11) for nodegroup in tree: ns_tree.add(nodegroup.nodeset) self.assertEqual(str(ns_all), str(ns_tree))
def testMultipleAdminGroups(self): """test topology with several admin groups""" ## ------------------- # TODO : uncommenting following lines should not produce an error. This # is a valid topology!! # ---------- tmpfile = tempfile.NamedTemporaryFile() tmpfile.write(b'[routes]\n') tmpfile.write(b'admin0: nodes[0-1]\n') #tmpfile.write(b'admin1: nodes[0-1]\n') tmpfile.write(b'admin2: nodes[2-3]\n') #tmpfile.write(b'admin3: nodes[2-3]\n') tmpfile.write(b'nodes[0-1]: nodes[10-19]\n') tmpfile.write(b'nodes[2-3]: nodes[20-29]\n') tmpfile.flush() parser = TopologyParser(tmpfile.name) ns_all = NodeSet('admin2,nodes[2-3,20-29]') ns_tree = NodeSet() tree = parser.tree('admin2') self.assertEqual(tree.inner_node_count(), 3) self.assertEqual(tree.leaf_node_count(), 10) for nodegroup in tree: ns_tree.add(nodegroup.nodeset) self.assertEqual(str(ns_all), str(ns_tree))
def ev_close(self, worker): """ Check process termination status and generate appropriate events. """ Action.ev_close(self, worker) # Action timed out if worker.did_timeout(): nodes = NodeSet.fromlist(worker.iter_keys_timeout()) self.fs._handle_shine_proxy_error(nodes, "Nodes timed out") self.set_status(ACT_ERROR) # Action succeeded elif max(rc for rc, _ in worker.iter_retcodes()) == 0: self.set_status(ACT_OK) # Action failed else: for rc, nodes in worker.iter_retcodes(): if rc == 0: continue # Avoid warnings, flag this component in error state for comp in self._comps or []: comp.sanitize_state(nodes=worker.nodes) for output, nodes in worker.iter_buffers(match_keys=nodes): nodes = NodeSet.fromlist(nodes) msg = "Copy failed: %s" % output self.fs._handle_shine_proxy_error(nodes, msg) self.set_status(ACT_ERROR)
def test_internal_mismatch(self): nodeset = NodeSet("cluster[1-30]c[1-2]") self.assertTrue("cluster%sc%s" in nodeset._patterns) nodeset._patterns["cluster%sc%s"] = RangeSetND([[1]]) self.assertRaises(NodeSetParseError, str, nodeset) nodeset._patterns["cluster%sc%s"] = RangeSetND([[1, 1]]) self.assertEqual(str(nodeset), "cluster1c1") nodeset._patterns["cluster%sc%s"] = RangeSetND([[1, 1, 1]]) self.assertRaises(NodeSetParseError, str, nodeset)
class LiveGatherOutputHandler(GatherOutputHandler): """Live line-gathered output event handler class (-bL).""" def __init__(self, display, nodes): assert nodes is not None, "cannot gather local command" GatherOutputHandler.__init__(self, display) self._nodes = NodeSet(nodes) self._nodecnt = dict.fromkeys(self._nodes, 0) self._mtreeq = [] self._offload = 0 def ev_read(self, worker, node, sname, msg): if sname != worker.SNAME_STDOUT: GatherOutputHandler.ev_read(self, worker, node, sname, msg) return # Read new line from node self._nodecnt[node] += 1 cnt = self._nodecnt[node] if len(self._mtreeq) < cnt: self._mtreeq.append(MsgTree()) self._mtreeq[cnt - self._offload - 1].add(node, msg) self._live_line(worker) def ev_hup(self, worker, node, rc): if self._mtreeq and node not in self._mtreeq[0]: # forget a node that doesn't answer to continue live line # gathering anyway self._nodes.remove(node) self._live_line(worker) def _live_line(self, worker): # if all nodes have replied, display gathered line while self._mtreeq and len(self._mtreeq[0]) == len(self._nodes): mtree = self._mtreeq.pop(0) self._offload += 1 self._runtimer_clean() nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1])) for buf, nodeset in sorted(map(nodesetify, mtree.walk()), key=bufnodeset_cmpkey): self._display.print_gather(nodeset, buf) self._runtimer_set_dirty() def ev_close(self, worker, timedout): # Worker is closing -- it's time to gather results... self._runtimer_finalize(worker) for mtree in self._mtreeq: nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1])) for buf, nodeset in sorted(map(nodesetify, mtree.walk()), key=bufnodeset_cmpkey): self._display.print_gather(nodeset, buf) self._close_common(worker) # Notify main thread to update its prompt self.update_prompt(worker)
def nodes_error(self): """Get nodeset of error nodes for this action.""" error_nodes = NodeSet() if self.worker: if isinstance(self.worker, WorkerPopen): retcode = self.worker.retcode() # We don't count timeout (retcode=None) if retcode not in (None, 0): error_nodes = NodeSet("localhost") else: for retcode, nds in self.worker.iter_retcodes(): if retcode != 0: error_nodes.add(nds) return error_nodes
def _report_unexec(a_model, execution): """ Display the 'unexec' type of report """ all_actions_set = set(a_model.actions.keys()) all_actions_set_nb = len(all_actions_set) executed_actions_set = set(execution.executed_actions.keys()) unexecuted_actions_set = all_actions_set.difference(executed_actions_set) unexecuted_actions_nb = len(unexecuted_actions_set) try: percentage = (float(unexecuted_actions_nb) / all_actions_set_nb) * 100 except ZeroDivisionError: percentage = 0.0 _LOGGER.output("\nUnexecuted Actions: %d (%2.1f %%)\t" + \ "Legend: mDeps=missings (error or unexecuted)" + \ " dependencies", unexecuted_actions_nb, percentage) tab_values = [] # Sort by len() first then alphabetically so: # b1, b2, b20, c1, c2, c10, c100 appears in that order sorted_list = sorted(unexecuted_actions_set, key = len) for id_ in sorted(sorted_list): action = a_model.actions[id_] all_deps = action.all_deps() all_deps_nb = len(all_deps) unexec = set(all_deps) - set(execution.executed_actions.keys()) error = set(all_deps) & set(execution.error_actions.keys()) missings = unexec.union(error) nodeset = NodeSet() missing_nb = len(missings) for missing in missings: if len(missing) != 0: nodeset.add(missing) try: percentage = ((float(missing_nb) / all_deps_nb) * 100) except ZeroDivisionError: percentage = 0.0 tab_values.append([id_, str(len(all_deps)), str(missing_nb), u"%2.1f" % percentage, str(nodeset)]) output = smart_display([u"Id", u"#Deps", u"#mDeps", u"%mDeps", u"mDeps"], tab_values, vsep=u" | ", justify=[str.center, str.center, str.center, str.center, str.ljust]) _LOGGER.output(output)
def __init__(self, display, nodes): assert nodes is not None, "cannot gather local command" GatherOutputHandler.__init__(self, display) self._nodes = NodeSet(nodes) self._nodecnt = dict.fromkeys(self._nodes, 0) self._mtreeq = [] self._offload = 0
def __gen_action_output(self, iterbuf, iterrc, timeouts, error_only): '''Display command result from output and retcodes.''' # Build the list of non-zero rc nodes retcodes = list(iterrc) ok_nodes = NodeSet.fromlist((nds for rc, nds in retcodes if rc == 0)) output = [] for out, nodes in iterbuf: if error_only: nodes = NodeSet(nodes) - ok_nodes if nodes and out: for lbuf in out.splitlines(): output.append(' > %s: %s' % (self.string_color(nodes, 'CYAN'), lbuf)) for retcode, nodes in retcodes: if retcode == 0 and not error_only: output.append(' > %s exited with %s' % (self.string_color(nodes, 'CYAN'), self.string_color(retcode, 'GREEN'))) elif retcode != 0: output.append(' > %s exited with %s' % (self.string_color(nodes, 'CYAN'), self.string_color(retcode, 'RED'))) if len(timeouts): output.append(' > %s has %s' % (self.string_color(timeouts, 'CYAN'), self.string_color('timeout', 'RED'))) return output
def submitNodeList(self): # info msg print '\n# Step 1 of 3 : Please enter nodes name below (using the clustershell syntax <azur1>, <azur[1-2]>) :' # retrieve keyboard input try: self.ns = NodeSet(self.input_request('')) repeat = True # ask if the user wants to add another node/node group while repeat : # print added nodes for node in self.ns: print 'node : %s' % node # user want to add nodes ? print '\n### Add nodes ? (yes | no)' # retrieve answer ans = self.input_request('') # check the ans if ans == 'Yes' or ans == 'Y' or ans == 'y' or ans == 'yes': print '### Please enter the node/group list below : ' # retrieve and append nodes self.ns.add(self.input_request('')) # the user don't want to add additionnal nodes else: # unset flag repeat = False # check submitted nodes self.ns = self.checkSubmittedNodes(self.ns) # invalid submitted node list / syntax error except NodeSetException : print >> sys.stderr, '\n(!) Error : the submitted node list is not valid\n' % self.ns
def reset(self): '''Reset values of attributes in order to perform multiple exec.''' self._tagged = False self.target = self._target_backup self.status = NO_STATUS self.failed_nodes = NodeSet() self.algo_reversed = False
def torque_job_nodelist(self,nodelist): nodelist = self._exechostpat.sub('',nodelist) nodelist = nodelist.split('+') nbprocs = len(nodelist) nodelist = NodeSet.fromlist(nodelist) nbnodes = len(nodelist) nodelist = str(nodelist) return nbprocs, nbnodes, nodelist
def testConfigurationParserBigTree(self): """test configuration parser against big propagation tree""" tmpfile = tempfile.NamedTemporaryFile() tmpfile.write('# this is a comment\n') tmpfile.write('[Main]\n') tmpfile.write('admin: ST[0-4]\n') tmpfile.write('ST[0-4]: STA[0-49]\n') tmpfile.write('STA[0-49]: nodes[0-10000]\n') tmpfile.flush() parser = TopologyParser() parser.load(tmpfile.name) ns_all = NodeSet('admin,ST[0-4],STA[0-49],nodes[0-10000]') ns_tree = NodeSet() for nodegroup in parser.tree('admin'): ns_tree.add(nodegroup.nodeset) self.assertEqual(str(ns_all), str(ns_tree))
def testConfigurationParser(self): """test configuration parsing""" tmpfile = tempfile.NamedTemporaryFile() tmpfile.write('# this is a comment\n') tmpfile.write('[Main]\n') tmpfile.write('admin: nodes[0-1]\n') tmpfile.write('nodes[0-1]: nodes[2-5]\n') tmpfile.write('nodes[4-5]: nodes[6-9]\n') tmpfile.flush() parser = TopologyParser(tmpfile.name) parser.tree('admin') ns_all = NodeSet('admin,nodes[0-9]') ns_tree = NodeSet() for nodegroup in parser.tree('admin'): ns_tree.add(nodegroup.nodeset) self.assertEqual(str(ns_all), str(ns_tree))
def _close_common(self, worker): verbexit = VERB_QUIET if self._display.maxrc: verbexit = VERB_STD # Display return code if not ok ( != 0) for rc, nodelist in worker.iter_retcodes(): if rc != 0: nsdisp = ns = NodeSet._fromlist1(nodelist) if self._display.verbosity > VERB_QUIET and len(ns) > 1: nsdisp = "%s (%d)" % (ns, len(ns)) msgrc = "clush: %s: exited with exit code %d" % (nsdisp, rc) self._display.vprint_err(verbexit, msgrc) # Display nodes that didn't answer within command timeout delay if worker.num_timeout() > 0: self._display.vprint_err(verbexit, "clush: %s: command timeout" % \ NodeSet._fromlist1(worker.iter_keys_timeout()))
def testConfigurationShortSyntax(self): """test short topology specification syntax""" tmpfile = tempfile.NamedTemporaryFile() tmpfile.write('# this is a comment\n') tmpfile.write('[Main]\n') tmpfile.write('admin: nodes[0-9]\n') tmpfile.write('nodes[0-3,5]: nodes[10-19]\n') tmpfile.write('nodes[4,6-9]: nodes[30-39]\n') tmpfile.flush() parser = TopologyParser() parser.load(tmpfile.name) ns_all = NodeSet('admin,nodes[0-19,30-39]') ns_tree = NodeSet() for nodegroup in parser.tree('admin'): ns_tree.add(nodegroup.nodeset) self.assertEqual(str(ns_all), str(ns_tree))
def parse(self, filename=None): """ Function called to parse the content of the tuning configuratio file and store the configuration in the object. """ # Build the patterns to retrieve alias and parameter declaration alias_re = re.compile("alias\s+(\S+)\s*=\s*(\S+)$") parameter_re = re.compile('("[^"]+"|\S+)\s+(\S+)\s+(\S+)$') supported = NodeSet.fromlist(list(NODE_TYPES) + TYPE_ALIASES.keys()) # Open the file to read each lines try: tuning_file = open(filename or self.filename) for line in tuning_file.readlines(): # Skip comments and blanks line = line.split('#', 1)[0].strip() if not line: continue m_alias = alias_re.match(line) m_param = parameter_re.match(line) if m_alias: # This line is an alias creation self.create_parameter_alias(m_alias.group(1), m_alias.group(2)) elif m_param: # This line is a parameter instanciation nodes = NodeSet.fromlist( m_param.group(3).lower().split(';')) self.create_parameter(m_param.group(2), m_param.group(1), nodes & supported, nodes - supported) else: # This line is not recognized raise TuningError("Wrong tuning syntax '%s'" % line) tuning_file.close() except IOError, error: msg = "Error while reading tuning configuration file: %s" % error raise TuningError(msg)
def update_target(self, nodeset, mode=None): '''Update the attribute target of an entity''' assert nodeset is not None if not mode: self.target = NodeSet(nodeset) elif mode is 'DIF' and self.target: self.target.difference_update(nodeset) elif mode is 'INT' and self.target: self.target.intersection_update(nodeset)
def iter_retcodes(self, match_keys=None): """ Returns an iterator over return codes and associated NodeSet. If the optional parameter match_keys is defined, only keys found in match_keys are returned. """ self._task_bound_check() for rc, keys in self.task._rc_iter_by_worker(self, match_keys): yield rc, NodeSet.fromlist(keys)
def __init__(self, name, value, node_types=None, node_list=None): self.name = name self.value = value self._node_types = set() self.node_types = node_types or set() self.node_list = NodeSet() if node_list is not None: self.node_list = NodeSet.fromlist(node_list)
def nodes_timeout(self): """Get nodeset of timeout nodes for this action.""" if self.worker: if isinstance(self.worker, WorkerPopen): if self.worker.did_timeout(): return NodeSet("localhost") else: return NodeSet.fromlist(list(self.worker.iter_keys_timeout())) return NodeSet()
def __init__(self, root, topology, fanout=0): self.root = root self.topology = topology self.fanout = fanout self.nodes_fanin = {} self.table = None self.table_generate(root, topology) self._unreachable_hosts = NodeSet()
def connected(self, src_ns): """find out and return the aggregation of directly connected children from src_ns. Argument src_ns is expected to be a NodeSet instance. Result is returned as a NodeSet instance """ next_hop = NodeSet.fromlist([dst for dst in [route.dest(src_ns) for route in self._routes] if dst is not None]) if len(next_hop) == 0: return None return next_hop
def checkNodes(self): try: # print command info print '\n== Checking active nodes ==' # launch ping on the specified nodes task_self().run('echo OK', nodes=self.ns) # retrieve and check return code for retcode, nodes in task_self().iter_retcodes(): if retcode in (0, 1, 2): # add nodes to OK set self.ns_ok |= NodeSet.fromlist(nodes) print '%s : OK' % nodes else: # add nodes to KO set self.ns_ko |= NodeSet.fromlist(nodes) print '%s : KO' % nodes # syntax error except NodeSetException: print >> sys.stderr, '(!) Error : the submitted nodeset [%s] is not valid' % self.ns
def iter_errors(self, match_keys=None): """ Returns an iterator over available error buffers and associated NodeSet. If the optional parameter match_keys is defined, only keys found in match_keys are returned. """ self._task_bound_check() for msg, keys in self.task._call_tree_matcher( self.task._msgtree(self.SNAME_STDERR).walk, match_keys, self): yield msg, NodeSet.fromlist(keys)
def testConfigurationParserDeepTree(self): """test a configuration that generates a deep tree""" tmpfile = tempfile.NamedTemporaryFile() tmpfile.write('# this is a comment\n') tmpfile.write('[Main]\n') tmpfile.write('admin: nodes[0-9]\n') levels = 15 # how deep do you want the tree to be? for i in xrange(0, levels*10, 10): line = 'nodes[%d-%d]: nodes[%d-%d]\n' % (i, i+9, i+10, i+19) tmpfile.write(line) tmpfile.flush() parser = TopologyParser() parser.load(tmpfile.name) ns_all = NodeSet('admin,nodes[0-159]') ns_tree = NodeSet() for nodegroup in parser.tree('admin'): ns_tree.add(nodegroup.nodeset) self.assertEqual(str(ns_all), str(ns_tree))
def remove_mount_point(self): """Remove dfuse directory Raises: CommandFailure: In case of error deleting directory """ # raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") dir_exists, _ = general_utils.check_file_exists( self.hosts, self.mount_dir.value, directory=True) if dir_exists: cmd = "rm -rf {}".format(self.mount_dir.value) ret_code = general_utils.pcmd(self.hosts, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error removing the {} dfuse mount point on the following " "hosts: {}".format(self.mount_dir.value, error_hosts))
def fromdict(self, svcdict): """Populate service attributes from dict.""" BaseEntity.fromdict(self, svcdict) if 'actions' in svcdict: dependencies = {} actions = {} for names, props in svcdict['actions'].items(): for name in NodeSet(names): action = Action(name) action.fromdict(props) actions[name] = action dependencies[name] = props.get('check', []) for action in actions.values(): for dep in dependencies[action.name]: action.add_dep(actions[dep]) self.add_action(action) # Inherits properies between service and actions for action in self.iter_actions(): action.inherits_from(self)
def remote_copy(self, hostlist, remote_dir, local_dir): """Copy files from remote dir to local dir. This is a temporary method and will be replaced by clush in general_utils Args: hostlist (list): list of remote nodes remote_dir (str): remote directory of files local_dir (str): local directory Returns: status: bool """ this_host = socket.gethostname() # Copy logfiles from non-empty client directories command = "clush -w {} -B -S \"{}\"".format( NodeSet.fromlist(hostlist), "if [ ! -z \\\"\\$(ls -A {0})\\\" ]; then " "scp -p -r {0}/ \\\"{1}:'{2}/'\\\" && rm -rf {0}/*; fi".format( remote_dir, this_host, local_dir)) status = process.run(command, timeout=300) return status
def _distant_action_by_server(self, action_class, servers, **kwargs): # filter local server distant_servers = Server.distant_servers(servers) # perform action on distant servers if len(distant_servers) > 0: action = action_class(nodes=distant_servers, fs=self, **kwargs) action.launch() self._run_actions() if action.status() == ACT_ERROR: err_code = None if task_self().num_timeout(): err_code = -1 elif task_self().max_retcode(): err_code = task_self().max_retcode() # FSRemoteError is limited and cannot handle more than 1 error msg, nodes = list(self.proxy_errors.walk())[0] nodes = NodeSet.fromlist(nodes) msg = str(msg).replace('THIS_SHINE_HOST', str(nodes)) raise FSRemoteError(nodes, err_code, msg)
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0): """Run a command on each host in parallel and get the return codes. Args: hosts (list): list of hosts command (str): the command to run in parallel verbose (bool, optional): display command output. Defaults to True. timeout (int, optional): command timeout in seconds. Defaults to None. expect_rc (int, optional): expected return code. Defaults to 0. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ # Run the command on each host in parallel results = run_pcmd(hosts, command, verbose, timeout, expect_rc) exit_status = {} for result in results: if result["exit_status"] not in exit_status: exit_status[result["exit_status"]] = NodeSet() exit_status[result["exit_status"]].add(result["hosts"]) return exit_status
def run(self): """Run the dfuse command. Raises: CommandFailure: In case dfuse run command fails """ self.log.info('Starting dfuse at %s', self.mount_dir.value) # A log file must be defined to ensure logs are captured if "D_LOG_FILE" not in self._pre_command: raise CommandFailure( "Dfuse missing environment varaibles for D_LOG_FILE") # create dfuse dir if does not exist self.create_mount_point() # run dfuse command ret_code = pcmd(self.hosts, self.__str__(), timeout=30) if 0 in ret_code: self.running_hosts.add(ret_code[0]) del ret_code[0] if len(ret_code): error_hosts = NodeSet( ",".join( [str(node_set) for code, node_set in ret_code.items() if code != 0])) raise CommandFailure( "Error starting dfuse on the following hosts: {}".format( error_hosts)) if not self.check_running(fail_on_error=False): self.log.info('Waiting five seconds for dfuse to start') time.sleep(5) self.check_running()
def testDisplayRegroup(self): """test CLI.Display (regroup)""" f = makeTestFile(""" # A comment [Main] default: local [local] map: echo hostfoo #all: list: echo all #reverse: """) res = GroupResolverConfig(f.name) set_std_group_resolver(res) try: parser = OptionParser("dummy") parser.install_display_options(verbose_options=True) options, _ = parser.parse_args(["-r"]) disp = Display(options, color=False) self.assertEqual(disp.regroup, True) disp.out = StringIO() disp.err = StringIO() self.assertEqual(disp.line_mode, False) ns = NodeSet("hostfoo") # nodeset.regroup() is performed by print_gather() disp.print_gather(ns, "message0\nmessage1\n") self.assertEqual( disp.out.getvalue(), "---------------\n@all\n---------------\nmessage0\nmessage1\n\n" ) finally: set_std_group_resolver(None)
def test_017_retcodes(self): """test clush (retcodes)""" s = "clush: %s: exited with exit code 1\n" % HOSTNAME exp_err = s.encode() self._clush_t(["-w", HOSTNAME, "/bin/false"], None, b"", 0, exp_err) self._clush_t(["-w", HOSTNAME, "-b", "/bin/false"], None, b"", 0, exp_err) self._clush_t(["-S", "-w", HOSTNAME, "/bin/false"], None, b"", 1, exp_err) for i in (1, 2, 127, 128, 255): s = "clush: %s: exited with exit code %d\n" % (HOSTNAME, i) self._clush_t(["-S", "-w", HOSTNAME, "exit %d" % i], None, b"", i, s.encode()) self._clush_t(["-v", "-w", HOSTNAME, "/bin/false"], None, b"", 0, exp_err) duo = str(NodeSet("%s,localhost" % HOSTNAME)) s = "clush: %s (%d): exited with exit code 1\n" % (duo, 2) self._clush_t(["-w", duo, "-b", "/bin/false"], None, b"", 0, s.encode()) s = "clush: %s: exited with exit code 1\n" % duo self._clush_t(["-w", duo, "-b", "-q", "/bin/false"], None, b"", 0, s.encode()) s = "clush: %s (%d): exited with exit code 1\n" % (duo, 2) self._clush_t(["-w", duo, "-S", "-b", "/bin/false"], None, b"", 1, s.encode()) self._clush_t(["-w", duo, "-S", "-b", "-q", "/bin/false"], None, b"", 1)
def remote_copy(self, hostlist, remote_dir, local_dir): """Copy files from remote dir to local dir. Args: hostlist (list): list of remote nodes remote_dir (str): remote directory of files local_dir (str): local directory Raises: SoakTestError: if there is an error with the remote copy """ this_host = socket.gethostname() result = pcmd( NodeSet.fromlist(hostlist), "if [ ! -z '$(ls -A {0})' ]; then " "scp -p -r {0}/ \"{1}:'{2}/'\" && rm -rf {0}/*; fi".format( remote_dir, this_host, local_dir), verbose=False) if len(result) > 1 or 0 not in result: raise SoakTestError( "Error executing remote copy: {}".format( ", ".join( [str(result[key]) for key in result if key != 0])))
def _copy_remote(self, source, dest, targets, gateway, timeout, reverse): """run a remote copy in tree mode (using gateway)""" self.logger.debug("_copy_remote gateway=%s source=%s dest=%s " "reverse=%s", gateway, source, dest, reverse) self._target_count += len(targets) self.gwtargets.setdefault(str(gateway), NodeSet()).add(targets) # tar commands are built here and launched on targets if reverse: # these weird replace calls aim to escape single quotes ' within '' srcdir = dirname(source).replace("'", '\'\"\'\"\'') srcbase = basename(normpath(self.source)).replace("'", '\'\"\'\"\'') cmd = self.TAR_CMD_FMT % (srcdir, srcbase) else: cmd = self.UNTAR_CMD_FMT % dest.replace("'", '\'\"\'\"\'') self.logger.debug('_copy_remote: tar cmd: %s', cmd) pchan = self.task._pchannel(gateway, self) pchan.shell(nodes=targets, command=cmd, worker=self, timeout=timeout, stderr=self.stderr, gw_invoke_cmd=self.invoke_gateway, remote=self.remote)
def print_source_groups(source, level, xset, opts): """ Print groups from a source, a level of verbosity and an optional nodeset acting as a filter. """ # list groups of some specified nodes? if opts.all or xset or opts.and_nodes or opts.sub_nodes or opts.xor_nodes: # When some node sets are provided as argument, the list command # retrieves node groups these nodes belong to, thanks to the # groups() method. # Note: stdin support is enabled when '-' is found. groups = xset.groups(source, opts.groupbase) # sort by group name for group, (gnodes, inodes) in sorted(groups.items()): if level == 1: print(group) elif level == 2: print("%s %s" % (group, inodes)) else: print("%s %s %d/%d" % (group, inodes, len(inodes), len(gnodes))) else: # "raw" group list when no argument at all for group in grouplist(source): if source and not opts.groupbase: nsgroup = "@%s:%s" % (source, group) else: nsgroup = "@%s" % group if level == 1: print(nsgroup) else: nodes = NodeSet(nsgroup) if level == 2: print("%s %s" % (nsgroup, nodes)) else: print("%s %s %d" % (nsgroup, nodes, len(nodes)))
def distant_event(self, evtype, node, **params): # Update the local component instance with the provided instance # if one is available in params. if evtype == 'comp': other = params['info'].elem other.fs = self try: # Special hack for Journal object as they are not put in # components list. if other.TYPE == Journal.TYPE: other.target.fs = self target = self.components[other.target.uniqueid()] target.journal.update(other) comp = target.journal else: comp = self.components[other.uniqueid()] # comp.update() updates the component state # and disk information if the component is a target. # These information don't need to be updated unless # we are on a completion event. if params['status'] not in ('start', 'progress'): # ensure other.server is the actual distant server other.server = comp.allservers().select( NodeSet(node))[0] # update target from remote one comp.update(other) # substitute target parameter by local one params['comp'] = comp except KeyError as error: print("ERROR: Component update failed (%s)" % str(error), file=sys.stderr) self.hdlr.event_callback(evtype, node=node, **params)
def _parse_token(self, token): """Concrete implementation of parent abstract method. :Parameters: according to parent :py:meth:`cumin.backends.BaseQueryAggregator._parse_token`. """ if not isinstance(token, pp.ParseResults ): # pragma: no cover - this should never happen raise InvalidQueryError( 'Expecting ParseResults object, got {type}: {token}'.format( type=type(token), token=token)) token_dict = token.asDict() self.logger.trace('Token is: %s | %s', token_dict, token) if 'hosts' in token_dict: element = self._get_stack_element() element['hosts'] = NodeSet.fromlist(token_dict['hosts'], resolver=self.resolver) if 'bool' in token_dict: element['bool'] = token_dict['bool'] self.stack_pointer['children'].append(element) elif 'open_subgroup' in token_dict and 'close_subgroup' in token_dict: self._open_subgroup() if 'bool' in token_dict: self.stack_pointer['bool'] = token_dict['bool'] for subtoken in token: if isinstance( subtoken, str ): # Grammar literals, boolean operators and parentheses continue self._parse_token(subtoken) self._close_subgroup() else: # pragma: no cover - this should never happen raise InvalidQueryError( 'Got unexpected token: {token}'.format(token=token))
def _execute_command(self, command, fail_on_err=True, display_output=True, hosts=None): """Execute the command on all client hosts. Optionally verify if the command returns a non zero return code. Args: command (str): the command to execute on the client hosts fail_on_err (bool, optional): whether or not to fail the test if command returns a non zero return code. Defaults to True. display_output (bool, optional): whether or not to display output. Defaults to True. Raises: CommandFailure: if 'fail_on_err' is set and the command fails on at least one of the client hosts Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ if hosts is None: hosts = self.hostlist_clients result = pcmd(hosts, command, verbose=display_output, timeout=300) if 0 not in result and fail_on_err: hosts = [ str(nodes) for code, nodes in list(result.items()) if code != 0 ] raise CommandFailure( "Error running '{}' on the following hosts: {}".format( command, NodeSet(",".join(hosts)))) return result
def get_partition_hosts(self, partition_key, host_list): """[summary]. Args: partition_key ([type]): [description] host_list ([type]): [description] Returns: tuple: [description] """ hosts = [] partiton_name = self.params.get(partition_key, "/run/hosts/*") if partiton_name is not None: cmd = "scontrol show partition {}".format(partiton_name) try: result = process.run(cmd, shell=True, timeout=10) except process.CmdError as error: self.log.warning("Unable to obtain hosts from the {} slurm " "partition: {}".format(partiton_name, error)) result = None if result: output = result.stdout try: hosts = list( NodeSet(re.findall(r"\s+Nodes=(.*)", output)[0])) except (NodeSetParseError, IndexError): self.log.warning( "Unable to obtain hosts from the {} slurm partition " "output: {}".format(partiton_name, output)) if hosts: return hosts, partiton_name else: return host_list, None
class PdshClient(ExecClient): """EngineClient which run 'pdsh'""" MODE = 'pdsh' def __init__(self, node, command, worker, stderr, timeout, autoclose=False, rank=None): ExecClient.__init__(self, node, command, worker, stderr, timeout, autoclose, rank) self._closed_nodes = NodeSet() def _build_cmd(self): """ Build the shell command line to start the commmand. Return an array of command and arguments. """ task = self.worker.task pdsh_env = {} # Build pdsh command executable = task.info("pdsh_path") or "pdsh" cmd_l = [executable, "-b"] fanout = task.info("fanout", 0) if fanout > 0: cmd_l.append("-f %d" % fanout) # Pdsh flag '-t' do not really works well. Better to use # PDSH_SSH_ARGS_APPEND variable to transmit ssh ConnectTimeout # flag. connect_timeout = task.info("connect_timeout", 0) if connect_timeout > 0: pdsh_env['PDSH_SSH_ARGS_APPEND'] = "-o ConnectTimeout=%d" % \ connect_timeout command_timeout = task.info("command_timeout", 0) if command_timeout > 0: cmd_l.append("-u %d" % command_timeout) cmd_l.append("-w %s" % self.key) cmd_l.append("%s" % self.command) return (cmd_l, pdsh_env) def _close(self, abort, timeout): """Close client. See EngineClient._close().""" if abort: # it's safer to call poll() first for long time completed processes prc = self.popen.poll() # if prc is None, process is still running if prc is None: try: # try to kill it self.popen.kill() except OSError: pass prc = self.popen.wait() if prc > 0: raise WorkerError("Cannot run pdsh (error %d)" % prc) self.streams.clear() if timeout: assert abort, "abort flag not set on timeout" for node in (self.key - self._closed_nodes): self.worker._on_node_timeout(node) else: for node in (self.key - self._closed_nodes): self.worker._on_node_rc(node, 0) self.worker._check_fini() def _parse_line(self, line, fname): """ Parse Pdsh line syntax. """ if line.startswith("pdsh@") or \ line.startswith("pdcp@") or \ line.startswith("sending "): try: # pdsh@cors113: cors115: ssh exited with exit code 1 # 0 1 2 3 4 5 6 7 # corsUNKN: ssh: corsUNKN: Name or service not known # 0 1 2 3 4 5 6 7 # pdsh@fortoy0: fortoy101: command timeout # 0 1 2 3 # sending SIGTERM to ssh fortoy112 pid 32014 # 0 1 2 3 4 5 6 # pdcp@cors113: corsUNKN: ssh exited with exit code 255 # 0 1 2 3 4 5 6 7 # pdcp@cors113: cors115: fatal: /var/cache/shine/... # 0 1 2 3... words = line.split() # Set return code for nodename of worker if self.MODE == 'pdsh': if len(words) == 4 and words[2] == "command" and \ words[3] == "timeout": pass elif len(words) == 8 and words[3] == "exited" and \ words[7].isdigit(): self._closed_nodes.add(words[1][:-1]) self.worker._on_node_rc(words[1][:-1], int(words[7])) elif self.MODE == 'pdcp': self._closed_nodes.add(words[1][:-1]) self.worker._on_node_rc(words[1][:-1], errno.ENOENT) except Exception, exc: print >> sys.stderr, exc raise EngineClientError() else:
class TopologyNodeGroup(object): """Base element for in-memory representation of the propagation tree. Contains a nodeset, with parent-children relationships with other instances. """ def __init__(self, nodeset=None): """initialize a new TopologyNodeGroup instance.""" # Base nodeset self.nodeset = nodeset # Parent TopologyNodeGroup (TNG) instance self.parent = None # List of children TNG instances self._children = [] self._children_len = 0 # provided for convenience self._children_ns = None def printable_subtree(self, prefix=''): """recursive method that returns a printable version the subtree from the current node with a nice presentation """ res = '' # For now, it is ok to use a recursive method here as we consider that # tree depth is relatively small. if self.parent is None: # root res = '%s\n' % str(self.nodeset) elif self.parent.parent is None: # first level if not self._is_last(): res = '|- %s\n' % str(self.nodeset) else: res = '`- %s\n' % str(self.nodeset) else: # deepest levels... if not self.parent._is_last(): prefix += '| ' else: # fix last line prefix += ' ' if not self._is_last(): res = '%s|- %s\n' % (prefix, str(self.nodeset)) else: res = '%s`- %s\n' % (prefix, str(self.nodeset)) # perform recursive calls to print out every node for child in self._children: res += child.printable_subtree(prefix) return res def add_child(self, child): """add a child to the children list and define the current instance as its parent """ assert isinstance(child, TopologyNodeGroup) if child in self._children: return child.parent = self self._children.append(child) if self._children_ns is None: self._children_ns = NodeSet() self._children_ns.add(child.nodeset) def clear_child(self, child, strict=False): """remove a child""" try: self._children.remove(child) self._children_ns.difference_update(child.nodeset) if len(self._children_ns) == 0: self._children_ns = None except ValueError: if strict: raise def clear_children(self): """delete all children""" self._children = [] self._children_ns = None def children(self): """get the children list""" return self._children def children_ns(self): """return the children as a nodeset""" return self._children_ns def children_len(self): """returns the number of children as the sum of the size of the children's nodeset """ if self._children_ns is None: return 0 else: return len(self._children_ns) def _is_last(self): """used to display the subtree: we won't prefix the line the same way if the current instance is the last child of the children list of its parent. """ return self.parent._children[-1::][0] == self def __str__(self): """printable representation of the nodegroup""" return '<TopologyNodeGroup (%s)>' % str(self.nodeset)
def get_partition_hosts(partition, reservation=None): """Get a list of hosts in the specified slurm partition and reservation. Args: partition (str): name of the partition reservation (str): name of reservation Returns: list: list of hosts in the specified partition """ log = getLogger() hosts = [] if partition is not None: # Get the partition name information cmd = "scontrol show partition {}".format(partition) try: result = process.run(cmd, timeout=10) except process.CmdError as error: log.warning( "Unable to obtain hosts from the %s slurm " "partition: %s", partition, error) result = None if result: # Get the list of hosts from the partition information output = result.stdout try: hosts = list(NodeSet(re.findall(r"\s+Nodes=(.*)", output)[0])) except (NodeSetParseError, IndexError): log.warning( "Unable to obtain hosts from the %s slurm partition " "output: %s", partition, output) hosts = [] if hosts and reservation is not None: # Get the list of hosts from the reservation information cmd = "scontrol show reservation {}".format(reservation) try: result = process.run(cmd, timeout=10) except process.CmdError as error: log.warning( "Unable to obtain hosts from the %s slurm " "reservation: %s", reservation, error) result = None hosts = [] if result: # Get the list of hosts from the reservation information output = result.stdout try: reservation_hosts = list( NodeSet(re.findall(r"\sNodes=(\S+)", output)[0])) except (NodeSetParseError, IndexError): log.warning( "Unable to obtain hosts from the %s slurm " "reservation output: %s", reservation, output) reservation_hosts = [] is_subset = set(reservation_hosts).issubset(set(hosts)) if reservation_hosts and is_subset: hosts = reservation_hosts else: hosts = [] return hosts
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0): """Run a command on each host in parallel and get the return codes. Args: hosts (list): list of hosts command (str): the command to run in parallel verbose (bool, optional): display command output. Defaults to True. timeout (int, optional): command timeout in seconds. Defaults to None. expect_rc (int, optional): expected return code. Defaults to 0. Returns: dict: a dictionary of return codes keys and accompanying NodeSet values indicating which hosts yielded the return code. """ # Run the command on each host in parallel task = run_task(hosts, command, timeout) # Report any errors retcode_dict = {} errors = False for retcode, rc_nodes in task.iter_retcodes(): # Create a NodeSet for this list of nodes nodeset = NodeSet.fromlist(rc_nodes) # Include this NodeSet for this return code if retcode not in retcode_dict: retcode_dict[retcode] = NodeSet() retcode_dict[retcode].add(nodeset) # Keep track of any errors if expect_rc is not None and expect_rc != retcode: errors = True # Report command output if requested or errors are detected if verbose or errors: print("Command:\n {}".format(command)) print("Command return codes:") for retcode in sorted(retcode_dict): print(" {}: rc={}".format(retcode_dict[retcode], retcode)) print("Command output:") for output, bf_nodes in task.iter_buffers(): # Create a NodeSet for this list of nodes nodeset = NodeSet.fromlist(bf_nodes) # Display the output per node set print(" {}:\n {}".format( nodeset, "\n ".join(str(output).splitlines()))) # Report any timeouts if timeout and task.num_timeout() > 0: nodes = task.iter_keys_timeout() print("{}: timeout detected running '{}' on {}/{} hosts after {}s". format(NodeSet.fromlist(nodes), command, task.num_timeout(), len(hosts), timeout)) retcode = 255 if retcode not in retcode_dict: retcode_dict[retcode] = NodeSet() retcode_dict[retcode].add(NodeSet.fromlist(nodes)) return retcode_dict
def start(self, args): dependanceManager = dep.dep() nodes = NodeSet() depNode = NodeSet() nbNoeud = len(args) - 2 #print'nbNoeud: %d'%nbNoeud for i in range(1, nbNoeud + 1): nodes.add(args[i]) dependance = 1 if os.path.isfile("cfg/" + args[nbNoeud + 1]): #verification de la dependance dependanceManager.toInstall("cfg/" + args[nbNoeud + 1]) dependanceManager.toStart("cfg/" + args[nbNoeud + 1]) #recuperation des dependances startNode = dependanceManager.getNodeStarted() startServices = dependanceManager.getStarted() installNode = dependanceManager.getNodeIs_install() installService = dependanceManager.getIs_install() #pour chaque noeud dependant for node in installNode: depNode.add(node) for service in installService: task_self().run('sudo service ' + service + ' status', nodes=depNode) ret = self.status([node, service], 2) depNode = NodeSet() if ret == 0: print 'Service : ' + service + ' sur : ' + node + ' status : non-installe' dependance = 0 elif ret == 1: print 'Service : ' + service + ' sur : ' + node + ' status : installe' #pour chaque noeud dependant for node in startNode: depNode.add(node) for service in startServices: task_self().run('sudo service ' + service + ' status', nodes=depNode) ret = self.status([node, service], 1) depNode = NodeSet() if ret == 0: print 'Service : ' + service + ' sur : ' + node + ' status : non-demarre' dependance = 0 elif ret == 1: print 'Service : ' + service + ' sur : ' + node + ' status : demarre' # print dependance if dependance == 1: print 'dependance OK' for i in range(1, nbNoeud + 1): print args[i] + ' : sudo service ' + args[nbNoeud + 1] + ' start' task_self().run('sudo service ' + args[nbNoeud + 1] + ' start', nodes=nodes) else: print 'dependance KO'
def stop(self): """Stop dfuse. Try to stop dfuse. Try once nicely by using fusermount, then if that fails try to pkill it to see if that works. Abort based on the result of the fusermount, as if pkill is necessary then dfuse itself has not worked correctly. Finally, try and remove the mount point, and that itself should work. Raises: CommandFailure: In case dfuse stop fails """ # Include all hosts when stopping to ensure all mount points in any # state are properly removed self.running_hosts.add(NodeSet.fromlist(self.hosts)) self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value, self.running_hosts) if self.mount_dir.value and self.running_hosts: error_list = [] # Loop until all fuseblk mounted devices are unmounted counter = 0 while self.running_hosts and counter < 3: # Attempt to kill dfuse on after first unmount fails if self.running_hosts and counter > 1: kill_command = "pkill dfuse --signal KILL" pcmd(self.running_hosts, kill_command, timeout=30) # Attempt to unmount any fuseblk mounted devices after detection if self.running_hosts and counter > 0: pcmd(self.running_hosts, self.get_umount_command(counter > 1), expect_rc=None) time.sleep(2) # Detect which hosts have fuseblk mounted devices and remove any # hosts which no longer have the dfuse mount point mounted state = self.check_mount_state(self.running_hosts) for host in state["unmounted"].union(state["nodirectory"]): self.running_hosts.remove(host) # Increment the loop counter counter += 1 if self.running_hosts: error_list.append("Error stopping dfuse on {}".format( self.running_hosts)) # Remove mount points try: self.remove_mount_point() except CommandFailure as error: error_list.append(error) # Report any errors if error_list: raise CommandFailure("\n".join(error_list)) elif self.mount_dir.value is None: self.log.info("No dfuse mount directory defined - nothing to stop") else: self.log.info("No hosts running dfuse - nothing to stop")
def set_test_environment(args): """Set up the test environment. Args: args (argparse.Namespace): command line arguments for this program Returns: None """ base_dir = get_build_environment()["PREFIX"] bin_dir = os.path.join(base_dir, "bin") sbin_dir = os.path.join(base_dir, "sbin") # /usr/sbin is not setup on non-root user for CI nodes. # SCM formatting tool mkfs.ext4 is located under # /usr/sbin directory. usr_sbin = os.path.sep + os.path.join("usr", "sbin") path = os.environ.get("PATH") # Get the default interface to use if OFI_INTERFACE is not set interface = os.environ.get("OFI_INTERFACE") if interface is None: # Find all the /sys/class/net interfaces on the launch node # (excluding lo) print("Detecting network devices - OFI_INTERFACE not set") available_interfaces = {} net_path = os.path.join(os.path.sep, "sys", "class", "net") net_list = [dev for dev in os.listdir(net_path) if dev != "lo"] for device in sorted(net_list): # Get the interface state - only include active (up) interfaces with open(os.path.join(net_path, device, "operstate"), "r") as \ fileh: state = fileh.read().strip() # Only include interfaces that are up if state.lower() == "up": # Get the interface speed - used to select the fastest available with open(os.path.join(net_path, device, "speed"), "r") as \ fileh: try: speed = int(fileh.read().strip()) # KVM/Qemu/libvirt returns an EINVAL except IOError as ioerror: if ioerror.errno == errno.EINVAL: speed = 1000 else: raise print(" - {0:<5} (speed: {1:>6} state: {2})".format( device, speed, state)) # Only include the first active interface for each speed - first # is determined by an alphabetic sort: ib0 will be checked # before ib1 if speed not in available_interfaces: available_interfaces[speed] = device print("Available interfaces: {}".format(available_interfaces)) try: # Select the fastest active interface available by sorting the speed interface = available_interfaces[sorted(available_interfaces)[-1]] except IndexError: print("Error obtaining a default interface from: {}".format( os.listdir(net_path))) exit(1) print("Using {} as the default interface".format(interface)) # Update env definitions os.environ["PATH"] = ":".join([bin_dir, sbin_dir, usr_sbin, path]) os.environ["CRT_CTX_SHARE_ADDR"] = "1" os.environ["OFI_INTERFACE"] = os.environ.get("OFI_INTERFACE", interface) # Set the default location for daos log files written during testing if not # already defined. if "DAOS_TEST_LOG_DIR" not in os.environ: os.environ["DAOS_TEST_LOG_DIR"] = DEFAULT_DAOS_TEST_LOG_DIR os.environ["D_LOG_FILE"] = os.path.join(os.environ["DAOS_TEST_LOG_DIR"], "daos.log") # Ensure the daos log files directory exists on each possible test node test_hosts = NodeSet(socket.gethostname().split(".")[0]) test_hosts.update(args.test_clients) test_hosts.update(args.test_servers) spawn_commands(test_hosts, "mkdir -p {}".format(os.environ["DAOS_TEST_LOG_DIR"])) # Python paths required for functional testing python_version = "python{}{}".format( version_info.major, "" if version_info.major > 2 else ".{}".format(version_info.minor)) required_python_paths = [ os.path.abspath("util/apricot"), os.path.abspath("util"), os.path.join(base_dir, "lib64", python_version, "site-packages"), ] # Check the PYTHONPATH env definition python_path = os.environ.get("PYTHONPATH") if python_path is None or python_path == "": # Use the required paths to define the PYTHONPATH env if it is not set os.environ["PYTHONPATH"] = ":".join(required_python_paths) else: # Append any missing required paths to the existing PYTHONPATH env defined_python_paths = [ os.path.abspath(os.path.expanduser(path)) for path in python_path.split(":") ] for required_path in required_python_paths: if required_path not in defined_python_paths: python_path += ":" + required_path os.environ["PYTHONPATH"] = python_path
def run_pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0): """Run a command on each host in parallel and get the results. Args: hosts (list): list of hosts command (str): the command to run in parallel verbose (bool, optional): display command output. Defaults to True. timeout (int, optional): command timeout in seconds. Defaults to None. expect_rc (int, optional): display output if the command return code does not match this value. Defaults to 0. A value of None will bypass this feature. Returns: list: a list of dictionaries with each entry containing output, exit status, and interrupted status common to each group of hosts, e.g.: [ { "command": "ls my_dir", "hosts": NodeSet(wolf-[1-3]), "exit_status": 0, "interrupted": False, "stdout": ["file1.txt", "file2.json"], }, { "command": "ls my_dir", "hosts": NodeSet(wolf-[4]), "exit_status": 1, "interrupted": False, "stdout": ["No such file or directory"], }, { "command": "ls my_dir", "hosts": NodeSet(wolf-[5-6]), "exit_status": 255, "interrupted": True, "stdout": [""] }, ] """ log = getLogger() results = [] # Run the command on each host in parallel task = run_task(hosts, command, timeout) # Get the exit status of each host host_exit_status = {host: None for host in hosts} for exit_status, host_list in task.iter_retcodes(): for host in host_list: host_exit_status[host] = exit_status # Get a list of any interrupted hosts host_interrupted = [] if timeout and task.num_timeout() > 0: host_interrupted.extend(list(task.iter_keys_timeout())) # Iterate through all the groups of common output output_data = list(task.iter_buffers()) if not output_data: output_data = [["", hosts]] for output, host_list in output_data: # Deterimine the unique exit status for each host with the same output output_exit_status = {} for host in host_list: if host_exit_status[host] not in output_exit_status: output_exit_status[host_exit_status[host]] = NodeSet() output_exit_status[host_exit_status[host]].add(host) # Determine the unique interrupted state for each host with the same # output and exit status for exit_status in output_exit_status: output_interrupted = {} for host in list(output_exit_status[exit_status]): is_interrupted = host in host_interrupted if is_interrupted not in output_interrupted: output_interrupted[is_interrupted] = NodeSet() output_interrupted[is_interrupted].add(host) # Add a result entry for each group of hosts with the same output, # exit status, and interrupted status for interrupted in output_interrupted: results.append({ "command": command, "hosts": output_interrupted[interrupted], "exit_status": exit_status, "interrupted": interrupted, "stdout": [ line.decode("utf-8").rstrip(os.linesep) for line in output ], }) # Display results if requested or there is an unexpected exit status bad_exit_status = [ item["exit_status"] for item in results if expect_rc is not None and item["exit_status"] != expect_rc ] if verbose or bad_exit_status: log.info(colate_results(command, results)) return results
def testBadTopologies(self): """test detecting invalid topologies""" g = TopologyGraph() admin = NodeSet('admin') # Add the same nodeset twice ns0 = NodeSet('nodes[0-9]') ns1 = NodeSet('nodes[10-19]') ns2 = NodeSet('nodes[20-29]') g.add_route(admin, ns0) g.add_route(ns0, ns1) g.add_route(ns0, ns2) # add a superset of a known destination as source ns2_sup = NodeSet('somenode[0-10]') ns2_sup.add(ns2) self.assertRaises(TopologyError, g.add_route, ns2_sup, NodeSet('foo1')) # Add a known dst nodeset as a src nodeset ns3 = NodeSet('nodes[30-39]') g.add_route(ns1, ns3) # Add a subset of a known src nodeset as src ns0_sub = NodeSet(','.join(ns0[:3:])) ns4 = NodeSet('nodes[40-49]') g.add_route(ns0_sub, ns4) # Add a subset of a known dst nodeset as src ns1_sub = NodeSet(','.join(ns1[:3:])) self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub) # Add a subset of a known src nodeset as dst self.assertRaises(TopologyError, g.add_route, ns4, ns0_sub) # Add a subset of a known dst nodeset as dst self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub) # src <- subset of -> dst ns5 = NodeSet('nodes[50-59]') ns5_sub = NodeSet(','.join(ns5[:3:])) self.assertRaises(TopologyError, g.add_route, ns5, ns5_sub) self.assertRaises(TopologyError, g.add_route, ns5_sub, ns5) self.assertEqual(g.dest(ns0), (ns1 | ns2)) self.assertEqual(g.dest(ns1), ns3) self.assertEqual(g.dest(ns2), None) self.assertEqual(g.dest(ns3), None) self.assertEqual(g.dest(ns4), None) self.assertEqual(g.dest(ns5), None) self.assertEqual(g.dest(ns0_sub), (ns1 | ns2 | ns4)) g = TopologyGraph() root = NodeSet('root') ns01 = NodeSet('nodes[0-1]') ns23 = NodeSet('nodes[2-3]') ns45 = NodeSet('nodes[4-5]') ns67 = NodeSet('nodes[6-7]') ns89 = NodeSet('nodes[8-9]') g.add_route(root, ns01) g.add_route(root, ns23 | ns45) self.assertRaises(TopologyError, g.add_route, ns23, ns23) self.assertRaises(TopologyError, g.add_route, ns45, root) g.add_route(ns23, ns67) g.add_route(ns67, ns89) self.assertRaises(TopologyError, g.add_route, ns89, ns67) self.assertRaises(TopologyError, g.add_route, ns89, ns89) self.assertRaises(TopologyError, g.add_route, ns89, ns23) ns_all = NodeSet('root,nodes[0-9]') for nodegroup in g.to_tree('root'): ns_all.difference_update(nodegroup.nodeset) self.assertEqual(len(ns_all), 0)
def testNodeSet(self): """test ServerGroup.nodeset()""" srv1 = Server('foo1', ['foo1@tcp']) srv2 = Server('foo2', ['foo2@tcp']) grp = ServerGroup([srv1, srv2]) self.assertEqual(grp.nodeset(), NodeSet('foo[1-2]'))
class Dfuse(DfuseCommand): """Class defining an object of type DfuseCommand.""" def __init__(self, hosts, tmp): """Create a dfuse object.""" super().__init__("/run/dfuse/*", "dfuse") # set params self.hosts = hosts self.tmp = tmp self.running_hosts = NodeSet() def __del__(self): """Destruct the object.""" if self.running_hosts: self.log.error('Dfuse object deleted without shutting down') def check_mount_state(self, nodes=None): """Check the dfuse mount point mounted state on the hosts. Args: nodes (NodeSet, optional): hosts on which to check if dfuse is mounted. Defaults to None, which will use all of the hosts. Returns: dict: a dictionary of NodeSets of hosts with the dfuse mount point either "mounted" or "unmounted" """ state = { "mounted": NodeSet(), "unmounted": NodeSet(), "nodirectory": NodeSet() } if not nodes: nodes = NodeSet.fromlist(self.hosts) check_mounted = NodeSet() # Detect which hosts have mount point directories defined command = "test -d {0} -a ! -L {0}".format(self.mount_dir.value) retcodes = pcmd(nodes, command, expect_rc=None) for retcode, hosts in list(retcodes.items()): for host in hosts: if retcode == 0: check_mounted.add(host) else: command = "grep 'dfuse {}' /proc/mounts".format( self.mount_dir.value) retcodes = pcmd([host], command, expect_rc=None) for ret_code, host_names in list(retcodes.items()): for node in host_names: if ret_code == 0: check_mounted.add(node) else: state["nodirectory"].add(node) if check_mounted: # Detect which hosts with mount point directories have it mounted as # a fuseblk device command = "stat -c %T -f {0} | grep -v fuseblk".format( self.mount_dir.value) retcodes = pcmd(check_mounted, command, expect_rc=None) for retcode, hosts in list(retcodes.items()): for host in hosts: if retcode == 1: state["mounted"].add(host) else: state["unmounted"].add(host) return state def get_umount_command(self, force=False): """Get the command to umount the dfuse mount point. Args: force (bool, optional): whether to force the umount with a lazy unmount. Defaults to False. Returns: str: the dfuse umount command """ umount = "-uz" if force else "-u" command = [ "if [ -x '$(command -v fusermount)' ]", "then fusermount {0} {1}".format(umount, self.mount_dir.value), "else fusermount3 {0} {1}".format(umount, self.mount_dir.value), "fi" ] return ";".join(command) def create_mount_point(self): """Create dfuse directory. Raises: CommandFailure: In case of error creating directory """ # Raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") # Create the mount point on any host without dfuse already mounted state = self.check_mount_state() if state["nodirectory"]: command = "mkdir -p {}".format(self.mount_dir.value) ret_code = pcmd(state["nodirectory"], command, timeout=30) if len(ret_code) > 1 or 0 not in ret_code: failed_nodes = [ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ] error_hosts = NodeSet(",".join(failed_nodes)) raise CommandFailure( "Error creating the {} dfuse mount point on the " "following hosts: {}".format(self.mount_dir.value, error_hosts)) def remove_mount_point(self, fail=True): """Remove dfuse directory. Try once with a simple rmdir which should succeed, if this does not then try again with rm -rf, but still raise an error. Raises: CommandFailure: In case of error deleting directory """ # raise exception if mount point not specified if self.mount_dir.value is None: raise CommandFailure("Mount point not specified, " "check test yaml file") dir_exists, clean_nodes = check_file_exists(self.hosts, self.mount_dir.value, directory=True) if dir_exists: target_nodes = list(self.hosts) if clean_nodes: target_nodes.remove(clean_nodes) self.log.info("Removing the %s dfuse mount point on %s", self.mount_dir.value, target_nodes) cmd = "rmdir {}".format(self.mount_dir.value) ret_code = pcmd(target_nodes, cmd, timeout=30) if len(ret_code) == 1 and 0 in ret_code: return failed_nodes = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) cmd = "rm -rf {}".format(self.mount_dir.value) ret_code = pcmd(failed_nodes, cmd, timeout=30) if len(ret_code) > 1 or 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) if fail: raise CommandFailure( "Error removing the {} dfuse mount point with rm on " "the following hosts: {}".format( self.mount_dir.value, error_hosts)) if fail: raise CommandFailure( "Error removing the {} dfuse mount point with rmdir on the " "following hosts: {}".format(self.mount_dir.value, failed_nodes)) else: self.log.info("No %s dfuse mount point directory found on %s", self.mount_dir.value, self.hosts) def run(self, check=True): # pylint: disable=arguments-differ """Run the dfuse command. Args: check (bool): Check if dfuse mounted properly after mount is executed. Raises: CommandFailure: In case dfuse run command fails """ self.log.info('Starting dfuse at %s', self.mount_dir.value) # A log file must be defined to ensure logs are captured if "D_LOG_FILE" not in self.env: raise CommandFailure( "Dfuse missing environment variables for D_LOG_FILE") # create dfuse dir if does not exist self.create_mount_point() # run dfuse command cmd = "".join([self.env.get_export_str(), self.__str__()]) ret_code = pcmd(self.hosts, cmd, timeout=30) if 0 in ret_code: self.running_hosts.add(ret_code[0]) del ret_code[0] if ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) raise CommandFailure( "Error starting dfuse on the following hosts: {}".format( error_hosts)) if check: # Dfuse will block in the command for the mount to complete, even # if run in background mode so it should be possible to start using # it immediately after the command returns. if not self.check_running(fail_on_error=False): self.log.info('Waiting two seconds for dfuse to start') time.sleep(2) if not self.check_running(fail_on_error=False): self.log.info('Waiting five seconds for dfuse to start') time.sleep(5) self.check_running() def check_running(self, fail_on_error=True): """Check dfuse is running. Run a command to verify dfuse is running on hosts where it is supposed to be. Use grep -v and rc=1 here so that if it isn't, then we can see what is being used instead. Args: fail_on_error (bool, optional): should an exception be raised if an error is detected. Defaults to True. Raises: CommandFailure: raised if dfuse is found not running on any expected nodes and fail_on_error is set. Returns: bool: whether or not dfuse is running """ status = True state = self.check_mount_state(self.running_hosts) if state["unmounted"] or state["nodirectory"]: self.log.error("Error: dfuse not running on %s", str(state["unmounted"].union(state["nodirectory"]))) status = False if fail_on_error: raise CommandFailure("dfuse not running") return status def stop(self): """Stop dfuse. Try to stop dfuse. Try once nicely by using fusermount, then if that fails try to pkill it to see if that works. Abort based on the result of the fusermount, as if pkill is necessary then dfuse itself has not worked correctly. Finally, try and remove the mount point, and that itself should work. Raises: CommandFailure: In case dfuse stop fails """ # Include all hosts when stopping to ensure all mount points in any # state are properly removed self.running_hosts.add(NodeSet.fromlist(self.hosts)) self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value, self.running_hosts) if self.mount_dir.value and self.running_hosts: error_list = [] # Loop until all fuseblk mounted devices are unmounted counter = 0 while self.running_hosts and counter < 3: # Attempt to kill dfuse on after first unmount fails if self.running_hosts and counter > 1: kill_command = "pkill dfuse --signal KILL" pcmd(self.running_hosts, kill_command, timeout=30) # Attempt to unmount any fuseblk mounted devices after detection if self.running_hosts and counter > 0: pcmd(self.running_hosts, self.get_umount_command(counter > 1), expect_rc=None) time.sleep(2) # Detect which hosts have fuseblk mounted devices and remove any # hosts which no longer have the dfuse mount point mounted state = self.check_mount_state(self.running_hosts) for host in state["unmounted"].union(state["nodirectory"]): self.running_hosts.remove(host) # Increment the loop counter counter += 1 if self.running_hosts: error_list.append("Error stopping dfuse on {}".format( self.running_hosts)) # Remove mount points try: self.remove_mount_point() except CommandFailure as error: error_list.append(error) # Report any errors if error_list: raise CommandFailure("\n".join(error_list)) elif self.mount_dir.value is None: self.log.info("No dfuse mount directory defined - nothing to stop") else: self.log.info("No hosts running dfuse - nothing to stop")
def _check_channel_ctl_shell(self, command, target, stderr, remote, reply_msg_class, reply_pattern, write_buf=None, timeout=-1, replycnt=1, reply_rc=0): """helper to check channel shell action""" self.channel_send_start() msg = self.recvxml(StartMessage) self.channel_send_cfg('n1') msg = self.recvxml(ACKMessage) # prepare a remote shell command request... workertree = TreeWorker(nodes=target, handler=None, timeout=timeout, command=command) # code snippet from PropagationChannel.shell() ctl = ControlMessage(id(workertree)) ctl.action = 'shell' ctl.target = NodeSet(target) info = task_self()._info.copy() info['debug'] = False ctl_data = { 'cmd': command, 'invoke_gateway': workertree.invoke_gateway, 'taskinfo': info, 'stderr': stderr, 'timeout': timeout, 'remote': remote } ctl.data_encode(ctl_data) self.gateway.send(ctl.xml()) self.recvxml(ACKMessage) if write_buf: ctl = ControlMessage(id(workertree)) ctl.action = 'write' ctl.target = NodeSet(target) ctl_data = { 'buf': write_buf, } # Send write message ctl.data_encode(ctl_data) self.gateway.send(ctl.xml()) self.recvxml(ACKMessage) # Send EOF message ctl = ControlMessage(id(workertree)) ctl.action = 'eof' ctl.target = NodeSet(target) self.gateway.send(ctl.xml()) self.recvxml(ACKMessage) while replycnt > 0: msg = self.recvxml(reply_msg_class) replycnt -= len(NodeSet(msg.nodes)) self.assertTrue(msg.nodes in ctl.target) if msg.has_payload or reply_pattern: msg_data = msg.data_decode() try: if not reply_pattern.search(msg_data): self.assertEqual( msg.data, reply_pattern, 'Pattern "%s" not found in data="%s"' % (reply_pattern.pattern, msg_data)) except AttributeError: # not a regexp self.assertEqual(msg_data, reply_pattern) if timeout <= 0: msg = self.recvxml(RetcodeMessage) self.assertEqual(msg.retcode, reply_rc) self.channel_send_stop() self.gateway.wait() self.gateway.close()
class PropagationTreeRouter(object): """performs routes resolving operations within a propagation tree. This object provides a next_hop method, that will look for the best directly connected node to use to forward a message to a remote node. Upon instanciation, the router will parse the topology tree to generate its routing table. """ def __init__(self, root, topology, fanout=0): self.root = root self.topology = topology self.fanout = fanout self.nodes_fanin = {} self.table = None self.table_generate(root, topology) self._unreachable_hosts = NodeSet() def table_generate(self, root, topology): """The router relies on a routing table. The keys are the destination nodes and the values are the next hop gateways to use to reach these nodes. """ self.table = {} root_group = None for entry in topology.groups: if root in entry.nodeset: root_group = entry break if root_group is None: raise RouteResolvingError('Invalid admin node: %s' % root) for group in root_group.children(): self.table[group.nodeset] = NodeSet() stack = [group] while len(stack) > 0: curr = stack.pop() self.table[group.nodeset].add(curr.children_ns()) stack += curr.children() # reverse table (it was crafted backward) self.table = dict((v, k) for k, v in self.table.iteritems()) def dispatch(self, dst): """dispatch nodes from a target nodeset to the directly connected gateways. The method acts as an iterator, returning a gateway and the associated hosts. It should provide a rather good load balancing between the gateways. """ # Check for directly connected targets res = [tmp & dst for tmp in self.table.values()] nexthop = NodeSet() [nexthop.add(x) for x in res] if len(nexthop) > 0: yield nexthop, nexthop # Check for remote targets, that require a gateway to be reached for network in self.table.iterkeys(): dst_inter = network & dst dst.difference_update(dst_inter) for host in dst_inter.nsiter(): yield self.next_hop(host), host def next_hop(self, dst): """perform the next hop resolution. If several hops are available, then, the one with the least number of current jobs will be returned """ if dst in self._unreachable_hosts: raise RouteResolvingError( 'Invalid destination: %s, host is unreachable' % dst) # can't resolve if source == destination if self.root == dst: raise RouteResolvingError( 'Invalid resolution request: %s -> %s' % (self.root, dst)) ## ------------------ # the routing table is organized this way: # # NETWORK | NEXT HOP # ------------+----------- # node[0-9] | gateway0 # node[10-19] | gateway[1-2] # ... # --------- for network, nexthops in self.table.iteritems(): # destination contained in current network if dst in network: res = self._best_next_hop(nexthops) if res is None: raise RouteResolvingError('No route available to %s' % \ str(dst)) self.nodes_fanin[res] += len(dst) return res # destination contained in current next hops (ie. directly # connected) if dst in nexthops: return dst raise RouteResolvingError( 'No route from %s to host %s' % (self.root, dst)) def mark_unreachable(self, dst): """mark node dst as unreachable and don't advertise routes through it anymore. The cache will be updated only when necessary to avoid performing expensive traversals. """ # Simply mark dst as unreachable in a dedicated NodeSet. This # list will be consulted by the resolution method self._unreachable_hosts.add(dst) def _best_next_hop(self, candidates): """find out a good next hop gateway""" backup = None backup_connections = 1e400 # infinity candidates = candidates.difference(self._unreachable_hosts) for host in candidates: # the router tracks established connections in the # nodes_fanin table to avoid overloading a gateway connections = self.nodes_fanin.setdefault(host, 0) # FIXME #if connections < self.fanout: # # currently, the first one is the best # return host if backup_connections > connections: backup = host backup_connections = connections return backup
def load(self): """Load Cluster, Nodes and partitions from Architecture files. Raises HPCStatsRuntimeError or HPCStatsSourceError if error is encountered while loading data from sources. It sets attributes cluster, nodes and partitions with loaded data. """ self.cluster = Cluster(self.cluster_name) self.nodes = [] self.partitions = {} self.read_arch() config_get = self.config_get partitions = config_get(self.cluster.name, "partitions").split(',') for partition in partitions: part_sect = self.cluster.name + "/" + partition nodegroups = config_get(part_sect, "nodegroups").split(',') job_partitions = config_get(part_sect, "job_partitions") \ .split(',') nodeset_part = NodeSet() # nodeset for the partitions attribute for nodegroup in nodegroups: nodegroup_sect = self.cluster.name + "/" + partition \ + "/" + nodegroup nodenames = config_get(nodegroup_sect, "names") nodeset_part.add(nodenames) sockets = config_get(nodegroup_sect, "sockets", isint=True) cores_per_socket = config_get(nodegroup_sect, "corespersocket", isint=True) cpu = sockets * cores_per_socket float_instructions = config_get(nodegroup_sect, "floatinstructions", isint=True) freq_str = config_get(nodegroup_sect, "frequency") freq = ArchitectureImporterArchfile.convert_freq(freq_str) if freq is None: raise HPCStatsSourceError( \ "format of frequency for nodeset %s/%s/%s (%s) " \ "'%s' is not valid" \ % ( self.cluster.name, partition, nodegroup, nodenames, freq_str )) flops = sockets * cores_per_socket * float_instructions * freq mem_str = config_get(nodegroup_sect, "memory") mem = ArchitectureImporterArchfile.convert_mem(mem_str) if mem is None: raise HPCStatsSourceError( \ "format of memory for nodeset %s/%s/%s (%s) " \ "'%s' is not valid" \ % ( self.cluster.name, partition, nodegroup, nodenames, mem_str )) model = config_get(nodegroup_sect, "model") nodeset_group = NodeSet(nodenames) for nodename in nodeset_group: # create and append node new_node = Node(name=nodename, cluster=self.cluster, model=model, partition=partition, cpu=cpu, memory=mem, flops=flops) self.nodes.append(new_node) self.partitions[str(nodeset_part)] = job_partitions
def test_bashcmd(self): """Jira ID: DAOS-3508. Test Description: Purpose of this test is to mount different mount points of dfuse for different container and pool sizes and perform basic bash commands. Use cases: Following list of bash commands have been incorporated as part of this test: mkdir, touch, ls, chmod, rm, dd, stat, cp, cmp, mv, rmdir. Create a directory. Create a file under that directory. List the created file. Remove the file. Write a file to the dfuse mounted location using dd. List the written file to verify if it's create. Verify the file created is of right size as desired. Copy the file Compare the copied file with original to verify the content is same. Remove copied file. Rename file Verify renamed file exist using list. Remove a directory :avocado: tags=all,hw,daosio,medium,ib2,full_regression,bashcmd """ dir_name = self.params.get("dirname", '/run/bashcmd/*') file_name1 = self.params.get("filename1", '/run/bashcmd/*') file_name2 = self.params.get("filename2", '/run/bashcmd/*') dd_count = self.params.get("dd_count", '/run/bashcmd/*') dd_blocksize = self.params.get("dd_blocksize", '/run/bashcmd/*') pool_count = self.params.get("pool_count", '/run/pool/*') cont_count = self.params.get("cont_count", '/run/container/*') # Create a pool if one does not already exist. for _ in range(pool_count): self.add_pool(connect=False) # perform test for multiple containers. for count in range(cont_count): self.add_container(self.pool) mount_dir = "/tmp/{}_daos_dfuse{}".format( self.pool.uuid, count) self.start_dfuse(self.hostlist_clients, self.pool, self.container, mount_dir) abs_dir_path = os.path.join(self.dfuse.mount_dir.value, dir_name) abs_file_path1 = os.path.join(abs_dir_path, file_name1) abs_file_path2 = os.path.join(abs_dir_path, file_name2) # list of commands to be executed. commands = [ "mkdir -p {}".format(abs_dir_path), "touch {}".format(abs_file_path1), "ls -a {}".format(abs_file_path1), "rm {}".format(abs_file_path1), "dd if=/dev/zero of={} count={} bs={}".format( abs_file_path1, dd_count, dd_blocksize), "ls -al {}".format(abs_file_path1), "filesize=$(stat -c%s '{}');\ if (( filesize != {}*{} )); then exit 1;\ fi".format(abs_file_path1, dd_count, dd_blocksize), "cp -r {} {}".format(abs_file_path1, abs_file_path2), "cmp --silent {} {}".format(abs_file_path1, abs_file_path2), "rm {}".format(abs_file_path2), "mv {} {}".format( abs_file_path1, abs_file_path2), "ls -al {}".format(abs_file_path2), "rm {}".format(abs_file_path2), "rmdir {}".format(abs_dir_path) ] for cmd in commands: try: # execute bash cmds ret_code = general_utils.pcmd(self.hostlist_clients, cmd, timeout=30) if 0 not in ret_code: error_hosts = NodeSet(",".join([ str(node_set) for code, node_set in list(ret_code.items()) if code != 0 ])) raise CommandFailure( "Error running '{}' on the following " "hosts: {}".format(cmd, error_hosts)) # report error if any command fails except CommandFailure as error: self.log.error("BashCmd Test Failed: %s", str(error)) self.fail("Test was expected to pass but " "it failed.\n") # stop dfuse self.stop_dfuse() # destroy container self.container.destroy() # destroy pool self.pool.destroy()