コード例 #1
0
ファイル: Clush.py プロジェクト: cea-hpc/clustershell
    def ev_close(self, worker, timedout):
        # Worker is closing -- it's time to gather results...
        self._runtimer_finalize(worker)
        # Display command output, try to order buffers by rc
        nodesetify = lambda v: (v[0], NodeSet._fromlist1(v[1]))
        cleaned = False
        for _rc, nodelist in sorted(worker.iter_retcodes()):
            ns_remain = NodeSet._fromlist1(nodelist)
            # Then order by node/nodeset (see nodeset_cmpkey)
            for buf, nodeset in sorted(map(nodesetify,
                                           worker.iter_buffers(nodelist)),
                                       key=bufnodeset_cmpkey):
                if not cleaned:
                    # clean runtimer line before printing first result
                    self._runtimer_clean()
                    cleaned = True
                self._display.print_gather(nodeset, buf)
                ns_remain.difference_update(nodeset)
            if ns_remain:
                self._display.print_gather_finalize(ns_remain)
        self._display.flush()

        self._close_common(worker)

        # Notify main thread to update its prompt
        self.update_prompt(worker)
コード例 #2
0
ファイル: Clush.py プロジェクト: cea-hpc/clustershell
    def update(self):
        """Update runtime progress info"""
        wrbwinfo = ''
        if self.bytes_written > 0:
            bandwidth = self.bytes_written/(time.time() - self.start_time)
            wrbwinfo = " write: %s/s" % human_bi_bytes_unit(bandwidth)

        gwcnt = len(self.task.gateways)
        if gwcnt:
            # tree mode
            act_targets = NodeSet()
            for gw, (chan, metaworkers) in self.task.gateways.items():
                act_targets.updaten(mw.gwtargets[gw] for mw in metaworkers)
            cnt = len(act_targets) + len(self.task._engine.clients()) - gwcnt
            gwinfo = ' gw %d' % gwcnt
        else:
            cnt = len(self.task._engine.clients())
            gwinfo = ''
        if self.bytes_written > 0 or cnt != self.cnt_last:
            self.cnt_last = cnt
            # display completed/total clients
            towrite = 'clush: %*d/%*d%s%s\r' % (self.tslen, self.total - cnt,
                                                self.tslen, self.total, gwinfo,
                                                wrbwinfo)
            self.wholelen = len(towrite)
            sys.stderr.write(towrite)
            self.started = True
コード例 #3
0
ファイル: Clubak.py プロジェクト: cea-hpc/clustershell
def display(tree, disp, gather, trace_mode, enable_nodeset_key):
    """nicely display MsgTree instance `tree' content according to
    `disp' Display object and `gather' boolean flag"""
    out = sys_stdout()
    try:
        if trace_mode:
            display_tree(tree, disp, out)
        else:
            if gather:
                if enable_nodeset_key:
                    # lambda to create a NodeSet from keys returned by walk()
                    ns_getter = lambda x: NodeSet.fromlist(x[1])
                    for nodeset in sorted((ns_getter(item) for item in tree.walk()),
                                          key=nodeset_cmpkey):
                        disp.print_gather(nodeset, tree[nodeset[0]])
                else:
                    for msg, key in tree.walk():
                        disp.print_gather_keys(key, msg)
            else:
                if enable_nodeset_key:
                    # nodes are automagically sorted by NodeSet
                    for node in NodeSet.fromlist(tree.keys()).nsiter():
                        disp.print_gather(node, tree[str(node)])
                else:
                    for key in tree.keys():
                        disp.print_gather_keys([ key ], tree[key])
    finally:
        out.flush()
コード例 #4
0
ファイル: slurm_interface.py プロジェクト: edf-hpc/unclebench
     def get_available_nodes(self,slices_size=1):
          """ Returns a list of currently available nodes by slice of slices_size
          ex: for slices of size 4 ['cn[100-103]','cn[109,150-152]']
          :param slices_size: slices size
          :param type: int
          :returns: list of nodes_id
          :rtype: str """

          node_list=[]
          a = pyslurm.node()
          node_dict = a.get()
          node_count=0
          nodeset = NodeSet()
          if len(node_dict) > 0:
               for key, value in sorted(node_dict.iteritems()):
                    if value['state']=='IDLE':
                         nodetype=value
                         nodeset.update(key)
                         node_count+=1
                    if node_count==slices_size:
                         node_list.append(str(nodeset))
                         nodeset=NodeSet()
                         slice_str=None
                         node_count=0


          return node_list
コード例 #5
0
    def testConfigurationLongSyntax(self):
        """test detailed topology description syntax"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write(b'# this is a comment\n')
        tmpfile.write(b'[routes]\n')
        tmpfile.write(b'admin: proxy\n')
        tmpfile.write(b'proxy: STA[0-1]\n')
        tmpfile.write(b'STA0: STB[0-1]\n')
        tmpfile.write(b'STB0: nodes[0-2]\n')
        tmpfile.write(b'STB1: nodes[3-5]\n')
        tmpfile.write(b'STA1: STB[2-3]\n')
        tmpfile.write(b'STB2: nodes[6-7]\n')
        tmpfile.write(b'STB3: nodes[8-10]\n')

        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,proxy,STA[0-1],STB[0-3],nodes[0-10]')
        ns_tree = NodeSet()
        tree = parser.tree('admin')
        self.assertEqual(tree.inner_node_count(), 8)
        self.assertEqual(tree.leaf_node_count(), 11)
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
コード例 #6
0
    def testMultipleAdminGroups(self):
        """test topology with several admin groups"""
        ## -------------------
        # TODO : uncommenting following lines should not produce an error. This
        # is a valid topology!!
        # ----------
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write(b'[routes]\n')
        tmpfile.write(b'admin0: nodes[0-1]\n')
        #tmpfile.write(b'admin1: nodes[0-1]\n')
        tmpfile.write(b'admin2: nodes[2-3]\n')
        #tmpfile.write(b'admin3: nodes[2-3]\n')
        tmpfile.write(b'nodes[0-1]: nodes[10-19]\n')
        tmpfile.write(b'nodes[2-3]: nodes[20-29]\n')
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        ns_all = NodeSet('admin2,nodes[2-3,20-29]')
        ns_tree = NodeSet()
        tree = parser.tree('admin2')
        self.assertEqual(tree.inner_node_count(), 3)
        self.assertEqual(tree.leaf_node_count(), 10)
        for nodegroup in tree:
            ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
コード例 #7
0
ファイル: Install.py プロジェクト: bullxpfs/lustre-shine
    def ev_close(self, worker):
        """
        Check process termination status and generate appropriate events.
        """
        Action.ev_close(self, worker)

        # Action timed out
        if worker.did_timeout():
            nodes = NodeSet.fromlist(worker.iter_keys_timeout())
            self.fs._handle_shine_proxy_error(nodes, "Nodes timed out")
            self.set_status(ACT_ERROR)

        # Action succeeded
        elif max(rc for rc, _ in worker.iter_retcodes()) == 0:
            self.set_status(ACT_OK)

        # Action failed
        else:
            for rc, nodes in worker.iter_retcodes():
                if rc == 0:
                    continue

                # Avoid warnings, flag this component in error state
                for comp in self._comps or []:
                    comp.sanitize_state(nodes=worker.nodes)

                for output, nodes in worker.iter_buffers(match_keys=nodes):
                    nodes = NodeSet.fromlist(nodes)
                    msg = "Copy failed: %s" % output
                    self.fs._handle_shine_proxy_error(nodes, msg)
            self.set_status(ACT_ERROR)
コード例 #8
0
ファイル: NodeSetErrorTest.py プロジェクト: wt/clustershell
 def test_internal_mismatch(self):
     nodeset = NodeSet("cluster[1-30]c[1-2]")
     self.assertTrue("cluster%sc%s" in nodeset._patterns)
     nodeset._patterns["cluster%sc%s"] = RangeSetND([[1]])
     self.assertRaises(NodeSetParseError, str, nodeset)
     nodeset._patterns["cluster%sc%s"] = RangeSetND([[1, 1]])
     self.assertEqual(str(nodeset), "cluster1c1")
     nodeset._patterns["cluster%sc%s"] = RangeSetND([[1, 1, 1]])
     self.assertRaises(NodeSetParseError, str, nodeset)
コード例 #9
0
ファイル: Clush.py プロジェクト: cea-hpc/clustershell
class LiveGatherOutputHandler(GatherOutputHandler):
    """Live line-gathered output event handler class (-bL)."""

    def __init__(self, display, nodes):
        assert nodes is not None, "cannot gather local command"
        GatherOutputHandler.__init__(self, display)
        self._nodes = NodeSet(nodes)
        self._nodecnt = dict.fromkeys(self._nodes, 0)
        self._mtreeq = []
        self._offload = 0

    def ev_read(self, worker, node, sname, msg):
        if sname != worker.SNAME_STDOUT:
            GatherOutputHandler.ev_read(self, worker, node, sname, msg)
            return
        # Read new line from node
        self._nodecnt[node] += 1
        cnt = self._nodecnt[node]
        if len(self._mtreeq) < cnt:
            self._mtreeq.append(MsgTree())
        self._mtreeq[cnt - self._offload - 1].add(node, msg)
        self._live_line(worker)

    def ev_hup(self, worker, node, rc):
        if self._mtreeq and node not in self._mtreeq[0]:
            # forget a node that doesn't answer to continue live line
            # gathering anyway
            self._nodes.remove(node)
            self._live_line(worker)

    def _live_line(self, worker):
        # if all nodes have replied, display gathered line
        while self._mtreeq and len(self._mtreeq[0]) == len(self._nodes):
            mtree = self._mtreeq.pop(0)
            self._offload += 1
            self._runtimer_clean()
            nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1]))
            for buf, nodeset in sorted(map(nodesetify, mtree.walk()),
                                       key=bufnodeset_cmpkey):
                self._display.print_gather(nodeset, buf)
            self._runtimer_set_dirty()

    def ev_close(self, worker, timedout):
        # Worker is closing -- it's time to gather results...
        self._runtimer_finalize(worker)

        for mtree in self._mtreeq:
            nodesetify = lambda v: (v[0], NodeSet.fromlist(v[1]))
            for buf, nodeset in sorted(map(nodesetify, mtree.walk()),
                                       key=bufnodeset_cmpkey):
                self._display.print_gather(nodeset, buf)

        self._close_common(worker)

        # Notify main thread to update its prompt
        self.update_prompt(worker)
コード例 #10
0
ファイル: Action.py プロジェクト: cea-hpc/milkcheck
 def nodes_error(self):
     """Get nodeset of error nodes for this action."""
     error_nodes = NodeSet()
     if self.worker:
         if isinstance(self.worker, WorkerPopen):
             retcode = self.worker.retcode()
             # We don't count timeout (retcode=None)
             if retcode not in (None, 0):
                 error_nodes = NodeSet("localhost")
         else:
             for retcode, nds in self.worker.iter_retcodes():
                 if retcode != 0:
                     error_nodes.add(nds)
     return error_nodes
コード例 #11
0
ファイル: cli.py プロジェクト: AdrienDebrie/sequencer
def _report_unexec(a_model, execution):
    """
    Display the 'unexec' type of report
    """
    all_actions_set = set(a_model.actions.keys())
    all_actions_set_nb = len(all_actions_set)
    executed_actions_set = set(execution.executed_actions.keys())
    unexecuted_actions_set = all_actions_set.difference(executed_actions_set)
    unexecuted_actions_nb = len(unexecuted_actions_set)
    try:
        percentage = (float(unexecuted_actions_nb) / all_actions_set_nb) * 100
    except ZeroDivisionError:
        percentage = 0.0
    _LOGGER.output("\nUnexecuted Actions: %d (%2.1f %%)\t" + \
                       "Legend: mDeps=missings (error or unexecuted)" + \
                       " dependencies",
                   unexecuted_actions_nb, percentage)
    tab_values = []
    # Sort by len() first then alphabetically so:
    # b1, b2, b20, c1, c2, c10, c100 appears in that order
    sorted_list = sorted(unexecuted_actions_set, key = len)
    for id_ in sorted(sorted_list):
        action = a_model.actions[id_]
        all_deps = action.all_deps()
        all_deps_nb = len(all_deps)
        unexec = set(all_deps) - set(execution.executed_actions.keys())
        error = set(all_deps) & set(execution.error_actions.keys())
        missings = unexec.union(error)
        nodeset = NodeSet()
        missing_nb = len(missings)
        for missing in missings:
            if len(missing) != 0:
                nodeset.add(missing)
        try:
            percentage = ((float(missing_nb) / all_deps_nb) * 100)
        except ZeroDivisionError:
            percentage = 0.0
        tab_values.append([id_, str(len(all_deps)),
                           str(missing_nb),
                           u"%2.1f" % percentage,
                           str(nodeset)])
    output = smart_display([u"Id", u"#Deps",
                            u"#mDeps", u"%mDeps",
                            u"mDeps"],
                           tab_values, vsep=u" | ",
                           justify=[str.center, str.center,
                                    str.center, str.center,
                                    str.ljust])
    _LOGGER.output(output)
コード例 #12
0
ファイル: Clush.py プロジェクト: cea-hpc/clustershell
 def __init__(self, display, nodes):
     assert nodes is not None, "cannot gather local command"
     GatherOutputHandler.__init__(self, display)
     self._nodes = NodeSet(nodes)
     self._nodecnt = dict.fromkeys(self._nodes, 0)
     self._mtreeq = []
     self._offload = 0
コード例 #13
0
ファイル: Cli.py プロジェクト: pombredanne/milkcheck
    def __gen_action_output(self, iterbuf, iterrc, timeouts, error_only):
        '''Display command result from output and retcodes.'''

        # Build the list of non-zero rc nodes
        retcodes = list(iterrc)
        ok_nodes = NodeSet.fromlist((nds for rc, nds in retcodes if rc == 0))

        output = []
        for out, nodes in iterbuf:
            if error_only:
                nodes = NodeSet(nodes) - ok_nodes
            if nodes and out:
                for lbuf in out.splitlines():
                    output.append(' > %s: %s' %
                                  (self.string_color(nodes, 'CYAN'), lbuf))

        for retcode, nodes in retcodes:
            if retcode == 0 and not error_only:
                output.append(' > %s exited with %s' %
                              (self.string_color(nodes, 'CYAN'),
                               self.string_color(retcode, 'GREEN')))
            elif retcode != 0:
                output.append(' > %s exited with %s' %
                              (self.string_color(nodes, 'CYAN'),
                               self.string_color(retcode, 'RED')))
        if len(timeouts):
            output.append(' > %s has %s' %
                          (self.string_color(timeouts, 'CYAN'),
                           self.string_color('timeout', 'RED')))
        return output
コード例 #14
0
    def submitNodeList(self):
        # info msg
        print '\n# Step 1 of 3 : Please enter nodes name below (using the clustershell syntax <azur1>, <azur[1-2]>) :' 
        # retrieve keyboard input
        try:
            self.ns = NodeSet(self.input_request(''))
            repeat = True
                
            # ask if the user wants to add another node/node group
            while repeat :
                # print added nodes
                for node in self.ns:
                    print 'node : %s' % node
                # user want to add nodes ?
                print '\n### Add nodes ? (yes | no)'
                # retrieve answer
                ans = self.input_request('')
                # check the ans
                if ans == 'Yes' or ans == 'Y' or ans == 'y' or ans == 'yes':
                   print '### Please enter the node/group list below : '
                   # retrieve and append nodes
                   self.ns.add(self.input_request(''))
                # the user don't want to add additionnal nodes
                else:
                   # unset flag
                   repeat = False
                   # check submitted nodes
                   self.ns = self.checkSubmittedNodes(self.ns)

        # invalid submitted node list / syntax error
        except NodeSetException :
            print >> sys.stderr, '\n(!) Error : the submitted node list is not valid\n' % self.ns
コード例 #15
0
ファイル: BaseEntity.py プロジェクト: cea-hpc/milkcheck
 def reset(self):
     '''Reset values of attributes in order to perform multiple exec.'''
     self._tagged = False
     self.target = self._target_backup
     self.status = NO_STATUS
     self.failed_nodes = NodeSet()
     self.algo_reversed = False
コード例 #16
0
ファイル: JobImporterTorque.py プロジェクト: jbaptl/hpcstats
 def torque_job_nodelist(self,nodelist):
     nodelist = self._exechostpat.sub('',nodelist)
     nodelist = nodelist.split('+')
     nbprocs = len(nodelist)
     nodelist = NodeSet.fromlist(nodelist)
     nbnodes = len(nodelist)
     nodelist = str(nodelist)
     return nbprocs, nbnodes, nodelist
コード例 #17
0
    def testConfigurationParserBigTree(self):
        """test configuration parser against big propagation tree"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: ST[0-4]\n')
        tmpfile.write('ST[0-4]: STA[0-49]\n')
        tmpfile.write('STA[0-49]: nodes[0-10000]\n')
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,ST[0-4],STA[0-49],nodes[0-10000]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
コード例 #18
0
    def testConfigurationParser(self):
        """test configuration parsing"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: nodes[0-1]\n')
        tmpfile.write('nodes[0-1]: nodes[2-5]\n')
        tmpfile.write('nodes[4-5]: nodes[6-9]\n')
        tmpfile.flush()
        parser = TopologyParser(tmpfile.name)

        parser.tree('admin')
        ns_all = NodeSet('admin,nodes[0-9]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
コード例 #19
0
ファイル: Clush.py プロジェクト: cea-hpc/clustershell
    def _close_common(self, worker):
        verbexit = VERB_QUIET
        if self._display.maxrc:
            verbexit = VERB_STD
        # Display return code if not ok ( != 0)
        for rc, nodelist in worker.iter_retcodes():
            if rc != 0:
                nsdisp = ns = NodeSet._fromlist1(nodelist)
                if self._display.verbosity > VERB_QUIET and len(ns) > 1:
                    nsdisp = "%s (%d)" % (ns, len(ns))
                msgrc = "clush: %s: exited with exit code %d" % (nsdisp, rc)
                self._display.vprint_err(verbexit, msgrc)

        # Display nodes that didn't answer within command timeout delay
        if worker.num_timeout() > 0:
            self._display.vprint_err(verbexit, "clush: %s: command timeout" % \
                NodeSet._fromlist1(worker.iter_keys_timeout()))
コード例 #20
0
    def testConfigurationShortSyntax(self):
        """test short topology specification syntax"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: nodes[0-9]\n')
        tmpfile.write('nodes[0-3,5]: nodes[10-19]\n')
        tmpfile.write('nodes[4,6-9]: nodes[30-39]\n')
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,nodes[0-19,30-39]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
コード例 #21
0
ファイル: TuningModel.py プロジェクト: bullxpfs/lustre-shine
    def parse(self, filename=None):
        """
        Function called to parse the content of the tuning configuratio file
        and store the configuration in the object.
        """
        # Build the patterns to retrieve alias and parameter declaration
        alias_re = re.compile("alias\s+(\S+)\s*=\s*(\S+)$")
        parameter_re = re.compile('("[^"]+"|\S+)\s+(\S+)\s+(\S+)$')
        supported = NodeSet.fromlist(list(NODE_TYPES) + TYPE_ALIASES.keys())

        # Open the file to read each lines
        try:
            tuning_file = open(filename or self.filename)

            for line in tuning_file.readlines():

                # Skip comments and blanks
                line = line.split('#', 1)[0].strip()
                if not line:
                    continue

                m_alias = alias_re.match(line)
                m_param = parameter_re.match(line)

                if m_alias:
                    # This line is an alias creation
                    self.create_parameter_alias(m_alias.group(1),
                                                m_alias.group(2))

                elif m_param:
                    # This line is a parameter instanciation
                    nodes = NodeSet.fromlist(
                                           m_param.group(3).lower().split(';'))
                    self.create_parameter(m_param.group(2), m_param.group(1),
                                          nodes & supported, nodes - supported)

                else:
                    # This line is not recognized
                    raise TuningError("Wrong tuning syntax '%s'" % line)

            tuning_file.close()

        except IOError, error:
            msg = "Error while reading tuning configuration file: %s" % error
            raise TuningError(msg)
コード例 #22
0
ファイル: BaseEntity.py プロジェクト: mdlx/milkcheck
 def update_target(self, nodeset, mode=None):
     '''Update the attribute target of an entity'''
     assert nodeset is not None
     if not mode:
         self.target = NodeSet(nodeset)
     elif mode is 'DIF' and self.target:
         self.target.difference_update(nodeset)
     elif mode is 'INT' and self.target:
         self.target.intersection_update(nodeset)
コード例 #23
0
ファイル: Worker.py プロジェクト: cea-hpc/clustershell
 def iter_retcodes(self, match_keys=None):
     """
     Returns an iterator over return codes and associated NodeSet.
     If the optional parameter match_keys is defined, only keys
     found in match_keys are returned.
     """
     self._task_bound_check()
     for rc, keys in self.task._rc_iter_by_worker(self, match_keys):
         yield rc, NodeSet.fromlist(keys)
コード例 #24
0
ファイル: TuningModel.py プロジェクト: bullxpfs/lustre-shine
    def __init__(self, name, value, node_types=None, node_list=None):
        self.name = name
        self.value = value
        self._node_types = set()

        self.node_types = node_types or set()
        self.node_list = NodeSet()
        if node_list is not None:
            self.node_list = NodeSet.fromlist(node_list)
コード例 #25
0
ファイル: Action.py プロジェクト: cea-hpc/milkcheck
 def nodes_timeout(self):
     """Get nodeset of timeout nodes for this action."""
     if self.worker:
         if isinstance(self.worker, WorkerPopen):
             if self.worker.did_timeout():
                 return NodeSet("localhost")
         else:
             return NodeSet.fromlist(list(self.worker.iter_keys_timeout()))
     return NodeSet()
コード例 #26
0
    def __init__(self, root, topology, fanout=0):
        self.root = root
        self.topology = topology
        self.fanout = fanout
        self.nodes_fanin = {}
        self.table = None

        self.table_generate(root, topology)
        self._unreachable_hosts = NodeSet()
コード例 #27
0
ファイル: Topology.py プロジェクト: abitduck/clustershell
 def connected(self, src_ns):
     """find out and return the aggregation of directly connected children
     from src_ns.
     Argument src_ns is expected to be a NodeSet instance. Result is returned
     as a NodeSet instance
     """
     next_hop = NodeSet.fromlist([dst for dst in [route.dest(src_ns) for route in self._routes] if dst is not None])
     if len(next_hop) == 0:
         return None
     return next_hop
コード例 #28
0
ファイル: Nodes.py プロジェクト: tchaly-bethmaure/ProjetGrid
 def checkNodes(self):
     try:
        # print command info
        print '\n== Checking active nodes =='
        # launch ping on the specified nodes
        task_self().run('echo OK', nodes=self.ns)
        # retrieve and check return code
        for retcode, nodes in task_self().iter_retcodes():
            if retcode in (0, 1, 2):
                # add nodes to OK set
                self.ns_ok |= NodeSet.fromlist(nodes)
                print '%s : OK' % nodes
            else:
                # add nodes to KO set
                self.ns_ko |= NodeSet.fromlist(nodes)
                print '%s : KO' % nodes
     # syntax error
     except NodeSetException:
         print >> sys.stderr, '(!) Error : the submitted nodeset [%s] is not valid' % self.ns
コード例 #29
0
ファイル: Worker.py プロジェクト: cea-hpc/clustershell
 def iter_errors(self, match_keys=None):
     """
     Returns an iterator over available error buffers and associated
     NodeSet. If the optional parameter match_keys is defined, only
     keys found in match_keys are returned.
     """
     self._task_bound_check()
     for msg, keys in self.task._call_tree_matcher(
             self.task._msgtree(self.SNAME_STDERR).walk, match_keys, self):
         yield msg, NodeSet.fromlist(keys)
コード例 #30
0
    def testConfigurationParserDeepTree(self):
        """test a configuration that generates a deep tree"""
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.write('# this is a comment\n')
        tmpfile.write('[Main]\n')
        tmpfile.write('admin: nodes[0-9]\n')

        levels = 15 # how deep do you want the tree to be?
        for i in xrange(0, levels*10, 10):
            line = 'nodes[%d-%d]: nodes[%d-%d]\n' % (i, i+9, i+10, i+19)
            tmpfile.write(line)
        tmpfile.flush()
        parser = TopologyParser()
        parser.load(tmpfile.name)

        ns_all = NodeSet('admin,nodes[0-159]')
        ns_tree = NodeSet()
        for nodegroup in parser.tree('admin'):
           ns_tree.add(nodegroup.nodeset)
        self.assertEqual(str(ns_all), str(ns_tree))
コード例 #31
0
    def remove_mount_point(self):
        """Remove dfuse directory
        Raises:
            CommandFailure: In case of error deleting directory
        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        dir_exists, _ = general_utils.check_file_exists(
            self.hosts, self.mount_dir.value, directory=True)
        if dir_exists:
            cmd = "rm -rf {}".format(self.mount_dir.value)
            ret_code = general_utils.pcmd(self.hosts, cmd, timeout=30)
            if 0 not in ret_code:
                error_hosts = NodeSet(
                    ",".join(
                        [str(node_set) for code, node_set in ret_code.items()
                         if code != 0]))
                raise CommandFailure(
                    "Error removing the {} dfuse mount point on the following "
                    "hosts: {}".format(self.mount_dir.value, error_hosts))
コード例 #32
0
ファイル: Service.py プロジェクト: mdlx/milkcheck
    def fromdict(self, svcdict):
        """Populate service attributes from dict."""
        BaseEntity.fromdict(self, svcdict)

        if 'actions' in svcdict:
            dependencies = {}
            actions = {}
            for names, props in svcdict['actions'].items():
                for name in NodeSet(names):
                    action = Action(name)
                    action.fromdict(props)

                    actions[name] = action
                    dependencies[name] = props.get('check', [])

            for action in actions.values():
                for dep in dependencies[action.name]:
                    action.add_dep(actions[dep])
                self.add_action(action)

        # Inherits properies between service and actions
        for action in self.iter_actions():
            action.inherits_from(self)
コード例 #33
0
ファイル: soak.py プロジェクト: yulujia/daos
    def remote_copy(self, hostlist, remote_dir, local_dir):
        """Copy files from remote dir to local dir.

        This is a temporary method and will be replaced by
        clush in general_utils
        Args:
                hostlist (list): list of remote nodes
                remote_dir (str): remote directory of files
                local_dir (str): local directory

        Returns:
            status: bool

        """
        this_host = socket.gethostname()
        # Copy logfiles from non-empty client directories
        command = "clush -w {} -B -S \"{}\"".format(
            NodeSet.fromlist(hostlist),
            "if [ ! -z \\\"\\$(ls -A {0})\\\" ]; then "
            "scp -p -r {0}/ \\\"{1}:'{2}/'\\\" && rm -rf {0}/*; fi".format(
                remote_dir, this_host, local_dir))
        status = process.run(command, timeout=300)
        return status
コード例 #34
0
ファイル: FileSystem.py プロジェクト: gauthierdelerce/shine
    def _distant_action_by_server(self, action_class, servers, **kwargs):

        # filter local server
        distant_servers = Server.distant_servers(servers)

        # perform action on distant servers
        if len(distant_servers) > 0:
            action = action_class(nodes=distant_servers, fs=self, **kwargs)
            action.launch()
            self._run_actions()

            if action.status() == ACT_ERROR:
                err_code = None
                if task_self().num_timeout():
                    err_code = -1
                elif task_self().max_retcode():
                    err_code = task_self().max_retcode()

                # FSRemoteError is limited and cannot handle more than 1 error
                msg, nodes = list(self.proxy_errors.walk())[0]
                nodes = NodeSet.fromlist(nodes)
                msg = str(msg).replace('THIS_SHINE_HOST', str(nodes))
                raise FSRemoteError(nodes, err_code, msg)
コード例 #35
0
ファイル: general_utils.py プロジェクト: kjacque/daos
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0):
    """Run a command on each host in parallel and get the return codes.

    Args:
        hosts (list): list of hosts
        command (str): the command to run in parallel
        verbose (bool, optional): display command output. Defaults to True.
        timeout (int, optional): command timeout in seconds. Defaults to None.
        expect_rc (int, optional): expected return code. Defaults to 0.

    Returns:
        dict: a dictionary of return codes keys and accompanying NodeSet
            values indicating which hosts yielded the return code.

    """
    # Run the command on each host in parallel
    results = run_pcmd(hosts, command, verbose, timeout, expect_rc)
    exit_status = {}
    for result in results:
        if result["exit_status"] not in exit_status:
            exit_status[result["exit_status"]] = NodeSet()
        exit_status[result["exit_status"]].add(result["hosts"])
    return exit_status
コード例 #36
0
    def run(self):
        """Run the dfuse command.

        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self._pre_command:
            raise CommandFailure(
                "Dfuse missing environment varaibles for D_LOG_FILE")

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        ret_code = pcmd(self.hosts, self.__str__(), timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if len(ret_code):
            error_hosts = NodeSet(
                ",".join(
                    [str(node_set) for code, node_set in ret_code.items()
                     if code != 0]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if not self.check_running(fail_on_error=False):
            self.log.info('Waiting five seconds for dfuse to start')
            time.sleep(5)
            self.check_running()
コード例 #37
0
    def testDisplayRegroup(self):
        """test CLI.Display (regroup)"""
        f = makeTestFile("""
# A comment

[Main]
default: local

[local]
map: echo hostfoo
#all:
list: echo all
#reverse:
        """)
        res = GroupResolverConfig(f.name)
        set_std_group_resolver(res)
        try:
            parser = OptionParser("dummy")
            parser.install_display_options(verbose_options=True)
            options, _ = parser.parse_args(["-r"])

            disp = Display(options, color=False)
            self.assertEqual(disp.regroup, True)
            disp.out = StringIO()
            disp.err = StringIO()
            self.assertEqual(disp.line_mode, False)

            ns = NodeSet("hostfoo")

            # nodeset.regroup() is performed by print_gather()
            disp.print_gather(ns, "message0\nmessage1\n")
            self.assertEqual(
                disp.out.getvalue(),
                "---------------\n@all\n---------------\nmessage0\nmessage1\n\n"
            )
        finally:
            set_std_group_resolver(None)
コード例 #38
0
ファイル: CLIClushTest.py プロジェクト: samkos/clustershell
    def test_017_retcodes(self):
        """test clush (retcodes)"""
        s = "clush: %s: exited with exit code 1\n" % HOSTNAME
        exp_err = s.encode()
        self._clush_t(["-w", HOSTNAME, "/bin/false"], None, b"", 0, exp_err)
        self._clush_t(["-w", HOSTNAME, "-b", "/bin/false"], None, b"", 0, exp_err)
        self._clush_t(["-S", "-w", HOSTNAME, "/bin/false"], None, b"", 1, exp_err)
        for i in (1, 2, 127, 128, 255):
            s = "clush: %s: exited with exit code %d\n" % (HOSTNAME, i)
            self._clush_t(["-S", "-w", HOSTNAME, "exit %d" % i], None, b"", i,
                          s.encode())
        self._clush_t(["-v", "-w", HOSTNAME, "/bin/false"], None, b"", 0,
                      exp_err)

        duo = str(NodeSet("%s,localhost" % HOSTNAME))
        s = "clush: %s (%d): exited with exit code 1\n" % (duo, 2)
        self._clush_t(["-w", duo, "-b", "/bin/false"], None, b"", 0, s.encode())
        s = "clush: %s: exited with exit code 1\n" % duo
        self._clush_t(["-w", duo, "-b", "-q", "/bin/false"], None, b"", 0,
                      s.encode())
        s = "clush: %s (%d): exited with exit code 1\n" % (duo, 2)
        self._clush_t(["-w", duo, "-S", "-b", "/bin/false"], None, b"", 1,
                      s.encode())
        self._clush_t(["-w", duo, "-S", "-b", "-q", "/bin/false"], None, b"", 1)
コード例 #39
0
ファイル: soak.py プロジェクト: zhenshuitieniu/daos
    def remote_copy(self, hostlist, remote_dir, local_dir):
        """Copy files from remote dir to local dir.

        Args:
                hostlist (list): list of remote nodes
                remote_dir (str): remote directory of files
                local_dir (str): local directory

        Raises:
            SoakTestError: if there is an error with the remote copy

        """
        this_host = socket.gethostname()
        result = pcmd(
            NodeSet.fromlist(hostlist),
            "if [ ! -z '$(ls -A {0})' ]; then "
            "scp -p -r {0}/ \"{1}:'{2}/'\" && rm -rf {0}/*; fi".format(
                remote_dir, this_host, local_dir),
            verbose=False)
        if len(result) > 1 or 0 not in result:
            raise SoakTestError(
                "Error executing remote copy: {}".format(
                    ", ".join(
                        [str(result[key]) for key in result if key != 0])))
コード例 #40
0
ファイル: Tree.py プロジェクト: xdelaruelle/clustershell
    def _copy_remote(self, source, dest, targets, gateway, timeout, reverse):
        """run a remote copy in tree mode (using gateway)"""
        self.logger.debug("_copy_remote gateway=%s source=%s dest=%s "
                          "reverse=%s", gateway, source, dest, reverse)

        self._target_count += len(targets)

        self.gwtargets.setdefault(str(gateway), NodeSet()).add(targets)

        # tar commands are built here and launched on targets
        if reverse:
            # these weird replace calls aim to escape single quotes ' within ''
            srcdir = dirname(source).replace("'", '\'\"\'\"\'')
            srcbase = basename(normpath(self.source)).replace("'", '\'\"\'\"\'')
            cmd = self.TAR_CMD_FMT % (srcdir, srcbase)
        else:
            cmd = self.UNTAR_CMD_FMT % dest.replace("'", '\'\"\'\"\'')

        self.logger.debug('_copy_remote: tar cmd: %s', cmd)

        pchan = self.task._pchannel(gateway, self)
        pchan.shell(nodes=targets, command=cmd, worker=self, timeout=timeout,
                    stderr=self.stderr, gw_invoke_cmd=self.invoke_gateway,
                    remote=self.remote)
コード例 #41
0
def print_source_groups(source, level, xset, opts):
    """
    Print groups from a source, a level of verbosity and an optional
    nodeset acting as a filter.
    """
    # list groups of some specified nodes?
    if opts.all or xset or opts.and_nodes or opts.sub_nodes or opts.xor_nodes:
        # When some node sets are provided as argument, the list command
        # retrieves node groups these nodes belong to, thanks to the
        # groups() method.
        # Note: stdin support is enabled when '-' is found.
        groups = xset.groups(source, opts.groupbase)
        # sort by group name
        for group, (gnodes, inodes) in sorted(groups.items()):
            if level == 1:
                print(group)
            elif level == 2:
                print("%s %s" % (group, inodes))
            else:
                print("%s %s %d/%d" %
                      (group, inodes, len(inodes), len(gnodes)))
    else:
        # "raw" group list when no argument at all
        for group in grouplist(source):
            if source and not opts.groupbase:
                nsgroup = "@%s:%s" % (source, group)
            else:
                nsgroup = "@%s" % group
            if level == 1:
                print(nsgroup)
            else:
                nodes = NodeSet(nsgroup)
                if level == 2:
                    print("%s %s" % (nsgroup, nodes))
                else:
                    print("%s %s %d" % (nsgroup, nodes, len(nodes)))
コード例 #42
0
ファイル: FileSystem.py プロジェクト: gauthierdelerce/shine
    def distant_event(self, evtype, node, **params):

        # Update the local component instance with the provided instance
        # if one is available in params.
        if evtype == 'comp':
            other = params['info'].elem
            other.fs = self
            try:
                # Special hack for Journal object as they are not put in
                # components list.
                if other.TYPE == Journal.TYPE:
                    other.target.fs = self
                    target = self.components[other.target.uniqueid()]
                    target.journal.update(other)
                    comp = target.journal
                else:
                    comp = self.components[other.uniqueid()]
                    # comp.update() updates the component state
                    # and disk information if the component is a target.
                    # These information don't need to be updated unless
                    # we are on a completion event.
                    if params['status'] not in ('start', 'progress'):
                        # ensure other.server is the actual distant server
                        other.server = comp.allservers().select(
                            NodeSet(node))[0]

                        # update target from remote one
                        comp.update(other)

                # substitute target parameter by local one
                params['comp'] = comp
            except KeyError as error:
                print("ERROR: Component update failed (%s)" % str(error),
                      file=sys.stderr)

        self.hdlr.event_callback(evtype, node=node, **params)
コード例 #43
0
    def _parse_token(self, token):
        """Concrete implementation of parent abstract method.

        :Parameters:
            according to parent :py:meth:`cumin.backends.BaseQueryAggregator._parse_token`.
        """
        if not isinstance(token, pp.ParseResults
                          ):  # pragma: no cover - this should never happen
            raise InvalidQueryError(
                'Expecting ParseResults object, got {type}: {token}'.format(
                    type=type(token), token=token))

        token_dict = token.asDict()
        self.logger.trace('Token is: %s | %s', token_dict, token)

        if 'hosts' in token_dict:
            element = self._get_stack_element()
            element['hosts'] = NodeSet.fromlist(token_dict['hosts'],
                                                resolver=self.resolver)
            if 'bool' in token_dict:
                element['bool'] = token_dict['bool']
            self.stack_pointer['children'].append(element)
        elif 'open_subgroup' in token_dict and 'close_subgroup' in token_dict:
            self._open_subgroup()
            if 'bool' in token_dict:
                self.stack_pointer['bool'] = token_dict['bool']
            for subtoken in token:
                if isinstance(
                        subtoken, str
                ):  # Grammar literals, boolean operators and parentheses
                    continue
                self._parse_token(subtoken)
            self._close_subgroup()
        else:  # pragma: no cover - this should never happen
            raise InvalidQueryError(
                'Got unexpected token: {token}'.format(token=token))
コード例 #44
0
ファイル: ior_test_base.py プロジェクト: dsikich/daos
    def _execute_command(self,
                         command,
                         fail_on_err=True,
                         display_output=True,
                         hosts=None):
        """Execute the command on all client hosts.

        Optionally verify if the command returns a non zero return code.

        Args:
            command (str): the command to execute on the client hosts
            fail_on_err (bool, optional): whether or not to fail the test if
                command returns a non zero return code. Defaults to True.
            display_output (bool, optional): whether or not to display output.
                Defaults to True.

        Raises:
            CommandFailure: if 'fail_on_err' is set and the command fails on at
                least one of the client hosts

        Returns:
            dict: a dictionary of return codes keys and accompanying NodeSet
                values indicating which hosts yielded the return code.

        """
        if hosts is None:
            hosts = self.hostlist_clients
        result = pcmd(hosts, command, verbose=display_output, timeout=300)
        if 0 not in result and fail_on_err:
            hosts = [
                str(nodes) for code, nodes in list(result.items()) if code != 0
            ]
            raise CommandFailure(
                "Error running '{}' on the following hosts: {}".format(
                    command, NodeSet(",".join(hosts))))
        return result
コード例 #45
0
ファイル: test.py プロジェクト: clicx/daos
    def get_partition_hosts(self, partition_key, host_list):
        """[summary].

        Args:
            partition_key ([type]): [description]
            host_list ([type]): [description]

        Returns:
            tuple: [description]

        """
        hosts = []
        partiton_name = self.params.get(partition_key, "/run/hosts/*")
        if partiton_name is not None:
            cmd = "scontrol show partition {}".format(partiton_name)

            try:
                result = process.run(cmd, shell=True, timeout=10)
            except process.CmdError as error:
                self.log.warning("Unable to obtain hosts from the {} slurm "
                                 "partition: {}".format(partiton_name, error))
                result = None
            if result:
                output = result.stdout
                try:
                    hosts = list(
                        NodeSet(re.findall(r"\s+Nodes=(.*)", output)[0]))
                except (NodeSetParseError, IndexError):
                    self.log.warning(
                        "Unable to obtain hosts from the {} slurm partition "
                        "output: {}".format(partiton_name, output))

        if hosts:
            return hosts, partiton_name
        else:
            return host_list, None
コード例 #46
0
class PdshClient(ExecClient):
    """EngineClient which run 'pdsh'"""

    MODE = 'pdsh'

    def __init__(self,
                 node,
                 command,
                 worker,
                 stderr,
                 timeout,
                 autoclose=False,
                 rank=None):
        ExecClient.__init__(self, node, command, worker, stderr, timeout,
                            autoclose, rank)
        self._closed_nodes = NodeSet()

    def _build_cmd(self):
        """
        Build the shell command line to start the commmand.
        Return an array of command and arguments.
        """
        task = self.worker.task
        pdsh_env = {}

        # Build pdsh command
        executable = task.info("pdsh_path") or "pdsh"
        cmd_l = [executable, "-b"]

        fanout = task.info("fanout", 0)
        if fanout > 0:
            cmd_l.append("-f %d" % fanout)

        # Pdsh flag '-t' do not really works well. Better to use
        # PDSH_SSH_ARGS_APPEND variable to transmit ssh ConnectTimeout
        # flag.
        connect_timeout = task.info("connect_timeout", 0)
        if connect_timeout > 0:
            pdsh_env['PDSH_SSH_ARGS_APPEND'] = "-o ConnectTimeout=%d" % \
                    connect_timeout

        command_timeout = task.info("command_timeout", 0)
        if command_timeout > 0:
            cmd_l.append("-u %d" % command_timeout)

        cmd_l.append("-w %s" % self.key)
        cmd_l.append("%s" % self.command)

        return (cmd_l, pdsh_env)

    def _close(self, abort, timeout):
        """Close client. See EngineClient._close()."""
        if abort:
            # it's safer to call poll() first for long time completed processes
            prc = self.popen.poll()
            # if prc is None, process is still running
            if prc is None:
                try:  # try to kill it
                    self.popen.kill()
                except OSError:
                    pass
        prc = self.popen.wait()

        if prc > 0:
            raise WorkerError("Cannot run pdsh (error %d)" % prc)

        self.streams.clear()

        if timeout:
            assert abort, "abort flag not set on timeout"
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_timeout(node)
        else:
            for node in (self.key - self._closed_nodes):
                self.worker._on_node_rc(node, 0)

        self.worker._check_fini()

    def _parse_line(self, line, fname):
        """
        Parse Pdsh line syntax.
        """
        if line.startswith("pdsh@") or \
           line.startswith("pdcp@") or \
           line.startswith("sending "):
            try:
                # pdsh@cors113: cors115: ssh exited with exit code 1
                #       0          1      2     3     4    5    6  7
                # corsUNKN: ssh: corsUNKN: Name or service not known
                #     0      1       2       3  4     5     6    7
                # pdsh@fortoy0: fortoy101: command timeout
                #     0             1         2       3
                # sending SIGTERM to ssh fortoy112 pid 32014
                #     0      1     2  3      4      5    6
                # pdcp@cors113: corsUNKN: ssh exited with exit code 255
                #     0             1      2    3     4    5    6    7
                # pdcp@cors113: cors115: fatal: /var/cache/shine/...
                #     0             1      2                   3...

                words = line.split()
                # Set return code for nodename of worker
                if self.MODE == 'pdsh':
                    if len(words) == 4 and words[2] == "command" and \
                       words[3] == "timeout":
                        pass
                    elif len(words) == 8 and words[3] == "exited" and \
                         words[7].isdigit():
                        self._closed_nodes.add(words[1][:-1])
                        self.worker._on_node_rc(words[1][:-1], int(words[7]))
                elif self.MODE == 'pdcp':
                    self._closed_nodes.add(words[1][:-1])
                    self.worker._on_node_rc(words[1][:-1], errno.ENOENT)

            except Exception, exc:
                print >> sys.stderr, exc
                raise EngineClientError()
        else:
コード例 #47
0
ファイル: Topology.py プロジェクト: ypsah/clustershell
class TopologyNodeGroup(object):
    """Base element for in-memory representation of the propagation tree.
    Contains a nodeset, with parent-children relationships with other
    instances.
    """
    def __init__(self, nodeset=None):
        """initialize a new TopologyNodeGroup instance."""
        # Base nodeset
        self.nodeset = nodeset
        # Parent TopologyNodeGroup (TNG) instance
        self.parent = None
        # List of children TNG instances
        self._children = []
        self._children_len = 0
        # provided for convenience
        self._children_ns = None

    def printable_subtree(self, prefix=''):
        """recursive method that returns a printable version the subtree from
        the current node with a nice presentation
        """
        res = ''
        # For now, it is ok to use a recursive method here as we consider that
        # tree depth is relatively small.
        if self.parent is None:
            # root
            res = '%s\n' % str(self.nodeset)
        elif self.parent.parent is None:
            # first level
            if not self._is_last():
                res = '|- %s\n' % str(self.nodeset)
            else:
                res = '`- %s\n' % str(self.nodeset)
        else:
            # deepest levels...
            if not self.parent._is_last():
                prefix += '|  '
            else:
                # fix last line
                prefix += '   '
            if not self._is_last():
                res = '%s|- %s\n' % (prefix, str(self.nodeset))
            else:
                res = '%s`- %s\n' % (prefix, str(self.nodeset))
        # perform recursive calls to print out every node
        for child in self._children:
            res += child.printable_subtree(prefix)
        return res

    def add_child(self, child):
        """add a child to the children list and define the current instance as
        its parent
        """
        assert isinstance(child, TopologyNodeGroup)

        if child in self._children:
            return
        child.parent = self
        self._children.append(child)
        if self._children_ns is None:
            self._children_ns = NodeSet()
        self._children_ns.add(child.nodeset)

    def clear_child(self, child, strict=False):
        """remove a child"""
        try:
            self._children.remove(child)
            self._children_ns.difference_update(child.nodeset)
            if len(self._children_ns) == 0:
                self._children_ns = None
        except ValueError:
            if strict:
                raise

    def clear_children(self):
        """delete all children"""
        self._children = []
        self._children_ns = None

    def children(self):
        """get the children list"""
        return self._children

    def children_ns(self):
        """return the children as a nodeset"""
        return self._children_ns

    def children_len(self):
        """returns the number of children as the sum of the size of the
        children's nodeset
        """
        if self._children_ns is None:
            return 0
        else:
            return len(self._children_ns)

    def _is_last(self):
        """used to display the subtree: we won't prefix the line the same way if
        the current instance is the last child of the children list of its
        parent.
        """
        return self.parent._children[-1::][0] == self

    def __str__(self):
        """printable representation of the nodegroup"""
        return '<TopologyNodeGroup (%s)>' % str(self.nodeset)
コード例 #48
0
def get_partition_hosts(partition, reservation=None):
    """Get a list of hosts in the specified slurm partition and reservation.

    Args:
        partition (str): name of the partition
        reservation (str): name of reservation
    Returns:
        list: list of hosts in the specified partition

    """
    log = getLogger()
    hosts = []
    if partition is not None:
        # Get the partition name information
        cmd = "scontrol show partition {}".format(partition)
        try:
            result = process.run(cmd, timeout=10)
        except process.CmdError as error:
            log.warning(
                "Unable to obtain hosts from the %s slurm "
                "partition: %s", partition, error)
            result = None

        if result:
            # Get the list of hosts from the partition information
            output = result.stdout
            try:
                hosts = list(NodeSet(re.findall(r"\s+Nodes=(.*)", output)[0]))
            except (NodeSetParseError, IndexError):
                log.warning(
                    "Unable to obtain hosts from the %s slurm partition "
                    "output: %s", partition, output)
                hosts = []
            if hosts and reservation is not None:
                # Get the list of hosts from the reservation information
                cmd = "scontrol show reservation {}".format(reservation)
                try:
                    result = process.run(cmd, timeout=10)
                except process.CmdError as error:
                    log.warning(
                        "Unable to obtain hosts from the %s slurm "
                        "reservation: %s", reservation, error)
                    result = None
                    hosts = []
                if result:
                    # Get the list of hosts from the reservation information
                    output = result.stdout
                    try:
                        reservation_hosts = list(
                            NodeSet(re.findall(r"\sNodes=(\S+)", output)[0]))
                    except (NodeSetParseError, IndexError):
                        log.warning(
                            "Unable to obtain hosts from the %s slurm "
                            "reservation output: %s", reservation, output)
                        reservation_hosts = []
                    is_subset = set(reservation_hosts).issubset(set(hosts))
                    if reservation_hosts and is_subset:
                        hosts = reservation_hosts
                    else:
                        hosts = []
    return hosts
コード例 #49
0
def pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0):
    """Run a command on each host in parallel and get the return codes.

    Args:
        hosts (list): list of hosts
        command (str): the command to run in parallel
        verbose (bool, optional): display command output. Defaults to True.
        timeout (int, optional): command timeout in seconds. Defaults to None.
        expect_rc (int, optional): expected return code. Defaults to 0.

    Returns:
        dict: a dictionary of return codes keys and accompanying NodeSet
            values indicating which hosts yielded the return code.

    """
    # Run the command on each host in parallel
    task = run_task(hosts, command, timeout)

    # Report any errors
    retcode_dict = {}
    errors = False
    for retcode, rc_nodes in task.iter_retcodes():
        # Create a NodeSet for this list of nodes
        nodeset = NodeSet.fromlist(rc_nodes)

        # Include this NodeSet for this return code
        if retcode not in retcode_dict:
            retcode_dict[retcode] = NodeSet()
        retcode_dict[retcode].add(nodeset)

        # Keep track of any errors
        if expect_rc is not None and expect_rc != retcode:
            errors = True

    # Report command output if requested or errors are detected
    if verbose or errors:
        print("Command:\n  {}".format(command))
        print("Command return codes:")
        for retcode in sorted(retcode_dict):
            print("  {}: rc={}".format(retcode_dict[retcode], retcode))

        print("Command output:")
        for output, bf_nodes in task.iter_buffers():
            # Create a NodeSet for this list of nodes
            nodeset = NodeSet.fromlist(bf_nodes)

            # Display the output per node set
            print("  {}:\n    {}".format(
                nodeset, "\n    ".join(str(output).splitlines())))

    # Report any timeouts
    if timeout and task.num_timeout() > 0:
        nodes = task.iter_keys_timeout()
        print("{}: timeout detected running '{}' on {}/{} hosts after {}s".
              format(NodeSet.fromlist(nodes), command, task.num_timeout(),
                     len(hosts), timeout))
        retcode = 255
        if retcode not in retcode_dict:
            retcode_dict[retcode] = NodeSet()
        retcode_dict[retcode].add(NodeSet.fromlist(nodes))

    return retcode_dict
コード例 #50
0
    def start(self, args):
        dependanceManager = dep.dep()
        nodes = NodeSet()
        depNode = NodeSet()

        nbNoeud = len(args) - 2
        #print'nbNoeud: %d'%nbNoeud
        for i in range(1, nbNoeud + 1):
            nodes.add(args[i])

        dependance = 1
        if os.path.isfile("cfg/" + args[nbNoeud + 1]):
            #verification de la dependance
            dependanceManager.toInstall("cfg/" + args[nbNoeud + 1])
            dependanceManager.toStart("cfg/" + args[nbNoeud + 1])

            #recuperation des dependances
            startNode = dependanceManager.getNodeStarted()
            startServices = dependanceManager.getStarted()
            installNode = dependanceManager.getNodeIs_install()
            installService = dependanceManager.getIs_install()

            #pour chaque noeud dependant

            for node in installNode:
                depNode.add(node)
                for service in installService:
                    task_self().run('sudo service ' + service + ' status',
                                    nodes=depNode)
                    ret = self.status([node, service], 2)
                    depNode = NodeSet()
                    if ret == 0:
                        print 'Service : ' + service + ' sur : ' + node + ' status : non-installe'
                        dependance = 0
                    elif ret == 1:
                        print 'Service : ' + service + ' sur : ' + node + ' status : installe'

            #pour chaque noeud dependant
            for node in startNode:
                depNode.add(node)
                for service in startServices:
                    task_self().run('sudo service ' + service + ' status',
                                    nodes=depNode)
                    ret = self.status([node, service], 1)
                    depNode = NodeSet()
                    if ret == 0:
                        print 'Service : ' + service + ' sur : ' + node + ' status : non-demarre'
                        dependance = 0
                    elif ret == 1:
                        print 'Service : ' + service + ' sur : ' + node + ' status : demarre'

    # print dependance

        if dependance == 1:
            print 'dependance OK'
            for i in range(1, nbNoeud + 1):
                print args[i] + ' : sudo service ' + args[nbNoeud +
                                                          1] + ' start'
            task_self().run('sudo service ' + args[nbNoeud + 1] + ' start',
                            nodes=nodes)
        else:
            print 'dependance KO'
コード例 #51
0
ファイル: dfuse_utils.py プロジェクト: kjacque/daos
    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        # Include all hosts when stopping to ensure all mount points in any
        # state are properly removed
        self.running_hosts.add(NodeSet.fromlist(self.hosts))

        self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value and self.running_hosts:
            error_list = []

            # Loop until all fuseblk mounted devices are unmounted
            counter = 0
            while self.running_hosts and counter < 3:
                # Attempt to kill dfuse on after first unmount fails
                if self.running_hosts and counter > 1:
                    kill_command = "pkill dfuse --signal KILL"
                    pcmd(self.running_hosts, kill_command, timeout=30)

                # Attempt to unmount any fuseblk mounted devices after detection
                if self.running_hosts and counter > 0:
                    pcmd(self.running_hosts,
                         self.get_umount_command(counter > 1),
                         expect_rc=None)
                    time.sleep(2)

                # Detect which hosts have fuseblk mounted devices and remove any
                # hosts which no longer have the dfuse mount point mounted
                state = self.check_mount_state(self.running_hosts)
                for host in state["unmounted"].union(state["nodirectory"]):
                    self.running_hosts.remove(host)

                # Increment the loop counter
                counter += 1

            if self.running_hosts:
                error_list.append("Error stopping dfuse on {}".format(
                    self.running_hosts))

            # Remove mount points
            try:
                self.remove_mount_point()
            except CommandFailure as error:
                error_list.append(error)

            # Report any errors
            if error_list:
                raise CommandFailure("\n".join(error_list))

        elif self.mount_dir.value is None:
            self.log.info("No dfuse mount directory defined - nothing to stop")

        else:
            self.log.info("No hosts running dfuse - nothing to stop")
コード例 #52
0
ファイル: launch.py プロジェクト: marcelarosalesj/daos
def set_test_environment(args):
    """Set up the test environment.

    Args:
        args (argparse.Namespace): command line arguments for this program

    Returns:
        None

    """
    base_dir = get_build_environment()["PREFIX"]
    bin_dir = os.path.join(base_dir, "bin")
    sbin_dir = os.path.join(base_dir, "sbin")
    # /usr/sbin is not setup on non-root user for CI nodes.
    # SCM formatting tool mkfs.ext4 is located under
    # /usr/sbin directory.
    usr_sbin = os.path.sep + os.path.join("usr", "sbin")
    path = os.environ.get("PATH")

    # Get the default interface to use if OFI_INTERFACE is not set
    interface = os.environ.get("OFI_INTERFACE")
    if interface is None:
        # Find all the /sys/class/net interfaces on the launch node
        # (excluding lo)
        print("Detecting network devices - OFI_INTERFACE not set")
        available_interfaces = {}
        net_path = os.path.join(os.path.sep, "sys", "class", "net")
        net_list = [dev for dev in os.listdir(net_path) if dev != "lo"]
        for device in sorted(net_list):
            # Get the interface state - only include active (up) interfaces
            with open(os.path.join(net_path, device, "operstate"), "r") as \
                 fileh:
                state = fileh.read().strip()
            # Only include interfaces that are up
            if state.lower() == "up":
                # Get the interface speed - used to select the fastest available
                with open(os.path.join(net_path, device, "speed"), "r") as \
                    fileh:
                    try:
                        speed = int(fileh.read().strip())
                        # KVM/Qemu/libvirt returns an EINVAL
                    except IOError as ioerror:
                        if ioerror.errno == errno.EINVAL:
                            speed = 1000
                        else:
                            raise
                print("  - {0:<5} (speed: {1:>6} state: {2})".format(
                    device, speed, state))
                # Only include the first active interface for each speed - first
                # is determined by an alphabetic sort: ib0 will be checked
                # before ib1
                if speed not in available_interfaces:
                    available_interfaces[speed] = device
        print("Available interfaces: {}".format(available_interfaces))
        try:
            # Select the fastest active interface available by sorting the speed
            interface = available_interfaces[sorted(available_interfaces)[-1]]
        except IndexError:
            print("Error obtaining a default interface from: {}".format(
                os.listdir(net_path)))
            exit(1)
    print("Using {} as the default interface".format(interface))

    # Update env definitions
    os.environ["PATH"] = ":".join([bin_dir, sbin_dir, usr_sbin, path])
    os.environ["CRT_CTX_SHARE_ADDR"] = "1"
    os.environ["OFI_INTERFACE"] = os.environ.get("OFI_INTERFACE", interface)

    # Set the default location for daos log files written during testing if not
    # already defined.
    if "DAOS_TEST_LOG_DIR" not in os.environ:
        os.environ["DAOS_TEST_LOG_DIR"] = DEFAULT_DAOS_TEST_LOG_DIR
    os.environ["D_LOG_FILE"] = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                            "daos.log")

    # Ensure the daos log files directory exists on each possible test node
    test_hosts = NodeSet(socket.gethostname().split(".")[0])
    test_hosts.update(args.test_clients)
    test_hosts.update(args.test_servers)
    spawn_commands(test_hosts,
                   "mkdir -p {}".format(os.environ["DAOS_TEST_LOG_DIR"]))

    # Python paths required for functional testing
    python_version = "python{}{}".format(
        version_info.major,
        "" if version_info.major > 2 else ".{}".format(version_info.minor))
    required_python_paths = [
        os.path.abspath("util/apricot"),
        os.path.abspath("util"),
        os.path.join(base_dir, "lib64", python_version, "site-packages"),
    ]

    # Check the PYTHONPATH env definition
    python_path = os.environ.get("PYTHONPATH")
    if python_path is None or python_path == "":
        # Use the required paths to define the PYTHONPATH env if it is not set
        os.environ["PYTHONPATH"] = ":".join(required_python_paths)
    else:
        # Append any missing required paths to the existing PYTHONPATH env
        defined_python_paths = [
            os.path.abspath(os.path.expanduser(path))
            for path in python_path.split(":")
        ]
        for required_path in required_python_paths:
            if required_path not in defined_python_paths:
                python_path += ":" + required_path
        os.environ["PYTHONPATH"] = python_path
コード例 #53
0
def run_pcmd(hosts, command, verbose=True, timeout=None, expect_rc=0):
    """Run a command on each host in parallel and get the results.

    Args:
        hosts (list): list of hosts
        command (str): the command to run in parallel
        verbose (bool, optional): display command output. Defaults to True.
        timeout (int, optional): command timeout in seconds. Defaults to None.
        expect_rc (int, optional): display output if the command return code
            does not match this value. Defaults to 0. A value of None will
            bypass this feature.

    Returns:
        list: a list of dictionaries with each entry containing output, exit
            status, and interrupted status common to each group of hosts, e.g.:
                [
                    {
                        "command": "ls my_dir",
                        "hosts": NodeSet(wolf-[1-3]),
                        "exit_status": 0,
                        "interrupted": False,
                        "stdout": ["file1.txt", "file2.json"],
                    },
                    {
                        "command": "ls my_dir",
                        "hosts": NodeSet(wolf-[4]),
                        "exit_status": 1,
                        "interrupted": False,
                        "stdout": ["No such file or directory"],
                    },
                    {
                        "command": "ls my_dir",
                        "hosts": NodeSet(wolf-[5-6]),
                        "exit_status": 255,
                        "interrupted": True,
                        "stdout": [""]
                    },
                ]

    """
    log = getLogger()
    results = []

    # Run the command on each host in parallel
    task = run_task(hosts, command, timeout)

    # Get the exit status of each host
    host_exit_status = {host: None for host in hosts}
    for exit_status, host_list in task.iter_retcodes():
        for host in host_list:
            host_exit_status[host] = exit_status

    # Get a list of any interrupted hosts
    host_interrupted = []
    if timeout and task.num_timeout() > 0:
        host_interrupted.extend(list(task.iter_keys_timeout()))

    # Iterate through all the groups of common output
    output_data = list(task.iter_buffers())
    if not output_data:
        output_data = [["", hosts]]
    for output, host_list in output_data:
        # Deterimine the unique exit status for each host with the same output
        output_exit_status = {}
        for host in host_list:
            if host_exit_status[host] not in output_exit_status:
                output_exit_status[host_exit_status[host]] = NodeSet()
            output_exit_status[host_exit_status[host]].add(host)

        # Determine the unique interrupted state for each host with the same
        # output and exit status
        for exit_status in output_exit_status:
            output_interrupted = {}
            for host in list(output_exit_status[exit_status]):
                is_interrupted = host in host_interrupted
                if is_interrupted not in output_interrupted:
                    output_interrupted[is_interrupted] = NodeSet()
                output_interrupted[is_interrupted].add(host)

            # Add a result entry for each group of hosts with the same output,
            # exit status, and interrupted status
            for interrupted in output_interrupted:
                results.append({
                    "command":
                    command,
                    "hosts":
                    output_interrupted[interrupted],
                    "exit_status":
                    exit_status,
                    "interrupted":
                    interrupted,
                    "stdout": [
                        line.decode("utf-8").rstrip(os.linesep)
                        for line in output
                    ],
                })

    # Display results if requested or there is an unexpected exit status
    bad_exit_status = [
        item["exit_status"] for item in results
        if expect_rc is not None and item["exit_status"] != expect_rc
    ]
    if verbose or bad_exit_status:
        log.info(colate_results(command, results))

    return results
コード例 #54
0
    def testBadTopologies(self):
        """test detecting invalid topologies"""
        g = TopologyGraph()
        admin = NodeSet('admin')
        # Add the same nodeset twice
        ns0 = NodeSet('nodes[0-9]')
        ns1 = NodeSet('nodes[10-19]')
        ns2 = NodeSet('nodes[20-29]')

        g.add_route(admin, ns0)
        g.add_route(ns0, ns1)
        g.add_route(ns0, ns2)

        # add a superset of a known destination as source
        ns2_sup = NodeSet('somenode[0-10]')
        ns2_sup.add(ns2)
        self.assertRaises(TopologyError, g.add_route, ns2_sup, NodeSet('foo1'))

        # Add a known dst nodeset as a src nodeset
        ns3 = NodeSet('nodes[30-39]')
        g.add_route(ns1, ns3)

        # Add a subset of a known src nodeset as src
        ns0_sub = NodeSet(','.join(ns0[:3:]))
        ns4 = NodeSet('nodes[40-49]')
        g.add_route(ns0_sub, ns4)

        # Add a subset of a known dst nodeset as src
        ns1_sub = NodeSet(','.join(ns1[:3:]))
        self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub)
        # Add a subset of a known src nodeset as dst
        self.assertRaises(TopologyError, g.add_route, ns4, ns0_sub)
        # Add a subset of a known dst nodeset as dst
        self.assertRaises(TopologyError, g.add_route, ns4, ns1_sub)
        # src <- subset of -> dst
        ns5 = NodeSet('nodes[50-59]')
        ns5_sub = NodeSet(','.join(ns5[:3:]))
        self.assertRaises(TopologyError, g.add_route, ns5, ns5_sub)
        self.assertRaises(TopologyError, g.add_route, ns5_sub, ns5)

        self.assertEqual(g.dest(ns0), (ns1 | ns2))
        self.assertEqual(g.dest(ns1), ns3)
        self.assertEqual(g.dest(ns2), None)
        self.assertEqual(g.dest(ns3), None)
        self.assertEqual(g.dest(ns4), None)
        self.assertEqual(g.dest(ns5), None)
        self.assertEqual(g.dest(ns0_sub), (ns1 | ns2 | ns4))

        g = TopologyGraph()
        root = NodeSet('root')
        ns01 = NodeSet('nodes[0-1]')
        ns23 = NodeSet('nodes[2-3]')
        ns45 = NodeSet('nodes[4-5]')
        ns67 = NodeSet('nodes[6-7]')
        ns89 = NodeSet('nodes[8-9]')

        g.add_route(root, ns01)
        g.add_route(root, ns23 | ns45)
        self.assertRaises(TopologyError, g.add_route, ns23, ns23)
        self.assertRaises(TopologyError, g.add_route, ns45, root)
        g.add_route(ns23, ns67)
        g.add_route(ns67, ns89)
        self.assertRaises(TopologyError, g.add_route, ns89, ns67)
        self.assertRaises(TopologyError, g.add_route, ns89, ns89)
        self.assertRaises(TopologyError, g.add_route, ns89, ns23)

        ns_all = NodeSet('root,nodes[0-9]')
        for nodegroup in g.to_tree('root'):
            ns_all.difference_update(nodegroup.nodeset)
        self.assertEqual(len(ns_all), 0)
コード例 #55
0
ファイル: ServerTest.py プロジェクト: thiell/shine
 def testNodeSet(self):
     """test ServerGroup.nodeset()"""
     srv1 = Server('foo1', ['foo1@tcp'])
     srv2 = Server('foo2', ['foo2@tcp'])
     grp = ServerGroup([srv1, srv2])
     self.assertEqual(grp.nodeset(), NodeSet('foo[1-2]'))
コード例 #56
0
ファイル: dfuse_utils.py プロジェクト: kjacque/daos
class Dfuse(DfuseCommand):
    """Class defining an object of type DfuseCommand."""
    def __init__(self, hosts, tmp):
        """Create a dfuse object."""
        super().__init__("/run/dfuse/*", "dfuse")

        # set params
        self.hosts = hosts
        self.tmp = tmp
        self.running_hosts = NodeSet()

    def __del__(self):
        """Destruct the object."""
        if self.running_hosts:
            self.log.error('Dfuse object deleted without shutting down')

    def check_mount_state(self, nodes=None):
        """Check the dfuse mount point mounted state on the hosts.

        Args:
            nodes (NodeSet, optional): hosts on which to check if dfuse is
                mounted. Defaults to None, which will use all of the hosts.

        Returns:
            dict: a dictionary of NodeSets of hosts with the dfuse mount point
                either "mounted" or "unmounted"

        """
        state = {
            "mounted": NodeSet(),
            "unmounted": NodeSet(),
            "nodirectory": NodeSet()
        }
        if not nodes:
            nodes = NodeSet.fromlist(self.hosts)
        check_mounted = NodeSet()

        # Detect which hosts have mount point directories defined
        command = "test -d {0} -a ! -L {0}".format(self.mount_dir.value)
        retcodes = pcmd(nodes, command, expect_rc=None)
        for retcode, hosts in list(retcodes.items()):
            for host in hosts:
                if retcode == 0:
                    check_mounted.add(host)
                else:
                    command = "grep 'dfuse {}' /proc/mounts".format(
                        self.mount_dir.value)
                    retcodes = pcmd([host], command, expect_rc=None)
                    for ret_code, host_names in list(retcodes.items()):
                        for node in host_names:
                            if ret_code == 0:
                                check_mounted.add(node)
                            else:
                                state["nodirectory"].add(node)

        if check_mounted:
            # Detect which hosts with mount point directories have it mounted as
            # a fuseblk device
            command = "stat -c %T -f {0} | grep -v fuseblk".format(
                self.mount_dir.value)
            retcodes = pcmd(check_mounted, command, expect_rc=None)
            for retcode, hosts in list(retcodes.items()):
                for host in hosts:
                    if retcode == 1:
                        state["mounted"].add(host)
                    else:
                        state["unmounted"].add(host)

        return state

    def get_umount_command(self, force=False):
        """Get the command to umount the dfuse mount point.

        Args:
            force (bool, optional): whether to force the umount with a lazy
                unmount. Defaults to False.

        Returns:
            str: the dfuse umount command

        """
        umount = "-uz" if force else "-u"
        command = [
            "if [ -x '$(command -v fusermount)' ]",
            "then fusermount {0} {1}".format(umount, self.mount_dir.value),
            "else fusermount3 {0} {1}".format(umount,
                                              self.mount_dir.value), "fi"
        ]
        return ";".join(command)

    def create_mount_point(self):
        """Create dfuse directory.

        Raises:
            CommandFailure: In case of error creating directory

        """
        # Raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        # Create the mount point on any host without dfuse already mounted
        state = self.check_mount_state()
        if state["nodirectory"]:
            command = "mkdir -p {}".format(self.mount_dir.value)
            ret_code = pcmd(state["nodirectory"], command, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                failed_nodes = [
                    str(node_set) for code, node_set in list(ret_code.items())
                    if code != 0
                ]
                error_hosts = NodeSet(",".join(failed_nodes))
                raise CommandFailure(
                    "Error creating the {} dfuse mount point on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 error_hosts))

    def remove_mount_point(self, fail=True):
        """Remove dfuse directory.

        Try once with a simple rmdir which should succeed, if this does not then
        try again with rm -rf, but still raise an error.

        Raises:
            CommandFailure: In case of error deleting directory

        """
        # raise exception if mount point not specified
        if self.mount_dir.value is None:
            raise CommandFailure("Mount point not specified, "
                                 "check test yaml file")

        dir_exists, clean_nodes = check_file_exists(self.hosts,
                                                    self.mount_dir.value,
                                                    directory=True)
        if dir_exists:
            target_nodes = list(self.hosts)
            if clean_nodes:
                target_nodes.remove(clean_nodes)

            self.log.info("Removing the %s dfuse mount point on %s",
                          self.mount_dir.value, target_nodes)

            cmd = "rmdir {}".format(self.mount_dir.value)
            ret_code = pcmd(target_nodes, cmd, timeout=30)
            if len(ret_code) == 1 and 0 in ret_code:
                return

            failed_nodes = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))

            cmd = "rm -rf {}".format(self.mount_dir.value)
            ret_code = pcmd(failed_nodes, cmd, timeout=30)
            if len(ret_code) > 1 or 0 not in ret_code:
                error_hosts = NodeSet(",".join([
                    str(node_set) for code, node_set in list(ret_code.items())
                    if code != 0
                ]))
                if fail:
                    raise CommandFailure(
                        "Error removing the {} dfuse mount point with rm on "
                        "the following hosts: {}".format(
                            self.mount_dir.value, error_hosts))
            if fail:
                raise CommandFailure(
                    "Error removing the {} dfuse mount point with rmdir on the "
                    "following hosts: {}".format(self.mount_dir.value,
                                                 failed_nodes))
        else:
            self.log.info("No %s dfuse mount point directory found on %s",
                          self.mount_dir.value, self.hosts)

    def run(self, check=True):
        # pylint: disable=arguments-differ
        """Run the dfuse command.

        Args:
            check (bool): Check if dfuse mounted properly after
                mount is executed.
        Raises:
            CommandFailure: In case dfuse run command fails

        """
        self.log.info('Starting dfuse at %s', self.mount_dir.value)

        # A log file must be defined to ensure logs are captured
        if "D_LOG_FILE" not in self.env:
            raise CommandFailure(
                "Dfuse missing environment variables for D_LOG_FILE")

        # create dfuse dir if does not exist
        self.create_mount_point()

        # run dfuse command
        cmd = "".join([self.env.get_export_str(), self.__str__()])
        ret_code = pcmd(self.hosts, cmd, timeout=30)

        if 0 in ret_code:
            self.running_hosts.add(ret_code[0])
            del ret_code[0]

        if ret_code:
            error_hosts = NodeSet(",".join([
                str(node_set) for code, node_set in list(ret_code.items())
                if code != 0
            ]))
            raise CommandFailure(
                "Error starting dfuse on the following hosts: {}".format(
                    error_hosts))

        if check:
            # Dfuse will block in the command for the mount to complete, even
            # if run in background mode so it should be possible to start using
            # it immediately after the command returns.
            if not self.check_running(fail_on_error=False):
                self.log.info('Waiting two seconds for dfuse to start')
                time.sleep(2)
                if not self.check_running(fail_on_error=False):
                    self.log.info('Waiting five seconds for dfuse to start')
                    time.sleep(5)
                    self.check_running()

    def check_running(self, fail_on_error=True):
        """Check dfuse is running.

        Run a command to verify dfuse is running on hosts where it is supposed
        to be.  Use grep -v and rc=1 here so that if it isn't, then we can
        see what is being used instead.

        Args:
            fail_on_error (bool, optional): should an exception be raised if an
                error is detected. Defaults to True.

        Raises:
            CommandFailure: raised if dfuse is found not running on any expected
                nodes and fail_on_error is set.

        Returns:
            bool: whether or not dfuse is running

        """
        status = True
        state = self.check_mount_state(self.running_hosts)
        if state["unmounted"] or state["nodirectory"]:
            self.log.error("Error: dfuse not running on %s",
                           str(state["unmounted"].union(state["nodirectory"])))
            status = False
            if fail_on_error:
                raise CommandFailure("dfuse not running")
        return status

    def stop(self):
        """Stop dfuse.

        Try to stop dfuse.  Try once nicely by using fusermount, then if that
        fails try to pkill it to see if that works.  Abort based on the result
        of the fusermount, as if pkill is necessary then dfuse itself has
        not worked correctly.

        Finally, try and remove the mount point, and that itself should work.

        Raises:
            CommandFailure: In case dfuse stop fails

        """
        # Include all hosts when stopping to ensure all mount points in any
        # state are properly removed
        self.running_hosts.add(NodeSet.fromlist(self.hosts))

        self.log.info("Stopping dfuse at %s on %s", self.mount_dir.value,
                      self.running_hosts)

        if self.mount_dir.value and self.running_hosts:
            error_list = []

            # Loop until all fuseblk mounted devices are unmounted
            counter = 0
            while self.running_hosts and counter < 3:
                # Attempt to kill dfuse on after first unmount fails
                if self.running_hosts and counter > 1:
                    kill_command = "pkill dfuse --signal KILL"
                    pcmd(self.running_hosts, kill_command, timeout=30)

                # Attempt to unmount any fuseblk mounted devices after detection
                if self.running_hosts and counter > 0:
                    pcmd(self.running_hosts,
                         self.get_umount_command(counter > 1),
                         expect_rc=None)
                    time.sleep(2)

                # Detect which hosts have fuseblk mounted devices and remove any
                # hosts which no longer have the dfuse mount point mounted
                state = self.check_mount_state(self.running_hosts)
                for host in state["unmounted"].union(state["nodirectory"]):
                    self.running_hosts.remove(host)

                # Increment the loop counter
                counter += 1

            if self.running_hosts:
                error_list.append("Error stopping dfuse on {}".format(
                    self.running_hosts))

            # Remove mount points
            try:
                self.remove_mount_point()
            except CommandFailure as error:
                error_list.append(error)

            # Report any errors
            if error_list:
                raise CommandFailure("\n".join(error_list))

        elif self.mount_dir.value is None:
            self.log.info("No dfuse mount directory defined - nothing to stop")

        else:
            self.log.info("No hosts running dfuse - nothing to stop")
コード例 #57
0
    def _check_channel_ctl_shell(self,
                                 command,
                                 target,
                                 stderr,
                                 remote,
                                 reply_msg_class,
                                 reply_pattern,
                                 write_buf=None,
                                 timeout=-1,
                                 replycnt=1,
                                 reply_rc=0):
        """helper to check channel shell action"""
        self.channel_send_start()
        msg = self.recvxml(StartMessage)
        self.channel_send_cfg('n1')
        msg = self.recvxml(ACKMessage)

        # prepare a remote shell command request...
        workertree = TreeWorker(nodes=target,
                                handler=None,
                                timeout=timeout,
                                command=command)
        # code snippet from PropagationChannel.shell()
        ctl = ControlMessage(id(workertree))
        ctl.action = 'shell'
        ctl.target = NodeSet(target)

        info = task_self()._info.copy()
        info['debug'] = False

        ctl_data = {
            'cmd': command,
            'invoke_gateway': workertree.invoke_gateway,
            'taskinfo': info,
            'stderr': stderr,
            'timeout': timeout,
            'remote': remote
        }
        ctl.data_encode(ctl_data)
        self.gateway.send(ctl.xml())

        self.recvxml(ACKMessage)

        if write_buf:
            ctl = ControlMessage(id(workertree))
            ctl.action = 'write'
            ctl.target = NodeSet(target)
            ctl_data = {
                'buf': write_buf,
            }
            # Send write message
            ctl.data_encode(ctl_data)
            self.gateway.send(ctl.xml())
            self.recvxml(ACKMessage)

            # Send EOF message
            ctl = ControlMessage(id(workertree))
            ctl.action = 'eof'
            ctl.target = NodeSet(target)
            self.gateway.send(ctl.xml())
            self.recvxml(ACKMessage)

        while replycnt > 0:
            msg = self.recvxml(reply_msg_class)
            replycnt -= len(NodeSet(msg.nodes))
            self.assertTrue(msg.nodes in ctl.target)
            if msg.has_payload or reply_pattern:
                msg_data = msg.data_decode()
                try:
                    if not reply_pattern.search(msg_data):
                        self.assertEqual(
                            msg.data, reply_pattern,
                            'Pattern "%s" not found in data="%s"' %
                            (reply_pattern.pattern, msg_data))
                except AttributeError:
                    # not a regexp
                    self.assertEqual(msg_data, reply_pattern)

        if timeout <= 0:
            msg = self.recvxml(RetcodeMessage)
            self.assertEqual(msg.retcode, reply_rc)

        self.channel_send_stop()
        self.gateway.wait()
        self.gateway.close()
コード例 #58
0
class PropagationTreeRouter(object):
    """performs routes resolving operations within a propagation tree.
    This object provides a next_hop method, that will look for the best
    directly connected node to use to forward a message to a remote
    node.

    Upon instanciation, the router will parse the topology tree to
    generate its routing table.
    """
    def __init__(self, root, topology, fanout=0):
        self.root = root
        self.topology = topology
        self.fanout = fanout
        self.nodes_fanin = {}
        self.table = None

        self.table_generate(root, topology)
        self._unreachable_hosts = NodeSet()

    def table_generate(self, root, topology):
        """The router relies on a routing table. The keys are the
        destination nodes and the values are the next hop gateways to
        use to reach these nodes.
        """
        self.table = {}
        root_group = None

        for entry in topology.groups:
            if root in entry.nodeset:
                root_group = entry
                break

        if root_group is None:
            raise RouteResolvingError('Invalid admin node: %s' % root)

        for group in root_group.children():
            self.table[group.nodeset] = NodeSet()
            stack = [group]
            while len(stack) > 0:
                curr = stack.pop()
                self.table[group.nodeset].add(curr.children_ns())
                stack += curr.children()

        # reverse table (it was crafted backward)
        self.table = dict((v, k) for k, v in self.table.iteritems())

    def dispatch(self, dst):
        """dispatch nodes from a target nodeset to the directly
        connected gateways.

        The method acts as an iterator, returning a gateway and the
        associated hosts. It should provide a rather good load balancing
        between the gateways.
        """
        # Check for directly connected targets
        res = [tmp & dst for tmp in self.table.values()]
        nexthop = NodeSet()
        [nexthop.add(x) for x in res]
        if len(nexthop) > 0:
            yield nexthop, nexthop

        # Check for remote targets, that require a gateway to be reached
        for network in self.table.iterkeys():
            dst_inter = network & dst
            dst.difference_update(dst_inter)
            for host in dst_inter.nsiter():
                yield self.next_hop(host), host

    def next_hop(self, dst):
        """perform the next hop resolution. If several hops are
        available, then, the one with the least number of current jobs
        will be returned
        """
        if dst in self._unreachable_hosts:
            raise RouteResolvingError(
                'Invalid destination: %s, host is unreachable' % dst)

        # can't resolve if source == destination
        if self.root == dst:
            raise RouteResolvingError(
                'Invalid resolution request: %s -> %s' % (self.root, dst))

        ## ------------------
        # the routing table is organized this way:
        # 
        #  NETWORK    | NEXT HOP
        # ------------+-----------
        # node[0-9]   | gateway0
        # node[10-19] | gateway[1-2]
        #            ...
        # ---------
        for network, nexthops in self.table.iteritems():
            # destination contained in current network
            if dst in network:
                res = self._best_next_hop(nexthops)
                if res is None:
                    raise RouteResolvingError('No route available to %s' % \
                        str(dst))
                self.nodes_fanin[res] += len(dst)
                return res
            # destination contained in current next hops (ie. directly
            # connected)
            if dst in nexthops:
                return dst

        raise RouteResolvingError(
            'No route from %s to host %s' % (self.root, dst))

    def mark_unreachable(self, dst):
        """mark node dst as unreachable and don't advertise routes
        through it anymore. The cache will be updated only when
        necessary to avoid performing expensive traversals.
        """
        # Simply mark dst as unreachable in a dedicated NodeSet. This
        # list will be consulted by the resolution method
        self._unreachable_hosts.add(dst)

    def _best_next_hop(self, candidates):
        """find out a good next hop gateway"""
        backup = None
        backup_connections = 1e400 # infinity

        candidates = candidates.difference(self._unreachable_hosts)

        for host in candidates:
            # the router tracks established connections in the
            # nodes_fanin table to avoid overloading a gateway
            connections = self.nodes_fanin.setdefault(host, 0)
            # FIXME
            #if connections < self.fanout:
            #    # currently, the first one is the best
            #    return host
            if backup_connections > connections:
                backup = host
                backup_connections = connections
        return backup
コード例 #59
0
    def load(self):
        """Load Cluster, Nodes and partitions from Architecture files. Raises
           HPCStatsRuntimeError or HPCStatsSourceError if error is encountered
           while loading data from sources. It sets attributes cluster, nodes
           and partitions with loaded data.
        """

        self.cluster = Cluster(self.cluster_name)
        self.nodes = []
        self.partitions = {}

        self.read_arch()
        config_get = self.config_get
        partitions = config_get(self.cluster.name, "partitions").split(',')

        for partition in partitions:

            part_sect = self.cluster.name + "/" + partition

            nodegroups = config_get(part_sect, "nodegroups").split(',')
            job_partitions = config_get(part_sect, "job_partitions") \
                               .split(',')

            nodeset_part = NodeSet() # nodeset for the partitions attribute

            for nodegroup in nodegroups:

                nodegroup_sect = self.cluster.name + "/" + partition \
                                 + "/" + nodegroup
                nodenames = config_get(nodegroup_sect, "names")
                nodeset_part.add(nodenames)

                sockets = config_get(nodegroup_sect, "sockets", isint=True)
                cores_per_socket = config_get(nodegroup_sect,
                                              "corespersocket",
                                              isint=True)
                cpu = sockets * cores_per_socket

                float_instructions = config_get(nodegroup_sect,
                                                "floatinstructions",
                                                isint=True)

                freq_str = config_get(nodegroup_sect, "frequency")
                freq = ArchitectureImporterArchfile.convert_freq(freq_str)
                if freq is None:
                    raise HPCStatsSourceError( \
                            "format of frequency for nodeset %s/%s/%s (%s) " \
                            "'%s' is not valid" \
                              % ( self.cluster.name,
                                  partition,
                                  nodegroup,
                                  nodenames,
                                  freq_str ))

                flops = sockets * cores_per_socket * float_instructions * freq

                mem_str = config_get(nodegroup_sect, "memory")
                mem = ArchitectureImporterArchfile.convert_mem(mem_str)
                if mem is None:
                    raise HPCStatsSourceError( \
                            "format of memory for nodeset %s/%s/%s (%s) " \
                            "'%s' is not valid" \
                              % ( self.cluster.name,
                                  partition,
                                  nodegroup,
                                  nodenames,
                                  mem_str ))

                model = config_get(nodegroup_sect, "model")
            
                nodeset_group = NodeSet(nodenames)
                for nodename in nodeset_group:
                    # create and append node
                    new_node = Node(name=nodename,
                                    cluster=self.cluster,
                                    model=model,
                                    partition=partition,
                                    cpu=cpu,
                                    memory=mem,
                                    flops=flops)
                    self.nodes.append(new_node)

            self.partitions[str(nodeset_part)] = job_partitions
コード例 #60
0
    def test_bashcmd(self):
        """Jira ID: DAOS-3508.

        Test Description:
            Purpose of this test is to mount different mount points of dfuse
            for different container and pool sizes and perform basic bash
            commands.

        Use cases:
            Following list of bash commands have been incorporated
            as part of this test: mkdir, touch, ls, chmod, rm, dd, stat,
            cp, cmp, mv, rmdir.
              Create a directory.
              Create a file under that directory.
              List the created file.
              Remove the file.
              Write a file to the dfuse mounted location using dd.
              List the written file to verify if it's create.
              Verify the file created is of right size as desired.
              Copy the file
              Compare the copied file with original to verify the
              content is same.
              Remove copied file.
              Rename file
              Verify renamed file exist using list.
              Remove a directory

        :avocado: tags=all,hw,daosio,medium,ib2,full_regression,bashcmd
        """
        dir_name = self.params.get("dirname", '/run/bashcmd/*')
        file_name1 = self.params.get("filename1", '/run/bashcmd/*')
        file_name2 = self.params.get("filename2", '/run/bashcmd/*')
        dd_count = self.params.get("dd_count", '/run/bashcmd/*')
        dd_blocksize = self.params.get("dd_blocksize", '/run/bashcmd/*')
        pool_count = self.params.get("pool_count", '/run/pool/*')
        cont_count = self.params.get("cont_count", '/run/container/*')

        # Create a pool if one does not already exist.
        for _ in range(pool_count):
            self.add_pool(connect=False)
            # perform test for multiple containers.
            for count in range(cont_count):
                self.add_container(self.pool)
                mount_dir = "/tmp/{}_daos_dfuse{}".format(
                    self.pool.uuid, count)
                self.start_dfuse(self.hostlist_clients, self.pool,
                                 self.container, mount_dir)
                abs_dir_path = os.path.join(self.dfuse.mount_dir.value,
                                            dir_name)
                abs_file_path1 = os.path.join(abs_dir_path, file_name1)
                abs_file_path2 = os.path.join(abs_dir_path, file_name2)
                # list of commands to be executed.
                commands = [
                    "mkdir -p {}".format(abs_dir_path),
                    "touch {}".format(abs_file_path1),
                    "ls -a {}".format(abs_file_path1),
                    "rm {}".format(abs_file_path1),
                    "dd if=/dev/zero of={} count={} bs={}".format(
                        abs_file_path1, dd_count,
                        dd_blocksize), "ls -al {}".format(abs_file_path1),
                    "filesize=$(stat -c%s '{}');\
                            if (( filesize != {}*{} )); then exit 1;\
                            fi".format(abs_file_path1, dd_count, dd_blocksize),
                    "cp -r {} {}".format(abs_file_path1, abs_file_path2),
                    "cmp --silent {} {}".format(abs_file_path1,
                                                abs_file_path2),
                    "rm {}".format(abs_file_path2), "mv {} {}".format(
                        abs_file_path1,
                        abs_file_path2), "ls -al {}".format(abs_file_path2),
                    "rm {}".format(abs_file_path2),
                    "rmdir {}".format(abs_dir_path)
                ]
                for cmd in commands:
                    try:
                        # execute bash cmds
                        ret_code = general_utils.pcmd(self.hostlist_clients,
                                                      cmd,
                                                      timeout=30)
                        if 0 not in ret_code:
                            error_hosts = NodeSet(",".join([
                                str(node_set)
                                for code, node_set in list(ret_code.items())
                                if code != 0
                            ]))
                            raise CommandFailure(
                                "Error running '{}' on the following "
                                "hosts: {}".format(cmd, error_hosts))
                    # report error if any command fails
                    except CommandFailure as error:
                        self.log.error("BashCmd Test Failed: %s", str(error))
                        self.fail("Test was expected to pass but "
                                  "it failed.\n")

                # stop dfuse
                self.stop_dfuse()
                # destroy container
                self.container.destroy()
            # destroy pool
            self.pool.destroy()