Example #1
0
    def WaitForNodeToComeUp(self, node, Timeout=300):
        '''Return TRUE when given node comes up, or None/FALSE if timeout'''
        timeout = Timeout
        anytimeouts = 0
        while timeout > 0:
            if self.IsNodeBooted(node) and self.IsSshdUp(node):
                if anytimeouts:
                    # Fudge to wait for the system to finish coming up
                    time.sleep(30)
                    LogFactory().debug("Node %s now up" % node)
                return 1

            time.sleep(30)
            if (not anytimeouts):
                LogFactory().debug("Waiting for node %s to come up" % node)

            anytimeouts = 1
            timeout = timeout - 1

        LogFactory().log("%s did not come up within %d tries" %
                         (node, Timeout))
        if self.Env["continue"] == 1:
            answer = "Y"
        else:
            try:
                answer = input_wrapper('Continue? [nY]')
            except EOFError as e:
                answer = "n"
        if answer and answer == "n":
            raise ValueError("%s did not come up within %d tries" %
                             (node, Timeout))
Example #2
0
    def WaitForNodeToComeUp(self, node, Timeout=300):
        '''Return TRUE when given node comes up, or None/FALSE if timeout'''
        timeout = Timeout
        anytimeouts = 0
        while timeout > 0:
            if self.IsNodeBooted(node) and self.IsSshdUp(node):
                if anytimeouts:
                     # Fudge to wait for the system to finish coming up
                     time.sleep(30)
                     LogFactory().debug("Node %s now up" % node)
                return 1

            time.sleep(30)
            if (not anytimeouts):
                LogFactory().debug("Waiting for node %s to come up" % node)

            anytimeouts = 1
            timeout = timeout - 1

        LogFactory().log("%s did not come up within %d tries" % (node, Timeout))
        if self.Env["continue"] == 1:
            answer = "Y"
        else:
            try:
                answer = input_wrapper('Continue? [nY]')
            except EOFError as e:
                answer = "n"
        if answer and answer == "n":
            raise ValueError("%s did not come up within %d tries" % (node, Timeout))
Example #3
0
    def audit(self, LocalIgnore=[]):
        errcount = 0
        ignorelist = []
        ignorelist.append("CTS:")
        ignorelist.extend(LocalIgnore)
        ignorelist.extend(self.ClusterManager.errorstoignore())
        ignorelist.extend(self.ClusterManager.instance_errorstoignore())

        # This makes sure everything is stabilized before starting...
        failed = 0
        for audit in self.Audits:
            if not audit():
                self.ClusterManager.log("Audit " + audit.name() + " FAILED.")
                failed += 1
            else:
                self.ClusterManager.debug("Audit " + audit.name() + " passed.")

        while errcount < 1000:
            match = None
            if self.BadNews:
                match = self.BadNews.look(0)

            if match:
                add_err = 1
                for ignore in ignorelist:
                    if add_err == 1 and re.search(ignore, match):
                        add_err = 0
                if add_err == 1:
                    self.ClusterManager.log("BadNews: " + match)
                    self.incr("BadNews")
                    errcount = errcount + 1
            else:
                break
        else:
            if self.ClusterManager.Env["continue"] == 1:
                answer = "Y"
            else:
                try:
                    answer = input_wrapper('Big problems. Continue? [nY]')
                except EOFError as e:
                    answer = "n"
            if answer and answer == "n":
                self.ClusterManager.log("Shutting down.")
                self.summarize()
                self.TearDown()
                raise ValueError("Looks like we hit a BadNews jackpot!")

        if self.BadNews:
            self.BadNews.end()
        return failed
Example #4
0
    def audit(self, LocalIgnore=[]):
        errcount = 0
        ignorelist = []
        ignorelist.append("CTS:")
        ignorelist.extend(LocalIgnore)
        ignorelist.extend(self.ClusterManager.errorstoignore())
        ignorelist.extend(self.ClusterManager.instance_errorstoignore())

        # This makes sure everything is stabilized before starting...
        failed = 0
        for audit in self.Audits:
            if not audit():
                self.ClusterManager.log("Audit " + audit.name() + " FAILED.")
                failed += 1
            else:
                self.ClusterManager.debug("Audit " + audit.name() + " passed.")

        while errcount < 1000:
            match = None
            if self.BadNews:
                match = self.BadNews.look(0)

            if match:
                add_err = 1
                for ignore in ignorelist:
                    if add_err == 1 and re.search(ignore, match):
                        add_err = 0
                if add_err == 1:
                    self.ClusterManager.log("BadNews: " + match)
                    self.incr("BadNews")
                    errcount = errcount + 1
            else:
                break
        else:
            if self.ClusterManager.Env["continue"] == 1:
                answer = "Y"
            else:
                try:
                    answer = input_wrapper('Big problems. Continue? [nY]')
                except EOFError as e:
                    answer = "n"
            if answer and answer == "n":
                self.ClusterManager.log("Shutting down.")
                self.summarize()
                self.TearDown()
                raise ValueError("Looks like we hit a BadNews jackpot!")

        if self.BadNews:
            self.BadNews.end()
        return failed
Example #5
0
    def __call__(self):
        result = 1
        # @TODO Use directory of PCMK_logfile if set on host
        dfcmd = "df -BM " + CTSvars.CRM_LOG_DIR + " | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%'"

        self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"])
        for node in self.CM.Env["nodes"]:
            dfout = self.CM.rsh(node, dfcmd, 1)
            if not dfout:
                self.CM.log(
                    "ERROR: Cannot execute remote df command [%s] on %s" %
                    (dfcmd, node))
            else:
                try:
                    (used, remain) = dfout.split()
                    used_percent = int(used)
                    remaining_mb = int(remain)
                except (ValueError, TypeError):
                    self.CM.log(
                        "Warning: df output '%s' from %s was invalid [%s, %s]"
                        % (dfout, node, used, remain))
                else:
                    if remaining_mb < 10 or used_percent > 95:
                        self.CM.log(
                            "CRIT: Out of log disk space on %s (%d%% / %dMB)" %
                            (node, used_percent, remaining_mb))
                        result = None
                        if self.CM.Env["continue"] == 1:
                            answer = "Y"
                        else:
                            try:
                                answer = input_wrapper('Continue? [nY]')
                            except EOFError as e:
                                answer = "n"

                        if answer and answer == "n":
                            raise ValueError("Disk full on %s" % (node))
                            ret = 0

                    elif remaining_mb < 100 or used_percent > 90:
                        self.CM.log(
                            "WARN: Low on log disk space (%dMB) on %s" %
                            (remaining_mb, node))
        return result
Example #6
0
    def __call__(self):
        result = 1
        # @TODO Use directory of PCMK_logfile if set on host
        dfcmd = "df -BM " + CTSvars.CRM_LOG_DIR + " | tail -1 | awk '{print $(NF-1)\" \"$(NF-2)}' | tr -d 'M%'"

        self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"])
        for node in self.CM.Env["nodes"]:
            dfout = self.CM.rsh(node, dfcmd, 1)
            if not dfout:
                self.CM.log ("ERROR: Cannot execute remote df command [%s] on %s" % (dfcmd, node))
            else:
                try:
                    (used, remain) = dfout.split()
                    used_percent = int(used)
                    remaining_mb = int(remain)
                except (ValueError, TypeError):
                    self.CM.log("Warning: df output '%s' from %s was invalid [%s, %s]"
                                % (dfout, node, used, remain))
                else:
                    if remaining_mb < 10 or used_percent > 95:
                        self.CM.log("CRIT: Out of log disk space on %s (%d%% / %dMB)"
                                    % (node, used_percent, remaining_mb))
                        result = None
                        if self.CM.Env["continue"] == 1:
                            answer = "Y"
                        else:
                            try:
                                answer = input_wrapper('Continue? [nY]')
                            except EOFError as e:
                                answer = "n"

                        if answer and answer == "n":
                            raise ValueError("Disk full on %s" % (node))
                            ret = 0

                    elif remaining_mb < 100 or used_percent > 90:
                        self.CM.log("WARN: Low on log disk space (%dMB) on %s" % (remaining_mb, node))
        return result
Example #7
0
    def run_test(self, test, testcount):
        nodechoice = self.ClusterManager.Env.RandomNode()

        ret = 1
        where = ""
        did_run = 0

        self.ClusterManager.StatsMark(testcount)
        self.ClusterManager.instance_errorstoignore_clear()
        self.ClusterManager.log(("Running test %s" % test.name).ljust(35) +
                                (" (%s) " % nodechoice).ljust(15) + "[" +
                                ("%d" % testcount).rjust(3) + "]")

        starttime = test.set_timer()
        if not test.setup(nodechoice):
            self.ClusterManager.log("Setup failed")
            ret = 0

        elif not test.canrunnow(nodechoice):
            self.ClusterManager.log("Skipped")
            test.skipped()

        else:
            did_run = 1
            ret = test(nodechoice)

        if not test.teardown(nodechoice):
            self.ClusterManager.log("Teardown failed")
            if self.ClusterManager.Env["continue"] == 1:
                answer = "Y"
            else:
                try:
                    answer = input_wrapper('Continue? [nY]')
                except EOFError as e:
                    answer = "n"
            if answer and answer == "n":
                raise ValueError("Teardown of %s on %s failed" %
                                 (test.name, nodechoice))
            ret = 0

        stoptime = time.time()
        self.ClusterManager.oprofileSave(testcount)

        elapsed_time = stoptime - starttime
        test_time = stoptime - test.get_timer()
        if not test["min_time"]:
            test["elapsed_time"] = elapsed_time
            test["min_time"] = test_time
            test["max_time"] = test_time
        else:
            test["elapsed_time"] = test["elapsed_time"] + elapsed_time
            if test_time < test["min_time"]:
                test["min_time"] = test_time
            if test_time > test["max_time"]:
                test["max_time"] = test_time

        if ret:
            self.incr("success")
            test.log_timer()
        else:
            self.incr("failure")
            self.ClusterManager.statall()
            did_run = 1  # Force the test count to be incremented anyway so test extraction works

        self.audit(test.errorstoignore())
        return did_run
Example #8
0
    def run_test(self, test, testcount):
        nodechoice = self.ClusterManager.Env.RandomNode()

        ret = 1
        where = ""
        did_run = 0

        self.ClusterManager.StatsMark(testcount)
        self.ClusterManager.instance_errorstoignore_clear()
        self.ClusterManager.log(("Running test %s" % test.name).ljust(35) + (" (%s) " % nodechoice).ljust(15) + "[" + ("%d" % testcount).rjust(3) + "]")

        starttime = test.set_timer()
        if not test.setup(nodechoice):
            self.ClusterManager.log("Setup failed")
            ret = 0

        elif not test.canrunnow(nodechoice):
            self.ClusterManager.log("Skipped")
            test.skipped()

        else:
            did_run = 1
            ret = test(nodechoice)

        if not test.teardown(nodechoice):
            self.ClusterManager.log("Teardown failed")
            if self.ClusterManager.Env["continue"] == 1:
                answer = "Y"
            else:
                try:
                    answer = input_wrapper('Continue? [nY]')
                except EOFError as e:
                    answer = "n"
            if answer and answer == "n":
                raise ValueError("Teardown of %s on %s failed" % (test.name, nodechoice))
            ret = 0

        stoptime = time.time()
        self.ClusterManager.oprofileSave(testcount)

        elapsed_time = stoptime - starttime
        test_time = stoptime - test.get_timer()
        if not test["min_time"]:
            test["elapsed_time"] = elapsed_time
            test["min_time"] = test_time
            test["max_time"] = test_time
        else:
            test["elapsed_time"] = test["elapsed_time"] + elapsed_time
            if test_time < test["min_time"]:
                test["min_time"] = test_time
            if test_time > test["max_time"]:
                test["max_time"] = test_time

        if ret:
            self.incr("success")
            test.log_timer()
        else:
            self.incr("failure")
            self.ClusterManager.statall()
            did_run = 1  # Force the test count to be incremented anyway so test extraction works

        self.audit(test.errorstoignore())
        return did_run