class OpTestHMIHandling:
    ## Initialize this object
    #  @param i_bmcIP The IP address of the BMC
    #  @param i_bmcUser The userid to log into the BMC with
    #  @param i_bmcPasswd The password of the userid to log into the BMC with
    #  @param i_bmcUserIpmi The userid to issue the BMC IPMI commands with
    #  @param i_bmcPasswdIpmi The password of BMC IPMI userid
    #  @param i_ffdcDir Optional param to indicate where to write FFDC
    #
    # "Only required for inband tests" else Default = None
    # @param i_lparIP The IP address of the LPAR
    # @param i_lparuser The userid to log into the LPAR
    # @param i_lparPasswd The password of the userid to log into the LPAR with
    #
    def __init__(
        self,
        i_bmcIP,
        i_bmcUser,
        i_bmcPasswd,
        i_bmcUserIpmi,
        i_bmcPasswdIpmi,
        i_ffdcDir=None,
        i_lparip=None,
        i_lparuser=None,
        i_lparPasswd=None,
    ):
        self.cv_BMC = OpTestBMC(i_bmcIP, i_bmcUser, i_bmcPasswd, i_ffdcDir)
        self.cv_IPMI = OpTestIPMI(
            i_bmcIP, i_bmcUserIpmi, i_bmcPasswdIpmi, i_ffdcDir, i_lparip, i_lparuser, i_lparPasswd
        )
        self.cv_LPAR = OpTestLpar(i_lparip, i_lparuser, i_lparPasswd, i_bmcIP)
        self.cv_SYSTEM = OpTestSystem(
            i_bmcIP,
            i_bmcUser,
            i_bmcPasswd,
            i_bmcUserIpmi,
            i_bmcPasswdIpmi,
            i_ffdcDir,
            i_lparip,
            i_lparuser,
            i_lparPasswd,
        )
        self.util = OpTestUtil()

    ##
    # @brief This is a common function for all the hmi test cases. This will be executed before
    #        any test case starts. Basically this provides below requirements.
    #        1. Validates all required lpar commands
    #        2. It will clone skiboot source repository
    #        3. Compile the necessary tools xscom-utils and gard utility to test HMI.
    #        4. Get the list Of Chips and cores in the form of dictionary.
    #           Ex: [['00000000', ['4', '5', '6', 'c', 'd', 'e']], ['00000001', ['4', '5', '6', 'c', 'd', 'e']], ['00000010', ['4', '5', '6', 'c', 'd', 'e']]]
    #        5. In-order to inject HMI errors on cpu's, cpu should be running,
    #           so disabling the sleep states 1 and 2 of all CPU's.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_init(self):
        # Get OS level
        self.cv_LPAR.lpar_get_OS_Level()

        # Check whether git and gcc commands are available on lpar
        self.cv_LPAR.lpar_check_command("git")
        self.cv_LPAR.lpar_check_command("gcc")

        # It will clone skiboot source repository
        l_dir = "/tmp/skiboot"
        self.cv_LPAR.lpar_clone_skiboot_source(l_dir)
        # Compile the necessary tools xscom-utils and gard utility
        self.cv_LPAR.lpar_compile_xscom_utilities(l_dir)
        self.cv_LPAR.lpar_compile_gard_utility(l_dir)

        # Getting list of processor chip Id's(executing getscom -l to get chip id's)
        l_res = self.cv_LPAR.lpar_run_command("cd %s/external/xscom-utils/; ./getscom -l" % l_dir)
        l_res = l_res.splitlines()
        l_chips = []
        for line in l_res:
            matchObj = re.search("(\d{8}).*processor", line)
            if matchObj:
                l_chips.append(matchObj.group(1))
        if not l_chips:
            l_msg = "Getscom failed to list processor chip id's"
            raise OpTestError(l_msg)
        l_chips.sort()
        print l_chips  # ['00000000', '00000001', '00000010']

        # Currently getting the list of active core id's with respect to each chip is by using opal msg log
        # TODO: Need to identify best way to get list of cores(If Opal msg log is empty)
        l_cmd = "cat /sys/firmware/opal/msglog | grep -i CHIP"
        l_res = self.cv_LPAR.lpar_run_command(l_cmd)
        l_cores = {}
        self.l_dic = []
        l_res = l_res.splitlines()
        for line in l_res:
            matchObj = re.search("Chip (\d{1,2}) Core ([a-z0-9])", line)
            if matchObj:
                if l_cores.has_key(int(matchObj.group(1))):
                    (l_cores[int(matchObj.group(1))]).append(matchObj.group(2))
                else:
                    l_cores[int(matchObj.group(1))] = list(matchObj.group(2))
        if not l_cores:
            l_msg = "Failed in getting core id's information from OPAL msg log"
            raise OpTestError(l_msg)

        print l_cores  # {0: ['4', '5', '6', 'c', 'd', 'e'], 1: ['4', '5', '6', 'c', 'd', 'e'], 10: ['4', '5', '6', 'c', 'd', 'e']}
        l_cores = sorted(l_cores.iteritems())
        print l_cores
        i = 0
        for tup in l_cores:
            new_list = [l_chips[i], tup[1]]
            self.l_dic.append(new_list)
            i += 1
        print self.l_dic
        # self.l_dic is a list of chip id's, core id's . and is of below format
        # [['00000000', ['4', '5', '6', 'c', 'd', 'e']], ['00000001', ['4', '5', '6', 'c', 'd', 'e']], ['00000010', ['4', '5', '6', 'c', 'd', 'e']]]

        self.l_dir = l_dir
        # In-order to inject HMI errors on cpu's, cpu should be running, so disabling the sleep states 1 and 2 of all CPU's
        self.cv_LPAR.lpar_run_command(BMC_CONST.GET_CPU_SLEEP_STATE2)
        self.cv_LPAR.lpar_run_command(BMC_CONST.GET_CPU_SLEEP_STATE1)
        self.cv_LPAR.lpar_run_command(BMC_CONST.GET_CPU_SLEEP_STATE0)
        self.cv_LPAR.lpar_run_command(BMC_CONST.DISABLE_CPU_SLEEP_STATE1)
        self.cv_LPAR.lpar_run_command(BMC_CONST.DISABLE_CPU_SLEEP_STATE2)
        self.cv_LPAR.lpar_run_command(BMC_CONST.GET_CPU_SLEEP_STATE2)
        self.cv_LPAR.lpar_run_command(BMC_CONST.GET_CPU_SLEEP_STATE1)
        self.cv_LPAR.lpar_run_command(BMC_CONST.GET_CPU_SLEEP_STATE0)

    ##
    # @brief This function is mainly used to clear hardware gard entries.
    #        It will perform below steps
    #           1. Reboot the system(Power off/on)
    #           2. Clear any Hardware gard entries
    #           3. Again reboot the system, to make use of garded Hardware.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def clearGardEntries(self):
        # Power off and on the system.
        self.cv_IPMI.ipmi_power_off()
        self.cv_IPMI.ipmi_power_on()
        if int(self.cv_SYSTEM.sys_ipl_wait_for_working_state()):
            l_msg = "System failed to boot host OS"
            raise OpTestError(l_msg)
        time.sleep(BMC_CONST.LPAR_BRINGUP_TIME)

        # Clearing gard entries after lpar comes up
        self.cv_LPAR.lpar_get_OS_Level()
        l_con = self.cv_SYSTEM.sys_get_ipmi_console()
        self.cv_IPMI.ipmi_lpar_login(l_con)
        self.cv_IPMI.ipmi_lpar_set_unique_prompt(l_con)
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("uname -a")
        l_dir = "/tmp/skiboot"
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("cd %s/external/gard/;" % l_dir)
        l_cmd = "./gard list; echo $?"
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console(l_cmd)
        l_cmd = "./gard clear all; echo $?"
        l_res = self.cv_IPMI.run_lpar_cmd_on_ipmi_console(l_cmd)
        if int(l_res[-1]):
            l_msg = "Clearing gard entries through gard tool is failed"
            raise OpTestError(l_msg)
        l_cmd = "./gard list; echo $?"
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console(l_cmd)

        # Rebooting the system again to make use of garded hardware
        self.cv_IPMI.ipmi_power_off()
        self.cv_IPMI.ipmi_power_on()
        if int(self.cv_SYSTEM.sys_ipl_wait_for_working_state()):
            l_msg = "System failed to boot host OS"
            raise OpTestError(l_msg)
        time.sleep(BMC_CONST.LPAR_BRINGUP_TIME)
        self.cv_LPAR.lpar_get_OS_Level()
        self.cv_SYSTEM.sys_ipmi_close_console(l_con)

    ##
    # @brief This function executes HMI test case based on the i_test value, Before test starts
    #        disabling kdump service to make sure system reboots, after injecting non-recoverable errors.
    #
    # @param i_test @type int: this is the type of test case want to execute
    #                          BMC_CONST.HMI_PROC_RECV_DONE: Processor recovery done
    #                          BMC_CONST.HMI_PROC_RECV_ERROR_MASKED: proc_recv_error_masked
    #                          BMC_CONST.HMI_MALFUNCTION_ALERT: malfunction_alert
    #                          BMC_CONST.HMI_HYPERVISOR_RESOURCE_ERROR: hypervisor resource error
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def testHMIHandling(self, i_test):
        l_test = i_test
        self.test_init()
        l_con = self.cv_SYSTEM.sys_get_ipmi_console()
        self.cv_IPMI.ipmi_lpar_login(l_con)
        self.cv_IPMI.ipmi_lpar_set_unique_prompt(l_con)
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("uname -a")
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("cat /etc/os-release")
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("service kdump status")
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("service kdump stop")
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("service kdump status")
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("cd %s/external/xscom-utils/;" % self.l_dir)
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("lscpu")
        self.cv_IPMI.run_lpar_cmd_on_ipmi_console("dmesg -D")
        if l_test == BMC_CONST.HMI_PROC_RECV_DONE:
            self.test_proc_recv_done()
        elif l_test == BMC_CONST.HMI_PROC_RECV_ERROR_MASKED:
            self.test_proc_recv_error_masked()
        elif l_test == BMC_CONST.HMI_MALFUNCTION_ALERT:
            self.test_malfunction_allert()
        elif l_test == BMC_CONST.HMI_HYPERVISOR_RESOURCE_ERROR:
            self.test_hyp_resource_err()
        else:
            l_msg = "Please provide valid test case"
            raise OpTestError(l_msg)
        self.cv_SYSTEM.sys_ipmi_close_console(l_con)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test HMI: processor recovery done
    #        and also this function injecting error on all the cpus one by one and
    #        verify whether cpu is recovered or not.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_proc_recv_done(self):
        for l_pair in self.l_dic:
            l_chip = l_pair[0]
            for l_core in l_pair[1]:
                l_reg = "1%s013100" % l_core
                l_cmd = "./putscom -c %s %s 0000000000100000; echo $?" % (l_chip, l_reg)

                self.cv_IPMI.run_lpar_cmd_on_ipmi_console("dmesg -C")
                time.sleep(10)
                l_res = self.cv_IPMI.run_lpar_cmd_on_ipmi_console(l_cmd)
                time.sleep(10)
                if l_res[-1] == "0":
                    print "Injected thread hang recoverable error"
                else:
                    if any("Kernel panic - not syncing" in line for line in l_res):
                        l_msg = "Processor recovery failed: Kernel got panic"
                    elif any("Petitboot" in line for line in l_res):
                        l_msg = "System reached petitboot:Processor recovery failed"
                    elif any("ISTEP" in line for line in l_res):
                        l_msg = "System started booting: Processor recovery failed"
                    else:
                        l_msg = "Failed to inject thread hang recoverable error"
                    print l_msg
                    raise OpTestError(l_msg)

                l_res = self.cv_IPMI.run_lpar_cmd_on_ipmi_console("dmesg")
                if any("Processor Recovery done" in line for line in l_res) and any(
                    "Harmless Hypervisor Maintenance interrupt [Recovered]" in line for line in l_res
                ):
                    print "Processor recovery done"
                else:
                    l_msg = "HMI handling failed to log message: for proc_recv_done"
                    raise OpTestError(l_msg)
                time.sleep(BMC_CONST.HMI_TEST_CASE_SLEEP_TIME)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test HMI: proc_recv_error_masked
    #        Processor went through recovery for an error which is actually masked for reporting
    #        this function also injecting the error on all the cpu's one-by-one.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_proc_recv_error_masked(self):
        for l_pair in self.l_dic:
            l_chip = l_pair[0]
            for l_core in l_pair[1]:
                l_reg = "1%s013100" % l_core
                l_cmd = "./putscom -c %s %s 0000000000080000; echo $?" % (l_chip, l_reg)
                self.cv_IPMI.run_lpar_cmd_on_ipmi_console("dmesg -C")
                time.sleep(10)
                l_res = self.cv_IPMI.run_lpar_cmd_on_ipmi_console(l_cmd)
                time.sleep(10)
                if l_res[-1] == "0":
                    print "Injected thread hang recoverable error"
                else:
                    if any("Kernel panic - not syncing" in line for line in l_res):
                        l_msg = "Processor recovery failed: Kernel got panic"
                    elif any("Petitboot" in line for line in l_res):
                        l_msg = "System reached petitboot:Processor recovery failed"
                    elif any("ISTEP" in line for line in l_res):
                        l_msg = "System started booting: Processor recovery failed"
                    else:
                        l_msg = "Failed to inject thread hang recoverable error"
                    print l_msg
                    raise OpTestError(l_msg)

                l_res = self.cv_IPMI.run_lpar_cmd_on_ipmi_console("dmesg")
                if any("Processor Recovery done" in line for line in l_res) and any(
                    "Harmless Hypervisor Maintenance interrupt [Recovered]" in line for line in l_res
                ):
                    print "Processor recovery done"
                else:
                    l_msg = "HMI handling failed to log message"
                    raise OpTestError(l_msg)
                time.sleep(BMC_CONST.HMI_TEST_CASE_SLEEP_TIME)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test hmi malfunction alert:Core checkstop
    #        A processor core in the system has to be checkstopped (failed recovery).
    #        Injecting core checkstop on random core of random chip
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_malfunction_allert(self):
        # Get random pair of chip vs cores
        l_pair = random.choice(self.l_dic)
        # Get random chip id
        l_chip = l_pair[0]
        # Get random core number
        l_core = random.choice(l_pair[1])

        l_reg = "1%s013100" % l_core
        l_cmd = "./putscom -c %s %s 1000000000000000" % (l_chip, l_reg)

        l_res = self.cv_IPMI.run_lpar_cmd_on_ipmi_console(l_cmd)
        if any("Kernel panic - not syncing" in line for line in l_res):
            print "Malfunction alert: kernel got panic"
        elif any("login:"******"System booted to host OS without any kernel panic message"
        elif any("Petitboot" in line for line in l_res):
            print "System reached petitboot without any kernel panic message"
        elif any("ISTEP" in line for line in l_res):
            print "System started booting without any kernel panic message"
        else:
            l_msg = "HMI: Malfunction alert failed"
            raise OpTestError(l_msg)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test HMI: Hypervisor resource error
    #        Injecting Hypervisor resource error on random core of random chip
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_hyp_resource_err(self):
        # Get random pair of chip vs cores
        l_pair = random.choice(self.l_dic)
        # Get random chip id
        l_chip = l_pair[0]
        # Get random core number
        l_core = random.choice(l_pair[1])

        l_reg = "1%s013100" % l_core
        l_cmd = "./putscom -c %s %s 0000000000008000" % (l_chip, l_reg)

        l_res = self.cv_IPMI.run_lpar_cmd_on_ipmi_console(l_cmd)
        if any("Kernel panic - not syncing" in line for line in l_res) and any(
            "Hypervisor Resource error - core check stop" in line for line in l_res
        ):
            print "Hypervisor resource error: kernel got panic"
        elif any("login:"******"System booted to host OS without any kernel panic message"
        elif any("Petitboot" in line for line in l_res):
            print "System reached petitboot without any kernel panic message"
        elif any("ISTEP" in line for line in l_res):
            print "System started booting without any kernel panic message"
        else:
            l_msg = "HMI: Hypervisor resource error failed"
            raise OpTestError(l_msg)
        return BMC_CONST.FW_SUCCESS
Esempio n. 2
0
class OpTestHMIHandling():
    ## Initialize this object
    #  @param i_bmcIP The IP address of the BMC
    #  @param i_bmcUser The userid to log into the BMC with
    #  @param i_bmcPasswd The password of the userid to log into the BMC with
    #  @param i_bmcUserIpmi The userid to issue the BMC IPMI commands with
    #  @param i_bmcPasswdIpmi The password of BMC IPMI userid
    #  @param i_ffdcDir Optional param to indicate where to write FFDC
    #
    # "Only required for inband tests" else Default = None
    # @param i_hostIP The IP address of the Host
    # @param i_hostuser The userid to log into the Host
    # @param i_hostPasswd The password of the userid to log into the Host with
    #
    def __init__(self, i_bmcIP, i_bmcUser, i_bmcPasswd,
                 i_bmcUserIpmi, i_bmcPasswdIpmi, i_ffdcDir=None, i_hostip=None,
                 i_hostuser=None, i_hostPasswd=None):
        self.cv_BMC = OpTestBMC(i_bmcIP, i_bmcUser, i_bmcPasswd, i_ffdcDir)
        self.cv_IPMI = OpTestIPMI(i_bmcIP, i_bmcUserIpmi, i_bmcPasswdIpmi,
                                  i_ffdcDir, i_hostip, i_hostuser, i_hostPasswd)
        self.cv_HOST = OpTestHost(i_hostip, i_hostuser, i_hostPasswd, i_bmcIP, i_ffdcDir)
        self.cv_SYSTEM = OpTestSystem(i_bmcIP, i_bmcUser, i_bmcPasswd,
                 i_bmcUserIpmi, i_bmcPasswdIpmi, i_ffdcDir, i_hostip,
                 i_hostuser, i_hostPasswd)
        self.util = OpTestUtil()

    ##
    # @brief This is a common function for all the hmi test cases. This will be executed before
    #        any test case starts. Basically this provides below requirements.
    #        1. Validates all required host commands
    #        2. It will clone skiboot source repository
    #        3. Compile the necessary tools xscom-utils and gard utility to test HMI.
    #        4. Get the list Of Chips and cores in the form of dictionary.
    #           Ex: [['00000000', ['4', '5', '6', 'c', 'd', 'e']], ['00000001', ['4', '5', '6', 'c', 'd', 'e']], ['00000010', ['4', '5', '6', 'c', 'd', 'e']]]
    #        5. In-order to inject HMI errors on cpu's, cpu should be running,
    #           so disabling the sleep states 1 and 2 of all CPU's.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_init(self):
        self.cv_SYSTEM.sys_bmc_power_on_validate_host()

        # Get OS level
        self.l_oslevel = self.cv_HOST.host_get_OS_Level()

        # Check whether git and gcc commands are available on the host
        self.cv_HOST.host_check_command("git")
        self.cv_HOST.host_check_command("gcc")

        # It will clone skiboot source repository
        l_dir = "/tmp/skiboot"
        self.cv_HOST.host_clone_skiboot_source(l_dir)
        # Compile the necessary tools xscom-utils and gard utility
        self.cv_HOST.host_compile_xscom_utilities(l_dir)
        self.cv_HOST.host_compile_gard_utility(l_dir)

        # Getting list of processor chip Id's(executing getscom -l to get chip id's)
        l_res = self.cv_HOST.host_run_command("cd %s/external/xscom-utils/; ./getscom -l" % l_dir)
        l_res = l_res.splitlines()
        l_chips = []
        for line in l_res:
            matchObj = re.search("(\d{8}).*processor", line)
            if matchObj:
                l_chips.append(matchObj.group(1))
        if not l_chips:
            l_msg = "Getscom failed to list processor chip id's"
            raise OpTestError(l_msg)
        l_chips.sort()
        print l_chips # ['00000000', '00000001', '00000010']

        # Currently getting the list of active core id's with respect to each chip is by using opal msg log
        # TODO: Need to identify best way to get list of cores(If Opal msg log is empty)
        l_cmd = "cat /sys/firmware/opal/msglog | grep -i CHIP"
        l_res = self.cv_HOST.host_run_command(l_cmd)
        l_cores = {}
        self.l_dic = []
        l_res = l_res.splitlines()
        for line in l_res:
            matchObj = re.search("Chip (\d{1,2}) Core ([a-z0-9])", line)
            if matchObj:
                if l_cores.has_key(int(matchObj.group(1))):
                    (l_cores[int(matchObj.group(1))]).append(matchObj.group(2))
                else:
                    l_cores[int(matchObj.group(1))] = list(matchObj.group(2))
        if not l_cores:
            l_msg = "Failed in getting core id's information from OPAL msg log"
            raise OpTestError(l_msg)

        print l_cores # {0: ['4', '5', '6', 'c', 'd', 'e'], 1: ['4', '5', '6', 'c', 'd', 'e'], 10: ['4', '5', '6', 'c', 'd', 'e']}
        l_cores = sorted(l_cores.iteritems())
        print l_cores
        i=0
        for tup in l_cores:
            new_list = [l_chips[i], tup[1]]
            self.l_dic.append(new_list)
            i+=1
        print self.l_dic
        # self.l_dic is a list of chip id's, core id's . and is of below format 
        # [['00000000', ['4', '5', '6', 'c', 'd', 'e']], ['00000001', ['4', '5', '6', 'c', 'd', 'e']], ['00000010', ['4', '5', '6', 'c', 'd', 'e']]]

        self.l_dir = l_dir
        # In-order to inject HMI errors on cpu's, cpu should be running, so disabling the sleep states 1 and 2 of all CPU's
        self.cv_HOST.host_run_command(BMC_CONST.GET_CPU_SLEEP_STATE2)
        self.cv_HOST.host_run_command(BMC_CONST.GET_CPU_SLEEP_STATE1)
        self.cv_HOST.host_run_command(BMC_CONST.GET_CPU_SLEEP_STATE0)
        self.cv_HOST.host_run_command(BMC_CONST.DISABLE_CPU_SLEEP_STATE1)
        self.cv_HOST.host_run_command(BMC_CONST.DISABLE_CPU_SLEEP_STATE2)
        self.cv_HOST.host_run_command(BMC_CONST.GET_CPU_SLEEP_STATE2)
        self.cv_HOST.host_run_command(BMC_CONST.GET_CPU_SLEEP_STATE1)
        self.cv_HOST.host_run_command(BMC_CONST.GET_CPU_SLEEP_STATE0)

        if "Ubuntu" in self.l_oslevel:
            self.cv_HOST.host_run_command("service kdump-tools stop")
            self.cv_HOST.host_run_command("service kdump-tools status")
        else:
            self.cv_HOST.host_run_command("service kdump stop")
            self.cv_HOST.host_run_command("service kdump status")

    ##
    # @brief This function is mainly used to clear hardware gard entries.
    #        It will perform below steps
    #           1. Reboot the system(Power off/on)
    #           2. Clear any Hardware gard entries
    #           3. Again reboot the system, to make use of garded Hardware.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def clearGardEntries(self):
        self.cv_SYSTEM.sys_bmc_power_on_validate_host()
        # Power off and on the system.
        self.cv_IPMI.ipmi_power_off()
        self.cv_IPMI.ipmi_power_on()
        if int(self.cv_SYSTEM.sys_ipl_wait_for_working_state()):
            l_msg = "System failed to boot host OS"
            raise OpTestError(l_msg)
        time.sleep(BMC_CONST.HOST_BRINGUP_TIME)

        # Clearing gard entries after host comes up
        self.cv_HOST.host_get_OS_Level()
        # It will clone skiboot source repository
        l_dir = "/tmp/skiboot"
        self.cv_HOST.host_clone_skiboot_source(l_dir)
        # Compile the necessary tools xscom-utils and gard utility
        self.cv_HOST.host_compile_xscom_utilities(l_dir)
        self.cv_HOST.host_compile_gard_utility(l_dir)

        l_con = self.cv_SYSTEM.sys_get_ipmi_console()
        self.cv_IPMI.ipmi_host_login(l_con)
        self.cv_IPMI.ipmi_host_set_unique_prompt(l_con)
        self.cv_IPMI.run_host_cmd_on_ipmi_console("uname -a")
        l_dir = "/tmp/skiboot"
        self.cv_IPMI.run_host_cmd_on_ipmi_console("cd %s/external/gard/;" % l_dir)
        l_cmd = "./gard list; echo $?"
        self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
        l_cmd = "./gard clear all; echo $?"
        l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
        if int(l_res[-1]):
            l_msg = "Clearing gard entries through gard tool is failed"
            raise OpTestError(l_msg)
        l_cmd = "./gard list; echo $?"
        self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)

        # Rebooting the system again to make use of garded hardware
        self.cv_IPMI.ipmi_power_off()
        self.cv_IPMI.ipmi_power_on()
        if int(self.cv_SYSTEM.sys_ipl_wait_for_working_state()):
            l_msg = "System failed to boot host OS"
            raise OpTestError(l_msg)
        time.sleep(BMC_CONST.HOST_BRINGUP_TIME)
        self.cv_HOST.host_get_OS_Level()
        self.cv_SYSTEM.sys_ipmi_close_console(l_con)

    ##
    # @brief This function executes HMI test case based on the i_test value, Before test starts
    #        disabling kdump service to make sure system reboots, after injecting non-recoverable errors.
    #
    # @param i_test @type int: this is the type of test case want to execute
    #                          BMC_CONST.HMI_PROC_RECV_DONE: Processor recovery done
    #                          BMC_CONST.HMI_PROC_RECV_ERROR_MASKED: proc_recv_error_masked
    #                          BMC_CONST.HMI_MALFUNCTION_ALERT: malfunction_alert
    #                          BMC_CONST.HMI_HYPERVISOR_RESOURCE_ERROR: hypervisor resource error
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def testHMIHandling(self, i_test):
        l_test = i_test
        self.test_init()
        l_con = self.cv_SYSTEM.sys_get_ipmi_console()
        self.cv_IPMI.ipmi_host_login(l_con)
        self.cv_IPMI.ipmi_host_set_unique_prompt(l_con)
        self.cv_IPMI.run_host_cmd_on_ipmi_console("uname -a")
        self.cv_IPMI.run_host_cmd_on_ipmi_console("cat /etc/os-release")
        self.cv_IPMI.run_host_cmd_on_ipmi_console("cd %s/external/xscom-utils/;" % self.l_dir)
        self.cv_IPMI.run_host_cmd_on_ipmi_console("lscpu")
        self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg -D")
        if l_test == BMC_CONST.HMI_PROC_RECV_DONE:
            self.test_proc_recv_done()
        elif l_test == BMC_CONST.HMI_PROC_RECV_ERROR_MASKED:
            self.test_proc_recv_error_masked()
        elif l_test == BMC_CONST.HMI_MALFUNCTION_ALERT:
            self.test_malfunction_allert()
        elif l_test == BMC_CONST.HMI_HYPERVISOR_RESOURCE_ERROR:
            self.test_hyp_resource_err()
        elif l_test == BMC_CONST.TOD_ERRORS:
            # TOD Error recovery works on systems having more than one chip TOD
            # Skip this test on single chip systems(as recovery fails on 1S systems)
            if len(self.l_dic) == 1:
                l_msg = "This is a single chip system, TOD Error recovery won't work"
                print l_msg
                return BMC_CONST.FW_SUCCESS
            elif len(self.l_dic) > 1:
                self.test_tod_errors(BMC_CONST.PSS_HAMMING_DISTANCE)
                self.test_tod_errors(BMC_CONST.INTERNAL_PATH_OR_PARITY_ERROR)
                self.test_tod_errors(BMC_CONST.TOD_DATA_PARITY_ERROR)
                self.test_tod_errors(BMC_CONST.TOD_SYNC_CHECK_ERROR)
                self.test_tod_errors(BMC_CONST.FSM_STATE_PARITY_ERROR)
                self.test_tod_errors(BMC_CONST.MASTER_PATH_CONTROL_REGISTER)
                self.test_tod_errors(BMC_CONST.PORT_0_PRIMARY_CONFIGURATION_REGISTER)
                self.test_tod_errors(BMC_CONST.PORT_1_PRIMARY_CONFIGURATION_REGISTER)
                self.test_tod_errors(BMC_CONST.PORT_0_SECONDARY_CONFIGURATION_REGISTER)
                self.test_tod_errors(BMC_CONST.PORT_1_SECONDARY_CONFIGURATION_REGISTER)
                self.test_tod_errors(BMC_CONST.SLAVE_PATH_CONTROL_REGISTER)
                self.test_tod_errors(BMC_CONST.INTERNAL_PATH_CONTROL_REGISTER)
                self.test_tod_errors(BMC_CONST.PR_SC_MS_SL_CONTROL_REGISTER)
            else:
                l_msg = "Getting Chip information failed"
                raise OpTestError(l_msg)
        elif l_test == BMC_CONST.TFMR_ERRORS:
            self.testTFMR_Errors(BMC_CONST.TB_PARITY_ERROR)
            self.testTFMR_Errors(BMC_CONST.TFMR_PARITY_ERROR)
            self.testTFMR_Errors(BMC_CONST.TFMR_HDEC_PARITY_ERROR)
            self.testTFMR_Errors(BMC_CONST.TFMR_DEC_PARITY_ERROR)
            self.testTFMR_Errors(BMC_CONST.TFMR_PURR_PARITY_ERROR)
            self.testTFMR_Errors(BMC_CONST.TFMR_SPURR_PARITY_ERROR)
        else:
            l_msg = "Please provide valid test case"
            raise OpTestError(l_msg)
        self.cv_SYSTEM.sys_ipmi_close_console(l_con)
        print "Gathering the OPAL msg logs"
        self.cv_HOST.host_gather_opal_msg_log()
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test HMI: processor recovery done
    #        and also this function injecting error on all the cpus one by one and 
    #        verify whether cpu is recovered or not.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_proc_recv_done(self):
        for l_pair in self.l_dic:
            l_chip = l_pair[0]
            for l_core in l_pair[1]:
                l_reg = "1%s013100" % l_core
                l_cmd = "./putscom -c %s %s 0000000000100000; echo $?" % (l_chip, l_reg)

                self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg -C")
                time.sleep(10)
                l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
                time.sleep(10)
                if l_res[-1] == "0":
                    print "Injected thread hang recoverable error"
                elif l_res[-1] == "1":
                    # putscom returns -5 when it is trying to read from write only access register,
                    # In these cases we should not exit and we will contiue with other error injetions
                    continue
                else:
                    if any("Kernel panic - not syncing" in line for line in l_res):
                        l_msg = "Processor recovery failed: Kernel got panic"
                    elif any("Petitboot" in line for line in l_res):
                        l_msg = "System reached petitboot:Processor recovery failed"
                    elif any("ISTEP" in line for line in l_res):
                        l_msg = "System started booting: Processor recovery failed"
                    else:
                        l_msg = "Failed to inject thread hang recoverable error"
                    print l_msg
                    raise OpTestError(l_msg)

                l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg")
                if any("Processor Recovery done" in line for line in l_res) and \
                any("Harmless Hypervisor Maintenance interrupt [Recovered]" in line for line in l_res):
                    print "Processor recovery done"
                else:
                    l_msg = "HMI handling failed to log message: for proc_recv_done"
                    raise OpTestError(l_msg)
                time.sleep(BMC_CONST.HMI_TEST_CASE_SLEEP_TIME)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test HMI: proc_recv_error_masked
    #        Processor went through recovery for an error which is actually masked for reporting
    #        this function also injecting the error on all the cpu's one-by-one.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_proc_recv_error_masked(self):
        for l_pair in self.l_dic:
            l_chip = l_pair[0]
            for l_core in l_pair[1]:
                l_reg = "1%s013100" % l_core
                l_cmd = "./putscom -c %s %s 0000000000080000; echo $?" % (l_chip, l_reg)
                self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg -C")
                time.sleep(10)
                l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
                time.sleep(10)
                if l_res[-1] == "0":
                    print "Injected thread hang recoverable error"
                elif l_res[-1] == "1":
                    continue
                else:
                    if any("Kernel panic - not syncing" in line for line in l_res):
                        l_msg = "Processor recovery failed: Kernel got panic"
                    elif any("Petitboot" in line for line in l_res):
                        l_msg = "System reached petitboot:Processor recovery failed"
                    elif any("ISTEP" in line for line in l_res):
                        l_msg = "System started booting: Processor recovery failed"
                    else:
                        l_msg = "Failed to inject thread hang recoverable error"
                    print l_msg
                    raise OpTestError(l_msg)

                l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg")
                if any("Processor Recovery done" in line for line in l_res) and \
                any("Harmless Hypervisor Maintenance interrupt [Recovered]" in line for line in l_res):
                    print "Processor recovery done"
                else:
                    l_msg = "HMI handling failed to log message"
                    raise OpTestError(l_msg)
                time.sleep(BMC_CONST.HMI_TEST_CASE_SLEEP_TIME)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test hmi malfunction alert:Core checkstop
    #        A processor core in the system has to be checkstopped (failed recovery).
    #        Injecting core checkstop on random core of random chip
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_malfunction_allert(self):
        # Get random pair of chip vs cores
        l_pair = random.choice(self.l_dic)
        # Get random chip id
        l_chip = l_pair[0]
        # Get random core number
        l_core = random.choice(l_pair[1])

        l_reg = "1%s013100" % l_core
        l_cmd = "./putscom -c %s %s 1000000000000000" % (l_chip, l_reg)

        l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
        if any("Kernel panic - not syncing" in line for line in l_res):
            print "Malfunction alert: kernel got panic"
        elif any("login:"******"System booted to host OS without any kernel panic message"
        elif any("Petitboot" in line for line in l_res):
            print "System reached petitboot without any kernel panic message"
        elif any("ISTEP" in line for line in l_res):
            print "System started booting without any kernel panic message"
        else:
            l_msg = "HMI: Malfunction alert failed"
            raise OpTestError(l_msg)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function is used to test HMI: Hypervisor resource error
    #        Injecting Hypervisor resource error on random core of random chip
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_hyp_resource_err(self):
        # Get random pair of chip vs cores
        l_pair = random.choice(self.l_dic)
        # Get random chip id
        l_chip = l_pair[0]
        # Get random core number
        l_core = random.choice(l_pair[1])

        l_reg = "1%s013100" % l_core
        l_cmd = "./putscom -c %s %s 0000000000008000" % (l_chip, l_reg)

        l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
        if any("Kernel panic - not syncing" in line for line in l_res) and \
        any("Hypervisor Resource error - core check stop" in line for line in l_res):
            print "Hypervisor resource error: kernel got panic"
        elif any("login:"******"System booted to host OS without any kernel panic message"
        elif any("Petitboot" in line for line in l_res):
            print "System reached petitboot without any kernel panic message"
        elif any("ISTEP" in line for line in l_res):
            print "System started booting without any kernel panic message"
        else:
            l_msg = "HMI: Hypervisor resource error failed"
            raise OpTestError(l_msg)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function tests timer facility related error injections and check
    #        the corresponding error got recovered. And this process is repeated
    #        for all the active cores in all the chips.
    #
    # @param i_error @type string: this is the type of error want to inject
    #                          BMC_CONST.TB_PARITY_ERROR
    #                          BMC_CONST.TFMR_PARITY_ERROR
    #                          BMC_CONST.TFMR_HDEC_PARITY_ERROR
    #                          BMC_CONST.TFMR_DEC_PARITY_ERROR
    #                          BMC_CONST.TFMR_PURR_PARITY_ERROR
    #                          BMC_CONST.TFMR_SPURR_PARITY_ERROR
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def testTFMR_Errors(self, i_error):
        l_error = i_error
        for l_pair in self.l_dic:
            l_chip = l_pair[0]
            for l_core in l_pair[1]:
                l_reg = "1%s013281" % l_core
                l_cmd = "./putscom -c %s %s %s;echo $?" % (l_chip, l_reg, l_error)
                self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg -C")
                l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
                time.sleep(10)
                if l_res[-1] == "0":
                    print "Injected TFMR error %s" % l_error
                elif l_res[-1] == "1":
                    continue
                else:
                    if any("Kernel panic - not syncing" in line for line in l_res):
                        l_msg = "TFMR error injection: Kernel got panic"
                    elif any("Petitboot" in line for line in l_res):
                        l_msg = "System reached petitboot:TFMR error injection recovery failed"
                    elif any("ISTEP" in line for line in l_res):
                        l_msg = "System started booting: TFMR error injection recovery failed"
                    else:
                        l_msg = "Failed to inject TFMR error %s " % l_error
                        print l_msg
                        raise OpTestError(l_msg)

                l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg")
                if any("Timer facility experienced an error" in line for line in l_res) and \
                    any("Severe Hypervisor Maintenance interrupt [Recovered]" in line for line in l_res):
                    print "Timer facility experienced an error and got recovered"
                else:
                    l_msg = "HMI handling failed to log message"
                    raise OpTestError(l_msg)
                time.sleep(BMC_CONST.HMI_TEST_CASE_SLEEP_TIME)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function tests chip TOD related error injections and check
    #        the corresponding error got recovered. And this error injection
    #        happening on a random chip. This tod errors should test on systems
    #        having more than one processor socket(chip). On single chip system
    #        TOD error recovery won't work.
    #
    # @param i_error @type string: this is the type of error want to inject
    #                       These errors represented in common/OpTestConstants.py file.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def test_tod_errors(self, i_error):
        l_error = i_error
        l_pair = random.choice(self.l_dic)
        # Get random chip id
        l_chip = l_pair[0]
        l_cmd = "./putscom -c %s %s %s;echo $?" % (l_chip, BMC_CONST.TOD_ERROR_REG, l_error)
        self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg -C")
        l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console(l_cmd)
        time.sleep(10)
        # As of now putscom command to TOD register will fail with return code -1.
        # putscom indirectly call getscom to read the value again.
        # But getscom to TOD error reg there is no access
        # TOD Error reg has only WO access and there is no read access
        if l_res[-1] == "1":
            print "Injected TOD error %s" % l_error
        else:
            if any("Kernel panic - not syncing" in line for line in l_res):
                print "TOD ERROR Injection-kernel got panic"
            elif any("login:"******"System booted to host OS without any kernel panic message"
            elif any("Petitboot" in line for line in l_res):
                print "System reached petitboot without any kernel panic message"
            elif any("ISTEP" in line for line in l_res):
                print "System started booting without any kernel panic message"
            else:
                l_msg = "TOD: PSS Hamming distance error injection failed"
                raise OpTestError(l_msg)
        l_res = self.cv_IPMI.run_host_cmd_on_ipmi_console("dmesg")
        if any("Timer facility experienced an error" in line for line in l_res) and \
            any("Severe Hypervisor Maintenance interrupt [Recovered]" in line for line in l_res):
            print "Timer facility experienced an error and got recovered"
        else:
            l_msg = "HMI handling failed to log message"
            raise OpTestError(l_msg)
        time.sleep(BMC_CONST.HMI_TEST_CASE_SLEEP_TIME)
        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function enables a single core
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def host_enable_single_core(self):
        self.cv_HOST.host_enable_single_core()
Esempio n. 3
0
class OpTestIPMIPowerControl():
    ##  Initialize this object
    #  @param i_bmcIP The IP address of the BMC
    #  @param i_bmcUser The userid to log into the BMC with
    #  @param i_bmcPasswd The password of the userid to log into the BMC with
    #  @param i_bmcUserIpmi The userid to issue the BMC IPMI commands with
    #  @param i_bmcPasswdIpmi The password of BMC IPMI userid
    #  @param i_ffdcDir Optional param to indicate where to write FFDC
    #
    # "Only required for inband tests" else Default = None
    # @param i_hostIP The IP address of the HOST
    # @param i_hostuser The userid to log into the HOST
    # @param i_hostPasswd The password of the userid to log into the HOST with
    #
    def __init__(self, i_bmcIP, i_bmcUser, i_bmcPasswd,
                 i_bmcUserIpmi, i_bmcPasswdIpmi, i_ffdcDir=None, i_hostip=None,
                 i_hostuser=None, i_hostPasswd=None):
        self.cv_BMC = OpTestBMC(i_bmcIP, i_bmcUser, i_bmcPasswd, i_ffdcDir)
        self.cv_IPMI = OpTestIPMI(i_bmcIP, i_bmcUserIpmi, i_bmcPasswdIpmi,
                                  i_ffdcDir, i_hostip, i_hostuser, i_hostPasswd)
        self.cv_HOST = OpTestHost(i_hostip, i_hostuser, i_hostPasswd, i_bmcIP)
        self.cv_SYSTEM = OpTestSystem(i_bmcIP, i_bmcUser, i_bmcPasswd,
                         i_bmcUserIpmi, i_bmcPasswdIpmi, i_ffdcDir, i_hostip,
                         i_hostuser, i_hostPasswd)
        self.util = OpTestUtil()

    ##
    # @brief This function will test below system power control operations
    #        IPMI Power ON
    #             Power OFF
    #             Power Soft
    #             Power Cycle
    #             Power Reset
    #        So each operation is executed through ipmi commands. and
    #        check_system_status function will check whether FW and Host OS
    #        Boot completed or not.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def testIPMIPowerControl(self):
        self.cv_SYSTEM.sys_bmc_power_on_validate_host()

        print "Performing a IPMI Power OFF Operation"
        # Perform a IPMI Power OFF Operation(Immediate Shutdown)
        self.cv_IPMI.ipmi_power_off()
        if int(self.cv_SYSTEM.sys_wait_for_standby_state(BMC_CONST.SYSTEM_STANDBY_STATE_DELAY)) == 0:
            print "System is in standby/Soft-off state"
        else:
            l_msg = "System failed to reach standby/Soft-off state"
            raise OpTestError(l_msg)

        print "Performing a IPMI Power ON Operation"
        # Perform a IPMI Power ON Operation
        self.cv_IPMI.ipmi_power_on()
        self.check_system_status()
        self.util.PingFunc(self.cv_HOST.ip, BMC_CONST.PING_RETRY_POWERCYCLE)

        print "Performing a IPMI Soft Power OFF Operation"
        # Perform a IPMI Soft Power OFF Operation(Graceful shutdown)
        self.cv_IPMI.ipmi_power_soft()
        if int(self.cv_SYSTEM.sys_wait_for_standby_state(BMC_CONST.SYSTEM_STANDBY_STATE_DELAY)) == 0:
            print "System is in standby/Soft-off state"
        else:
            l_msg = "System failed to reach standby/Soft-off state"
            raise OpTestError(l_msg)

        print "Perform a IPMI Power ON Operation"
        # Perform a IPMI Power ON Operation
        self.cv_IPMI.ipmi_power_on()
        self.check_system_status()
        self.util.PingFunc(self.cv_HOST.ip, BMC_CONST.PING_RETRY_POWERCYCLE)

        print "Performing a IPMI Power Cycle(Soft reboot) Operation "
        # Perform a IPMI Power Cycle(Soft reboot) Operation only when system is in ON state
        self.cv_IPMI.ipmi_power_cycle()
        self.check_system_status()
        self.util.PingFunc(self.cv_HOST.ip, BMC_CONST.PING_RETRY_POWERCYCLE)

        print "Performing a IPMI Power Hard Reset Operation"
        # Perform a IPMI Power Hard Reset Operation
        self.cv_IPMI.ipmi_power_reset()
        self.check_system_status()
        self.util.PingFunc(self.cv_HOST.ip, BMC_CONST.PING_RETRY_POWERCYCLE)

        return BMC_CONST.FW_SUCCESS

    ##
    # @brief This function will check for system status and wait for
    #        FW and Host OS Boot progress to complete.
    #
    # @return BMC_CONST.FW_SUCCESS or raise OpTestError
    #
    def check_system_status(self):
        if int(self.cv_SYSTEM.sys_ipl_wait_for_working_state()) == 0:
            print "System booted to working state"
        else:
            l_msg = "System failed to boot"
            raise OpTestError(l_msg)
        if int(self.cv_SYSTEM.sys_wait_for_os_boot_complete()) == 0:
            print "System booted to Host OS"
        else:
            l_msg = "System failed to boot Host OS"
            raise OpTestError(l_msg)

        return BMC_CONST.FW_SUCCESS