def write(self, value): ''' Writes to write_source using util. Arguments: value: value to be set to the sensor Return: N/A ''' if self.write_source is None: return cmd = self.write_source % (int(value)) Logger.debug("Setting value using cmd=%s" % cmd) response = '' try: response = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() if response.find("Error") != -1: raise Exception("Write failed with response=%s" % response) except SystemExit: Logger.debug("SystemExit from sensor write") raise except Exception: Logger.crit("Exception with cmd=%s response=%s" % (cmd, response))
def get_sensor_tuples(fru_name, sensor_num, sensor_sources): """ Method to walk through each of the sensor sources to build the tuples of the form 'SensorValue' Arguments: fru_name: fru where the sensors should be read from sensor_sources: Set of all sensor souces from fsc config Returns: SensorValue tuples """ result = {} for key, value in list(sensor_sources.items()): if isinstance(value.source, FscSensorSourceUtil): result = parse_all_sensors_util( sensor_sources[key].source.read(fru=fru_name, num=sensor_num) ) break # Hack: util reads all sensors elif isinstance(sensor_sources.get(key).source, FscSensorSourceSysfs): symbolized_key, tuple = get_sensor_tuple_sysfs( key, sensor_sources[key].source.read(), sensor_sources[key].source.read_source_fail_counter, sensor_sources[key].source.read_source_wrong_counter, ) result[symbolized_key] = tuple else: Logger.crit("Unknown source type") return result
def sensor_valid_check(board, sname, check_name, attribute): cmd = "" data = "" if str(board) == "all": sdata = sname.split("_") board = sdata[0] sname = sname.replace(board + "_", "") Logger.debug("board=%s sname=%s" % (board, sname)) try: if attribute["type"] == "power_status": # check power status first pwr_sts = bmc_read_power() if pwr_sts == 1: return 1 return 0 else: Logger.debug( "Sensor corresponding valid check funciton not found!") return -1 except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception as err: Logger.crit( "Exception with board=%s, sensor_name=%s, cmd=%s, response=%s, err=%s" % (board, sname, cmd, data, err)) return 0
def read(self, **kwargs): ''' Reads all sensors values from sysfs source and return data read. There are two kinds of sensors temperature and fans. Arguments: kwargs: set of aruments needed to read from sysfs Return: blob of data read from sysfs ''' cmd = 'cat ' + self.read_source Logger.debug("Reading data with cmd=%s" % cmd) data = '' try: proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) data = proc.stdout.read() err = proc.stderr.read() if err: self.read_source_fail_counter += 1 else: self.read_source_fail_counter = 0 except SystemExit: Logger.debug("SystemExit from sensor read") self.read_source_fail_counter += 1 raise except Exception: Logger.crit("Exception with cmd=%s response=%s" % (cmd, data)) self.read_source_fail_counter += 1 return data
def sensor_valid_check(board, sname, check_name, attribute): cmd = "" data = "" try: if attribute["type"] == "power_status": return bmc_read_power() elif attribute["type"] == "gpio": cmd = ["gpiocli", "get-value", "--shadow", attribute["shadow"]] data = check_output(cmd).decode().split("=") if int(data[1]) == 0: return 1 else: return 0 else: Logger.debug( "Sensor corresponding valid check funciton not found!") return -1 except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception as err: Logger.crit( "Exception with board=%s, sensor_name=%s, cmd=%s, response=%s, err=%s" % (board, sname, cmd, data, err)) return 0
def board_host_actions(action="None", cause="None"): if "host_shutdown" in action: Logger.crit("Host is shutdown due to cause %s" % (str(cause), )) return yamp_host_shutdown() Logger.warn("Host needs action '%s' and cause '%s'" % (str(action), str(cause))) pass
def write(self, value): """ Writes to write_source using echo to sysfs location echo #value > sysfs_path Arguments: value: value to be set to the sensor Return: N/A """ if self.write_source is None: return cmd = ("echo " + str(value * self.max_duty_register / 100) + " > " + self.write_source) Logger.debug("Setting value using cmd=%s" % cmd) response = "" try: response = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() except SystemExit: Logger.debug("SystemExit from sensor write") raise except Exception: Logger.crit("Exception with cmd=%s response=%s" % (cmd, response))
def read(self, **kwargs): ''' Reads all sensors values from the util and return data read. There are two kinds of sensors temperature and fans. Following are the util usages: sensor util: 'util <fru name>' Reads all sensors from a specific fru 'util <fru name> <sensor number>' Reads sensor from a specific fru number fan util: 'util' Reads all fan speeds Arguments: kwargs: set of aruments needed to read from any of the util Return: blob of data read from util ''' cmd = self.read_source if 'fru' in kwargs: if 'num' in kwargs and len(kwargs['num']): cmd = '' for num in kwargs['num']: cmd += self.read_source + " " + kwargs[ 'fru'] + " " + num + ";" else: cmd = cmd + " " + kwargs['fru'] Logger.debug("Reading data with cmd=%s" % cmd) data = '' try: data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception: Logger.crit("Exception with cmd=%s response=%s" % (cmd, data)) return data
def sensor_valid_check(board, sname, check_name, attribute): cmd = '' data = '' try: if attribute['type'] == "power_BIC_status": cmd = "/usr/local/bin/power-util %s status" % attribute['fru'] data = '' data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() result = data.split(": ") if match(r'ON', result[1]) != None: cmd = "cat /sys/class/gpio/gpio%s/value" % attribute['number'] data = '' data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() if int(data) == 0: return 1 else: return 0 else: return 0 elif attribute['type'] == "gpio_power_nvme": cmd = "cat /sys/class/gpio/gpio%s/value" % attribute['number'] data = '' data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() if int(data) == 0: cmd = "/usr/local/bin/power-util %s status" % attribute['fru'] data = '' data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() result = data.split(": ") if match(r'ON', result[1]) != None: cmd = "/tmp/cache_store/M2_%s_NVMe" % attribute['nvme'] data = '' if os.path.isfile(cmd) == True: data = open(cmd, "r") if data.read() == "1": return 1 else: return 0 else: return 0 else: return 0 else: return 0 else: Logger.debug( "Sensor corresponding valid check funciton not found!") return 0 except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception: Logger.crit( "Exception with board=%s, sensor_name=%s, cmd=%s, response=%s" % (board, sname, cmd, data)) return 0
def board_host_actions(action="None", cause="None"): """ Override the method to define fan specific actions like: - handling host power off - alarming/syslogging criticals """ if "host_shutdown" in action: Logger.crit("Host is shutdown due to cause %s" % (str(cause),)) return host_shutdown() Logger.warn("Host needs action '%s' and cause '%s'" % (str(action), str(cause))) pass
def sensor_valid_check(board, sname, check_name, attribute): cmd = "" data = "" try: if attribute["type"] == "power_BIC_status": cmd = "/usr/local/bin/power-util %s status" % attribute["fru"] data = "" data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() result = data.split(": ") if match(r"ON", result[1]) is not None: cmd = "cat /sys/class/gpio/gpio%s/value" % attribute["number"] data = "" data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() if int(data) == 0: return 1 else: return 0 else: return 0 elif attribute["type"] == "gpio_power_nvme": cmd = "cat /sys/class/gpio/gpio%s/value" % attribute["number"] data = "" data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() if int(data) == 0: cmd = "/usr/local/bin/power-util %s status" % attribute["fru"] data = "" data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() result = data.split(": ") if match(r"ON", result[1]) is not None: key = "M2_%s_NVMe" % attribute["nvme"] try: return int(kv_get(key)) except Exception: return 0 else: return 0 else: return 0 else: Logger.debug( "Sensor corresponding valid check funciton not found!") return 0 except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception: Logger.crit( "Exception with board=%s, sensor_name=%s, cmd=%s, response=%s" % (board, sname, cmd, data)) return 0
def board_host_actions(action="None", cause="None"): """ Override the method to define fan specific actions like: - handling host power off - alarming/syslogging criticals """ if "host_shutdown" in action: if "All fans are bad" in cause: if not check_if_all_fantrays_ok(): Logger.warn("Host action %s not performed for cause %s" % (str(action), str(cause))) return False Logger.crit("Host is shutdown due to cause %s" % (str(cause), )) return host_shutdown() Logger.warn("Host needs action '%s' and cause '%s'" % (str(action), str(cause))) pass
def read(self, **kwargs): """ Reads all sensors values from the util and return data read. There are two kinds of sensors temperature and fans. Following are the util usages: sensor util: 'util <fru name>' Reads all sensors from a specific fru 'util <fru name> <sensor number>' Reads sensor from a specific fru number fan util: 'util' Reads all fan speeds Arguments: kwargs: set of aruments needed to read from any of the util Return: blob of data read from util """ cmd = self.read_source if "fru" in kwargs: if "inf" in kwargs and kwargs["inf"] is not None: cmd += " " + kwargs["fru"] + " --filter" inf = kwargs["inf"] for name in inf["ext_vars"]: sdata = name.split(":") board = sdata[0] if board != kwargs["fru"]: continue #sname = sdata[1] cmd += " " + sdata[1] elif "num" in kwargs and len(kwargs["num"]): cmd = "" for num in kwargs["num"]: cmd += self.read_source + " " + kwargs[ "fru"] + " " + num + ";" else: cmd = cmd + " " + kwargs["fru"] Logger.debug("Reading data with cmd=%s" % cmd) data = "" try: data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception: Logger.crit("Exception with cmd=%s response=%s" % (cmd, data)) return data
def sensor_valid_check(board, sname, check_name, attribute): cmd = "" data = "" try: if attribute["type"] == "power_status": # check power status first pwr_sts = bmc_read_power() if pwr_sts != 1: return 0 fru_name = c_char_p(board.encode("utf-8")) snr_name = c_char_p(sname.encode("utf-8")) is_snr_valid = lpal_hndl.pal_sensor_is_valid(fru_name, snr_name) return int(is_snr_valid) elif attribute["type"] == "gpio": cmd = ["gpiocli", "get-value", "--shadow", attribute["shadow"]] data = check_output(cmd).decode().split("=") if int(data[1]) == 0: return 1 else: return 0 elif attribute["type"] == "prsnt": fru_name = c_char_p(board.encode("utf-8")) snr_name = c_char_p(sname.encode("utf-8")) is_snr_valid = lpal_hndl.pal_sensor_is_valid(fru_name, snr_name) return int(is_snr_valid) else: Logger.debug( "Sensor corresponding valid check funciton not found!") return -1 except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception as err: Logger.crit( "Exception with board=%s, sensor_name=%s, cmd=%s, response=%s, err=%s" % (board, sname, cmd, data, err)) return 0
def read_fans(self, fans): """ Method to read all fans speeds Arguments: fans: Set of all sensor fan souces from fsc config Returns: Fan speeds set """ Logger.debug("Read all fan speeds") result = {} for key, value in list(fans.items()): if isinstance(value.source, FscSensorSourceUtil): result[fans[key]] = parse_fan_util(fans[key].source.read()) elif isinstance(fans[key].source, FscSensorSourceSysfs): result[fans[key]] = parse_fan_sysfs(fans[key].source.read()) else: Logger.crit("Unknown source type") return result
def write(self, value): ''' Writes to write_source using echo to sysfs location echo #value > sysfs_path Arguments: value: value to be set to the sensor Return: N/A ''' if self.write_source is None: return cmd = 'echo ' + str(value) + ' > ' + self.write_source Logger.debug("Setting value using cmd=%s" % cmd) response = '' try: response = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() except SystemExit: Logger.debug("SystemExit from sensor write") raise except Exception: Logger.crit("Exception with cmd=%s response=%s" % (cmd, response))
def host_shutdown(): SCM_POWER_COMMAND = "/usr/local/bin/wdtcli kick &> /dev/null; /usr/local/bin/wedge_power.sh off" TH_SWITCH_POWER_COMMAND = "source /usr/local/bin/openbmc-utils.sh; echo 0 > $SMBCPLD_SYSFS_DIR/th3_turn_on" GB_SWITCH_POWER_COMMAND = "source /usr/local/bin/openbmc-utils.sh; echo 0 > $SMBCPLD_SYSFS_DIR/gb_turn_on" switch_poweroff_cmd = "" brd_type = pal_get_board_type() if brd_type == "Wedge400": switch_poweroff_cmd = TH_SWITCH_POWER_COMMAND elif brd_type == "Wedge400C": switch_poweroff_cmd = GB_SWITCH_POWER_COMMAND else: Logger.crit("Cannot identify board type: %s" % brd_type) Logger.crit("Switch won't be resetting!") Logger.info("host_shutdown() executing {}".format(SCM_POWER_COMMAND)) response = Popen(SCM_POWER_COMMAND, shell=True, stdout=PIPE).stdout.read() time.sleep(5) if switch_poweroff_cmd != "": Logger.info("host_shutdown() executing {}".format(switch_poweroff_cmd)) response = Popen(switch_poweroff_cmd, shell=True, stdout=PIPE).stdout.read() return response
def sensor_valid_check(board, sname, check_name, attribute): cmd = '' data = '' try: if attribute['type'] == "power_status": #check power status first pwr_sts = bmc_read_power() if pwr_sts != 1: return 0 fru_name = c_char_p(board.encode('utf-8')) snr_name = c_char_p(sname.encode('utf-8')) is_snr_valid = lpal_hndl.pal_sensor_is_valid(fru_name, snr_name) return int(is_snr_valid) elif attribute['type'] == "gpio": cmd = "cat /sys/class/gpio/gpio%s/value" % attribute['number'] data = '' data = Popen(cmd, shell=True, stdout=PIPE).stdout.read().decode() if int(data) == 0: return 1 else: return 0 else: Logger.debug( "Sensor corresponding valid check funciton not found!") return -1 except SystemExit: Logger.debug("SystemExit from sensor read") raise except Exception as err: Logger.crit( "Exception with board=%s, sensor_name=%s, cmd=%s, response=%s, err=%s" % (board, sname, cmd, data, err)) return 0
def read(self, **kwargs): ''' Reads all sensors values from sysfs source and return data read. There are two kinds of sensors temperature and fans. Arguments: kwargs: set of aruments needed to read from sysfs Return: blob of data read from sysfs ''' # IF read_source has hwmon* then determine what is the hwmon device # and use that for reading readsysfs = self.read_source if "hwmon*" in self.read_source: readsysfs = self.get_hwmon_source() cmd = 'cat ' + readsysfs Logger.debug("Reading data with cmd=%s" % cmd) data = '' try: proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) data = proc.stdout.read().decode() err = proc.stderr.read().decode() if err: self.read_source_fail_counter += 1 else: self.read_source_fail_counter = 0 except SystemExit: Logger.debug("SystemExit from sensor read") self.read_source_fail_counter += 1 raise except Exception: Logger.crit("Exception with cmd=%s response=%s" % (cmd, data)) self.read_source_fail_counter += 1 return data
def update_dead_fans(self, dead_fans): ''' Check for dead and recovered fans ''' last_dead_fans = dead_fans.copy() speeds = self.machine.read_fans(self.fans) print("\x1b[2J\x1b[H") sys.stdout.flush() for fan, rpms in list(speeds.items()): Logger.info("%s speed: %d RPM" % (fan.label, rpms)) if rpms < self.fsc_config['min_rpm']: dead_fans.add(fan) self.fsc_fan_action(fan, action='dead') else: dead_fans.discard(fan) recovered_fans = last_dead_fans - dead_fans newly_dead_fans = dead_fans - last_dead_fans if len(newly_dead_fans) > 0: if self.fanpower: Logger.warn("%d fans failed" % (len(dead_fans), )) else: Logger.crit("%d fans failed" % (len(dead_fans), )) for dead_fan in dead_fans: if self.fanpower: Logger.warn("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) else: Logger.crit("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) Logger.usbdbg("%s fail" % (dead_fan.label)) fan_fail_record_path = FAN_FAIL_RECORD_DIR + '%s' % ( dead_fan.label) if not os.path.isfile(fan_fail_record_path): fan_fail_record = open(fan_fail_record_path, 'w') fan_fail_record.close() for fan in recovered_fans: if self.fanpower: Logger.warn("%s has recovered" % (fan.label, )) else: Logger.crit("%s has recovered" % (fan.label, )) Logger.usbdbg("%s recovered" % (fan.label)) self.fsc_fan_action(fan, action='recover') fan_fail_record_path = FAN_FAIL_RECORD_DIR + '%s' % (fan.label) if os.path.isfile(fan_fail_record_path): os.remove(fan_fail_record_path) return dead_fans
def update_dead_fans(self, dead_fans): ''' Check for dead and recovered fans ''' last_dead_fans = dead_fans.copy() speeds = self.machine.read_fans(self.fans) print("\x1b[2J\x1b[H") sys.stdout.flush() for fan, rpms in speeds.items(): Logger.info("%s speed: %d RPM" % (fan.label, rpms)) if rpms < self.fsc_config['min_rpm']: dead_fans.add(fan) self.fsc_fan_action(fan, action='dead') else: dead_fans.discard(fan) recovered_fans = last_dead_fans - dead_fans newly_dead_fans = dead_fans - last_dead_fans if len(newly_dead_fans) > 0: if self.fanpower: Logger.warn("%d fans failed" % (len(dead_fans), )) else: Logger.crit("%d fans failed" % (len(dead_fans), )) for dead_fan in dead_fans: if self.fanpower: Logger.warn("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) else: Logger.crit("%s dead, %d RPM" % (dead_fan.label, speeds[dead_fan])) Logger.usbdbg("%s fail" % (dead_fan.label)) for fan in recovered_fans: if self.fanpower: Logger.warn("%s has recovered" % (fan.label, )) else: Logger.crit("%s has recovered" % (fan.label, )) Logger.usbdbg("%s recovered" % (fan.label)) self.fsc_fan_action(fan, action='recover') return dead_fans
def run(self, sensors, dt): ctx = {"dt": dt} outmin = 0 fail_ssd_count = 0 sensor_index = 0 cause_boost_count = 0 no_sane_flag = 0 mode = 0 for v in self.expr_meta["ext_vars"]: sensor_valid_flag = 1 sdata = v.split(":") board = sdata[0] sname = sdata[1] if self.sensor_valid_check != None: for check_name in self.sensor_valid_check: if re.match(check_name, sname, re.IGNORECASE) != None: self.sensor_valid_cur[ sensor_index] = fsc_board.sensor_valid_check( board, sname, check_name, self.sensor_valid_check[check_name] ["attribute"], ) # If current or previous sensor valid status is 0, ignore this sensor reading. # Only when both are 1, goes to sensor check process if (self.sensor_valid_cur[sensor_index] == 0) or (self.sensor_valid_pre[sensor_index] == 0): sensor_valid_flag = 0 self.missing_sensor_assert_retry[sensor_index] = 0 break if sensor_valid_flag == 1: if sname in sensors[board]: self.missing_sensor_assert_retry[sensor_index] = 0 if self.missing_sensor_assert_flag[sensor_index]: Logger.crit("DEASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = False sensor = sensors[board][sname] ctx[v] = sensor.value if sensor.status in ["ucr"]: Logger.warn("Sensor %s reporting status %s" % (sensor.name, sensor.status)) outmin = max(outmin, self.transitional) if outmin == self.transitional: mode = fan_mode["trans_mode"] else: if self.sensor_fail == True: sensor_fail_record_path = SENSOR_FAIL_RECORD_DIR + v if not os.path.isdir(SENSOR_FAIL_RECORD_DIR): os.mkdir(SENSOR_FAIL_RECORD_DIR) if (sensor.status in [ "na" ]) and (self.sensor_valid_cur[sensor_index] != -1): if re.match(r"SSD", sensor.name) != None: fail_ssd_count = fail_ssd_count + 1 else: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) cause_boost_count += 1 if not os.path.isfile(sensor_fail_record_path): sensor_fail_record = open( sensor_fail_record_path, "w") sensor_fail_record.close() if outmin == self.boost: mode = fan_mode["boost_mode"] else: if os.path.isfile(sensor_fail_record_path): os.remove(sensor_fail_record_path) else: if (not self.missing_sensor_assert_flag[sensor_index] ) and (self.missing_sensor_assert_retry[sensor_index] >= 2): Logger.crit("ASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = True if self.missing_sensor_assert_retry[sensor_index] < 2: self.missing_sensor_assert_retry[sensor_index] += 1 # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None self.sensor_valid_pre[sensor_index] = self.sensor_valid_cur[ sensor_index] sensor_index += 1 if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if (not exprout) and (outmin == 0): if not self.transitional_assert_flag: Logger.crit("ASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed." % (self.counter)) exprout = self.transitional mode = fan_mode["trans_mode"] no_sane_flag = 1 self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit("DEASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed." % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: if "SSD_sensor_fail" in list(self.fail_sensor_type.keys()): if self.fail_sensor_type["SSD_sensor_fail"] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if "offset_algorithm" in list( self.ssd_progressive_algorithm.keys()): list_index = 0 for i in self.ssd_progressive_algorithm[ "offset_algorithm"]: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] no_sane_flag = 0 break else: if list_index == len( self.ssd_progressive_algorithm[ "offset_algorithm"]): outmin = max(outmin, self.boost) cause_boost_count += 1 if outmin == self.boost: mode = fan_mode["boost_mode"] boost_record_path = RECORD_DIR + "sensor_fail_boost" if cause_boost_count != 0: if not os.path.isfile(boost_record_path): sensor_fail_boost_record = open(boost_record_path, "w") sensor_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) if not exprout: exprout = 0 if exprout < outmin: exprout = outmin else: if no_sane_flag != 1: mode = fan_mode["normal_mode"] self.get_set_fan_mode(mode, action="write") exprout = clamp(exprout, 0, 100) return exprout
global wdfile board_callout(callout='init_fans', boost=DEFAULT_INIT_TRANSITIONAL) Logger.warn("killed by signal %d" % (signum,)) if signum == signal.SIGQUIT and wdfile: Logger.info("Killed with SIGQUIT - stopping watchdog.") wdfile.write(b"X") wdfile.flush() wdfile.close() wdfile = None sys.exit('killed') if __name__ == "__main__": try: signal.signal(signal.SIGTERM, handle_term) signal.signal(signal.SIGINT, handle_term) signal.signal(signal.SIGQUIT, handle_term) if len(sys.argv) > 1: llevel = sys.argv[1] else: llevel = 'warning' fscd = Fscd(log_level=llevel) fscd.run() except Exception: board_callout(callout='init_fans', boost=DEFAULT_INIT_TRANSITIONAL) (etype, e) = sys.exc_info()[:2] Logger.crit("failed, exception: " + str(etype)) traceback.print_exc() for line in traceback.format_exc().split('\n'): Logger.crit(line)
def run(self, sensors, dt): ctx = {'dt': dt} outmin = 0 fail_ssd_count = 0 sensor_index = 0 for v in self.expr_meta['ext_vars']: sensor_valid_flag = 1 board, sname = v.split(":") if self.sensor_valid_check != None: for check_name in self.sensor_valid_check: if re.match(check_name, sname, re.IGNORECASE) != None: self.sensor_valid_cur[sensor_index] = fsc_board.sensor_valid_check(board, sname, check_name, self.sensor_valid_check[check_name]["attribute"]) #If current or previous sensor valid status is 0, ignore this sensor reading. #Only when both are 1, goes to sensor check process if (self.sensor_valid_cur[sensor_index] == 0) or (self.sensor_valid_pre[sensor_index] == 0): sensor_valid_flag = 0 self.missing_sensor_assert_retry[sensor_index] = 0 break if sensor_valid_flag == 1: if sname in sensors[board]: self.missing_sensor_assert_retry[sensor_index] = 0 if self.missing_sensor_assert_flag[sensor_index]: Logger.crit('DEASSERT: Zone%d Missing sensors: %s' % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = False sensor = sensors[board][sname] ctx[v] = sensor.value if sensor.status in ['ucr']: Logger.warn('Sensor %s reporting status %s' % (sensor.name, sensor.status)) outmin = max(outmin, self.transitional) else: if self.sensor_fail == True: if (sensor.status in ['na']) and (self.sensor_valid_cur[sensor_index] != -1): if re.match(r'.+_C[2-4]_[0-3]_NVME_.+', sensor.name) != None: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) elif re.match(r'SSD', sensor.name) != None or re.match(r'(.*)nvme(.*)', sname) != None: fail_ssd_count = fail_ssd_count + 1 else: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) else: if (not self.missing_sensor_assert_flag[sensor_index]) and (self.missing_sensor_assert_retry[sensor_index] >= 2): Logger.crit('ASSERT: Zone%d Missing sensors: %s' % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = True if (self.missing_sensor_assert_retry[sensor_index] < 2): self.missing_sensor_assert_retry[sensor_index] += 1 # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None self.sensor_valid_pre[sensor_index] = self.sensor_valid_cur[sensor_index] sensor_index += 1 if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if (not exprout) and (outmin == 0): if not self.transitional_assert_flag: Logger.crit('ASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) exprout = self.transitional self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit('DEASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: if 'SSD_sensor_fail' in list(self.fail_sensor_type.keys()): if self.fail_sensor_type['SSD_sensor_fail'] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if 'offset_algorithm' in list(self.ssd_progressive_algorithm.keys()): list_index = 0 for i in self.ssd_progressive_algorithm['offset_algorithm']: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] break else: if list_index == len(self.ssd_progressive_algorithm['offset_algorithm']): outmin = max(outmin, self.boost) if not exprout: exprout = 0 if exprout < outmin: exprout = outmin exprout = clamp(exprout, 0, 100) return exprout
def run(self, sensors, dt): ctx = {'dt': dt} outmin = 0 fail_ssd_count = 0 missing = set() for v in self.expr_meta['ext_vars']: board, sname = v.split(":") if sname in sensors[board]: sensor = sensors[board][sname] ctx[v] = sensor.value if sensor.status in ['ucr']: Logger.warn('Sensor %s reporting status %s' % (sensor.name, sensor.status)) outmin = self.transitional if self.fail_sensor_type != None: if 'standby_sensor_fail' in self.fail_sensor_type.keys(): if self.fail_sensor_type[ 'standby_sensor_fail'] == True: if sensor.status in ['na']: if re.match(r'SOC', sensor.name) != None: if 'server_sensor_fail' in self.fail_sensor_type.keys( ): if self.fail_sensor_type[ 'server_sensor_fail'] == True: ret = fsc_board.get_power_status( board) if ret: Logger.debug( "Server Sensor Fail") outmin = self.boost break elif re.match(r'SSD', sensor.name) != None: if 'SSD_sensor_fail' in self.fail_sensor_type.keys( ): if self.fail_sensor_type[ 'SSD_sensor_fail'] == True: fail_ssd_count = fail_ssd_count + 1 else: Logger.debug("Standby Sensor Fail") outmin = self.boost break else: missing.add(v) # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None if missing: Logger.warn('Missing sensors: %s' % (', '.join(missing), )) if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if not exprout: if not self.transitional_assert_flag: Logger.crit('ASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) exprout = self.transitional self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit('DEASSERT: Zone%d No sane fan speed could be \ calculated! Using transitional speed.' % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: if 'SSD_sensor_fail' in self.fail_sensor_type.keys(): if self.fail_sensor_type['SSD_sensor_fail'] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if 'offset_algorithm' in self.ssd_progressive_algorithm.keys( ): list_index = 0 for i in self.ssd_progressive_algorithm[ 'offset_algorithm']: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] break else: if list_index == len( self.ssd_progressive_algorithm[ 'offset_algorithm']): outmin = self.boost if exprout < outmin: exprout = outmin exprout = clamp(exprout, 0, 100) return exprout
def run(self, sensors, ctx, ignore_mode): outmin = 0 fail_ssd_count = 0 valid_m2_count = 0 sensor_index = 0 cause_boost_count = 0 no_sane_flag = 0 display_progressive_flag = 0 mode = 0 for v in self.expr_meta["ext_vars"]: sensor_valid_flag = 1 sdata = v.split(":") board = sdata[0] sname = sdata[1] if self.sensor_valid_check != None: for check_name in self.sensor_valid_check: if re.match(check_name, sname, re.IGNORECASE) != None: self.sensor_valid_cur[ sensor_index] = fsc_board.sensor_valid_check( board, sname, check_name, self.sensor_valid_check[check_name] ["attribute"], ) # If current or previous sensor valid status is 0, ignore this sensor reading. # Only when both are 1, goes to sensor check process if (self.sensor_valid_cur[sensor_index] == 0) or (self.sensor_valid_pre[sensor_index] == 0): sensor_valid_flag = 0 self.missing_sensor_assert_retry[sensor_index] = 0 break if sensor_valid_flag == 1: if sname in sensors[board]: self.missing_sensor_assert_retry[sensor_index] = 0 if self.missing_sensor_assert_flag[sensor_index]: Logger.crit("DEASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = False sensor = sensors[board][sname] ctx[v] = sensor.value if re.match(r".*temp_dev", sname) != None: valid_m2_count = valid_m2_count + 1 if sensor.status in ["ucr"]: Logger.warn("Sensor %s reporting status %s" % (sensor.name, sensor.status)) outmin = max(outmin, self.transitional) if outmin == self.transitional: mode = fan_mode["trans_mode"] else: if self.sensor_fail == True: sensor_fail_record_path = SENSOR_FAIL_RECORD_DIR + v if not os.path.isdir(SENSOR_FAIL_RECORD_DIR): os.mkdir(SENSOR_FAIL_RECORD_DIR) if (sensor.status in [ "na" ]) and (self.sensor_valid_cur[sensor_index] != -1): if (re.match(r"SSD", sensor.name) != None) or (re.match( r".*temp_dev", sname) != None): fail_ssd_count = fail_ssd_count + 1 Logger.warn("M.2 Device %s Fail" % v) else: Logger.warn("%s Fail" % v) outmin = max(outmin, self.boost) cause_boost_count += 1 if not os.path.isfile(sensor_fail_record_path): sensor_fail_record = open( sensor_fail_record_path, "w") sensor_fail_record.close() if outmin == self.boost: mode = fan_mode["boost_mode"] else: if os.path.isfile(sensor_fail_record_path): os.remove(sensor_fail_record_path) else: if (not self.missing_sensor_assert_flag[sensor_index] ) and (self.missing_sensor_assert_retry[sensor_index] >= 2): Logger.crit("ASSERT: Zone%d Missing sensors: %s" % (self.counter, v)) self.missing_sensor_assert_flag[sensor_index] = True if self.missing_sensor_assert_retry[sensor_index] < 2: self.missing_sensor_assert_retry[sensor_index] += 1 # evaluation tries to ignore the effects of None values # (e.g. acts as 0 in max/+) ctx[v] = None else: if sname in sensors[board]: if self.sensor_fail == True: sensor_fail_record_path = SENSOR_FAIL_RECORD_DIR + v if os.path.isfile(sensor_fail_record_path): os.remove(sensor_fail_record_path) self.sensor_valid_pre[sensor_index] = self.sensor_valid_cur[ sensor_index] sensor_index += 1 if verbose: (exprout, dxstr) = self.expr.dbgeval(ctx) Logger.info(dxstr + " = " + str(exprout)) else: exprout = self.expr.eval_driver(ctx) Logger.info(self.expr_str + " = " + str(exprout)) # If *all* sensors in the top level max() report None, the # expression will report None if (not exprout) and (outmin == 0): if not self.transitional_assert_flag: Logger.crit( "ASSERT: Zone%d No sane fan speed could be calculated! Using transitional speed." % (self.counter)) exprout = self.transitional mode = fan_mode["trans_mode"] no_sane_flag = 1 self.transitional_assert_flag = True else: if self.transitional_assert_flag: Logger.crit( "DEASSERT: Zone%d No sane fan speed could be calculated! Using transitional speed." % (self.counter)) self.transitional_assert_flag = False if self.fail_sensor_type != None: progressive_mode = True if ("M2_sensor_fail" in list(self.fail_sensor_type.keys())) and ( "M2_sensor_count" in list(self.fail_sensor_type.keys())): if (self.fail_sensor_type["M2_sensor_fail"] == True) and ( self.fail_sensor_type["M2_sensor_count"] > 0): if valid_m2_count == 0: if fsc_board.all_slots_power_off() == False: # Missing all module (no M.2 device) outmin = max(outmin, self.boost) cause_boost_count += 1 mode = fan_mode["boost_mode"] progressive_mode = False else: # All slots power off, do not boost up progressive_mode = False elif valid_m2_count != self.fail_sensor_type[ "M2_sensor_count"]: # Missing some module (M.2 devices partially populated) progressive_mode = False cause_boost_count += 1 else: # M.2 devices fully populated if cause_boost_count != 0: # other boost reasons: e.g. other sensors (not M.2 devices' sensors) fail to read sensors progressive_mode = False else: if fail_ssd_count != 0: # M.2 devices progressive_mode # handle M.2 devices/SSD fail to read case cause_boost_count += 1 # show out sensor fail record display_progressive_flag = ( 1) # do not override by normal mode mode = fan_mode["progressive_mode"] else: # M.2 devices noraml mode progressive_mode = False if progressive_mode and ("SSD_sensor_fail" in list( self.fail_sensor_type.keys())): if self.fail_sensor_type["SSD_sensor_fail"] == True: if fail_ssd_count != 0: if self.ssd_progressive_algorithm != None: if "offset_algorithm" in list( self.ssd_progressive_algorithm.keys()): list_index = 0 for i in self.ssd_progressive_algorithm[ "offset_algorithm"]: list_index = list_index + 1 if fail_ssd_count <= i[0]: exprout = exprout + i[1] no_sane_flag = 0 break else: if list_index == len( self.ssd_progressive_algorithm[ "offset_algorithm"]): outmin = max(outmin, self.boost) cause_boost_count += 1 if outmin == self.boost: mode = fan_mode["boost_mode"] boost_record_path = RECORD_DIR + "sensor_fail_boost" if cause_boost_count != 0: if not os.path.isfile(boost_record_path): sensor_fail_boost_record = open(boost_record_path, "w") sensor_fail_boost_record.close() else: if os.path.isfile(boost_record_path): os.remove(boost_record_path) if not exprout: exprout = 0 if exprout < outmin: exprout = outmin else: if (no_sane_flag != 1) and (display_progressive_flag != 1): mode = fan_mode["normal_mode"] if not ignore_mode: self.get_set_fan_mode(mode, action="write") exprout = clamp(exprout, 0, 100) return exprout