def determine_array_status(self): # kinda weird but MD can leave an array device present with # array status clean so we need to check that too. try: stat ('/dev/%s' % self.dev_name) except OSError: return 'stopped' if self.level != 'linear': try: array_state = get_sysfs_param ('/sys/block/%s/md/array_state' % self.dev_name) sync_action = get_sysfs_param ('/sys/block/%s/md/sync_action' % self.dev_name) num_failed = get_sysfs_param('/sys/block/%s/md/failed_disks' % self.dev_name) # check the number of failed disks and then see if we're syncing # if int (num_failed, 10) != 0: if sync_action == 'idle': return 'degraded' else: return 'rebuilding' else: return 'online' except IOError: return 'stopped' else: # linear raid levels do not support sync/failed disks # we simply assume these arrays are online, as they # do not maintain raid drive element state # if the raid array device exists, it is online # return 'online'
def purge_faulty_drives(self): dev_entry_re = recompile ("^dev") # walk the dev-sdX entries in the sysfs for a raid device and remove any # that are faulty. md_rootdir = '/sys/block/%s/md/' % self.get_devname() try: dir = listdir(md_rootdir) for d in dir: if dev_entry_re.match (d): state_entry = '%s%s/state' % (md_rootdir, d) try: state = '%s' % get_sysfs_param(state_entry) except (IOError, OSError): # ignore and continue continue if state == "faulty": rlog_debug ('Cleaning up stale device [%s] reference in array [%s]' % ( d, self.get_devname())) # we found a disk that should have been removed but wasnt if not set_sysfs_param (state_entry, 'remove'): rlog_notice ('Unable to remove faulty device [%s] from array [%s]' % ( self.get_devname(), d)) except (IOError, OSError): # make sure we keep on going if we have a problem, we want to try to # fix any inconsistancies found pass
def collect_rebuild_info(self): try: sync_state = get_sysfs_param('/sys/block/%s/md/sync_completed' % self.dev_name) values=sync_state.strip().split("/") self.sync_completed_kb = int(values[0]) self.sync_total_kb = int(values[1]) rlog_debug ("Rebuild info : [%d:%d]" % (self.sync_completed_kb, self.sync_total_kb)) except (rrdm_error, IndexError): self.sync_total_kb = -1 self.sync_complete_kb = 0
def fill_from_system_info(self): if (self.hd.status == 'missing'): self.raid_port = -1 self.raid_status = 'missing' path = '/sys/block/%s/md/dev-%s/slot' % (self.raid_array.dev_name, self.dev_name) try: self.raid_port = int (get_sysfs_param (path) , 10) except Exception: self.raid_port = -1 self.raid_status = 'missing'
def fill_from_system_info(self, device_list, name, dev_name, fstype, type, layout, level, cfg_size_mb, sysfscfg_list = []): self.dev_name = dev_name self.name = name self.fstype = fstype self.type = type self.layout = layout self.level = level self.cfg_size_mb = cfg_size_mb self.__device_list = device_list self.__sysfscfg_list = sysfscfg_list # currently we expect each raid to go across all drives # in the system self.num_drives = self.__device_list.get_expected_drives() self.status = self.determine_array_status() if self.is_rebuilding(): self.collect_rebuild_info() if not self.is_stopped(): try: self.uuid = get_sysfs_param ('/sys/block/%s/md/uuid' % self.dev_name) except rrdm_error: self.uuid = '' rlog_debug ('raid status for [%s] is [%s]' % (self.dev_name, self.status)) for diskpart in self.__device_list.get_devices(): part_num = diskpart.part_id disk = diskpart.hd rlog_debug ('adding disk device for raid array [%s] part [%s]' % \ (self.dev_name, part_num)) rpart = diskpart.get_devname() rdevice = '/dev/%s' % rpart # This is an assumption that should hold true even on old boxes, # the raid port should equal the logical port carried by the device in # the drive list. # originally we simply encoded the raid port in the rvbd SB as the drive # number. This would be an issue if we supported moving around disks, # but as we don't support that today, we should be ok. # the problem with moving drives around would be that each drive physically # could now be a different rdev in a number of arrays, and the SB doesnt store # this well today # rdev = diskpart.get_logical_device() # try: # # if the disk has a valid riverbed SB, we can use the SB info to give us # # the raid port, otherwise we need to fall back to using mdadm to get the # # raid port. # # # if disk.has_valid_superblock(): # rdev = disk.superblock.get_raid_port() # rlog_debug('Superblock indicates [%s] is [%s]' % (rpart, rdev)) # elif not disk.is_failed(): # # fallback to mdadm's brief superblock output and get the raid port from there. # rlog_debug ('Disk %s has no riverbed superblock, checking mdadm' % rpart) # dev_sb_output = read_brief_md_sb(rdevice) # # we expect a string rdev here. # rdev = '%s' % get_rdev_from_brief_sb(dev_sb_output) # else: # rdev = 'unknown' # raise rrdm_error ("Disk %s doesnt have a riverbed superblock" % rpart) # except rrdm_error: # # # we can't read the SB info for this disk and we know its not missing, so.. # # fill it in failed and go to the next disk # newpart=RaidPartition() # # here we need to use a fallback from the config if the drive is missing, # # and we want movable drives. # newpart.make_partition(part_num, disk, self, disk.portnum) # newpart.device_name = '%s' % rpart # # self.found_devices = self.found_devices + 1 # self.part_list.append(newpart) # continue if rdev == 'unknown': continue rlog_debug ('disk [%s] is [%s] raid drive [%s]' % \ (rpart, self.dev_name, rdev)) base_dev=hwtool_disk_map.find_devname_by_port(disk.portnum) base_devname='%s%s' % (base_dev, part_num) path='/sys/block/%s/md/dev-%s/state' % (self.dev_name, base_devname) try: disk_state=get_sysfs_param(path) disk_status = convert_md_status_to_rrdm(disk_state) except IOError: disk_status='failed' newpart=RaidPartition() newpart.make_partition (part_num, disk, self, rdev, disk_status) newpart.device_name = '%s' % rpart self.part_list.append(newpart) self.found_devices = self.found_devices + 1 continue
def fail(self): # once you've failed the disk, it disappears from the sysfs entry, # you can only fail a drive once, also b/c of that read the dev name first. # # failing is a 2 stage process of setting the drive to faulty and removing it # from the array. # array_name = self.raid_array.get_devname() # XXX currently assumes that the disk in port X is raid X # if self.raid_port == 'unknown': # if this drive isnt in the system assume its on the hard drive. rlog_debug ('drive has been removed using drive-raid map') sysconfig = SystemConfig() if sysconfig.is_config_valid(): portnum = sysconfig.get_disk_rport(self.hd.portnum) else: # if we don't know which raid port to fail, don't just continue on. # skip out and log a msg. # rlog_notice ('Unable to determie rport when failing disk [%s]' % portnum) return else: portnum = self.raid_port state_cmd = "faulty" remove_cmd = "remove" md_devname_path = '/sys/block/%s/md/rd%s/device' % (array_name, portnum) try: md_dev_name = get_sysfs_param (md_devname_path) except IOError: raise rrdm_error ('unable to read raid device : %s' % md_devname_path) # use the device name indicated by RAID, since if the drive is missing, # md might still have a reference to the device, but we don't have a scsi device # to use to figure out what the name of the device that used ot be in the array # is md_state_path = '/sys/block/%s/md/dev-%s/state' % (array_name, md_dev_name) rlog_notice ('Failing array [%s] device [%s:%s]' % (array_name, portnum, md_dev_name)) retries = 0 while retries < 3: try: if exists (md_state_path): sys_file = open (md_state_path, "w") try: sys_file.write(state_cmd) finally: sys_file.close() sleep (0.5) sys_file = open (md_state_path, "w") try: sys_file.write(remove_cmd) finally: sys_file.close() # if we succeed, give a grace period to allow for the request # to complete. sleep (0.5) # bail out its failed already or we succeeded # make sure drive is really gone, and if its not.. retry if not exists (md_state_path): break except IOError: retries += 1 if exists (md_state_path): rlog_debug('Unable to fail %s on %s with cmd [%s:%s]' % ( self.raid_port, array_name, md_state_path,remove_cmd))