def hot_add_drive(self, hd): # the logical drive of the hd is the raid drive # also we need to look up this disk in our raidcfg and see # which partition is on which disk, a special note, # nothing prevents a disk having multiple partitions in # the same array rdev_list = self.__device_list.find_devices_on_disk(hd.portnum) for rdev in rdev_list: part_num = rdev.part_id # fill in the raid drive based on the logical device # associated with the device config raid_drive = rdev.get_logical_device() disk_dev = 'disk%sp%s' % (hd.portnum, part_num) if self.is_degraded(): mdadm_cmd='/sbin/mdadm --add -d %s /dev/%s /dev/%s' % \ (raid_drive, self.dev_name, disk_dev) rlog_debug ('executing command %s' % mdadm_cmd) if run_shell_cmd(mdadm_cmd, False) != 0: print 'Failed to add drive [disk%s] to raid [%s]' % \ (hd.portnum, self.dev_name) else: print '%s successfully added to array %s' % \ (hd.get_devname(), self.dev_name) if self.is_online(): print '%s is already online in array %s' % \ (hd.get_devname(), self.dev_name)
def purge_faulty_drives(self): dev_entry_re = recompile ("^dev") # walk the dev-sdX entries in the sysfs for a raid device and remove any # that are faulty. md_rootdir = '/sys/block/%s/md/' % self.get_devname() try: dir = listdir(md_rootdir) for d in dir: if dev_entry_re.match (d): state_entry = '%s%s/state' % (md_rootdir, d) try: state = '%s' % get_sysfs_param(state_entry) except (IOError, OSError): # ignore and continue continue if state == "faulty": rlog_debug ('Cleaning up stale device [%s] reference in array [%s]' % ( d, self.get_devname())) # we found a disk that should have been removed but wasnt if not set_sysfs_param (state_entry, 'remove'): rlog_notice ('Unable to remove faulty device [%s] from array [%s]' % ( self.get_devname(), d)) except (IOError, OSError): # make sure we keep on going if we have a problem, we want to try to # fix any inconsistancies found pass
def write_config(self, system): name = self.config_tmp_file self.config_valid = False try: rlog_debug ('Writing disk configuration to : ' + name) tfile = open (name, "w+") tfile.write('<rrdm version="1">\n') tfile.write('<config rebuild_rate=\"%s\" auto_rebuild=\"%s\"/>\n' % \ (system.get_rebuild_rate(), "true")) tfile.write('<drives>\n') self.disk_list = [] for disk in system.disk_array.get_drive_list(): # figure out the raid port corresponding to # this drive. if not disk.is_missing(): if disk.has_valid_superblock(): # read the rport from the SB rport = '%s' % disk.superblock.get_raid_port() else: # disk is in, but no rport, so we don't know what it # is yet, should be the old one. rport = self.get_disk_rport(disk.portnum) else: # disk is missing so, fill in the previous raid port. # rport = self.get_disk_rport(disk.portnum) tfile.write ('<disk port=\"%s\" rport=\"%s\" serial=\"%s\"/>\n' % ( disk.portnum, rport, disk.serialnum)) self.disk_list.append ((disk.portnum, disk.serialnum)) tfile.write('</drives>\n') tfile.write('<raid-arrays>\n') # reset the array list self.array_list = [] for array in system.raid_arrays.get_array_list(): tfile.write ('<array name=\"%s\" uid=\"%s\"/>\n' % ( array.name, array.uuid)) self.array_list.append ((array.name, array.uuid)) tfile.write('</raid-arrays>\n') tfile.write('</rrdm>\n') tfile.close() except IOError: raise rrdm_error ('unable to create configuration file %s' % name) try: rename (name, self.config_file) except Exception: raise rrdm_error ('unable to update configuration file %s' % self.config_file) self.config_valid = True
def collect_rebuild_info(self): try: sync_state = get_sysfs_param('/sys/block/%s/md/sync_completed' % self.dev_name) values=sync_state.strip().split("/") self.sync_completed_kb = int(values[0]) self.sync_total_kb = int(values[1]) rlog_debug ("Rebuild info : [%d:%d]" % (self.sync_completed_kb, self.sync_total_kb)) except (rrdm_error, IndexError): self.sync_total_kb = -1 self.sync_complete_kb = 0
def add(self): if not self.is_ok(): array_name = self.raid_array.get_devname() disk_dev = 'disk%sp%s' % (self.hd.portnum, self.part_id) raid_drive = self.hd.portnum mdadm_cmd='/sbin/mdadm --add -d %s /dev/%s /dev/%s' % (raid_drive, array_name, disk_dev) rlog_debug ('executing command %s' % mdadm_cmd) if run_shell_cmd(mdadm_cmd, False) != 0: print 'Failed to add drive [disk%s] to raid [%s]' % (self.hd.portnum, self.dev_name) else: print '%s successfully added to array %s' % (self.hd.get_devname(), self.dev_name) else: print '%s is already online in array %s' % (self.hd.get_devname(), self.dev_name)
def check_consistency(self): if not self.is_missing(): rlog_debug("Checking consistancy on %s" % self.dev_name) rdev_name = "/dev/%s" % self.dev_name if exists(rdev_name): dev_sb_output = read_brief_md_sb(rdev_name) rdev = get_rdev_from_brief_sb(dev_sb_output) sb_rdev = self.hd.superblock.get_raid_port() rlog_debug("%s consistency [%s:%s]" % (self.get_devname(), rdev, sb_rdev)) if rdev != sb_rdev: return False return True
def add(self): if not self.is_ok(): array_name = self.raid_array.get_devname() disk_dev = "disk%sp%s" % (self.hd.portnum, self.part_id) raid_drive = self.hd.portnum mdadm_cmd = "/sbin/mdadm --zero-superblock /dev/%s" % (disk_dev) rlog_debug("executing command %s" % mdadm_cmd) if run_shell_cmd(mdadm_cmd, False) != 0: print "Failed to zero superblock on [%s]" % (disk_dev) else: print "Successfully wiped out the superblock on [%s]" % (disk_dev) mdadm_cmd = "/sbin/mdadm --add -d %s /dev/%s /dev/%s" % (raid_drive, array_name, disk_dev) rlog_debug("executing command %s" % mdadm_cmd) if run_shell_cmd(mdadm_cmd, False) != 0: print "Failed to add drive [disk%s] to raid [%s]" % (self.hd.portnum, self.dev_name) else: print "%s successfully added to array %s" % (self.hd.get_devname(), self.dev_name) else: print "%s is already online in array %s" % (self.hd.get_devname(), self.dev_name)
def fail_disk(self, hd_target): for r in self.raid_arrays.get_array_list(): if self.supports_disk_led_control(): hd_target.turn_on_led() rpart = r.find_dev_by_hd(hd_target) rlog_debug ('found %s' % rpart) if (rpart != None): rlog_notice ('Failing drive : %s' % rpart.device_name) try: rpart.fail() except rrdm_error: # couldn't fail the drive.., already failed. continue for ftsarr in self.__ftraid_arrays: ft_dev = ftsarr.find_dev_by_hd(hd_target) if ft_dev != None: ft_dev.fail() try: self.__fail_disk(hd_target) except rrdm_error, what: rlog_warning("Could not fail disk [%s]" % hd_target)
def fill_from_system_info(self, device_list, name, dev_name, fstype, type, layout, level, cfg_size_mb, sysfscfg_list = []): self.dev_name = dev_name self.name = name self.fstype = fstype self.type = type self.layout = layout self.level = level self.cfg_size_mb = cfg_size_mb self.__device_list = device_list self.__sysfscfg_list = sysfscfg_list # currently we expect each raid to go across all drives # in the system self.num_drives = self.__device_list.get_expected_drives() self.status = self.determine_array_status() if self.is_rebuilding(): self.collect_rebuild_info() if not self.is_stopped(): try: self.uuid = get_sysfs_param ('/sys/block/%s/md/uuid' % self.dev_name) except rrdm_error: self.uuid = '' rlog_debug ('raid status for [%s] is [%s]' % (self.dev_name, self.status)) for diskpart in self.__device_list.get_devices(): part_num = diskpart.part_id disk = diskpart.hd rlog_debug ('adding disk device for raid array [%s] part [%s]' % \ (self.dev_name, part_num)) rpart = diskpart.get_devname() rdevice = '/dev/%s' % rpart # This is an assumption that should hold true even on old boxes, # the raid port should equal the logical port carried by the device in # the drive list. # originally we simply encoded the raid port in the rvbd SB as the drive # number. This would be an issue if we supported moving around disks, # but as we don't support that today, we should be ok. # the problem with moving drives around would be that each drive physically # could now be a different rdev in a number of arrays, and the SB doesnt store # this well today # rdev = diskpart.get_logical_device() # try: # # if the disk has a valid riverbed SB, we can use the SB info to give us # # the raid port, otherwise we need to fall back to using mdadm to get the # # raid port. # # # if disk.has_valid_superblock(): # rdev = disk.superblock.get_raid_port() # rlog_debug('Superblock indicates [%s] is [%s]' % (rpart, rdev)) # elif not disk.is_failed(): # # fallback to mdadm's brief superblock output and get the raid port from there. # rlog_debug ('Disk %s has no riverbed superblock, checking mdadm' % rpart) # dev_sb_output = read_brief_md_sb(rdevice) # # we expect a string rdev here. # rdev = '%s' % get_rdev_from_brief_sb(dev_sb_output) # else: # rdev = 'unknown' # raise rrdm_error ("Disk %s doesnt have a riverbed superblock" % rpart) # except rrdm_error: # # # we can't read the SB info for this disk and we know its not missing, so.. # # fill it in failed and go to the next disk # newpart=RaidPartition() # # here we need to use a fallback from the config if the drive is missing, # # and we want movable drives. # newpart.make_partition(part_num, disk, self, disk.portnum) # newpart.device_name = '%s' % rpart # # self.found_devices = self.found_devices + 1 # self.part_list.append(newpart) # continue if rdev == 'unknown': continue rlog_debug ('disk [%s] is [%s] raid drive [%s]' % \ (rpart, self.dev_name, rdev)) base_dev=hwtool_disk_map.find_devname_by_port(disk.portnum) base_devname='%s%s' % (base_dev, part_num) path='/sys/block/%s/md/dev-%s/state' % (self.dev_name, base_devname) try: disk_state=get_sysfs_param(path) disk_status = convert_md_status_to_rrdm(disk_state) except IOError: disk_status='failed' newpart=RaidPartition() newpart.make_partition (part_num, disk, self, rdev, disk_status) newpart.device_name = '%s' % rpart self.part_list.append(newpart) self.found_devices = self.found_devices + 1 continue
def fail(self): # once you've failed the disk, it disappears from the sysfs entry, # you can only fail a drive once, also b/c of that read the dev name first. # # failing is a 2 stage process of setting the drive to faulty and removing it # from the array. # array_name = self.raid_array.get_devname() # XXX currently assumes that the disk in port X is raid X # if self.raid_port == 'unknown': # if this drive isnt in the system assume its on the hard drive. rlog_debug ('drive has been removed using drive-raid map') sysconfig = SystemConfig() if sysconfig.is_config_valid(): portnum = sysconfig.get_disk_rport(self.hd.portnum) else: # if we don't know which raid port to fail, don't just continue on. # skip out and log a msg. # rlog_notice ('Unable to determie rport when failing disk [%s]' % portnum) return else: portnum = self.raid_port state_cmd = "faulty" remove_cmd = "remove" md_devname_path = '/sys/block/%s/md/rd%s/device' % (array_name, portnum) try: md_dev_name = get_sysfs_param (md_devname_path) except IOError: raise rrdm_error ('unable to read raid device : %s' % md_devname_path) # use the device name indicated by RAID, since if the drive is missing, # md might still have a reference to the device, but we don't have a scsi device # to use to figure out what the name of the device that used ot be in the array # is md_state_path = '/sys/block/%s/md/dev-%s/state' % (array_name, md_dev_name) rlog_notice ('Failing array [%s] device [%s:%s]' % (array_name, portnum, md_dev_name)) retries = 0 while retries < 3: try: if exists (md_state_path): sys_file = open (md_state_path, "w") try: sys_file.write(state_cmd) finally: sys_file.close() sleep (0.5) sys_file = open (md_state_path, "w") try: sys_file.write(remove_cmd) finally: sys_file.close() # if we succeed, give a grace period to allow for the request # to complete. sleep (0.5) # bail out its failed already or we succeeded # make sure drive is really gone, and if its not.. retry if not exists (md_state_path): break except IOError: retries += 1 if exists (md_state_path): rlog_debug('Unable to fail %s on %s with cmd [%s:%s]' % ( self.raid_port, array_name, md_state_path,remove_cmd))