def create_raid_array(self, dry = False): # if self.status != 'stopped' and not dry: # raise rrdm_error ('RAID device %s is already running' % self.dev_name) if (self.found_devices == self.num_drives): dev_list = self.__form_mdadm_create_device_list() if len(dev_list) == 0: raise rrdm_error ('Insufficient raid disks to create array [%s]' % self.dev_name) opt_list = self.__form_mdadm_create_opt_list() command = self.__form_mdadm_cmd_line(opt_list, dev_list) if dry: print 'running:', command else: try: ret = run_shell_cmd(command) except rrdm_error: try: ret = run_shell_cmd('/mfg/'+command) except rrdm_error: raise rrdm_error('failed to start RAID with cmdline : %s' % command) else: raise rrdm_error ('Unable to create raid array with missing disks [%d/%d]' % (self.found_devices, self.num_drives)) rlog_notice ('Created array [%s:%s]' % (self.name, self.dev_name))
def write_config(self, system): name = self.config_tmp_file self.config_valid = False try: rlog_debug ('Writing disk configuration to : ' + name) tfile = open (name, "w+") tfile.write('<rrdm version="1">\n') tfile.write('<config rebuild_rate=\"%s\" auto_rebuild=\"%s\"/>\n' % \ (system.get_rebuild_rate(), "true")) tfile.write('<drives>\n') self.disk_list = [] for disk in system.disk_array.get_drive_list(): # figure out the raid port corresponding to # this drive. if not disk.is_missing(): if disk.has_valid_superblock(): # read the rport from the SB rport = '%s' % disk.superblock.get_raid_port() else: # disk is in, but no rport, so we don't know what it # is yet, should be the old one. rport = self.get_disk_rport(disk.portnum) else: # disk is missing so, fill in the previous raid port. # rport = self.get_disk_rport(disk.portnum) tfile.write ('<disk port=\"%s\" rport=\"%s\" serial=\"%s\"/>\n' % ( disk.portnum, rport, disk.serialnum)) self.disk_list.append ((disk.portnum, disk.serialnum)) tfile.write('</drives>\n') tfile.write('<raid-arrays>\n') # reset the array list self.array_list = [] for array in system.raid_arrays.get_array_list(): tfile.write ('<array name=\"%s\" uid=\"%s\"/>\n' % ( array.name, array.uuid)) self.array_list.append ((array.name, array.uuid)) tfile.write('</raid-arrays>\n') tfile.write('</rrdm>\n') tfile.close() except IOError: raise rrdm_error ('unable to create configuration file %s' % name) try: rename (name, self.config_file) except Exception: raise rrdm_error ('unable to update configuration file %s' % self.config_file) self.config_valid = True
def add_sb_kvp(self, kvp): if len(kvp) != 2: raise rrdm_error ('Invalid key value pair parameter') cmdline = '%s -a %s=%s %s' % (self.super_path, kvp[0], kvp[1], self.dev_name) err = run_shell_cmd (cmdline) if err != 0: raise rrdm_error ('Unable to update superblock on %s' % self.dev_name) self.__sb_kvp[kvp[0]] = kvp[1]
def start_raid_array(self, dry=False): if self.status != "stopped" and not dry: # if its already running, just return ok return plist = filter(lambda dev: not dev.hd.is_missing(), self.part_list) uuids = map(lambda dev: dev.raid_super().uuid(), plist) uuids = filter(lambda u: u != None and u != "", uuids) # remove duplicates in uuids... uuids.sort() nuuids = [] prev = None for u in uuids: if u != prev: nuuids.append(u) prev = u uuids = nuuids # get our "expected uuid" uuid = SystemConfig().get_array_uid(self.name) array_started = False while len(uuids) > 0: # first priority our uuid... if uuids.count(uuid) > 0: u = uuid else: # next priority, most uuids in list... maxu = max(map(lambda a: plist.count(a), uuids)) u = filter(lambda a: plist.count(a) == maxu, uuids)[0] uuids.remove(u) if self.__start_raid_array_with_uuid(u, dry): array_started = True break if not array_started: raise rrdm_error("failed to start RAID") else: # raid array has started. If this raid array is a vecache then set the RAID # disable_queue_plugging sysfs param for this array if self.__sysfscfg_list != []: # Setting sysfs param to disable queueing on RAID10 writes on the VE blockstore try: for entry in self.__sysfscfg_list: cmd = "echo %s > %s/%s/%s" % (entry.value, entry.type, self.dev_name, entry.path) run_shell_cmd(cmd) except IOerror, OSerror: raise rrdm_error( "Could not set sysfs param disable_queue_plugging for vecache device %s" % self.dev_name )
def sync_disk_sb (self): retries = 0 while retries < 5 and not exists (self.dev_name): sleep (1) retries = retries + 1 cmdline = '%s -u -s %s -p %d -r %d %s' % (self.super_path, self.serial, \ self.port_num, self.raid_port_num, self.dev_name) err = run_shell_cmd (cmdline) if err != 0: if err == errno.EINVAL: raise rrdm_error ('%s: Invalid argument provided.' % self.super_path) raise rrdm_error ('Unable to update superblock on %s' % self.dev_name)
def __fail_disk(self, hd_target): dconfig_path = '/config/disk' if not isdir(dconfig_path): try: mkdir(dconfig_path) except OSError: raise rrdm_error('Unable to create disk state directory %s' % \ dconfig_path) dfile = '%s/disk%s_failed' % (dconfig_path, hd_target.portnum) try: open(dfile, "w").close() except IOError: raise rrdm_error ('Unable to create disk state file: %s' % dfile)
def fail(self): if not isdir(self.__cfg_status_dir): try: mkdir(self.__cfg_status_dir) except OSError: raise rrdm_error('Unable to create disk state directory %s' % \ self.__cfg_status_dir) try: open(self.__cfg_status_name, "w").close() except IOError: raise rrdm_error ('Unable to create disk state file: %s' % \ self.__cfg_status_name)
def __assemble_raid_array(self, uuid, dry): dev_list = self.__form_mdadm_assemble_device_list(uuid) opt_list = self.__form_mdadm_assemble_opt_list() cmd_line = self.__form_mdadm_cmd_line(opt_list, dev_list) started_array = False if dry: print 'Running:', cmd_line else: try: if len(dev_list) == 0: raise rrdm_error ('Insufficient raid disks to start array [%s]' % self.dev_name) rlog_notice ('Raid Assemble: [%s]' % cmd_line) run_shell_cmd(cmd_line) started_array = True except rrdm_error: rlog_notice ('Failed to start array with command [%s]' % cmd_line) # md often leaves some badness around when it fails an assemble # remove it self.stop_raid_array(True) # since we failed assembly sometimes MD leaves some state around. # rescan our raid state. self.determine_array_status() return started_array
def find_dev_by_raid_id(self, id): for part in self.part_list: # if its a failed raid drive, raid doesnt tell us where it came from. id_str = '%d' % id if part.raid_port == id_str: return part raise rrdm_error ('No raid device %s' % part.part_id)
def stop_raid_array(self, force = False): if force or not self.is_stopped(): cmd_line='mdadm --stop /dev/%s' % self.dev_name try: run_shell_cmd(cmd_line) except rrdm_error: raise rrdm_error('failed to stop RAID with cmdline : %s' % cmd_line) else: print 'Array %s is already stopped' % self.dev_name
def __add_disk(self, hd_target): dconfig_path = '/config/disk' if not isdir(dconfig_path): try: mkdir(dconfig_path) except OSError: raise rrdm_error('Unable to create disk state directory %s' % \ dconfig_path) dfile = '%s/disk%s_failed' % (dconfig_path, hd_target.portnum) if exists(dfile): remove(dfile)
def get_rebuild_rate(self): try: file = open (self.raid_rebuild_max_proc, 'r') try: rate=file.read() finally: file.close() except (IOError, OSError): raise rrdm_error ('Unable to read rebuild rate from proc') return rate.strip()
def check_consistency(self): in_sync = True for part in self.part_list: # only check actively in sync drives since MD will use a # spare drive number during the rebuild sequence, which can trick this into # thinking its out of sync if part.check_consistency() and not part.is_ok(): in_sync = False rlog_notice( "On array [%s] disk [%s] device [%s] is out of sync" % (self.get_devname(), part.hd.get_devname(), part.get_devname()) ) if not in_sync: rlog_notice("System may be vulnerable to single disk failure") raise rrdm_error("Array %s is not properly spread across system drives" % (self.get_devname())) return in_sync
def fill_from_rvbd_super(self, wait_for_device = False): super_path = RVBD_SUPER_PATH if not exists (super_path): super_path = RVBD_MFG_SUPER_PATH if not exists (super_path): raise rrdm_error ('Unable to locate rvbd_super tool.') retries = 0 if wait_for_device: while not exists(self.dev_name) and retries < 3: sleep (1) retries += 1 if not exists (self.dev_name): raise rrdm_error ('Device does not exist %s' % self.dev_name) self.super_path = super_path cmdline = '%s -v %s' % (super_path, self.dev_name) try: output = run_shell_cmd (cmdline, True) except rrdm_error: raise rrdm_error ('Unable to execute rvbd super tool.') if output == '': raise rrdm_error ('No output returned from rvbd super tool.'); ver_kvp = output.split('=') if ver_kvp[0] != 'version': raise rrdm_error ('Invalid output returned from rvbd super tool'); self.version = int (ver_kvp[1], 10) # we only do version 1 SB 's today. # should probably abstract the routines as a class later if we need to add # more. # if self.version == 1: # if we have a valid SB version. # just get fetch the output. cmdline = '%s -g %s' % (super_path, self.dev_name) output = run_shell_cmd (cmdline, True) try: sb_lines = output.split('\n') for line in sb_lines: sb_kvp = line.split('=') self.update_from_kvp(sb_kvp) except IndexError: raise rrdm_error ('invalid SB output returned from rvbd_super')
def fail(self): # once you've failed the disk, it disappears from the sysfs entry, # you can only fail a drive once, also b/c of that read the dev name first. # # failing is a 2 stage process of setting the drive to faulty and removing it # from the array. # array_name = self.raid_array.get_devname() # XXX currently assumes that the disk in port X is raid X # if self.raid_port == 'unknown': # if this drive isnt in the system assume its on the hard drive. rlog_debug ('drive has been removed using drive-raid map') sysconfig = SystemConfig() if sysconfig.is_config_valid(): portnum = sysconfig.get_disk_rport(self.hd.portnum) else: # if we don't know which raid port to fail, don't just continue on. # skip out and log a msg. # rlog_notice ('Unable to determie rport when failing disk [%s]' % portnum) return else: portnum = self.raid_port state_cmd = "faulty" remove_cmd = "remove" md_devname_path = '/sys/block/%s/md/rd%s/device' % (array_name, portnum) try: md_dev_name = get_sysfs_param (md_devname_path) except IOError: raise rrdm_error ('unable to read raid device : %s' % md_devname_path) # use the device name indicated by RAID, since if the drive is missing, # md might still have a reference to the device, but we don't have a scsi device # to use to figure out what the name of the device that used ot be in the array # is md_state_path = '/sys/block/%s/md/dev-%s/state' % (array_name, md_dev_name) rlog_notice ('Failing array [%s] device [%s:%s]' % (array_name, portnum, md_dev_name)) retries = 0 while retries < 3: try: if exists (md_state_path): sys_file = open (md_state_path, "w") try: sys_file.write(state_cmd) finally: sys_file.close() sleep (0.5) sys_file = open (md_state_path, "w") try: sys_file.write(remove_cmd) finally: sys_file.close() # if we succeed, give a grace period to allow for the request # to complete. sleep (0.5) # bail out its failed already or we succeeded # make sure drive is really gone, and if its not.. retry if not exists (md_state_path): break except IOError: retries += 1 if exists (md_state_path): rlog_debug('Unable to fail %s on %s with cmd [%s:%s]' % ( self.raid_port, array_name, md_state_path,remove_cmd))
def check_config(self): if (self.num_arrays != self.expected_num_arrays): raise rrdm_error ('Raid Configuration Mismatch')
def __init__(self, spec, model, mfg_mode=False, profile = None): self.spec = spec self.model = model self.appliance_serial = '' if profile == None or profile == '': # None if there is no storage profile or profiles # are not supported self.__cur_sprofile = get_storage_profile() else: # if the user has specified a profile on the cmd line # we'll use that profile (most notably for the mfg option, # where we want to reconfigure a system for a new profile) self.__cur_sprofile = profile # if the user has not specified the storage profile # and we have not read it from disk, and we're in # mfg mode then assume the storage profile is the default one # in the spec definition if self.spec.has_storage_cfg() and \ self.__cur_sprofile in [ None, '' ]: self.__cur_sprofile = self.spec.get_default_scfg_name() if self.__cur_sprofile in [ None, '' ]: raise AssertionError('Unable to determine storage profile') # we need to associate our view of the spec # with the profile we are currently configured for self.spec.set_spec_profile_view(self.__cur_sprofile) self.fill_appliance_serial() self.get_rebuild_rate() # grab the motherboard from hwtool self.motherboard = get_hwtool_motherboard() self.phy_mobo = get_hwtool_phy_motherboard() # gather all associated physical disk information # should we fill this from the spec info, or the available system info # split it out as a query? self.disk_array = DiskArray() num_physical_disks = hwtool_disk_map.get_num_hard_disk() if num_physical_disks <= 0: # how did we boot? raise rrdm_error ('Appliance detects an invalid number' \ ' of disks %d.' % num_physical_disks) self.disk_array.fill_from_system_info (self.spec) self.volumes = [] self.raid_arrays = RaidDevices() self.__ftraid_arrays = [] self.__zone_map = {} # populate a zone map with pointers to physical disks in the zones. # if self.spec.has_storage_cfg() == False: # fall back to the legacy config mode, where the zones # describe exported volumes for zone in self.spec.get_zone_list(): self.__legacy_volume_setup(zone) else: for z in self.spec.get_zone_list(): dz = DiskZone(z, self.disk_array) self.__zone_map[dz.get_name()] = dz # Storage config/ storage profile describe the set of exported # volumes lvm_list = self.spec.get_storage_cfg(). \ get_logical_volumes(self.__cur_sprofile) for lvm in lvm_list: if lvm.get_type() == LogicalVolumeConfig.direct_type: self.__lvm_direct_setup(lvm) if lvm.get_type() in LogicalVolumeConfig.raid_types: self.__lvm_raid_setup(lvm) if lvm.get_type() == LogicalVolumeConfig.ftraid_type: self.__lvm_ftraid_setup(lvm) # update drive status based on raid status. self.disk_array.update_status_by_zone(self.raid_arrays.get_array_list(), self.__ftraid_arrays)
def validate(self): (valid, msg) = validate_spec(self.spec, self.disk_array) if not valid: raise rrdm_error(msg)