Example #1
0
    def hot_add_drive(self, hd):
        # the logical drive of the hd is the raid drive
        # also we need to look up this disk in our raidcfg and see 
        # which partition is on which disk, a special note,
        # nothing prevents a disk having multiple partitions in
        # the same array
        rdev_list = self.__device_list.find_devices_on_disk(hd.portnum)
        for rdev in rdev_list:
            part_num = rdev.part_id
            # fill in the raid drive based on the logical device
            # associated with the device config
            raid_drive = rdev.get_logical_device()
            disk_dev = 'disk%sp%s' % (hd.portnum, part_num)
            if self.is_degraded():
                mdadm_cmd='/sbin/mdadm --add -d %s /dev/%s /dev/%s' % \
                    (raid_drive, self.dev_name, disk_dev)
                rlog_debug ('executing command %s' % mdadm_cmd)

                if run_shell_cmd(mdadm_cmd, False) != 0:
                    print 'Failed to add drive [disk%s] to raid [%s]' % \
                          (hd.portnum, self.dev_name)       
                else:
                    print '%s successfully added to array %s' % \
                          (hd.get_devname(),
                           self.dev_name)

            if self.is_online():
                print '%s is already online in array %s' % \
                    (hd.get_devname(), self.dev_name)
Example #2
0
    def purge_faulty_drives(self):
	dev_entry_re = recompile ("^dev")

	# walk the dev-sdX entries in the sysfs for a raid device and remove any
	# that are faulty.
	md_rootdir  =	'/sys/block/%s/md/' % self.get_devname()
	try:
	    dir = listdir(md_rootdir)
	    for d in dir:
		if dev_entry_re.match (d):
		    state_entry = '%s%s/state' % (md_rootdir, d) 
		    try:
			state	    = '%s' % get_sysfs_param(state_entry)
		    except (IOError, OSError):
			# ignore and continue
			continue
		
		    if state == "faulty":
			rlog_debug ('Cleaning up stale device [%s] reference in array [%s]' % (
				    d, self.get_devname()))
			# we found a disk that should have been removed but wasnt
			if not set_sysfs_param (state_entry, 'remove'):
			    rlog_notice ('Unable to remove faulty device [%s] from array [%s]' % (
					 self.get_devname(),
					 d))
	except (IOError, OSError):
	    # make sure we keep on going if we have a problem, we want to try to
	    # fix any inconsistancies found
	    pass
Example #3
0
    def write_config(self, system):
        name = self.config_tmp_file
        self.config_valid = False

        try:
            rlog_debug ('Writing disk configuration to : ' + name)
            tfile = open (name, "w+")
            tfile.write('<rrdm version="1">\n')
            tfile.write('<config rebuild_rate=\"%s\" auto_rebuild=\"%s\"/>\n' % \
                        (system.get_rebuild_rate(), "true"))
            tfile.write('<drives>\n')

            self.disk_list = []
            for disk in system.disk_array.get_drive_list():
		# figure out the raid port corresponding to
		# this drive.
		if not disk.is_missing():
		    if disk.has_valid_superblock():
			# read the rport from the SB
			rport = '%s' % disk.superblock.get_raid_port()
		    else:
			# disk is in, but no rport, so we don't know what it
			# is yet, should be the old one.
			rport = self.get_disk_rport(disk.portnum)
			
		else:
		    # disk is missing so, fill in the previous raid port.
		    #
		    rport = self.get_disk_rport(disk.portnum)

                tfile.write ('<disk port=\"%s\" rport=\"%s\" serial=\"%s\"/>\n' % (
                             disk.portnum, rport, disk.serialnum))
                self.disk_list.append ((disk.portnum, disk.serialnum))

            tfile.write('</drives>\n')

            tfile.write('<raid-arrays>\n')
            # reset the array list
            self.array_list = []
            for array in system.raid_arrays.get_array_list():
                tfile.write ('<array name=\"%s\" uid=\"%s\"/>\n' % (
                             array.name, array.uuid))
                self.array_list.append ((array.name, array.uuid))

            tfile.write('</raid-arrays>\n')
            tfile.write('</rrdm>\n')
            tfile.close()
        except IOError:
            raise rrdm_error ('unable to create configuration file %s' % name)

        try:
            rename (name, self.config_file)
        except Exception:
            raise rrdm_error ('unable to update configuration file %s' %
                              self.config_file)

        self.config_valid = True
Example #4
0
    def collect_rebuild_info(self):
	try:
	    sync_state   = get_sysfs_param('/sys/block/%s/md/sync_completed' % self.dev_name)
	    values=sync_state.strip().split("/")
	    self.sync_completed_kb  = int(values[0])
	    self.sync_total_kb	    = int(values[1])
	    rlog_debug ("Rebuild info : [%d:%d]" % (self.sync_completed_kb, self.sync_total_kb))
	
	except (rrdm_error, IndexError):
	    self.sync_total_kb = -1
	    self.sync_complete_kb = 0
Example #5
0
    def add(self):
        if not self.is_ok():
            array_name = self.raid_array.get_devname()
            disk_dev   = 'disk%sp%s' % (self.hd.portnum, self.part_id)
            raid_drive = self.hd.portnum
            mdadm_cmd='/sbin/mdadm --add -d %s /dev/%s /dev/%s' % (raid_drive, array_name, disk_dev)
            rlog_debug ('executing command %s' % mdadm_cmd)

            if run_shell_cmd(mdadm_cmd, False) != 0:
                print 'Failed to add drive [disk%s] to raid [%s]' % (self.hd.portnum, self.dev_name)      
            else:
                print '%s successfully added to array %s' % (self.hd.get_devname(),
                        self.dev_name)
        else:
            print '%s is already online in array %s' % (self.hd.get_devname(), self.dev_name)
Example #6
0
    def check_consistency(self):
        if not self.is_missing():
            rlog_debug("Checking consistancy on %s" % self.dev_name)
            rdev_name = "/dev/%s" % self.dev_name

            if exists(rdev_name):
                dev_sb_output = read_brief_md_sb(rdev_name)
                rdev = get_rdev_from_brief_sb(dev_sb_output)
                sb_rdev = self.hd.superblock.get_raid_port()

                rlog_debug("%s consistency [%s:%s]" % (self.get_devname(), rdev, sb_rdev))
                if rdev != sb_rdev:
                    return False

        return True
Example #7
0
    def add(self):
        if not self.is_ok():
            array_name = self.raid_array.get_devname()
            disk_dev = "disk%sp%s" % (self.hd.portnum, self.part_id)
            raid_drive = self.hd.portnum

            mdadm_cmd = "/sbin/mdadm --zero-superblock /dev/%s" % (disk_dev)
            rlog_debug("executing command %s" % mdadm_cmd)

            if run_shell_cmd(mdadm_cmd, False) != 0:
                print "Failed to zero superblock on [%s]" % (disk_dev)
            else:
                print "Successfully wiped out the superblock on [%s]" % (disk_dev)

            mdadm_cmd = "/sbin/mdadm --add -d %s /dev/%s /dev/%s" % (raid_drive, array_name, disk_dev)
            rlog_debug("executing command %s" % mdadm_cmd)

            if run_shell_cmd(mdadm_cmd, False) != 0:
                print "Failed to add drive [disk%s] to raid [%s]" % (self.hd.portnum, self.dev_name)
            else:
                print "%s successfully added to array %s" % (self.hd.get_devname(), self.dev_name)
        else:
            print "%s is already online in array %s" % (self.hd.get_devname(), self.dev_name)
Example #8
0
    def fail_disk(self, hd_target):
        for r in self.raid_arrays.get_array_list():
	    if self.supports_disk_led_control():
		hd_target.turn_on_led()

            rpart = r.find_dev_by_hd(hd_target)
            rlog_debug ('found %s' % rpart)
            if (rpart != None):
                rlog_notice ('Failing drive : %s' % rpart.device_name)
                try:
                    rpart.fail()
                except rrdm_error:
                    # couldn't fail the drive.., already failed.
                    continue

        for ftsarr in self.__ftraid_arrays:
            ft_dev = ftsarr.find_dev_by_hd(hd_target)
            if ft_dev != None:
                ft_dev.fail()

        try:
            self.__fail_disk(hd_target)
        except rrdm_error, what:
            rlog_warning("Could not fail disk [%s]" % hd_target)
Example #9
0
    def fill_from_system_info(self,
                             device_list,
                             name,
                             dev_name,
                             fstype,
                             type,
                             layout,
                             level,
                             cfg_size_mb,
			     sysfscfg_list = []):
        self.dev_name   = dev_name
        self.name       = name
        self.fstype     = fstype
        self.type       = type
        self.layout     = layout
        self.level      = level
        self.cfg_size_mb = cfg_size_mb

        self.__device_list = device_list
	self.__sysfscfg_list = sysfscfg_list

        # currently we expect each raid to go across all drives
        # in the system 
        self.num_drives = self.__device_list.get_expected_drives()

        self.status = self.determine_array_status()
	if self.is_rebuilding():
	    self.collect_rebuild_info()

	if not self.is_stopped():
	    try:
		self.uuid = get_sysfs_param ('/sys/block/%s/md/uuid' % self.dev_name)
	    except rrdm_error:
		self.uuid = ''
	    
        rlog_debug ('raid status for [%s] is [%s]' % (self.dev_name, self.status))
        for diskpart in self.__device_list.get_devices():
            part_num = diskpart.part_id
            disk = diskpart.hd
            rlog_debug ('adding disk device for raid array [%s] part [%s]' % \
                        (self.dev_name, 
                         part_num))

            rpart = diskpart.get_devname()
            rdevice = '/dev/%s' % rpart

            # This is an assumption that should hold true even on old boxes,
            # the raid port should equal the logical port carried by the device in
            # the drive list.
            # originally we simply encoded the raid port in the rvbd SB as the drive
            # number.  This would be an issue if we supported moving around disks,
            # but as we don't support that today, we should be ok.
            # the problem with moving drives around would be that each drive physically
            # could now be a different rdev in a number of arrays, and the SB doesnt store
            # this well today
            # 
            rdev = diskpart.get_logical_device() 
	
#            try:
#		# if the disk has a valid riverbed SB, we can use the SB info to give us
#		# the raid port, otherwise we need to fall back to using mdadm to get the
#		# raid port.
#		#
#		if disk.has_valid_superblock():
#		    rdev = disk.superblock.get_raid_port()
#		    rlog_debug('Superblock indicates [%s] is [%s]' % (rpart, rdev))
#		elif not disk.is_failed():
#		    # fallback to mdadm's brief superblock output and get the raid port from there.
#		    rlog_debug ('Disk %s has no riverbed superblock, checking mdadm' % rpart)
#		    dev_sb_output = read_brief_md_sb(rdevice)
#		    # we expect a string rdev here.
#		    rdev = '%s' % get_rdev_from_brief_sb(dev_sb_output)
#                else:
#                    rdev = 'unknown'
#                    raise rrdm_error ("Disk %s doesnt have a riverbed superblock" % rpart)
#            except rrdm_error:
#
#                # we can't read the SB info for this disk and we know its not missing, so..
#                # fill it in failed and go to the next disk
#                newpart=RaidPartition()
#		# here we need to use a fallback from the config if the drive is missing,
#		# and we want movable drives.
#		newpart.make_partition(part_num, disk, self, disk.portnum)
#                newpart.device_name = '%s' % rpart
#
#                self.found_devices = self.found_devices + 1
#                self.part_list.append(newpart)
#                continue

            if rdev == 'unknown':
                continue

            rlog_debug ('disk [%s] is [%s] raid drive [%s]' % \
                        (rpart, self.dev_name, rdev))
            
            base_dev=hwtool_disk_map.find_devname_by_port(disk.portnum)
            base_devname='%s%s' % (base_dev, part_num)
            
            path='/sys/block/%s/md/dev-%s/state' % (self.dev_name, base_devname)
            try:
                disk_state=get_sysfs_param(path)
                disk_status = convert_md_status_to_rrdm(disk_state)
            except IOError:
                disk_status='failed'

            newpart=RaidPartition()
	    newpart.make_partition (part_num, disk, self, rdev, disk_status)
            newpart.device_name = '%s' % rpart

            self.part_list.append(newpart)
            self.found_devices = self.found_devices + 1
            continue
Example #10
0
    def fail(self):
        # once you've failed the disk, it disappears from the sysfs entry,
        # you can only fail a drive once, also b/c of that read the dev name first.
        #
        # failing is a 2 stage process of setting the drive to faulty and removing it
        # from the array.
        #
        array_name      = self.raid_array.get_devname()

        # XXX currently assumes that the disk in port X is raid X
        #
        if self.raid_port == 'unknown':
            # if this drive isnt in the system assume its on the hard drive.
            rlog_debug ('drive has been removed using drive-raid map')
	    sysconfig = SystemConfig()
	    if sysconfig.is_config_valid():
		portnum = sysconfig.get_disk_rport(self.hd.portnum)
	    else:
		# if we don't know which raid port to fail, don't just continue on.
		# skip out and log a msg.
		#
		rlog_notice ('Unable to determie rport when failing disk [%s]' %
			     portnum)
		return
        else:
            portnum = self.raid_port

        state_cmd   = "faulty"
        remove_cmd  = "remove"

	md_devname_path = '/sys/block/%s/md/rd%s/device' % (array_name, portnum)

        try:
            md_dev_name = get_sysfs_param (md_devname_path)
        except IOError:
            raise rrdm_error ('unable to read raid device : %s' % md_devname_path)

	# use the device name indicated by RAID, since if the drive is missing,
	# md might still have a reference to the device, but we don't have a scsi device
	# to use to figure out what the name of the device that used ot be in the array
	# is
        md_state_path  = '/sys/block/%s/md/dev-%s/state' % (array_name, md_dev_name)

        rlog_notice ('Failing array [%s] device [%s:%s]' % (array_name,
                      portnum, md_dev_name))
	retries = 0

	while retries < 3:
	    try:
		if exists (md_state_path):
		    sys_file = open (md_state_path, "w")
		    try:
			sys_file.write(state_cmd)
		    finally:
			sys_file.close()

		    sleep (0.5)

		    sys_file = open (md_state_path, "w")
		    try:
			sys_file.write(remove_cmd)
		    finally:
			sys_file.close()

		    # if we succeed, give a grace period to allow for the request 
		    # to complete.
		    sleep (0.5)

		# bail out its failed already or we succeeded
		# make sure drive is really gone, and if its not.. retry
		if not exists (md_state_path):
		    break
		    
	    except IOError:
		retries += 1

	if exists (md_state_path):
	    rlog_debug('Unable to fail %s on %s with cmd [%s:%s]' % (
		       self.raid_port, array_name, md_state_path,remove_cmd))