def health_checks(self): """ perform checks on elements that affect the reported state of the cluster """ # The idea here is to perform the most significant checks first # so the message list appears in a priority order # 1. Check the volumes for volume_name in self.volume: this_volume = self.volume[volume_name] if 'down' in this_volume.volume_state: self.messages.append("Volume '%s' is down" % volume_name) self.status = 'unhealthy' if 'partial' in this_volume.volume_state: self.messages.append( "Volume '%s' is in a PARTIAL state, some data is inaccessible data, due to missing bricks" % volume_name) self.messages.append("WARNING -> Write requests may fail against volume '%s'" % this_volume.name) self.status = 'unhealthy' # 2. Check for conditions detected at the node level for uuid in self.node: this_node = self.node[uuid] if this_node.state != '1': # https://bugzilla.redhat.com/show_bug.cgi?id=1254514, # node_name comes as empty # self.messages.append("Cluster node '%s' is down" % # (this_node.node_name())) self.nodes_down += 1 self.status = 'unhealthy' if this_node.self_heal_enabled != this_node.self_heal_active: # https://bugzilla.redhat.com/show_bug.cgi?id=1254514, # decided to remove the self-heal status as it is # redunant information # self.messages.append("Self heal daemon is down on %s" % (this_node.node_name())) self.status = 'unhealthy' # Print the number of nodes that are down if self.nodes_down == 1: self.messages.append("One of the nodes in the cluster is down") elif self.nodes_down > 1: self.messages.append("%s nodes in the cluster are down"%self.nodes_down) # 3. Check the bricks for brick_name in self.brick: this_brick = self.brick[brick_name] # 3.1 check for state if not this_brick.up: self.messages.append( "Brick %s in volume '%s' is down/unavailable" % (brick_name, this_brick.owning_volume)) # 3.2 check for best practice goes here (minor error messages - FUTURE) if self.bricks_active < Brick.brick_count(): self.messages.append("INFO -> Not all bricks are online, so capacity provided is NOT accurate")
def health_checks(self): """ perform checks on elements that affect the reported state of the cluster """ # The idea here is to perform the most significant checks first # so the message list appears in a priority order # 1. Check the volumes for volume_name in self.volume: this_volume = self.volume[volume_name] if 'down' in this_volume.volume_state: self.messages.append("Volume '%s' is down" % volume_name) self.status = 'unhealthy' if 'partial' in this_volume.volume_state: self.messages.append( "Volume '%s' is in a PARTIAL state, some data is inaccessible data, due to missing bricks" % volume_name) self.messages.append("WARNING -> Write requests may fail against volume '%s'" % this_volume.name) self.status = 'unhealthy' # 2. Check for conditions detected at the node level for uuid in self.node: this_node = self.node[uuid] if this_node.state != '1': self.messages.append("Cluster node '%s' is down" % (this_node.node_name())) self.status = 'unhealthy' if this_node.self_heal_enabled != this_node.self_heal_active: self.messages.append("Self heal daemon is down on %s" % (this_node.node_name())) self.status = 'unhealthy' # 3. Check the bricks for brick_name in self.brick: this_brick = self.brick[brick_name] # 3.1 check for state if not this_brick.up: self.messages.append( "Brick %s in volume '%s' is down/unavailable" % (brick_name, this_brick.owning_volume)) # 3.2 check for best practice goes here (minor error messages - FUTURE) if self.bricks_active < Brick.brick_count(): self.messages.append("INFO -> Not all bricks are online, so capacity provided is NOT accurate")
def define_volumes(self): """ Create the volume + brick objects """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Building volume objects" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand("gluster vol info --xml") cmd.run() # (rc, vol_info) = issueCMD("gluster vol info --xml") xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) vol_elements = xml_root.findall('.//volume') for vol_object in vol_elements: # set up a dict for the initial definition of the volume vol_dict = {} # build a dict for the initial volume settings. An attribute error results in a default # value being assigned (e.g. on older glusterfs disperse related fields are missing) for attr in Volume.volume_attr: try: vol_dict[attr] = vol_object.find('./' + attr).text except AttributeError: vol_dict[attr] = '0' # create a volume object, for this volume new_volume = Volume(vol_dict) self.volume[new_volume.name] = new_volume if cfg.debug: print "defineVolumes. Adding volume %s" % new_volume.name # add information about any volume options opt_nodes = vol_object.findall('.//option') for option in opt_nodes: for n in option.getchildren(): if n.tag == 'name': key = n.text elif n.tag == 'value': value = n.text new_volume.options[key] = value # Protocols are enabled by default, so we look # for the volume tuning options that turn them # off if key == 'user.cifs': if value in ['disable', 'off', 'false']: new_volume.protocol['SMB'] = 'off' elif key == 'nfs.disable': if value in ['on', 'true']: new_volume.protocol['NFS'] = 'off' # get bricks listed against this volume, and create the Brick object(s) brick_nodes = vol_object.findall('.//brick') # list holding brick paths repl = [] ctr = 1 for brick in brick_nodes: brick_path = brick.text new_volume.brick_order.append(brick_path) (hostname, pathname) = brick_path.split(':') if cfg.debug: print "defineVolumes. Adding brick %s to %s" % (brick_path, new_volume.name) node_uuid = self.get_node(hostname) # add this bricks owning node to the volume's attributes try: new_volume.node[node_uuid] = self.node[node_uuid] except KeyError: print "Unable to associate brick %s with a peer in the cluster, possibly due" % brick_path print "to name lookup failures. If the nodes are not registered (fwd & rev)" print "to dns, add local entries for your cluster nodes in the the /etc/hosts file" sys.exit(16) new_brick = Brick(brick_path, self.node[node_uuid], new_volume.name) # Add the brick to the cluster and volume self.brick[brick_path] = new_brick new_volume.brick[brick_path] = new_brick # add this brick to the owning node brick_owner = self.node[node_uuid] brick_owner.brick[brick_path] = new_brick if (new_volume.replicaCount > 1) or (new_volume.disperseCount > 0): repl.append(brick_path) bricks_per_subvolume = max(new_volume.replicaCount, new_volume.disperseCount) ctr += 1 if ctr > bricks_per_subvolume: ctr = 1 # add this replica set to the volume's info new_volume.subvolumes.append(repl) # drop all elements from temporary list repl = [] # By default from gluster 3.3 onwards, self heal is enabled for # all replicated/disperse volumes. We look at the volume type, and if it # is replicated and hasn't had self-heal explicitly disabled the # self heal state is inferred against the nodes that contain the # bricks for the volume. With this state in place, the updateState # method can cross-check to see what is actually happening if ('replicate' in new_volume.typeStr.lower()) or ('disperse' in new_volume.typeStr.lower()): heal_enabled = True # assume it's on if 'cluster.self-heal-daemon' in new_volume.options: if new_volume.options['cluster.self-heal-daemon'].lower() in ['off', 'false']: heal_enabled = False new_volume.self_heal_enabled = heal_enabled if heal_enabled: node_set = set() # use a set to maintain a unique group of nodes for brick_path in new_volume.brick: this_brick = self.brick[brick_path] this_node = this_brick.node node_set.add(this_node) this_node.self_heal_enabled = True self.sh_enabled = len(node_set) self.volume_count = Volume.volume_count() self.brick_count = Brick.brick_count()
def define_volumes(self): """ Create the volume + brick objects """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Building volume objects" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand("gluster vol info --xml", timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, vol_info) = issueCMD("gluster vol info --xml") xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) vol_elements = xml_root.findall('.//volume') for vol_object in vol_elements: # set up a dict for the initial definition of the volume vol_dict = {} # build a dict for the initial volume settings. An attribute error results in a default # value being assigned (e.g. on older glusterfs disperse related fields are missing) for attr in Volume.volume_attr: try: vol_dict[attr] = vol_object.find('./' + attr).text except AttributeError: vol_dict[attr] = '0' # create a volume object, for this volume new_volume = Volume(vol_dict) self.volume[new_volume.name] = new_volume if cfg.debug: print "defineVolumes. Adding volume %s" % new_volume.name # add information about any volume options opt_nodes = vol_object.findall('.//option') for option in opt_nodes: for n in option.getchildren(): if n.tag == 'name': key = n.text elif n.tag == 'value': value = n.text new_volume.options[key] = value # Protocols are enabled by default, so we look # for the volume tuning options that turn them # off if key == 'user.cifs': if value in ['disable', 'off', 'false']: new_volume.protocol['SMB'] = 'off' elif key == 'nfs.disable': if value in ['on', 'true']: new_volume.protocol['NFS'] = 'off' # get bricks listed against this volume, and create the Brick object(s) brick_nodes = vol_object.findall('.//brick') # list holding brick paths repl = [] ctr = 1 for brick in brick_nodes: brick_path = brick.text new_volume.brick_order.append(brick_path) (hostname, pathname) = brick_path.split(':') if cfg.debug: print "defineVolumes. Adding brick %s to %s" % (brick_path, new_volume.name) node_uuid = self.get_node(hostname) # add this bricks owning node to the volume's attributes try: new_volume.node[node_uuid] = self.node[node_uuid] except KeyError: print "Unable to associate brick %s with a peer in the cluster, possibly due" % brick_path print "to name lookup failures. If the nodes are not registered (fwd & rev)" print "to dns, add local entries for your cluster nodes in the the /etc/hosts file" sys.exit(16) new_brick = Brick(brick_path, self.node[node_uuid], new_volume.name) # Add the brick to the cluster and volume self.brick[brick_path] = new_brick new_volume.brick[brick_path] = new_brick # add this brick to the owning node brick_owner = self.node[node_uuid] brick_owner.brick[brick_path] = new_brick if (new_volume.replicaCount > 1) or (new_volume.disperseCount > 0): repl.append(brick_path) bricks_per_subvolume = max(new_volume.replicaCount, new_volume.disperseCount) ctr += 1 if ctr > bricks_per_subvolume: ctr = 1 # add this replica set to the volume's info new_volume.subvolumes.append(repl) # drop all elements from temporary list repl = [] # By default from gluster 3.3 onwards, self heal is enabled for # all replicated/disperse volumes. We look at the volume type, and if it # is replicated and hasn't had self-heal explicitly disabled the # self heal state is inferred against the nodes that contain the # bricks for the volume. With this state in place, the updateState # method can cross-check to see what is actually happening if ('replicate' in new_volume.typeStr.lower()) or ('disperse' in new_volume.typeStr.lower()): heal_enabled = True # assume it's on if 'cluster.self-heal-daemon' in new_volume.options: if new_volume.options['cluster.self-heal-daemon'].lower() in ['off', 'false']: heal_enabled = False new_volume.self_heal_enabled = heal_enabled if heal_enabled: node_set = set() # use a set to maintain a unique group of nodes for brick_path in new_volume.brick: this_brick = self.brick[brick_path] this_node = this_brick.node node_set.add(this_node) this_node.self_heal_enabled = True self.sh_enabled = len(node_set) self.volume_count = Volume.volume_count() self.brick_count = Brick.brick_count()