def define_snapshots(self): """ Process each of the discovered volumes to look for any associated snapshots """ # process each discovered volume for volume_name in self.volume: this_volume = self.volume[volume_name] cmd = GlusterCommand("gluster snap list %s" % volume_name, \ timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, snap_info) = issueCMD("gluster snap list %s"%(volume_name)) if cmd.rc == 0: # process the snap information if not cmd.stdout[0].lower().startswith('no snapshots'): for snap in cmd.stdout: snap_name = snap.strip() if cfg.debug: print "defineSnapshots. Creating a snapshot instance for volume '%s' called '%s'" % (volume_name, snap_name) new_snapshot = Snapshot(snap_name, this_volume, volume_name) this_volume.snapshot_list.append(new_snapshot) this_volume.snapshot_count = len(this_volume.snapshot_list) if cfg.debug: print "defineSnapshots. Volume '%s' has %d snapshots" % (volume_name, this_volume.snapshot_count)
def define_snapshots(self): """ Process each of the discovered volumes to look for any associated snapshots """ # process each discovered volume for volume_name in self.volume: this_volume = self.volume[volume_name] cmd = GlusterCommand("gluster snap list %s" % volume_name) cmd.run() # (rc, snap_info) = issueCMD("gluster snap list %s"%(volume_name)) if cmd.rc == 0: # process the snap information if not cmd.stdout[0].lower().startswith('no snapshots'): for snap in cmd.stdout: snap_name = snap.strip() if cfg.debug: print "defineSnapshots. Creating a snapshot instance for volume '%s' called '%s'" % (volume_name, snap_name) new_snapshot = Snapshot(snap_name, this_volume, volume_name) this_volume.snapshot_list.append(new_snapshot) this_volume.snapshot_count = len(this_volume.snapshot_list) if cfg.debug: print "defineSnapshots. Volume '%s' has %d snapshots" % (volume_name, this_volume.snapshot_count)
def define_nodes(self): """ define the node objects for this cluster based on gluster pool list output """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # display a progress message sys.stdout.write("Processing nodes" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand('gluster pool list --xml') cmd.run() if cmd.rc != 0: print "glusterd did not respond to a peer status request, gstatus" print "can not continue.\n" exit(12) # define a list of elements in the xml that we're interested in field_list = ['hostname', 'uuid', 'connected'] xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) peer_list = xml_root.findall('.//peer') for peer in peer_list: node_info = get_attr(peer, field_list) this_hostname = node_info['hostname'] alias_list = [] if this_hostname == 'localhost': # output may say localhost, but it could be a reponse from a # foreign peer, since the local glusterd could be down if GlusterCommand.targetNode == 'localhost': local_ip_list = get_ipv4_addr() # Grab all IP's for ip in local_ip_list: alias_list += host_aliases(ip) alias_list.append('localhost') else: this_hostname = GlusterCommand.targetNode alias_list = host_aliases(this_hostname) alias_list.append('localhost') else: alias_list = host_aliases(this_hostname) # DEBUG ------------------------------------------------------------ if cfg.debug: print "Creating a node object with uuid %s, with names of %s" % (node_info['uuid'], alias_list) # ------------------------------------------------------------------ new_node = Node(node_info['uuid'], node_info['connected'], alias_list) self.ip_list += [ip for ip in alias_list if is_ip(ip)] # add this node object to the cluster objects 'dict' self.node[node_info['uuid']] = new_node self.node_count = Node.node_count()
def calc_connections(self): """ Issue a vol status all clients --xml and invoke the volume's clientCount method to determine unique clients connected to the clusters volume(s) """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Processing gluster client connections" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand("gluster vol status all clients --xml", timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, vol_clients) = issueCMD("gluster vol status all clients --xml") # gluster_rc = int([line.replace('<',' ').replace('>',' ').split()[1] # for line in vol_clients if 'opRet' in line][0]) if rc == 0 else rc if cmd.rc > 0: # unable to get the client connectivity information if self.output_mode == 'console' and not cfg.no_progress_msgs: print( "\ngstatus has been unable to get the output of a 'vol status all clients --xml' command" ) print("and can not continue.\n") return # At this point the command worked, so we can process the results xml_string = ''.join(cmd.stdout) try: xml_root = ETree.fromstring(xml_string) except ExpatError: print("Malformed xml, try again later.") volumes = xml_root.findall('.//volume') for volume_xml in volumes: # Find the volume name vol_name = volume_xml.find('./volName').text # process the volume xml self.volume[vol_name].client_count(volume_xml) # add the volumes unique set of clients to the clusters set self.client_set.update(self.volume[vol_name].client_set) self.num_connections += self.volume[vol_name].num_connections self.num_clients = len(self.client_set)
def get_version(self): """ Sets the current version and product identifier for this cluster """ cmd = GlusterCommand("gluster --version") # (rc, versInfo) = issueCMD("gluster --version") cmd.run() self.glfs_version = cmd.stdout[0].split()[1] if os.path.exists('/etc/redhat-storage-release'): with open('/etc/redhat-storage-release', 'r') as RHS_version: # example contents - Red Hat Storage Server 3.0 self.product_name = RHS_version.readline().rstrip() lc_name = self.product_name.lower().replace('update', '.') self.product_shortname = "RHGS v%s" % (''.join(lc_name.split()[4:])) else: self.product_name = self.product_shortname = "Community"
def get_version(self): """ Sets the current version and product identifier for this cluster """ cmd = GlusterCommand("gluster --version") # (rc, versInfo) = issueCMD("gluster --version") cmd.run() self.glfs_version = cmd.stdout[0].split()[1] if os.path.exists('/etc/redhat-storage-release'): with open('/etc/redhat-storage-release', 'r') as RHS_version: # example contents - Red Hat Storage Server 3.0 self.product_name = RHS_version.readline().rstrip() lc_name = self.product_name.replace('update', '.') self.product_shortname = "RHGS Server v%s" %\ (''.join(lc_name.split()[5:])) else: self.product_name = self.product_shortname = "Community"
def calc_connections(self): """ Issue a vol status all clients --xml and invoke the volume's clientCount method to determine unique clients connected to the clusters volume(s) """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Processing gluster client connections" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand("gluster vol status all clients --xml", timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, vol_clients) = issueCMD("gluster vol status all clients --xml") # gluster_rc = int([line.replace('<',' ').replace('>',' ').split()[1] # for line in vol_clients if 'opRet' in line][0]) if rc == 0 else rc if cmd.rc > 0: # unable to get the client connectivity information if self.output_mode == 'console' and not cfg.no_progress_msgs: print "\ngstatus has been unable to get the output of a 'vol status all clients --xml' command" print "and can not continue.\n" return # At this point the command worked, so we can process the results xml_string = ''.join(cmd.stdout) try: xml_root = ETree.fromstring(xml_string) except ExpatError: print "Malformed xml, try again later." volumes = xml_root.findall('.//volume') for volume_xml in volumes: # Find the volume name vol_name = volume_xml.find('./volName').text # process the volume xml self.volume[vol_name].client_count(volume_xml) # add the volumes unique set of clients to the clusters set self.client_set.update(self.volume[vol_name].client_set) self.num_connections += self.volume[vol_name].num_connections self.num_clients = len(self.client_set)
def initialise(self): """ call the node, volume 'generator' to create the child objects (bricks are created within the volume logic) """ self.has_volumes = True if glob( '/var/lib/glusterd/vols/*/trusted-*-fuse.vol') else False set_active_peer( ) # setup GlusterCommand class to have a valid node for commands # if has_volumes is populated we have vol files, then it's ok to # run the queries to define the node and volume objects if self.has_volumes: self.define_nodes() self.define_volumes() # if this cluster supports snapshots, take a look to see if # there are any self.snapshot_capable = version_ok(self.glfs_version, cfg.snapshot_support) if self.snapshot_capable: self.define_snapshots() self.snapshot_count = Snapshot.snap_count() else: # no volumes in this cluster, print a message and abort print("This cluster doesn't have any volumes/daemons running.") print( "The output below shows the current nodes attached to this host.\n" ) cmd = GlusterCommand('gluster pool list', timeout=cfg.CMD_TIMEOUT) cmd.run() for line in cmd.stdout: print(line) print() exit(12)
def initialise(self): """ call the node, volume 'generator' to create the child objects (bricks are created within the volume logic) """ self.has_volumes = True if glob('/var/lib/glusterd/vols/*/trusted-*-fuse.vol') else False set_active_peer() # setup GlusterCommand class to have a valid node for commands # if has_volumes is populated we have vol files, then it's ok to # run the queries to define the node and volume objects if self.has_volumes: self.define_nodes() self.define_volumes() # if this cluster supports snapshots, take a look to see if # there are any self.snapshot_capable = version_ok(self.glfs_version, cfg.snapshot_support) if self.snapshot_capable: self.define_snapshots() self.snapshot_count = Snapshot.snap_count() else: # no volumes in this cluster, print a message and abort print "This cluster doesn't have any volumes/daemons running." print "The output below shows the current nodes attached to this host.\n" cmd = GlusterCommand('gluster pool list') cmd.run() for line in cmd.stdout: print line print exit(12)
def update_state(self, self_heal_backlog): """ update the state of the cluster by processing the output of 'vol status' commands - vol status all detail --> provides the brick info (up/down, type), plus volume capacity - vol status all --> self heal states """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Updating volume information" + " " * 20 + "\n\r\x1b[A") # WORKAROUND # The code issues n vol status requests because issueing a vol status # wih the 'all' parameter can give bad xml when nodes are not # present in the cluster. By stepping through each volume, the # xml, while still buggy can be worked around # Process all volumes known to the cluster for volume_name in self.volume: # 'status' is set from a vol info command. This will show whether the # vol is created (0), started (1), or stopped (2). We're only interested # in the started state, when issuing the vol status command if self.volume[volume_name].status == 1: cmd = GlusterCommand("gluster vol status %s detail --xml" % volume_name) # (rc, vol_status) = issueCMD("gluster vol status %s detail --xml"%(volume_name)) cmd.run() # Need to check opRet element since for xml based gluster commands # do NOT pass a return code back to the shell! # gluster_rc = int([line.replace('<',' ').replace('>',' ').split()[1] # for line in vol_status if 'opRet' in line][0]) if cmd.rc == 0: xml_string = ''.join(cmd.stdout) xml_obj = ETree.fromstring(xml_string) # Update the volume, to provide capacity and status information self.volume[volume_name].update(xml_obj) else: # Being unable to get a vol status for a known volume # may indicate a peer transitioning to disconnected state # so issue an error message and abort the script print "\n--> gstatus has been unable to query volume '" + volume_name + "'" print "\nPossible cause: cluster is currently reconverging after a node" print "has entered a disconnected state." print "\nResponse: Rerun gstatus or issue a peer status command to confirm\n" exit(16) # ----------------------------------------------------------------------------- # Issue a vol status then use the output to look for active tasks and self heal # state information # ----------------------------------------------------------------------------- cmd = GlusterCommand("gluster vol status %s --xml" % volume_name) cmd.run() # (rc, vol_status) = issueCMD("gluster vol status %s --xml"%(volume_name)) # gluster_rc = int([line.replace('<',' ').replace('>',' ').split()[1] # for line in vol_status if 'opRet' in line][0]) xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) task_elements = xml_root.findall('.//task') for task in task_elements: task_name = task.find('./type').text task_status = task.find('./status').text task.status_str = task.find('./statusStr').text if task_status == '1': self.volume[volume_name].task_list.append(task_name) # --------------------------------------------------------------------- # If the volume has self_heal enabled, we look at the state of the daemons # --------------------------------------------------------------------- if self.volume[volume_name].self_heal_enabled: if self.output_mode == 'console' and not cfg.no_progress_msgs: sys.stdout.write("Analysing Self Heal daemons on %s %s\n\r\x1b[A" % (volume_name, " " * 20)) if cmd.rc == 0: # self_heal_list = [] node_elements = xml_root.findall('.//node') # print "DEBUG --> node elements in vol status is " + str(len(node_elements)) # first get a list of self-heal elements from the xml for node in node_elements: # WORKAROUND # there's a bug in some versions of 3.4, where when a node is missing # the xml returned is malformed returning a node # within a node so we need to check the subelements # to see if they're valid. if node.find('./node'): continue if node.find('./hostname').text == 'Self-heal Daemon': node_name = node.find('./path').text node_state = node.find('./status').text # uuid = '' # convert the name to a usable uuid if node_name == 'localhost': uuid = self.get_node(GlusterCommand.targetNode) else: uuid = self.get_node(node_name) if not uuid: # tried to resolve the name but couldn't print ("Cluster.updateState : Attempting to use a 'path' (%s) for " "a self heal daemon that" % node_name) print "does not correspond to a peer node, and can not continue\n" exit(16) if self.node[uuid].self_heal_enabled: if node_state == '1': self.node[uuid].self_heal_active = True else: self.node[uuid].self_heal_active = False # update the self heal flags, based on the vol status self.volume[volume_name].set_self_heal_stats() # high level info if self_heal_backlog: # now get low level info to check for heal backlog self.volume[volume_name].update_self_heal(self.output_mode) if 'UNAVAILABLE' in self.volume[volume_name].self_heal_string: # add message to cluster messages self.messages.append( 'WARNING -> self heal query did not complete for %s. Debug with -D or use ' '-t to increase cmd timeout' % volume_name) this_state = self.volume[volume_name].volume_state if this_state == 'up': self.volume_summary['up'] += 1 elif 'degraded' in this_state: self.volume_summary['degraded'] += 1 elif 'partial' in this_state: self.volume_summary['partial'] += 1 else: self.volume_summary['down'] += 1 self.active_nodes() # update active node counter self.active_bricks() # update active brick counter self.check_self_heal() self.calc_connections()
def define_volumes(self): """ Create the volume + brick objects """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Building volume objects" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand("gluster vol info --xml") cmd.run() # (rc, vol_info) = issueCMD("gluster vol info --xml") xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) vol_elements = xml_root.findall('.//volume') for vol_object in vol_elements: # set up a dict for the initial definition of the volume vol_dict = {} # build a dict for the initial volume settings. An attribute error results in a default # value being assigned (e.g. on older glusterfs disperse related fields are missing) for attr in Volume.volume_attr: try: vol_dict[attr] = vol_object.find('./' + attr).text except AttributeError: vol_dict[attr] = '0' # create a volume object, for this volume new_volume = Volume(vol_dict) self.volume[new_volume.name] = new_volume if cfg.debug: print "defineVolumes. Adding volume %s" % new_volume.name # add information about any volume options opt_nodes = vol_object.findall('.//option') for option in opt_nodes: for n in option.getchildren(): if n.tag == 'name': key = n.text elif n.tag == 'value': value = n.text new_volume.options[key] = value # Protocols are enabled by default, so we look # for the volume tuning options that turn them # off if key == 'user.cifs': if value in ['disable', 'off', 'false']: new_volume.protocol['SMB'] = 'off' elif key == 'nfs.disable': if value in ['on', 'true']: new_volume.protocol['NFS'] = 'off' # get bricks listed against this volume, and create the Brick object(s) brick_nodes = vol_object.findall('.//brick') # list holding brick paths repl = [] ctr = 1 for brick in brick_nodes: brick_path = brick.text new_volume.brick_order.append(brick_path) (hostname, pathname) = brick_path.split(':') if cfg.debug: print "defineVolumes. Adding brick %s to %s" % (brick_path, new_volume.name) node_uuid = self.get_node(hostname) # add this bricks owning node to the volume's attributes try: new_volume.node[node_uuid] = self.node[node_uuid] except KeyError: print "Unable to associate brick %s with a peer in the cluster, possibly due" % brick_path print "to name lookup failures. If the nodes are not registered (fwd & rev)" print "to dns, add local entries for your cluster nodes in the the /etc/hosts file" sys.exit(16) new_brick = Brick(brick_path, self.node[node_uuid], new_volume.name) # Add the brick to the cluster and volume self.brick[brick_path] = new_brick new_volume.brick[brick_path] = new_brick # add this brick to the owning node brick_owner = self.node[node_uuid] brick_owner.brick[brick_path] = new_brick if (new_volume.replicaCount > 1) or (new_volume.disperseCount > 0): repl.append(brick_path) bricks_per_subvolume = max(new_volume.replicaCount, new_volume.disperseCount) ctr += 1 if ctr > bricks_per_subvolume: ctr = 1 # add this replica set to the volume's info new_volume.subvolumes.append(repl) # drop all elements from temporary list repl = [] # By default from gluster 3.3 onwards, self heal is enabled for # all replicated/disperse volumes. We look at the volume type, and if it # is replicated and hasn't had self-heal explicitly disabled the # self heal state is inferred against the nodes that contain the # bricks for the volume. With this state in place, the updateState # method can cross-check to see what is actually happening if ('replicate' in new_volume.typeStr.lower()) or ('disperse' in new_volume.typeStr.lower()): heal_enabled = True # assume it's on if 'cluster.self-heal-daemon' in new_volume.options: if new_volume.options['cluster.self-heal-daemon'].lower() in ['off', 'false']: heal_enabled = False new_volume.self_heal_enabled = heal_enabled if heal_enabled: node_set = set() # use a set to maintain a unique group of nodes for brick_path in new_volume.brick: this_brick = self.brick[brick_path] this_node = this_brick.node node_set.add(this_node) this_node.self_heal_enabled = True self.sh_enabled = len(node_set) self.volume_count = Volume.volume_count() self.brick_count = Brick.brick_count()
def update_self_heal(self, output_mode): """ Updates the state of self heal for this volume """ # first check if this volume is a replicated or disperse volume, if not # set the state string to "not applicable" if ('replicate' not in self.typeStr.lower()) and ('disperse' not in self.typeStr.lower()): self.self_heal_string = 'N/A' return # if self-heal is disabled by option... if 'cluster.self-heal-daemon' in self.options: if self.options['cluster.self-heal-daemon'].lower() in [ 'off', 'false' ]: self.self_heal_string = 'DISABLED' return if output_mode == 'console' and not cfg.no_progress_msgs: sys.stdout.write( "Analysing Self Heal backlog for %s %s \n\r\x1b[A" % (self.name, " " * 20)) # On gluster 3.4 & 3.5 vol heal with --xml is not supported so parsing # has to be done the old fashioned way :( # The command is invoked with a timeout clause too cmd = GlusterCommand("gluster vol heal %s info" % self.name, timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, vol_heal_output) = issueCMD("gluster vol heal %s info"%(self.name)) if cmd.rc == 0: total_heal_count = 0 # in gluster 3.4 even though the cluster and bricks are defined with IP addresses # the vol heal can return an fqdn for the hostname - so we have to account for that for line in cmd.stdout: if line.lower().startswith('brick'): (node, path_name) = line.replace(':', ' ').split()[1:] if cfg.debug: print "updateSelfHeal. self heal cmd gave a node name of %s" % node # 3.4.0.59 added trailing '/' to brick path,so remove it! brick_path = node + ":" + path_name.rstrip('/') if line.lower().startswith('number'): heal_count = int(line.split(':')[1]) try: self.brick[brick_path].heal_count = heal_count if cfg.debug: print "updateSelfHeal. brick path from self heal matched brick object successfully" except KeyError: if cfg.debug: print( "updateSelfHeal. brick path from self heal != any brick object, " "processing nodes to locate the brick") # cycle though the nodes associated with this volume match_found = False for uuid in self.node: # if this node does NOT match the node in the brickpath, skip it if node not in self.node[uuid].alias_list: continue # now convert the brickpath to something usable for alias in self.node[uuid].alias_list: new_path = alias + ":" + path_name.rstrip('/') if new_path in self.brick: brick_path = new_path match_found = True break if match_found: break if cfg.debug: print "updateSelfHeal. using brick path match of %s" % brick_path total_heal_count += heal_count self.self_heal_count = total_heal_count if total_heal_count > 0: self.self_heal_string += " Heal backlog of %d files" % total_heal_count else: self.self_heal_string += " All files in sync" else: # vol heal command failed - just flag the problem if cfg.debug: print( "Volume updateSelfHeal. Query for self heal details timed out - " "maybe run again with a larger -t value?") self.self_heal_string += " HEAL DATA UNAVAILABLE"
def update_self_heal(self, output_mode): """ Updates the state of self heal for this volume """ # first check if this volume is a replicated or disperse volume, if not # set the state string to "not applicable" if ('replicate' not in self.typeStr.lower()) and ('disperse' not in self.typeStr.lower()): self.self_heal_string = 'N/A' return # if self-heal is disabled by option... if 'cluster.self-heal-daemon' in self.options: if self.options['cluster.self-heal-daemon'].lower() in ['off', 'false']: self.self_heal_string = 'DISABLED' return if output_mode == 'console' and not cfg.no_progress_msgs: sys.stdout.write("Analysing Self Heal backlog for %s %s \n\r\x1b[A" % (self.name, " " * 20)) # On gluster 3.4 & 3.5 vol heal with --xml is not supported so parsing # has to be done the old fashioned way :( # The command is invoked with a timeout clause too cmd = GlusterCommand("gluster vol heal %s info" % self.name, timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, vol_heal_output) = issueCMD("gluster vol heal %s info"%(self.name)) if cmd.rc == 0: total_heal_count = 0 # in gluster 3.4 even though the cluster and bricks are defined with IP addresses # the vol heal can return an fqdn for the hostname - so we have to account for that for line in cmd.stdout: if line.lower().startswith('brick'): (node, path_name) = line.replace(':', ' ').split()[1:] if cfg.debug: print "updateSelfHeal. self heal cmd gave a node name of %s" % node # 3.4.0.59 added trailing '/' to brick path,so remove it! brick_path = node + ":" + path_name.rstrip('/') if line.lower().startswith('number'): heal_count = int(line.split(':')[1]) try: self.brick[brick_path].heal_count = heal_count if cfg.debug: print "updateSelfHeal. brick path from self heal matched brick object successfully" except KeyError: if cfg.debug: print ("updateSelfHeal. brick path from self heal != any brick object, " "processing nodes to locate the brick") # cycle though the nodes associated with this volume match_found = False for uuid in self.node: # if this node does NOT match the node in the brickpath, skip it if node not in self.node[uuid].alias_list: continue # now convert the brickpath to something usable for alias in self.node[uuid].alias_list: new_path = alias + ":" + path_name.rstrip('/') if new_path in self.brick: brick_path = new_path match_found = True break if match_found: break try: if cfg.debug: print "updateSelfHeal. using brick path match of %s" % brick_path self.brick[brick_path].heal_count = heal_count except: print "updateSelfHeal.Unable to apply self heal stats due to %s not matching existing" % ( brick_path) print "brick objects, and can not continue." exit(16) total_heal_count += heal_count self.self_heal_count = total_heal_count if total_heal_count > 0: self.self_heal_string += " Heal backlog of %d files" % total_heal_count else: self.self_heal_string += " All files in sync" else: # vol heal command failed - just flag the problem if cfg.debug: print ("Volume updateSelfHeal. Query for self heal details timed out - " "maybe run again with a larger -t value?") self.self_heal_string += " HEAL DATA UNAVAILABLE"
def update_state(self, self_heal_backlog, client_status): """ update the state of the cluster by processing the output of 'vol status' commands - vol status all detail --> provides the brick info (up/down, type), plus volume capacity - vol status all --> self heal states """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Updating volume information" + " " * 20 + "\n\r\x1b[A") # WORKAROUND # The code issues n vol status requests because issueing a vol status # wih the 'all' parameter can give bad xml when nodes are not # present in the cluster. By stepping through each volume, the # xml, while still buggy can be worked around # Process all volumes known to the cluster for volume_name in self.volume: # 'status' is set from a vol info command. This will show whether the # vol is created (0), started (1), or stopped (2). We're only interested # in the started state, when issuing the vol status command if self.volume[volume_name].status == 1: cmd = GlusterCommand("gluster vol status %s detail --xml"%volume_name, timeout=cfg.CMD_TIMEOUT) # (rc, vol_status) = issueCMD("gluster vol status %s detail --xml"%(volume_name)) cmd.run() # Need to check opRet element since for xml based gluster commands # do NOT pass a return code back to the shell! # gluster_rc = int([line.replace('<',' ').replace('>',' ').split()[1] # for line in vol_status if 'opRet' in line][0]) if cmd.rc == 0: xml_string = ''.join(cmd.stdout) xml_obj = ETree.fromstring(xml_string) # Update the volume, to provide capacity and status information self.volume[volume_name].update(xml_obj) else: # Being unable to get a vol status for a known volume # may indicate a peer transitioning to disconnected state # so issue an error message and abort the script print "\n--> gstatus has been unable to query volume '" + volume_name + "'" print "\nPossible cause: cluster is currently reconverging after a node" print "has entered a disconnected state." print "\nResponse: Rerun gstatus or issue a peer status command to confirm\n" exit(16) # ----------------------------------------------------------------------------- # Issue a vol status then use the output to look for active tasks and self heal # state information # ----------------------------------------------------------------------------- cmd = GlusterCommand("gluster vol status %s --xml"%volume_name, timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, vol_status) = issueCMD("gluster vol status %s --xml"%(volume_name)) # gluster_rc = int([line.replace('<',' ').replace('>',' ').split()[1] # for line in vol_status if 'opRet' in line][0]) xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) task_elements = xml_root.findall('.//task') for task in task_elements: task_name = task.find('./type').text task_status = task.find('./status').text task.status_str = task.find('./statusStr').text if task_status == '1': self.volume[volume_name].task_list.append(task_name) # --------------------------------------------------------------------- # If the volume has self_heal enabled, we look at the state of the daemons # --------------------------------------------------------------------- if self.volume[volume_name].self_heal_enabled: if self.output_mode == 'console' and not cfg.no_progress_msgs: sys.stdout.write("Analysing Self Heal daemons on %s %s\n\r\x1b[A" % (volume_name, " " * 20)) if cmd.rc == 0: # self_heal_list = [] node_elements = xml_root.findall('.//node') # print "DEBUG --> node elements in vol status is " + str(len(node_elements)) # first get a list of self-heal elements from the xml for node in node_elements: # WORKAROUND # there's a bug in some versions of 3.4, where when a node is missing # the xml returned is malformed returning a node # within a node so we need to check the subelements # to see if they're valid. if node.find('./node'): continue if node.find('./hostname').text == 'Self-heal Daemon': node_name = node.find('./path').text node_state = node.find('./status').text # uuid = '' # convert the name to a usable uuid if node_name == 'localhost': uuid = self.get_node(GlusterCommand.targetNode) else: uuid = self.get_node(node_name) if not uuid: # tried to resolve the name but couldn't print ("Cluster.updateState : Attempting to use a 'path' (%s) for " "a self heal daemon that" % node_name) print "does not correspond to a peer node, and can not continue\n" exit(16) if self.node[uuid].self_heal_enabled: if node_state == '1': self.node[uuid].self_heal_active = True else: self.node[uuid].self_heal_active = False # update the self heal flags, based on the vol status self.volume[volume_name].set_self_heal_stats() # high level info if self_heal_backlog: # now get low level info to check for heal backlog self.volume[volume_name].update_self_heal(self.output_mode) if 'UNAVAILABLE' in self.volume[volume_name].self_heal_string: # add message to cluster messages self.messages.append( 'WARNING -> self heal query did not complete for %s. Debug with -D or use ' '-t to increase cmd timeout' % volume_name) this_state = self.volume[volume_name].volume_state if this_state == 'up': self.volume_summary['up'] += 1 elif 'degraded' in this_state: self.volume_summary['degraded'] += 1 elif 'partial' in this_state: self.volume_summary['partial'] += 1 else: self.volume_summary['down'] += 1 self.active_nodes() # update active node counter self.active_bricks() # update active brick counter self.check_self_heal() if client_status: self.calc_connections()
def define_volumes(self): """ Create the volume + brick objects """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # print a progress message sys.stdout.write("Building volume objects" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand("gluster vol info --xml", timeout=cfg.CMD_TIMEOUT) cmd.run() # (rc, vol_info) = issueCMD("gluster vol info --xml") xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) vol_elements = xml_root.findall('.//volume') for vol_object in vol_elements: # set up a dict for the initial definition of the volume vol_dict = {} # build a dict for the initial volume settings. An attribute error results in a default # value being assigned (e.g. on older glusterfs disperse related fields are missing) for attr in Volume.volume_attr: try: vol_dict[attr] = vol_object.find('./' + attr).text except AttributeError: vol_dict[attr] = '0' # create a volume object, for this volume new_volume = Volume(vol_dict) self.volume[new_volume.name] = new_volume if cfg.debug: print "defineVolumes. Adding volume %s" % new_volume.name # add information about any volume options opt_nodes = vol_object.findall('.//option') for option in opt_nodes: for n in option.getchildren(): if n.tag == 'name': key = n.text elif n.tag == 'value': value = n.text new_volume.options[key] = value # Protocols are enabled by default, so we look # for the volume tuning options that turn them # off if key == 'user.cifs': if value in ['disable', 'off', 'false']: new_volume.protocol['SMB'] = 'off' elif key == 'nfs.disable': if value in ['on', 'true']: new_volume.protocol['NFS'] = 'off' # get bricks listed against this volume, and create the Brick object(s) brick_nodes = vol_object.findall('.//brick') # list holding brick paths repl = [] ctr = 1 for brick in brick_nodes: brick_path = brick.text new_volume.brick_order.append(brick_path) (hostname, pathname) = brick_path.split(':') if cfg.debug: print "defineVolumes. Adding brick %s to %s" % (brick_path, new_volume.name) node_uuid = self.get_node(hostname) # add this bricks owning node to the volume's attributes try: new_volume.node[node_uuid] = self.node[node_uuid] except KeyError: print "Unable to associate brick %s with a peer in the cluster, possibly due" % brick_path print "to name lookup failures. If the nodes are not registered (fwd & rev)" print "to dns, add local entries for your cluster nodes in the the /etc/hosts file" sys.exit(16) new_brick = Brick(brick_path, self.node[node_uuid], new_volume.name) # Add the brick to the cluster and volume self.brick[brick_path] = new_brick new_volume.brick[brick_path] = new_brick # add this brick to the owning node brick_owner = self.node[node_uuid] brick_owner.brick[brick_path] = new_brick if (new_volume.replicaCount > 1) or (new_volume.disperseCount > 0): repl.append(brick_path) bricks_per_subvolume = max(new_volume.replicaCount, new_volume.disperseCount) ctr += 1 if ctr > bricks_per_subvolume: ctr = 1 # add this replica set to the volume's info new_volume.subvolumes.append(repl) # drop all elements from temporary list repl = [] # By default from gluster 3.3 onwards, self heal is enabled for # all replicated/disperse volumes. We look at the volume type, and if it # is replicated and hasn't had self-heal explicitly disabled the # self heal state is inferred against the nodes that contain the # bricks for the volume. With this state in place, the updateState # method can cross-check to see what is actually happening if ('replicate' in new_volume.typeStr.lower()) or ('disperse' in new_volume.typeStr.lower()): heal_enabled = True # assume it's on if 'cluster.self-heal-daemon' in new_volume.options: if new_volume.options['cluster.self-heal-daemon'].lower() in ['off', 'false']: heal_enabled = False new_volume.self_heal_enabled = heal_enabled if heal_enabled: node_set = set() # use a set to maintain a unique group of nodes for brick_path in new_volume.brick: this_brick = self.brick[brick_path] this_node = this_brick.node node_set.add(this_node) this_node.self_heal_enabled = True self.sh_enabled = len(node_set) self.volume_count = Volume.volume_count() self.brick_count = Brick.brick_count()
def define_nodes(self): """ define the node objects for this cluster based on gluster pool list output """ if self.output_mode == 'console' and not cfg.no_progress_msgs: # display a progress message sys.stdout.write("Processing nodes" + " " * 20 + "\n\r\x1b[A") cmd = GlusterCommand('gluster pool list --xml', timeout=cfg.CMD_TIMEOUT) cmd.run() if cmd.rc != 0: print "glusterd did not respond to a peer status request, gstatus" print "can not continue.\n" exit(12) # define a list of elements in the xml that we're interested in field_list = ['hostname', 'uuid', 'connected'] xml_string = ''.join(cmd.stdout) xml_root = ETree.fromstring(xml_string) peer_list = xml_root.findall('.//peer') for peer in peer_list: node_info = get_attr(peer, field_list) this_hostname = node_info['hostname'] alias_list = [] if this_hostname == 'localhost': # output may say localhost, but it could be a reponse from a # foreign peer, since the local glusterd could be down if GlusterCommand.targetNode == 'localhost': local_ip_list = get_ipv4_addr() # Grab all IP's for ip in local_ip_list: alias_list += host_aliases(ip) alias_list.append('localhost') else: this_hostname = GlusterCommand.targetNode alias_list = host_aliases(this_hostname) alias_list.append('localhost') else: alias_list = host_aliases(this_hostname) # DEBUG ------------------------------------------------------------ if cfg.debug: # Clean up all the empty strings in the list self.alias_stripped = [ele for ele in alias_list if ele != ''] print "Creating a node object with uuid %s, with names of %s"%\ (node_info['uuid'], self.alias_stripped) # ------------------------------------------------------------------ new_node = Node(node_info['uuid'], node_info['connected'], alias_list) self.ip_list += [ip for ip in alias_list if is_ip(ip)] # add this node object to the cluster objects 'dict' self.node[node_info['uuid']] = new_node self.node_count = Node.node_count()