Ejemplo n.º 1
0
    def health_checks(self):
        """ perform checks on elements that affect the reported state of the cluster """

        # The idea here is to perform the most significant checks first
        # so the message list appears in a priority order

        # 1. Check the volumes
        for volume_name in self.volume:
            this_volume = self.volume[volume_name]

            if 'down' in this_volume.volume_state:
                self.messages.append("Volume '%s' is down" % volume_name)
                self.status = 'unhealthy'

            if 'partial' in this_volume.volume_state:
                self.messages.append(
                    "Volume '%s' is in a PARTIAL state, some data is inaccessible data, due to missing bricks"
                    % volume_name)
                self.messages.append("WARNING -> Write requests may fail against volume '%s'" % this_volume.name)
                self.status = 'unhealthy'

                # 2. Check for conditions detected at the node level
        for uuid in self.node:
            this_node = self.node[uuid]
            if this_node.state != '1':
                # https://bugzilla.redhat.com/show_bug.cgi?id=1254514,
                # node_name comes as empty
                # self.messages.append("Cluster node '%s' is down" %
                # (this_node.node_name()))
                self.nodes_down += 1
                self.status = 'unhealthy'

            if this_node.self_heal_enabled != this_node.self_heal_active:
                # https://bugzilla.redhat.com/show_bug.cgi?id=1254514,
                # decided to remove the self-heal status as it is
                # redunant information
                # self.messages.append("Self heal daemon is down on %s" % (this_node.node_name()))
                self.status = 'unhealthy'

        # Print the number of nodes that are down
        if self.nodes_down == 1:
            self.messages.append("One of the nodes in the cluster is down")
        elif self.nodes_down > 1:
            self.messages.append("%s nodes in the cluster are down"%self.nodes_down)

        # 3. Check the bricks
        for brick_name in self.brick:

            this_brick = self.brick[brick_name]

            # 3.1 check for state
            if not this_brick.up:
                self.messages.append(
                    "Brick %s in volume '%s' is down/unavailable" % (brick_name, this_brick.owning_volume))

                # 3.2 check for best practice goes here (minor error messages - FUTURE)

        if self.bricks_active < Brick.brick_count():
            self.messages.append("INFO -> Not all bricks are online, so capacity provided is NOT accurate")
Ejemplo n.º 2
0
    def health_checks(self):
        """ perform checks on elements that affect the reported state of the cluster """

        # The idea here is to perform the most significant checks first
        # so the message list appears in a priority order

        # 1. Check the volumes
        for volume_name in self.volume:
            this_volume = self.volume[volume_name]

            if 'down' in this_volume.volume_state:
                self.messages.append("Volume '%s' is down" % volume_name)
                self.status = 'unhealthy'

            if 'partial' in this_volume.volume_state:
                self.messages.append(
                    "Volume '%s' is in a PARTIAL state, some data is inaccessible data, due to missing bricks"
                    % volume_name)
                self.messages.append("WARNING -> Write requests may fail against volume '%s'" % this_volume.name)
                self.status = 'unhealthy'

                # 2. Check for conditions detected at the node level
        for uuid in self.node:
            this_node = self.node[uuid]
            if this_node.state != '1':
                # https://bugzilla.redhat.com/show_bug.cgi?id=1254514,
                # node_name comes as empty
                # self.messages.append("Cluster node '%s' is down" %
                # (this_node.node_name()))
                self.nodes_down += 1
                self.status = 'unhealthy'

            if this_node.self_heal_enabled != this_node.self_heal_active:
                # https://bugzilla.redhat.com/show_bug.cgi?id=1254514,
                # decided to remove the self-heal status as it is
                # redunant information
                # self.messages.append("Self heal daemon is down on %s" % (this_node.node_name()))
                self.status = 'unhealthy'

        # Print the number of nodes that are down
        if self.nodes_down == 1:
            self.messages.append("One of the nodes in the cluster is down")
        elif self.nodes_down > 1:
            self.messages.append("%s nodes in the cluster are down"%self.nodes_down)

        # 3. Check the bricks
        for brick_name in self.brick:

            this_brick = self.brick[brick_name]

            # 3.1 check for state
            if not this_brick.up:
                self.messages.append(
                    "Brick %s in volume '%s' is down/unavailable" % (brick_name, this_brick.owning_volume))

                # 3.2 check for best practice goes here (minor error messages - FUTURE)

        if self.bricks_active < Brick.brick_count():
            self.messages.append("INFO -> Not all bricks are online, so capacity provided is NOT accurate")
Ejemplo n.º 3
0
    def health_checks(self):
        """ perform checks on elements that affect the reported state of the cluster """

        # The idea here is to perform the most significant checks first
        # so the message list appears in a priority order

        # 1. Check the volumes
        for volume_name in self.volume:
            this_volume = self.volume[volume_name]

            if 'down' in this_volume.volume_state:
                self.messages.append("Volume '%s' is down" % volume_name)
                self.status = 'unhealthy'

            if 'partial' in this_volume.volume_state:
                self.messages.append(
                    "Volume '%s' is in a PARTIAL state, some data is inaccessible data, due to missing bricks"
                    % volume_name)
                self.messages.append("WARNING -> Write requests may fail against volume '%s'" % this_volume.name)
                self.status = 'unhealthy'

                # 2. Check for conditions detected at the node level
        for uuid in self.node:
            this_node = self.node[uuid]
            if this_node.state != '1':
                self.messages.append("Cluster node '%s' is down" % (this_node.node_name()))
                self.status = 'unhealthy'

            if this_node.self_heal_enabled != this_node.self_heal_active:
                self.messages.append("Self heal daemon is down on %s" % (this_node.node_name()))
                self.status = 'unhealthy'

        # 3. Check the bricks
        for brick_name in self.brick:

            this_brick = self.brick[brick_name]

            # 3.1 check for state
            if not this_brick.up:
                self.messages.append(
                    "Brick %s in volume '%s' is down/unavailable" % (brick_name, this_brick.owning_volume))

                # 3.2 check for best practice goes here (minor error messages - FUTURE)

        if self.bricks_active < Brick.brick_count():
            self.messages.append("INFO -> Not all bricks are online, so capacity provided is NOT accurate")
Ejemplo n.º 4
0
    def define_volumes(self):
        """ Create the volume + brick objects """

        if self.output_mode == 'console' and not cfg.no_progress_msgs:
            # print a progress message
            sys.stdout.write("Building volume objects" + " " * 20 + "\n\r\x1b[A")

        cmd = GlusterCommand("gluster vol info --xml")
        cmd.run()
        # (rc, vol_info) = issueCMD("gluster vol info --xml")

        xml_string = ''.join(cmd.stdout)
        xml_root = ETree.fromstring(xml_string)

        vol_elements = xml_root.findall('.//volume')

        for vol_object in vol_elements:

            # set up a dict for the initial definition of the volume
            vol_dict = {}

            # build a dict for the initial volume settings. An attribute error results in a default
            # value being assigned (e.g. on older glusterfs disperse related fields are missing)
            for attr in Volume.volume_attr:
                try:
                    vol_dict[attr] = vol_object.find('./' + attr).text
                except AttributeError:
                    vol_dict[attr] = '0'

            # create a volume object, for this volume
            new_volume = Volume(vol_dict)
            self.volume[new_volume.name] = new_volume

            if cfg.debug:
                print "defineVolumes. Adding volume %s" % new_volume.name

            # add information about any volume options
            opt_nodes = vol_object.findall('.//option')
            for option in opt_nodes:
                for n in option.getchildren():
                    if n.tag == 'name':
                        key = n.text
                    elif n.tag == 'value':
                        value = n.text
                        new_volume.options[key] = value

                        # Protocols are enabled by default, so we look
                        # for the volume tuning options that turn them
                        # off
                        if key == 'user.cifs':
                            if value in ['disable', 'off', 'false']:
                                new_volume.protocol['SMB'] = 'off'

                        elif key == 'nfs.disable':
                            if value in ['on', 'true']:
                                new_volume.protocol['NFS'] = 'off'

            # get bricks listed against this volume, and create the Brick object(s)
            brick_nodes = vol_object.findall('.//brick')

            # list holding brick paths
            repl = []
            ctr = 1

            for brick in brick_nodes:

                brick_path = brick.text
                new_volume.brick_order.append(brick_path)
                (hostname, pathname) = brick_path.split(':')

                if cfg.debug:
                    print "defineVolumes. Adding brick %s to %s" % (brick_path,
                                                                    new_volume.name)

                node_uuid = self.get_node(hostname)

                # add this bricks owning node to the volume's attributes
                try:
                    new_volume.node[node_uuid] = self.node[node_uuid]

                except KeyError:
                    print "Unable to associate brick %s with a peer in the cluster, possibly due" % brick_path
                    print "to name lookup failures. If the nodes are not registered (fwd & rev)"
                    print "to dns, add local entries for your cluster nodes in the the /etc/hosts file"
                    sys.exit(16)

                new_brick = Brick(brick_path, self.node[node_uuid], new_volume.name)

                # Add the brick to the cluster and volume
                self.brick[brick_path] = new_brick
                new_volume.brick[brick_path] = new_brick

                # add this brick to the owning node
                brick_owner = self.node[node_uuid]
                brick_owner.brick[brick_path] = new_brick

                if (new_volume.replicaCount > 1) or (new_volume.disperseCount > 0):
                    repl.append(brick_path)
                    bricks_per_subvolume = max(new_volume.replicaCount, new_volume.disperseCount)
                    ctr += 1
                    if ctr > bricks_per_subvolume:
                        ctr = 1

                        # add this replica set to the volume's info
                        new_volume.subvolumes.append(repl)
                        # drop all elements from temporary list
                        repl = []

            # By default from gluster 3.3 onwards, self heal is enabled for
            # all replicated/disperse volumes. We look at the volume type, and if it
            # is replicated and hasn't had self-heal explicitly disabled the
            # self heal state is inferred against the nodes that contain the
            # bricks for the volume. With this state in place, the updateState
            # method can cross-check to see what is actually happening

            if ('replicate' in new_volume.typeStr.lower()) or ('disperse' in new_volume.typeStr.lower()):

                heal_enabled = True  # assume it's on

                if 'cluster.self-heal-daemon' in new_volume.options:
                    if new_volume.options['cluster.self-heal-daemon'].lower() in ['off', 'false']:
                        heal_enabled = False

                new_volume.self_heal_enabled = heal_enabled

                if heal_enabled:

                    node_set = set()  # use a set to maintain a unique group of nodes

                    for brick_path in new_volume.brick:
                        this_brick = self.brick[brick_path]
                        this_node = this_brick.node
                        node_set.add(this_node)
                        this_node.self_heal_enabled = True

                    self.sh_enabled = len(node_set)

        self.volume_count = Volume.volume_count()
        self.brick_count = Brick.brick_count()
Ejemplo n.º 5
0
    def define_volumes(self):
        """ Create the volume + brick objects """

        if self.output_mode == 'console' and not cfg.no_progress_msgs:
            # print a progress message
            sys.stdout.write("Building volume objects" + " " * 20 + "\n\r\x1b[A")

        cmd = GlusterCommand("gluster vol info --xml", timeout=cfg.CMD_TIMEOUT)
        cmd.run()
        # (rc, vol_info) = issueCMD("gluster vol info --xml")

        xml_string = ''.join(cmd.stdout)
        xml_root = ETree.fromstring(xml_string)

        vol_elements = xml_root.findall('.//volume')

        for vol_object in vol_elements:

            # set up a dict for the initial definition of the volume
            vol_dict = {}

            # build a dict for the initial volume settings. An attribute error results in a default
            # value being assigned (e.g. on older glusterfs disperse related fields are missing)
            for attr in Volume.volume_attr:
                try:
                    vol_dict[attr] = vol_object.find('./' + attr).text
                except AttributeError:
                    vol_dict[attr] = '0'

            # create a volume object, for this volume
            new_volume = Volume(vol_dict)
            self.volume[new_volume.name] = new_volume

            if cfg.debug:
                print "defineVolumes. Adding volume %s" % new_volume.name

            # add information about any volume options
            opt_nodes = vol_object.findall('.//option')
            for option in opt_nodes:
                for n in option.getchildren():
                    if n.tag == 'name':
                        key = n.text
                    elif n.tag == 'value':
                        value = n.text
                        new_volume.options[key] = value

                        # Protocols are enabled by default, so we look
                        # for the volume tuning options that turn them
                        # off
                        if key == 'user.cifs':
                            if value in ['disable', 'off', 'false']:
                                new_volume.protocol['SMB'] = 'off'

                        elif key == 'nfs.disable':
                            if value in ['on', 'true']:
                                new_volume.protocol['NFS'] = 'off'

            # get bricks listed against this volume, and create the Brick object(s)
            brick_nodes = vol_object.findall('.//brick')

            # list holding brick paths
            repl = []
            ctr = 1

            for brick in brick_nodes:

                brick_path = brick.text
                new_volume.brick_order.append(brick_path)
                (hostname, pathname) = brick_path.split(':')

                if cfg.debug:
                    print "defineVolumes. Adding brick %s to %s" % (brick_path,
                                                                    new_volume.name)

                node_uuid = self.get_node(hostname)

                # add this bricks owning node to the volume's attributes
                try:
                    new_volume.node[node_uuid] = self.node[node_uuid]

                except KeyError:
                    print "Unable to associate brick %s with a peer in the cluster, possibly due" % brick_path
                    print "to name lookup failures. If the nodes are not registered (fwd & rev)"
                    print "to dns, add local entries for your cluster nodes in the the /etc/hosts file"
                    sys.exit(16)

                new_brick = Brick(brick_path, self.node[node_uuid], new_volume.name)

                # Add the brick to the cluster and volume
                self.brick[brick_path] = new_brick
                new_volume.brick[brick_path] = new_brick

                # add this brick to the owning node
                brick_owner = self.node[node_uuid]
                brick_owner.brick[brick_path] = new_brick

                if (new_volume.replicaCount > 1) or (new_volume.disperseCount > 0):
                    repl.append(brick_path)
                    bricks_per_subvolume = max(new_volume.replicaCount, new_volume.disperseCount)
                    ctr += 1
                    if ctr > bricks_per_subvolume:
                        ctr = 1

                        # add this replica set to the volume's info
                        new_volume.subvolumes.append(repl)
                        # drop all elements from temporary list
                        repl = []

            # By default from gluster 3.3 onwards, self heal is enabled for
            # all replicated/disperse volumes. We look at the volume type, and if it
            # is replicated and hasn't had self-heal explicitly disabled the
            # self heal state is inferred against the nodes that contain the
            # bricks for the volume. With this state in place, the updateState
            # method can cross-check to see what is actually happening

            if ('replicate' in new_volume.typeStr.lower()) or ('disperse' in new_volume.typeStr.lower()):

                heal_enabled = True  # assume it's on

                if 'cluster.self-heal-daemon' in new_volume.options:
                    if new_volume.options['cluster.self-heal-daemon'].lower() in ['off', 'false']:
                        heal_enabled = False

                new_volume.self_heal_enabled = heal_enabled

                if heal_enabled:

                    node_set = set()  # use a set to maintain a unique group of nodes

                    for brick_path in new_volume.brick:
                        this_brick = self.brick[brick_path]
                        this_node = this_brick.node
                        node_set.add(this_node)
                        this_node.self_heal_enabled = True

                    self.sh_enabled = len(node_set)

        self.volume_count = Volume.volume_count()
        self.brick_count = Brick.brick_count()