Ejemplo n.º 1
0
    def _shutdown():
        console_log.info("Initiating server shutdown per manager request")
        # This will initiate a "nice" shutdown with a wall from root, etc.
        AgentShell.try_run(["shutdown", "-H" if halt else "-h", at_time])

        console_log.info("Terminating")
        os._exit(0)
Ejemplo n.º 2
0
    def _reboot():
        console_log.info("Initiating server reboot per manager request")
        # reboot(8) just calls shutdown anyhow.
        AgentShell.try_run(["shutdown", "-r", at_time])

        console_log.info("Terminating")
        os._exit(0)
Ejemplo n.º 3
0
def kernel_status():
    """
    :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>}
    """
    running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip()
    try:
        required_kernel_stdout = AgentShell.try_run(
            ["rpm", "-qR", "lustre-modules"])
    except AgentShell.CommandExecutionError:
        try:
            required_kernel_stdout = AgentShell.try_run(
                ["rpm", "-qR", "lustre-client-modules"])
        except AgentShell.CommandExecutionError:
            required_kernel_stdout = None

    required_kernel = None
    if required_kernel_stdout:
        for line in required_kernel_stdout.split("\n"):
            if line.startswith('kernel'):
                required_kernel = "kernel-%s.%s" % (line.split(" = ")[1],
                                                    platform.machine())

    available_kernels = []
    for installed_kernel in AgentShell.try_run(["rpm", "-q",
                                                "kernel"]).split("\n"):
        if installed_kernel:
            available_kernels.append(installed_kernel)

    return {
        'running': running_kernel,
        'required': required_kernel,
        'available': available_kernels
    }
Ejemplo n.º 4
0
    def set_address(self, ipv4_address, prefix):
        ifaddr = "%s/%s" % (ipv4_address, prefix)

        console_log.info("Set %s (%s) up" % (self.name, ifaddr))

        if self.ipv4_address != ipv4_address:
            node_admin.unmanage_network(self.device, self.mac_address)

            AgentShell.try_run(
                ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up'])
            AgentShell.try_run(
                ['/sbin/ip', 'addr', 'add', ifaddr, 'dev', self.name])

            # The link address change is asynchronous, so we need to wait for the
            # address to stick of we have a race condition.
            timeout = 30
            while self.ipv4_address != ipv4_address and timeout != 0:
                self.refresh()
                time.sleep(1)
                timeout -= 1

            if self.ipv4_address != ipv4_address:
                raise RuntimeError(
                    'Unable to set the address %s for interface %s' %
                    (self.ipv4_address, self.name))

            node_admin.write_ifcfg(self.device, self.mac_address,
                                   self.ipv4_address, self.ipv4_netmask)
        else:
            console_log.info("Nothing to do as %s already has address %s" %
                             (self.name, ifaddr))
Ejemplo n.º 5
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run(['openssl', 'genrsa', '-out', self.PRIVATE_KEY_FILE, '2048', '-sha256'])

        return self.PRIVATE_KEY_FILE
Ejemplo n.º 6
0
def action_two(arg1):
    """An action which invokes subprocess_one and subprocess_two"""

    assert arg1 == "arg2_test"
    stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg'])
    assert stdout == 'subprocess_one_stdout'
    AgentShell.try_run(['subprocess_two', 'subprocess_two_arg'])
    return ACTION_TWO_RETVAL
Ejemplo n.º 7
0
def mount_lustre_filesystem(mountspec, mountpoint):
    try:
        os.makedirs(mountpoint, 0o755)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    create_fstab_entry(mountspec, mountpoint)
    AgentShell.try_run(["/bin/mount", mountpoint])
Ejemplo n.º 8
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run([
                "openssl", "genrsa", "-out", self.PRIVATE_KEY_FILE, "2048",
                "-sha256"
            ])

        return self.PRIVATE_KEY_FILE
Ejemplo n.º 9
0
def delete_node(nodename):
    rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"])
    node_id = None
    for line in stdout.split("\n"):
        node_id, name, status = line.split(" ")
        if name == nodename:
            break
    AgentShell.try_run(["crm_node", "--force", "-R", node_id])
    cibxpath("delete", '//nodes/node[@uname="{}"]'.format(nodename))
    cibxpath("delete", '//status/node_state[@uname="{}"]'.format(nodename))
Ejemplo n.º 10
0
 def disable_standby(self):
     AgentShell.try_run([
         "crm_attribute",
         "-N",
         self.name,
         "-n",
         "standby",
         "-v",
         "off",
         "--lifetime=forever",
     ])
Ejemplo n.º 11
0
 def set_attribute(self, key, value):
     AgentShell.try_run([
         "crm_attribute",
         "-t",
         "nodes",
         "-U",
         self.name,
         "-n",
         key,
         "-v",
         str(value),
     ])
Ejemplo n.º 12
0
def _move_target(target_label, dest_node):
    """
    common plumbing for failover/failback. Move the target with label to the destination node.

    :param target_label: The label of the node to move
    :param dest_node: The target to move it to.
    :return: None if successful or an error message if an error occurred.
    """

    # Issue the command to Pacemaker to move the target
    arg_list = [
        'crm_resource', '--resource', target_label, '--move', '--node',
        dest_node
    ]

    # For on going debug purposes, lets get the resource locations at the beginning. This provides useful
    # log output in the case where things don't work.
    AgentShell.run(['crm_mon', '-1'])

    # Now before we start cleanup anything that has gone on before. HA is a fickle old thing and this will make sure
    # that everything is clean before we start.
    AgentShell.try_run(
        ['crm_resource', '--resource', target_label, '--cleanup'])

    result = AgentShell.run(arg_list)

    if result.rc != 0:
        return "Error (%s) running '%s': '%s' '%s'" % (
            result.rc, " ".join(arg_list), result.stdout, result.stderr)

    timeout = 100

    # Now wait for it to complete its move, this will succeed quickly if it was already there
    while timeout > 0:
        if get_resource_location(target_label) == dest_node:
            break

        time.sleep(1)
        timeout -= 1

    # now delete the constraint that crm_resource --move created
    AgentShell.try_run([
        'crm_resource', '--resource', target_label, '--un-move', '--node',
        dest_node
    ])

    if timeout == 0:
        return "Failed to move target %s to node %s" % (target_label,
                                                        dest_node)

    return None
Ejemplo n.º 13
0
def unmanage_network(device, mac_address):
    """Rewrite the network configuration file to set NM_CONTROLLED="no"
    TODO: This is destructive and overwrites the file loosing all settings.
    This needs to be fixed up.
    """
    ifcfg_path = write_ifcfg(device, mac_address, None, None)

    if platform_info.distro_version >= 7.0:
        try:
            AgentShell.try_run(['nmcli', 'con', 'load', ifcfg_path])
        except AgentShell.CommandExecutionError as cee:
            if cee.result.rc not in [
                    127, 2, 8
            ]:  # network manager may be uninstalled (127) stopped (8)
                raise
Ejemplo n.º 14
0
def delete_node(nodename):
    rc, stdout, stderr = AgentShell.run_old(['crm_node', '-l'])
    node_id = None
    for line in stdout.split('\n'):
        node_id, name, status = line.split(" ")
        if name == nodename:
            break
    AgentShell.try_run(['crm_node', '--force', '-R', node_id])
    cibadmin(
        ["--delete", "-o", "nodes", "-X",
         "<node uname=\"%s\"/>" % nodename])
    cibadmin([
        "--delete", "-o", "nodes", "--crm_xml",
        "<node_state uname=\"%s\"/>" % nodename
    ])
Ejemplo n.º 15
0
 def generate_csr(self, common_name):
     """Return a CSR as a string"""
     output = AgentShell.try_run([
         "openssl", "req", "-new", "-sha256", "-subj",
         "/C=/ST=/L=/O=/CN=%s" % common_name, "-key", self.private_key_file
     ])
     return output.strip()
Ejemplo n.º 16
0
    def _process_zpool(self, pool, block_devices):
        """
        Either read pool info from store if unavailable or inspect by importing

        :param pool: dict of pool info
        :return: None
        """
        pool_name = pool['pool']

        with ZfsDevice(pool_name, True) as zfs_device:

            if zfs_device.available:
                out = AgentShell.try_run(["zpool", "list", "-H", "-o", "name,size,guid", pool['pool']])
                self._add_zfs_pool(out, block_devices)
            else:
                # zpool probably imported elsewhere, attempt to read from store, this should return
                # previously seen zpool state either with or without datasets
                pool_id = pool.get('id', None)

                try:
                    if pool_id is None:
                        data = find_name_in_store(pool_name)
                    else:
                        data = read_from_store(pool_id)
                except KeyError as e:
                    daemon_log.error("ZfsPool unavailable and could not be retrieved from store: %s ("
                                     "pool info: %s)" % (e, pool))
                else:
                    # populate self._pools/datasets/zvols info from saved data read from store
                    self._update_pool_or_datasets(block_devices,
                                                  data['pool'],
                                                  data['datasets'],
                                                  data['zvols'])
Ejemplo n.º 17
0
def _get_zpool_datasets(pool_name, drives):
    """ Retrieve datasets belonging to a zpool """
    out = AgentShell.try_run(['zfs', 'list', '-H', '-o', 'name,avail,guid'])

    zpool_datasets = {}

    if out.strip() != "no datasets available":
        for line in filter(None, out.split('\n')):
            name, size_str, uuid = line.split()
            size = util.human_to_bytes(size_str)

            if name.startswith("%s/" % pool_name):
                # This will need discussion, but for now fabricate a major:minor. Do we ever use them as numbers?
                major_minor = "zfsset:%s" % uuid

                zpool_datasets[uuid] = {
                    "name": name,
                    "path": name,
                    "block_device": major_minor,
                    "uuid": uuid,
                    "size": size,
                    "drives": drives
                }

                daemon_log.debug("zfs mount '%s'" % name)

    return zpool_datasets
Ejemplo n.º 18
0
def _check_HYD4050():
    """
    HYD-4050 means that kernels are not installed with a default kernel or the initramfs isn't present.

    This function checks for these cases and returns an error message if a problem exists.

    return: None if everything is OK, error message if not.
    """

    #  Make sure that there is an initramfs for the booting kernel
    try:
        default_kernel = AgentShell.try_run(["grubby",
                                             "--default-kernel"]).strip()
    except AgentShell.CommandExecutionError:
        return ("Unable to determine your default kernel.  "
                "This node may not boot successfully until grub "
                "is fixed to have a default kernel to boot.")

    default_kernel_version = default_kernel[default_kernel.find("-") + 1:]
    initramfs = "/boot/initramfs-%s.img" % default_kernel_version

    if not os.path.isfile(initramfs):
        return ("There is no initramfs (%s) for the default kernel (%s).  "
                "This node may not boot successfully until an initramfs "
                "is created." % (initramfs, default_kernel_version))

    return None
Ejemplo n.º 19
0
def action_one_no_context(arg1):
    """An action which invokes subprocess_one"""

    assert arg1 == "arg1_test"
    stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg'])
    assert stdout == 'subprocess_one_stdout'
    return ACTION_ONE_NO_CONTEXT_RETVAL
Ejemplo n.º 20
0
 def configured(self):
     """ configured returns True if this node has a pacemaker configuration set by IML.
     :return: True if configuration present else False
     """
     return "fence_chroma" in AgentShell.try_run(
         ["cibadmin", "--query", "-o", "resource"]
     )
    def __init__(self, block_devices):
        self.block_devices = block_devices
        self.mpaths = {}
        self.vgs = {}
        self.lvs = {}

        for vg_name, vg_uuid, vg_size in self._get_vgs():
            self.vgs[vg_name] = {
                'name': vg_name,
                'uuid': vg_uuid,
                'size': vg_size,
                'pvs_major_minor': []
            }
            self.lvs[vg_name] = {}
            for lv_name, lv_uuid, lv_size, lv_path in self._get_lvs(vg_name):
                # Do this to cache the device, type see blockdevice and filesystem for info.
                BlockDevice('lvm_volume',
                            '/dev/mapper/%s-%s' % (vg_name, lv_name))

                self.lvs[vg_name][lv_name] = {
                    'name': lv_name,
                    'uuid': lv_uuid,
                    'size': lv_size
                }

        stdout = AgentShell.try_run(['dmsetup', 'table'])
        self._parse_dm_table(stdout)
Ejemplo n.º 22
0
def kernel_status():
    """
    :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>}
    """
    running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip()

    available_kernels = [
        k for k in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n") if k
    ]

    if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0:
        try:
            modlist = [
                os.path.splitext(os.path.basename(k))[0]
                for k in AgentShell.try_run([
                    "rpm", "-ql", "--whatprovides", "lustre-osd", "kmod-lustre"
                ]).split("\n") if k.endswith(".ko")
            ]

            required_kernel = latest_kernel(available_kernels, modlist)

        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None

    elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0:
        # but on a worker, we can ask kmod-lustre-client what the required
        # kernel is
        try:
            modlist = [
                os.path.splitext(os.path.basename(k))[0]
                for k in AgentShell.try_run([
                    "rpm", "-ql", "--whatprovides", "kmod-lustre-client"
                ]).split("\n") if k.endswith(".ko")
            ]

            required_kernel = latest_kernel(available_kernels, modlist)

        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None
    else:
        required_kernel = None

    return {
        "running": running_kernel,
        "required": required_kernel,
        "available": available_kernels,
    }
Ejemplo n.º 23
0
def _res_set_started(ha_label, running):
    # RAISES AgentShell.CommandExecutionError on error
    if running:
        role = "Started"
    else:
        role = "Stopped"

    AgentShell.try_run([
        "crm_resource",
        "--resource",
        ha_label,
        "--set-parameter",
        "target-role",
        "--meta",
        "--parameter-value",
        role,
    ])
Ejemplo n.º 24
0
def action_one_with_context(agent_daemon_context, arg1):
    """An action which invokes subprocess_one"""

    assert isinstance(agent_daemon_context, AgentDaemonContext)
    assert arg1 == "arg1_test"
    stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg'])
    assert stdout == 'subprocess_one_stdout'
    return ACTION_ONE_WITH_CONTEXT_RETVAL
Ejemplo n.º 25
0
    def has_link(self):
        old_link_state_up = self.is_up

        # HYD-2003: Some NICs require the interface to be in an UP state
        # before link detection will work.
        time_left = 0

        if not self.is_up:
            AgentShell.try_run(
                ["/sbin/ip", "link", "set", "dev", self.name, "up"])
            time_left = 10

        def _get_device_state(name):
            try:
                filepath = operstate.format(name)
                if os.path.exists(filepath):
                    with open(filepath, "r") as f:
                        return f.read().strip()
                else:
                    return "unknown"
            except IOError:
                print(
                    "Could not read state of ethernet device {}".format(name))
                return "unknown"

        def _has_link():
            return _get_device_state(self.name) == "up"

        try:
            while time_left:
                # Poll for link status on newly-up interfaces
                if _has_link():
                    return True
                else:
                    time.sleep(1)
                    time_left -= 1

            return _has_link()
        except IOError:
            # If the ioctl fails, then for the purposes of this test, the
            # interface is not usable. HYD-2679
            return False
        finally:
            if not old_link_state_up:
                AgentShell.try_run(
                    ["/sbin/ip", "link", "set", "dev", self.name, "down"])
Ejemplo n.º 26
0
def unmanage_network(device, mac_address):
    """
    Rewrite the network configuration file to set NM_CONTROLLED="no"

    TODO: This is destructive and overwrites the file clearing
    previously configured settings, needs fixing.
    """
    ifcfg_path = write_ifcfg(device, mac_address, None, None)

    if ifcfg_path:
        try:
            AgentShell.try_run(['nmcli', 'con', 'load', ifcfg_path])
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise e
        except AgentShell.CommandExecutionError as cee:
            if cee.result.rc != NM_STOPPED_RC:
                raise cee
Ejemplo n.º 27
0
def kernel_status():
    """
    :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>}
    """
    running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip()

    if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0:
        # on a server, a required kernel is a lustre patched kernel since we
        # are building storage servers that can support both ldiskfs and zfs
        try:
            required_kernel = \
                next(k for k in sorted(AgentShell.try_run(["rpm", "-q",
                                                           "kernel"]).split('\n'),
                                       reverse=True)
                     if "_lustre" in k)
        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None
    elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0:
        # but on a worker, we can ask kmod-lustre-client what the required
        # kernel is
        try:
            required_kernel_prefix = \
                next(k for k in AgentShell.try_run(["rpm", "-q", "--requires",
                                                    "kmod-lustre-client"]).split('\n')
                     if "kernel >=" in k).split(" >= ")[1]
            required_kernel = AgentShell.try_run(
                ["rpm", "-q",
                 "kernel-%s*" % required_kernel_prefix]).split('\n')[0]
        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None
    else:
        required_kernel = None

    available_kernels = []
    for installed_kernel in AgentShell.try_run(["rpm", "-q",
                                                "kernel"]).split("\n"):
        if installed_kernel:
            available_kernels.append(installed_kernel)

    return {
        'running': running_kernel,
        'required': required_kernel,
        'available': available_kernels
    }
Ejemplo n.º 28
0
    def has_link(self):
        import array
        import struct
        import fcntl

        old_link_state_up = self.is_up

        # HYD-2003: Some NICs require the interface to be in an UP state
        # before link detection will work.
        time_left = 0

        if not self.is_up:
            AgentShell.try_run(
                ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up'])
            time_left = 10

        def _has_link():
            SIOCETHTOOL = 0x8946
            ETHTOOL_GLINK = 0x0000000a
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            ecmd = array.array('B', struct.pack('2I', ETHTOOL_GLINK, 0))
            ifreq = struct.pack('16sP', self.name, ecmd.buffer_info()[0])
            fcntl.ioctl(sock.fileno(), SIOCETHTOOL, ifreq)
            sock.close()
            return bool(struct.unpack('4xI', ecmd.tostring())[0])

        try:
            while time_left:
                # Poll for link status on newly-up interfaces
                if _has_link():
                    return True
                else:
                    time.sleep(1)
                    time_left -= 1

            return _has_link()
        except IOError:
            # If the ioctl fails, then for the purposes of this test, the
            # interface is not usable. HYD-2679
            return False
        finally:
            if not old_link_state_up:
                AgentShell.try_run(
                    ['/sbin/ip', 'link', 'set', 'dev', self.name, 'down'])
Ejemplo n.º 29
0
def get_cluster_node_name():
    try:
        return AgentShell.try_run(["crm_node", "-n"]).strip()
    except Exception as e:
        console_log.info(
            "Could not get cluster node name {}. Falling back to socket.getfqdn()".format(
                e
            )
        )

        return socket.getfqdn()
Ejemplo n.º 30
0
def _find_resource_constraint(ha_label, location):
    stdout = AgentShell.try_run(["crm_resource", "-r", ha_label, "-a"])

    for line in stdout.rstrip().split("\n"):
        match = re.match(
            "\s+:\s+Node\s+([^\s]+)\s+\(score=[^\s]+ id=%s-%s\)" %
            (ha_label, location), line)
        if match:
            return match.group(1)

    return None