Beispiel #1
0
    def _shutdown():
        console_log.info("Initiating server shutdown per manager request")
        # This will initiate a "nice" shutdown with a wall from root, etc.
        AgentShell.try_run(["shutdown", "-H" if halt else "-h", at_time])

        console_log.info("Terminating")
        os._exit(0)
Beispiel #2
0
def scanner_cmd(cmd):
    # Because we are pulling from device-scanner,
    # It is very important that we wait for
    # the udev queue to settle before requesting new data
    AgentShell.run(["udevadm", "settle"])

    client = socket.socket(socket.AF_UNIX)
    client.settimeout(10)
    client.connect_ex("/var/run/device-scanner.sock")
    client.sendall(json.dumps(cmd) + "\n")

    out = ""
    begin = 0

    while True:
        out += client.recv(1024)
        # Messages are expected to be separated by a newline
        # But sometimes it is not placed in the end of the line
        # Thus, take out only the first one
        idx = out.find("\n", begin)

        if idx >= 0:

            try:
                return json.loads(out[:idx])
            except ValueError:
                return None
        begin = len(out)
Beispiel #3
0
def stop_target(ha_label):
    '''
    Stop the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        if _resource_exists(_zfs_name(ha_label)):
            # Group disable will disable all members of group regardless of current status
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable',
                 _group_name(ha_label)])
        else:
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable', ha_label])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s", ha_label)
        else:
            return agent_error("Failed to stop target {}".format(ha_label))
Beispiel #4
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run(['openssl', 'genrsa', '-out', self.PRIVATE_KEY_FILE, '2048', '-sha256'])

        return self.PRIVATE_KEY_FILE
Beispiel #5
0
def kernel_status():
    """
    :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>}
    """
    running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip()
    try:
        required_kernel_stdout = AgentShell.try_run(
            ["rpm", "-qR", "lustre-modules"])
    except AgentShell.CommandExecutionError:
        try:
            required_kernel_stdout = AgentShell.try_run(
                ["rpm", "-qR", "lustre-client-modules"])
        except AgentShell.CommandExecutionError:
            required_kernel_stdout = None

    required_kernel = None
    if required_kernel_stdout:
        for line in required_kernel_stdout.split("\n"):
            if line.startswith('kernel'):
                required_kernel = "kernel-%s.%s" % (line.split(" = ")[1],
                                                    platform.machine())

    available_kernels = []
    for installed_kernel in AgentShell.try_run(["rpm", "-q",
                                                "kernel"]).split("\n"):
        if installed_kernel:
            available_kernels.append(installed_kernel)

    return {
        'running': running_kernel,
        'required': required_kernel,
        'available': available_kernels
    }
Beispiel #6
0
def cibadmin(command_args, timeout=120):
    assert timeout > 0, 'timeout must be greater than zero'

    # I think these are "errno" values, but I'm not positive
    # but going forward, any additions to this should try to be informative
    # about the type of exit code and why it's OK to retry
    RETRY_CODES = {
        10: "something unknown",
        41: "something unknown",
        62: "Timer expired",
        107: "Transport endpoint is not connected"
    }

    command_args.insert(0, 'cibadmin')
    # NB: This isn't a "true" timeout, in that it won't forcibly stop the
    # subprocess after a timeout. We'd need more invasive changes to
    # shell._run() for that.
    for _ in util.wait(timeout):
        result = AgentShell.run(command_args)

        if result.rc == 0:
            return result
        elif result.rc not in RETRY_CODES:
            break

    if result.rc in RETRY_CODES:
        raise PacemakerError(
            "%s timed out after %d seconds: rc: %s, stderr: %s" %
            (" ".join(command_args), timeout, result.rc, result.stderr))
    else:
        raise AgentShell.CommandExecutionError(result, command_args)
Beispiel #7
0
    def _reboot():
        console_log.info("Initiating server reboot per manager request")
        # reboot(8) just calls shutdown anyhow.
        AgentShell.try_run(["shutdown", "-r", at_time])

        console_log.info("Terminating")
        os._exit(0)
Beispiel #8
0
    def set_address(self, ipv4_address, prefix):
        ifaddr = "%s/%s" % (ipv4_address, prefix)

        console_log.info("Set %s (%s) up" % (self.name, ifaddr))

        if self.ipv4_address != ipv4_address:
            node_admin.unmanage_network(self.device, self.mac_address)

            AgentShell.try_run(
                ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up'])
            AgentShell.try_run(
                ['/sbin/ip', 'addr', 'add', ifaddr, 'dev', self.name])

            # The link address change is asynchronous, so we need to wait for the
            # address to stick of we have a race condition.
            timeout = 30
            while self.ipv4_address != ipv4_address and timeout != 0:
                self.refresh()
                time.sleep(1)
                timeout -= 1

            if self.ipv4_address != ipv4_address:
                raise RuntimeError(
                    'Unable to set the address %s for interface %s' %
                    (self.ipv4_address, self.name))

            node_admin.write_ifcfg(self.device, self.mac_address,
                                   self.ipv4_address, self.ipv4_netmask)
        else:
            console_log.info("Nothing to do as %s already has address %s" %
                             (self.name, ifaddr))
Beispiel #9
0
def action_two(arg1):
    """An action which invokes subprocess_one and subprocess_two"""

    assert arg1 == "arg2_test"
    stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg'])
    assert stdout == 'subprocess_one_stdout'
    AgentShell.try_run(['subprocess_two', 'subprocess_two_arg'])
    return ACTION_TWO_RETVAL
def mount_lustre_filesystem(mountspec, mountpoint):
    try:
        os.makedirs(mountpoint, 0o755)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    create_fstab_entry(mountspec, mountpoint)
    AgentShell.try_run(["/bin/mount", mountpoint])
Beispiel #11
0
def _configure_target_ha(ha_label, info, enabled=False):
    if enabled:
        extra = []
    else:
        extra = ['--disabled']

    bdev = info['bdev']

    if info['device_type'] == 'zfs':
        extra += ['--group', _group_name(ha_label)]
        zpool = info['bdev'].split("/")[0]
        result = AgentShell.run([
            'pcs', 'resource', 'create',
            _zfs_name(ha_label), 'ocf:chroma:ZFS', 'pool={}'.format(zpool),
            'op', 'start', 'timeout=120', 'op', 'stop', 'timeout=90'
        ] + extra)
        if result.rc != 0:
            console_log.error("Resource (%s) create failed:%d: %s", zpool,
                              result.rc, result.stderr)
            return result

        if enabled and not _wait_target(_zfs_name(ha_label), True):
            return {
                "rc":
                -1,
                "stdout":
                "",
                "stderr":
                "ZFS Resource ({}) failed to start".format(_zfs_name(ha_label))
            }

    else:
        # This is a hack for ocf:lustre:Lustre up to Lustre 2.10.5/2.11 see LU-11461
        result = AgentShell.run(['realpath', info['bdev']])
        if result.rc == 0 and result.stdout.startswith('/dev/sd'):
            bdev = result.stdout.strip()

    # Create Lustre resource and add target=uuid as an attribute
    result = AgentShell.run([
        'pcs', 'resource', 'create', ha_label, 'ocf:lustre:Lustre',
        'target={}'.format(bdev), 'mountpoint={}'.format(
            info['mntpt']), 'op', 'start', 'timeout=600'
    ] + extra)

    if result.rc != 0 or enabled and not _wait_target(ha_label, True):
        if result.rc == 0:
            result.rc = -1
            result.stderr = "Resource ({}) failed to start".format(ha_label)

        console_log.error("Failed to create resource %s:%d: %s", ha_label,
                          result.rc, result.stderr)

        if info['device_type'] == 'zfs':
            AgentShell.run(['pcs', 'resource', 'delete', _zfs_name(ha_label)])

    return result
def start_lnet():
    '''
    Place lnet into the 'up' state.
    '''
    console_log.info("Starting LNet")

    # modprobe lust is a hack for HYD-1263 - Fix or work around LU-1279 - failure trying to mount
    # should be removed when LU-1279 is fixed
    return agent_ok_or_error(AgentShell.run_canned_error_message(["lctl", "net", "up"]) or
                             AgentShell.run_canned_error_message(["modprobe", "lustre"]))
Beispiel #13
0
def delete_node(nodename):
    rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"])
    node_id = None
    for line in stdout.split("\n"):
        node_id, name, status = line.split(" ")
        if name == nodename:
            break
    AgentShell.try_run(["crm_node", "--force", "-R", node_id])
    cibxpath("delete", '//nodes/node[@uname="{}"]'.format(nodename))
    cibxpath("delete", '//status/node_state[@uname="{}"]'.format(nodename))
Beispiel #14
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run([
                "openssl", "genrsa", "-out", self.PRIVATE_KEY_FILE, "2048",
                "-sha256"
            ])

        return self.PRIVATE_KEY_FILE
Beispiel #15
0
def _move_target(target_label, dest_node):
    """
    common plumbing for failover/failback. Move the target with label to the destination node.

    :param target_label: The label of the node to move
    :param dest_node: The target to move it to.
    :return: None if successful or an error message if an error occurred.
    """

    # Issue the command to Pacemaker to move the target
    arg_list = [
        "crm_resource",
        "--resource",
        target_label,
        "--move",
        "--node",
        dest_node,
    ]

    # For on going debug purposes, lets get the resource locations at the beginning.
    # This provides useful log output in the case where things don't work.
    AgentShell.run(["crm_mon", "-1"])

    # Now before we start cleanup anything that has gone on before. HA is a fickle
    # old thing and this will make sure that everything is clean before we start.
    AgentShell.try_run(
        ["crm_resource", "--resource", target_label, "--cleanup"])
    if _resource_exists(_zfs_name(target_label)):
        AgentShell.try_run([
            "crm_resource", "--resource",
            _zfs_name(target_label), "--cleanup"
        ])

    result = AgentShell.run(arg_list)

    if result.rc != 0:
        return "Error ({}) running '{}': '{}' '{}'".format(
            result.rc, " ".join(arg_list), result.stdout, result.stderr)

    timeout = 100

    # Now wait for it to complete its move, this will succeed quickly if it was already there
    while timeout > 0:
        if get_resource_location(target_label) == dest_node:
            break

        time.sleep(1)
        timeout -= 1

    # now delete the constraint that crm_resource --move created
    AgentShell.try_run([
        "crm_resource", "--resource", target_label, "--un-move", "--node",
        dest_node
    ])

    if timeout <= 0:
        return "Failed to move target {} to node {}".format(
            target_label, dest_node)

    return None
Beispiel #16
0
 def disable_standby(self):
     AgentShell.try_run([
         "crm_attribute",
         "-N",
         self.name,
         "-n",
         "standby",
         "-v",
         "off",
         "--lifetime=forever",
     ])
Beispiel #17
0
def _unconfigure_target_ha(ha_label, info, force=False):
    if force:
        extra = ["--force"]
    else:
        extra = []

    result = AgentShell.run(['pcs', 'resource', 'delete', ha_label] + extra)
    if info['backfstype'] == "zfs":
        AgentShell.run(['pcs', 'resource', 'delete',
                        _zfs_name(ha_label)] + extra)

    return result
Beispiel #18
0
def latest_kernel(kernel_list, modlist):
    required_kernel = None
    arch = AgentShell.try_run(["uname", "-m"]).strip()

    for kernel in kernel_list:
        if not kver_gt(kernel, required_kernel, arch):
            continue
        kver = kernel.split("-", 1)[1]
        if AgentShell.run(["modinfo", "-n", "-k", kver] + modlist).rc == 0:
            required_kernel = kernel

    return required_kernel
Beispiel #19
0
def stop_lnet():
    """
    Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state
    will be unloaded before lnet is stopped.
    """

    console_log.info("Stopping LNet")

    return agent_ok_or_error(
        AgentShell.run_canned_error_message(["lustre_rmmod", "ptlrpc"])
        or AgentShell.run_canned_error_message(
            ["lnetctl", "lnet", "unconfigure"]))
Beispiel #20
0
 def set_attribute(self, key, value):
     AgentShell.try_run([
         "crm_attribute",
         "-t",
         "nodes",
         "-U",
         self.name,
         "-n",
         key,
         "-v",
         str(value),
     ])
Beispiel #21
0
def configure_corosync2_stage_2(ring0_name, ring1_name, new_node_fqdn, mcast_port, pcs_password, create_cluster):
    """Process configuration including peers and negotiated multicast port, no IP address
    information required

    Note: "The pcs cluster setup command will automatically configure two_node: 1 in
    corosync.conf, so a two-node cluster will "just work". If you are using a different cluster
    shell, you will have to configure corosync.conf appropriately yourself." Therefore
    no-quorum-policy does not have to be set when setting up cluster with pcs.

    :param ring0_name:
    :param ring1_name:
    :param peer_fqdns:
    :param mcast_port:
    :return:
    """

    interfaces = [InterfaceInfo(CorosyncRingInterface(name=ring0_name, ringnumber=0,
                                                      mcastport=mcast_port), None, None),
                  InterfaceInfo(CorosyncRingInterface(name=ring1_name, ringnumber=1,
                                                      mcastport=mcast_port), None, None)]

    config_params = {
        'token': '17000',
        'fail_recv_const': '10',
        'transport': 'udp',
        'rrpmode': 'passive',
        'addr0': interfaces[0].corosync_iface.bindnetaddr,
        'addr1': interfaces[1].corosync_iface.bindnetaddr,
        'mcast0': interfaces[0].corosync_iface.mcastaddr,
        'mcast1': interfaces[1].corosync_iface.mcastaddr,
        'mcastport0': interfaces[0].corosync_iface.mcastport,
        'mcastport1': interfaces[1].corosync_iface.mcastport
    }

    # authenticate nodes in cluster
    authenticate_nodes_in_cluster_command = ['pcs', 'cluster', 'auth', new_node_fqdn,
                                             '-u', PCS_USER, '-p', pcs_password]

    # build command string for setup of cluster which will result in corosync.conf rather than
    # writing from template, note we don't start the cluster here as services are managed
    # independently
    if create_cluster:
        cluster_setup_command = ['pcs', 'cluster', 'setup', '--name', PCS_CLUSTER_NAME, '--force'] + [new_node_fqdn]
        for param in ['transport', 'rrpmode', 'addr0', 'mcast0', 'mcastport0', 'addr1', 'mcast1',
                      'mcastport1', 'token', 'fail_recv_const']:
            # pull this value from the dictionary using parameter keyword
            cluster_setup_command.extend(["--" + param, str(config_params[param])])
    else:
        cluster_setup_command = ['pcs', 'cluster', 'node', 'add', new_node_fqdn]

    return agent_ok_or_error(AgentShell.run_canned_error_message(authenticate_nodes_in_cluster_command) or
                             AgentShell.run_canned_error_message(cluster_setup_command))
Beispiel #22
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Started'
        ])

        if error:
            return agent_error(error)

        # now wait for it to start
        _wait_target(ha_label, True)

        # and make sure it didn't start but (the RA) fail(ed)
        rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1'])

        failed = True
        for line in stdout.split("\n"):
            if line.lstrip().startswith(ha_label):
                if line.find("FAILED") < 0:
                    failed = False

        if failed:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message([
                'crm_resource', '-r', ha_label, '-p', 'target-role', '-m',
                '-v', 'Stopped'
            ])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s" % ha_label)
            else:
                return agent_error("Failed to start target %s" % ha_label)

        else:
            location = get_resource_location(ha_label)
            if not location:
                return agent_error("Started %s but now can't locate it!" %
                                   ha_label)
            return agent_result(location)
Beispiel #23
0
def delete_node(nodename):
    rc, stdout, stderr = AgentShell.run_old(['crm_node', '-l'])
    node_id = None
    for line in stdout.split('\n'):
        node_id, name, status = line.split(" ")
        if name == nodename:
            break
    AgentShell.try_run(['crm_node', '--force', '-R', node_id])
    cibadmin(
        ["--delete", "-o", "nodes", "-X",
         "<node uname=\"%s\"/>" % nodename])
    cibadmin([
        "--delete", "-o", "nodes", "--crm_xml",
        "<node_state uname=\"%s\"/>" % nodename
    ])
def unmanage_network(device, mac_address):
    """Rewrite the network configuration file to set NM_CONTROLLED="no"
    TODO: This is destructive and overwrites the file loosing all settings.
    This needs to be fixed up.
    """
    ifcfg_path = write_ifcfg(device, mac_address, None, None)

    if platform_info.distro_version >= 7.0:
        try:
            AgentShell.try_run(['nmcli', 'con', 'load', ifcfg_path])
        except AgentShell.CommandExecutionError as cee:
            if cee.result.rc not in [
                    127, 2, 8
            ]:  # network manager may be uninstalled (127) stopped (8)
                raise
Beispiel #25
0
    def _lnet_state(self):
        lnet_up = False
        lnet_loaded = not bool(
            AgentShell.run(['udevadm', 'info', '--path', '/sys/module/lnet'
                            ]).rc)

        if lnet_loaded:
            lnet_up = not bool(AgentShell.run(['lnetctl', 'net', 'show']).rc)

        return {
            (False, False): "lnet_unloaded",
            (False, True): "lnet_unloaded",
            (True, False): "lnet_down",
            (True, True): "lnet_up"
        }[(lnet_loaded, lnet_up)]
    def _lnet_state(self):
        lnet_up = False
        lnet_loaded = not bool(
            AgentShell.run(["udevadm", "info", "--path", "/sys/module/lnet"
                            ]).rc)

        if lnet_loaded:
            lnet_up = not bool(AgentShell.run(["lnetctl", "net", "show"]).rc)

        return {
            (False, False): "lnet_unloaded",
            (False, True): "lnet_unloaded",
            (True, False): "lnet_down",
            (True, True): "lnet_up",
        }[(lnet_loaded, lnet_up)]
Beispiel #27
0
def _get_zpool_datasets(pool_name, drives):
    """ Retrieve datasets belonging to a zpool """
    out = AgentShell.try_run(['zfs', 'list', '-H', '-o', 'name,avail,guid'])

    zpool_datasets = {}

    if out.strip() != "no datasets available":
        for line in filter(None, out.split('\n')):
            name, size_str, uuid = line.split()
            size = util.human_to_bytes(size_str)

            if name.startswith("%s/" % pool_name):
                # This will need discussion, but for now fabricate a major:minor. Do we ever use them as numbers?
                major_minor = "zfsset:%s" % uuid

                zpool_datasets[uuid] = {
                    "name": name,
                    "path": name,
                    "block_device": major_minor,
                    "uuid": uuid,
                    "size": size,
                    "drives": drives
                }

                daemon_log.debug("zfs mount '%s'" % name)

    return zpool_datasets
Beispiel #28
0
def _check_HYD4050():
    """
    HYD-4050 means that kernels are not installed with a default kernel or the initramfs isn't present.

    This function checks for these cases and returns an error message if a problem exists.

    return: None if everything is OK, error message if not.
    """

    #  Make sure that there is an initramfs for the booting kernel
    try:
        default_kernel = AgentShell.try_run(["grubby",
                                             "--default-kernel"]).strip()
    except AgentShell.CommandExecutionError:
        return ("Unable to determine your default kernel.  "
                "This node may not boot successfully until grub "
                "is fixed to have a default kernel to boot.")

    default_kernel_version = default_kernel[default_kernel.find("-") + 1:]
    initramfs = "/boot/initramfs-%s.img" % default_kernel_version

    if not os.path.isfile(initramfs):
        return ("There is no initramfs (%s) for the default kernel (%s).  "
                "This node may not boot successfully until an initramfs "
                "is created." % (initramfs, default_kernel_version))

    return None
    def __init__(self, block_devices):
        self.block_devices = block_devices
        self.mpaths = {}
        self.vgs = {}
        self.lvs = {}

        for vg_name, vg_uuid, vg_size in self._get_vgs():
            self.vgs[vg_name] = {
                'name': vg_name,
                'uuid': vg_uuid,
                'size': vg_size,
                'pvs_major_minor': []
            }
            self.lvs[vg_name] = {}
            for lv_name, lv_uuid, lv_size, lv_path in self._get_lvs(vg_name):
                # Do this to cache the device, type see blockdevice and filesystem for info.
                BlockDevice('lvm_volume',
                            '/dev/mapper/%s-%s' % (vg_name, lv_name))

                self.lvs[vg_name][lv_name] = {
                    'name': lv_name,
                    'uuid': lv_uuid,
                    'size': lv_size
                }

        stdout = AgentShell.try_run(['dmsetup', 'table'])
        self._parse_dm_table(stdout)
Beispiel #30
0
def action_one_no_context(arg1):
    """An action which invokes subprocess_one"""

    assert arg1 == "arg1_test"
    stdout = AgentShell.try_run(['subprocess_one', 'subprocess_one_arg'])
    assert stdout == 'subprocess_one_stdout'
    return ACTION_ONE_NO_CONTEXT_RETVAL