コード例 #1
0
    def _lnet_state(self):
        lnet_up = False
        lnet_loaded = not bool(
            AgentShell.run(['udevadm', 'info', '--path', '/sys/module/lnet'
                            ]).rc)

        if lnet_loaded:
            lnet_up = not bool(AgentShell.run(['lnetctl', 'net', 'show']).rc)

        return {
            (False, False): "lnet_unloaded",
            (False, True): "lnet_unloaded",
            (True, False): "lnet_down",
            (True, True): "lnet_up"
        }[(lnet_loaded, lnet_up)]
コード例 #2
0
    def _lnet_state(self):
        lnet_up = False
        lnet_loaded = not bool(
            AgentShell.run(["udevadm", "info", "--path", "/sys/module/lnet"
                            ]).rc)

        if lnet_loaded:
            lnet_up = not bool(AgentShell.run(["lnetctl", "net", "show"]).rc)

        return {
            (False, False): "lnet_unloaded",
            (False, True): "lnet_unloaded",
            (True, False): "lnet_down",
            (True, True): "lnet_up",
        }[(lnet_loaded, lnet_up)]
コード例 #3
0
def _resource_exists(ha_label):
    '''
    Check if a resource exists in current configuration.
    :return: True if exists
    '''
    result = AgentShell.run(["crm_resource", "-W", "-r", ha_label])
    return result.rc == 0
コード例 #4
0
ファイル: pacemaker.py プロジェクト: whamcloud/iml-agent
def _cibadmin(command_args, timeout=120, raise_on_timeout=False):
    assert timeout > 0, "timeout must be greater than zero"

    # I think these are "errno" values, but I'm not positive
    # but going forward, any additions to this should try to be informative
    # about the type of exit code and why it's OK to retry
    RETRY_CODES = {
        10: "something unknown",
        41: "something unknown",
        62: "Timer expired",
        107: "Transport endpoint is not connected",
    }

    command_args.insert(0, "cibadmin")
    # NB: This isn't a "true" timeout, in that it won't forcibly stop the
    # subprocess after a timeout. We'd need more invasive changes to
    # shell._run() for that.
    for _ in util.wait(timeout):
        result = AgentShell.run(command_args)

        if result.rc == 0:
            return result
        elif result.rc not in RETRY_CODES:
            break

    if raise_on_timeout and result.rc in RETRY_CODES:
        raise PacemakerError(
            "%s timed out after %d seconds: rc: %s, stderr: %s" %
            (" ".join(command_args), timeout, result.rc, result.stderr))

    return result
コード例 #5
0
def unconfigure_corosync2(host_fqdn, mcast_port):
    """
    Unconfigure the corosync application.

    For corosync2 don't disable pcsd, just remove host node from cluster and disable corosync from
    auto starting (service should already be stopped in state transition)

    Note that pcs cluster commands handle editing and removal of the corosync.conf file

    Return: Value using simple return protocol
    """
    error = corosync_service.disable()
    if error:
        return agent_error(error)

    # Detect if we are the only node in the cluster, we want to do this before next command removes conf file
    cluster_nodes = _nodes_in_cluster()

    result = AgentShell.run(["pcs", "--force", "cluster", "node", "remove", host_fqdn])

    if result.rc != 0:
        if "No such file or directory" in result.stderr:
            # we want to return successful if the configuration file does not exist
            console_log.warning(result.stderr)
        elif "Error: Unable to update any nodes" in result.stderr:
            # this error is expected when this is the last node in the cluster
            if len(cluster_nodes) != 1:
                return agent_error(result.stderr)
        else:
            return agent_error(result.stderr)

    return agent_ok_or_error(
        firewall_control.remove_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True)
        or firewall_control.remove_rule(mcast_port, "udp", "corosync", persist=True)
    )
コード例 #6
0
def kernel_status():
    """
    :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>}
    """
    running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip()

    available_kernels = [
        k for k in AgentShell.try_run(["rpm", "-q", "kernel"]).split("\n") if k
    ]

    if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0:
        try:
            modlist = [
                os.path.splitext(os.path.basename(k))[0]
                for k in AgentShell.try_run([
                    "rpm", "-ql", "--whatprovides", "lustre-osd", "kmod-lustre"
                ]).split("\n") if k.endswith(".ko")
            ]

            required_kernel = latest_kernel(available_kernels, modlist)

        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None

    elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0:
        # but on a worker, we can ask kmod-lustre-client what the required
        # kernel is
        try:
            modlist = [
                os.path.splitext(os.path.basename(k))[0]
                for k in AgentShell.try_run([
                    "rpm", "-ql", "--whatprovides", "kmod-lustre-client"
                ]).split("\n") if k.endswith(".ko")
            ]

            required_kernel = latest_kernel(available_kernels, modlist)

        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None
    else:
        required_kernel = None

    return {
        "running": running_kernel,
        "required": required_kernel,
        "available": available_kernels,
    }
コード例 #7
0
    def _full_scan(self):
        # If we are a worker node then return nothing because our devices are not of interest. This is a short term
        # solution for HYD-3140. This plugin should really be loaded if it is not needed but for now this sorts out
        # and issue with PluginAgentResources being in the linux plugin.
        if config.get('settings', 'profile')['worker']:
            return {}

        # Before we do anything do a partprobe, this will ensure that everything gets an up to date view of the
        # device partitions. partprobe might throw errors so ignore return value
        AgentShell.run(["partprobe"])

        # Map of block devices major:minors to /dev/ path.
        block_devices = BlockDevices()

        # Devicemapper: LVM and Multipath
        dmsetup = DmsetupTable(block_devices)

        # Software RAID
        mds = MdRaid(block_devices).all()

        # _zpools
        zfs_devices = ZfsDevices()
        zfs_devices.full_scan(block_devices)

        # EMCPower Devices
        emcpowers = EMCPower(block_devices).all()

        # Local filesystems (not lustre) in /etc/fstab or /proc/mounts
        local_fs = LocalFilesystems(block_devices).all()

        # We have scan devices, so set the devices scanned flags.
        LinuxDevicePlugin.devices_scanned = True

        return {
            "vgs": dmsetup.vgs,
            "lvs": dmsetup.lvs,
            "zfspools": zfs_devices.zpools,
            "zfsdatasets": zfs_devices.datasets,
            "zfsvols": zfs_devices.zvols,
            "mpath": dmsetup.mpaths,
            "devs": block_devices.block_device_nodes,
            "local_fs": local_fs,
            'emcpower': emcpowers,
            'mds': mds
        }
コード例 #8
0
def convert_targets(force=False):
    '''
    Convert existing ocf:chroma:Target to ZFS + Lustre
    '''
    try:
        result = AgentShell.run(['cibadmin', '--query'])
    except OSError, err:
        if err.errno != errno.ENOENT:
            raise
コード例 #9
0
def kernel_status():
    """
    :return: {'running': {'kernel-X.Y.Z'}, 'required': <'kernel-A.B.C' or None>}
    """
    running_kernel = "kernel-%s" % AgentShell.try_run(["uname", "-r"]).strip()

    if AgentShell.run(["rpm", "-q", "--whatprovides", "kmod-lustre"]).rc == 0:
        # on a server, a required kernel is a lustre patched kernel since we
        # are building storage servers that can support both ldiskfs and zfs
        try:
            required_kernel = \
                next(k for k in sorted(AgentShell.try_run(["rpm", "-q",
                                                           "kernel"]).split('\n'),
                                       reverse=True)
                     if "_lustre" in k)
        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None
    elif AgentShell.run(["rpm", "-q", "kmod-lustre-client"]).rc == 0:
        # but on a worker, we can ask kmod-lustre-client what the required
        # kernel is
        try:
            required_kernel_prefix = \
                next(k for k in AgentShell.try_run(["rpm", "-q", "--requires",
                                                    "kmod-lustre-client"]).split('\n')
                     if "kernel >=" in k).split(" >= ")[1]
            required_kernel = AgentShell.try_run(
                ["rpm", "-q",
                 "kernel-%s*" % required_kernel_prefix]).split('\n')[0]
        except (AgentShell.CommandExecutionError, StopIteration):
            required_kernel = None
    else:
        required_kernel = None

    available_kernels = []
    for installed_kernel in AgentShell.try_run(["rpm", "-q",
                                                "kernel"]).split("\n"):
        if installed_kernel:
            available_kernels.append(installed_kernel)

    return {
        'running': running_kernel,
        'required': required_kernel,
        'available': available_kernels
    }
コード例 #10
0
def unmount_target(uuid):
    # This is called by the Target RA from corosync

    # only unmount targets that are controlled by chroma:Target
    try:
        result = AgentShell.run(
            ['cibadmin', '--query', '--xpath', '//primitive'])
    except OSError, err:
        if err.errno != errno.ENOENT:
            raise
コード例 #11
0
def get_resource_locations():
    """Parse `crm_mon -1` to identify where (if anywhere) resources
    (i.e. targets) are running
    returns [ resoure_id: location|None, ... ]
    """
    try:
        result = AgentShell.run(["crm_mon", "-1", "-r", "-X"])
    except OSError, err:
        # ENOENT is fine here.  Pacemaker might not be installed yet.
        if err.errno != errno.ENOENT:
            raise
コード例 #12
0
def selinux_status():
    """
    Get selinux status on node
    :return: {'status': 'Disabled'}
    """
    status = "Disabled"
    rc = AgentShell.run(["getenforce"])
    if rc.rc == 0:
        status = rc.stdout.strip()

    return {"status": status}
コード例 #13
0
def latest_kernel(kernel_list, modlist):
    required_kernel = None
    arch = AgentShell.try_run(["uname", "-m"]).strip()

    for kernel in kernel_list:
        if not kver_gt(kernel, required_kernel, arch):
            continue
        kver = kernel.split("-", 1)[1]
        if AgentShell.run(["modinfo", "-n", "-k", kver] + modlist).rc == 0:
            required_kernel = kernel

    return required_kernel
コード例 #14
0
def scanner_cmd(cmd):
    # Because we are pulling from device-scanner,
    # It is very important that we wait for
    # the udev queue to settle before requesting new data
    AgentShell.run(["udevadm", "settle"])

    client = socket.socket(socket.AF_UNIX)
    client.settimeout(10)
    client.connect_ex("/var/run/device-scanner.sock")
    client.sendall(json.dumps(cmd) + "\n")

    out = ""

    while True:
        out += client.recv(1024)

        if out.endswith("\n"):

            try:
                return json.loads(out)
            except ValueError:
                pass
コード例 #15
0
def _configure_target_priority(primary, ha_label, node):
    if primary:
        score = "20"
    else:
        score = "10"

    name = _constraint(ha_label, primary)
    result = AgentShell.run(
        ['pcs', 'constraint', 'location', 'add', name, ha_label, node, score])

    if result.rc == 76:
        console_log.warn("A constraint with the name %s already exists", name)
        result.rc = 0

    return result
コード例 #16
0
def get_resource_locations():
    """Parse `crm_mon -1` to identify where (if anywhere) resources
    (i.e. targets) are running
    returns [ resoure_id: location|None, ... ]
    """
    try:
        result = AgentShell.run(["crm_mon", "-1", "-r", "-X"])
    except OSError as err:
        # ENOENT is fine here.  Pacemaker might not be installed yet.
        if err.errno != errno.ENOENT:
            raise err
        return {}

    if result.rc != 0:
        console_log.info("crm_mon failed (%d): '%s' '%s'", result.rc,
                         result.stdout, result.stderr)
        return {}

    return _get_resource_locations(result.stdout)
コード例 #17
0
def _nodes_in_cluster():
    """
    Returns the nodes in the corosync cluster

    example output from command 'pcs status corosync':
    > Corosync Nodes:
    >  Online:
    >  Offline: bill.bailey.com bob.marley.com

    :return: a list of all nodes in cluster
    """
    nodes = []
    result = AgentShell.run(["pcs", "status", "nodes", "corosync"])

    if result.rc != 0:
        # log all command errors but always continue to remove node from cluster
        console_log.warning(result.stderr)
    else:
        # nodes are on the right side of lines separated with ':'
        for line in result.stdout.split("\n"):
            if line.find(":") > 0:
                nodes.extend(line.split(":")[1].strip().split())

    return nodes
コード例 #18
0
def fetch_device_list():
    AgentShell.run(["udevadm", "settle"])
    info = scanner_cmd("info")

    return pipe(info.itervalues(), cmap(as_device), cfilter(filter_device),
                list)
コード例 #19
0
ファイル: yum_utils.py プロジェクト: whamcloud/iml-agent
def yum_util(action,
             packages=[],
             fromrepo=None,
             enablerepo=None,
             narrow_updates=False):
    """
    A wrapper to perform yum actions in encapsulated way.
    :param action:  clean, install, remove, update, requires etc
    :param packages: Packages to install or remove
    :param fromrepo: The repo the action should be carried out from, others are disabled.
    :param enablerepo: The repo to enable for the action, others are not disabled or enabled
    :param narrow_updates: ?
    :return: No return but throws CommandExecutionError on error.
    """

    if fromrepo and enablerepo:
        raise ValueError(
            "Cannot provide fromrepo and enablerepo simultaneously")

    repo_arg = []
    valid_rc_values = [0]  # Some errors values other than 0 are valid.
    tries = 2
    if fromrepo:
        repo_arg = ["--disablerepo=*"
                    ] + ["--enablerepo=%s" % r for r in fromrepo]
    elif enablerepo:
        repo_arg = ["--enablerepo=%s" % r for r in enablerepo]
    if narrow_updates and action == "query":
        repo_arg.extend(["--upgrades"])

    if action == "clean":
        cmd = ["yum", "clean", "all"
               ] + (repo_arg if repo_arg else ["--enablerepo=*"])
    elif action == "install":
        cmd = (["yum", "install", "-y", "--exclude", "kernel-debug"] +
               repo_arg + list(packages))
    elif action == "remove":
        cmd = ["yum", "remove", "-y"] + repo_arg + list(packages)
    elif action == "update":
        cmd = (["yum", "update", "-y", "--exclude", "kernel-debug"] +
               repo_arg + list(packages))
    elif action == "requires":
        cmd = ["repoquery", "--requires"] + repo_arg + list(packages)
    elif action == "query":
        cmd = ["repoquery"] + repo_arg + list(packages)
    elif action == "repoquery":
        cmd = (["repoquery", "--show-duplicates"] + repo_arg + [
            "--queryformat=%{EPOCH} %{NAME} "
            "%{VERSION} %{RELEASE} %{ARCH}"
        ])
    else:
        raise RuntimeError("Unknown yum util action %s" % action)

    # This is a poor solution for HYD-3855 but not one that carries any known cost.
    # We sometimes see intermittent failures in test, and possibly out of test, that occur
    # 1 in 50 (estimate) times. yum commands are idempotent and so trying the command three
    # times has no downside and changes the estimated chance of fail to 1 in 12500.
    for hyd_3885 in range(tries, -1, -1):
        result = AgentShell.run(cmd)

        if result.rc in valid_rc_values:
            return result.stdout
        else:
            # if we were trying to install, clean the metadata before
            # trying again
            if action == "install":
                AgentShell.run(["yum", "clean", "metadata"])
            daemon_log.info("HYD-3885 Retrying yum command '%s'" %
                            " ".join(cmd))
            if hyd_3885 == 0:
                daemon_log.info("HYD-3885 Retry yum command failed '%s'" %
                                " ".join(cmd))
                raise AgentShell.CommandExecutionError(
                    result, cmd)  # Out of retries so raise for the caller..
コード例 #20
0
    for con in dom.getElementsByTagName('rsc_location'):
        ha_label = con.getAttribute("rsc")
        if not locations.get(ha_label):
            locations[ha_label] = {}
        if con.getAttribute("id") == _constraint(ha_label, True):
            ind = 0
        elif con.getAttribute("id") == _constraint(ha_label, False):
            ind = 1
        else:
            console_log.info("Unknown constraint: %s", con.getAttribute("id"))
            continue
        locations[ha_label][ind] = con.getAttribute("node")

    active = get_resource_locations()

    AgentShell.run(['pcs', 'property', 'set', 'maintenance-mode=true'])

    wait_list = []
    for res in dom.getElementsByTagName('primitive'):
        if not (res.getAttribute("provider") == "chroma"
                and res.getAttribute("type") == "Target"):
            continue

        ha_label = res.getAttribute("id")

        # _get_target_config() will raise KeyError if uuid doesn't exist locally
        # next() will raise StopIteration if it doesn't find attribute target
        try:
            info = next(
                _get_target_config(ops.getAttribute("value"))
                for ops in res.getElementsByTagName('nvpair')
コード例 #21
0
def _unconfigure_target_priority(primary, ha_label):
    return AgentShell.run([
        'pcs', 'constraint', 'location', 'remove',
        _constraint(ha_label, primary)
    ])
コード例 #22
0
def yum_util(action, packages=[], fromrepo=None, enablerepo=None, narrow_updates=False):
    '''
    A wrapper to perform yum actions in encapsulated way.
    :param action:  clean, install, remove, update, requires etc
    :param packages: Packages to install or remove
    :param fromrepo: The repo the action should be carried out from, others are disabled.
    :param enablerepo: The repo to enable for the action, others are not disabled or enabled
    :param narrow_updates: ?
    :return: No return but throws CommandExecutionError on error.
    '''

    if fromrepo and enablerepo:
        raise ValueError("Cannot provide fromrepo and enablerepo simultaneously")

    repo_arg = []
    valid_rc_values = [0]                               # Some errors values other than 0 are valid.
    tries = 2
    if fromrepo:
        repo_arg = ['--disablerepo=*'] + ['--enablerepo=%s' % r for r in fromrepo]
    elif enablerepo:
        repo_arg = ['--enablerepo=%s' % r for r in enablerepo]
    if narrow_updates and action == 'query':
        repo_arg.extend(['--upgrades'])

    if action == 'clean':
        cmd = ['dnf', 'clean', 'all'] + (repo_arg if repo_arg else ["--enablerepo=*"])
    elif action == 'install':
        cmd = ['dnf', 'install', '--allowerasing', '-y', '--exclude', 'kernel-debug'] + \
               repo_arg + list(packages)
    elif action == 'remove':
        cmd = ['dnf', 'remove', '-y'] + repo_arg + list(packages)
    elif action == 'update':
        cmd = ['dnf', 'update', '--allowerasing', '-y', '--exclude', 'kernel-debug'] + \
               repo_arg + list(packages)
    elif action == 'requires':
        cmd = ['dnf', 'repoquery', '--requires'] + repo_arg + list(packages)
    elif action == 'query':
        cmd = ['dnf', 'repoquery', '--available'] + repo_arg + list(packages)
    elif action == 'repoquery':
        cmd = ['dnf', 'repoquery', '--available'] + repo_arg + ['--queryformat=%{EPOCH} %{NAME} %{VERSION} %{RELEASE} %{ARCH}']
    elif action == 'check-update':
        cmd = ['dnf', 'repoquery', '--queryformat=%{name} %{version}-%{release}.'
               '%{arch} %{repoid}', '--upgrades'] + repo_arg + \
            list(packages)
    else:
        raise RuntimeError('Unknown yum util action %s' % action)

    # This is a poor solution for HYD-3855 but not one that carries any known cost.
    # We sometimes see intermittent failures in test, and possibly out of test, that occur
    # 1 in 50 (estimate) times. yum commands are idempotent and so trying the command three
    # times has no downside and changes the estimated chance of fail to 1 in 12500.
    for hyd_3885 in range(tries, -1, -1):
        result = AgentShell.run(cmd)

        if result.rc in valid_rc_values:
            return result.stdout
        else:
            # if we were trying to install, clean the metadata before
            # trying again
            if action == 'install':
                AgentShell.run(['dnf', 'clean', 'metadata'])
            daemon_log.info("HYD-3885 Retrying yum command '%s'" % " ".join(cmd))
            if hyd_3885 == 0:
                daemon_log.info("HYD-3885 Retry yum command failed '%s'" % " ".join(cmd))
                raise AgentShell.CommandExecutionError(result, cmd)   # Out of retries so raise for the caller..
コード例 #23
0
def convert_targets(force=False):
    """
    Convert existing ocf:chroma:Target to ZFS + Lustre
    """
    try:
        result = AgentShell.run(["cibadmin", "--query"])
    except OSError as err:
        if err.errno != errno.ENOENT:
            raise err
        return {
            "crm_mon_error": {
                "rc": err.errno,
                "stdout": err.message,
                "stderr": err.strerror,
            }
        }

    if result.rc != 0:
        # Pacemaker not running, or no resources configured yet
        return {
            "crm_mon_error": {
                "rc": result.rc,
                "stdout": result.stdout,
                "stderr": result.stderr,
            }
        }

    dom = ET.fromstring(result.stdout)

    this_node = _this_node()

    # node elements are numbered from 1
    # dc-uuid is the node id of the domain controller
    dcuuid = next(
        (node.get("uname") for node in dom.findall(".//node")
         if node.get("id") == dom.get("dc-uuid")),
        "",
    )

    if dcuuid != this_node and not force:
        console_log.info("This is not Pacemaker DC %s this is %s", dcuuid,
                         this_node)
        return

    # Build map of resource -> [ primary node, secondary node ]
    locations = {}
    for con in dom.findall(".//rsc_location"):
        ha_label = con.get("rsc")
        if not locations.get(ha_label):
            locations[ha_label] = {}
        if con.get("id") == _constraint(ha_label, True):
            ind = 0
        elif con.get("id") == _constraint(ha_label, False):
            ind = 1
        else:
            console_log.info("Unknown constraint: %s", con.get("id"))
            continue
        locations[ha_label][ind] = con.get("node")

    active = get_resource_locations()

    AgentShell.try_run([
        "crm_attribute",
        "--type",
        "crm_config",
        "--name",
        "maintenance-mode",
        "--update",
        "true",
    ])

    wait_list = []
    for res in dom.findall(".//primitive"):
        if not (res.get("provider") == "chroma"
                and res.get("type") == "Target"):
            continue

        ha_label = res.get("id")

        # _get_target_config() will raise KeyError if uuid doesn't exist locally
        # next() will raise StopIteration if it doesn't find attribute target
        try:
            info = next(
                _get_target_config(ops.get("value"))
                for ops in res.findall('.//nvpair[@name="target"]'))
        except Exception as err:
            console_log.error("No local info for resource: %s", ha_label)
            continue

        _unconfigure_target_priority(False, ha_label)
        _unconfigure_target_priority(True, ha_label)
        _unconfigure_target_ha(ha_label, True)
        _configure_target_ha(ha_label, info,
                             (active.get(ha_label) is not None))
        _configure_target_priority(True, ha_label, locations[ha_label][0])
        _configure_target_priority(False, ha_label, locations[ha_label][1])
        wait_list.append([ha_label, (active.get(ha_label) is not None)])

    # wait for last item
    for wait in wait_list:
        console_log.info("Waiting on %s", wait[0])
        _wait_target(*wait)
    AgentShell.try_run([
        "crm_attribute",
        "--type",
        "crm_config",
        "--name",
        "maintenance-mode",
        "--delete",
    ])
コード例 #24
0
def pacemaker_running():
    result = AgentShell.run(['service', 'pacemaker', 'status'])

    return result.rc == 0
コード例 #25
0
ファイル: fence_agents.py プロジェクト: whamcloud/iml-agent
 def monitor(self):
     result = AgentShell.run(self.base_cmd +
                             ["-n", self.plug, "-o", "monitor"])
     return result.rc