Exemple #1
0
def lsof(pid=None, file=None):
    lsof_args = ['lsof', '-F', 'pan0']

    if pid:
        lsof_args += ["-p", str(pid)]

    if file:
        lsof_args += [file]

    pids = defaultdict(dict)
    current_pid = None

    rc, stdout, stderr = AgentShell.run_old(lsof_args)
    if rc != 0:
        if stderr:
            raise RuntimeError(stderr)
        # lsof exits non-zero if there's nothing holding the file open
        return pids

    for line in stdout.split("\n"):
        match = re.match(r'^p(\d+)\x00', line)
        if match:
            current_pid = match.group(1)
            continue

        match = re.match(r'^a(\w)\x00n(.*)\x00', line)
        if match:
            mode = match.group(1)
            file = match.group(2)
            pids[current_pid][file] = {'mode': mode}

    return pids
def _do_configure_pacemaker(pc):
    # ignoring quorum should only be done on clusters of 2
    if len(pc.nodes) > 2:
        no_quorum_policy = "stop"
    else:
        no_quorum_policy = "ignore"

    error = _unconfigure_fencing()

    if error:
        return error

    # this could race with other cluster members to make sure
    # any errors are only due to it already existing
    try:
        cibadmin([
            "--create",
            "-o",
            "resources",
            "-X",
            '<primitive class="stonith" id="st-fencing" type="fence_chroma"/>',
        ])
    except Exception as e:
        rc, stdout, stderr = AgentShell.run_old(
            ["crm_resource", "--locate", "--resource", "st-fencing"])
        if rc == 0:  # no need to do the rest if another member is already doing it
            return None
        else:
            return e.message

    pc.create_update_properyset(
        "cib-bootstrap-options",
        {
            "no-quorum-policy": no_quorum_policy,
            "symmetric-cluster": "true",
            "cluster-infrastructure": "openais",
            "stonith-enabled": "true",
        },
    )

    def set_rsc_default(name, value):
        """

        :param name: attribute to set
        :param value: value to set
        :return: None if an error else a canned error message
        """
        return AgentShell.run_canned_error_message([
            "crm_attribute",
            "--type",
            "rsc_defaults",
            "--attr-name",
            name,
            "--attr-value",
            value,
        ])

    return (set_rsc_default("resource-stickiness", "1000")
            or set_rsc_default("failure-timeout", RSRC_FAIL_WINDOW) or
            set_rsc_default("migration-threshold", RSRC_FAIL_MIGRATION_COUNT))
def delete_node(nodename):
    rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"])
    node_id = None
    for line in stdout.split("\n"):
        node_id, name, status = line.split(" ")
        if name == nodename:
            break
    AgentShell.try_run(["crm_node", "--force", "-R", node_id])
    cibxpath("delete", '//nodes/node[@uname="{}"]'.format(nodename))
    cibxpath("delete", '//status/node_state[@uname="{}"]'.format(nodename))
Exemple #4
0
def get_resource_locations():
    # FIXME: this may break on non-english systems or new versions of pacemaker
    """Parse `crm_mon -1` to identify where (if anywhere)
       resources (i.e. targets) are running."""

    rc, lines_text, stderr = AgentShell.run_old(["crm_mon", "-1", "-r"])
    if rc != 0:
        # Pacemaker not running, or no resources configured yet
        return {
            "crm_mon_error": {
                "rc": rc,
                "stdout": lines_text,
                "stderr": stderr
            }
        }

    locations = {}
    before_resources = True
    for line in lines_text.split("\n"):
        # if we don't have a DC for this cluster yet, we can't really believe
        # anything it says
        if line == "Current DC: NONE":
            return {}

        # skip down to the resources part
        if before_resources:
            if line.startswith("Full list of resources:"):
                before_resources = False
            continue

        # only interested in Target resources
        if "(ocf::chroma:Target)" not in line:
            continue

        # The line can have 3 - 5 arguments so pad it out to at least 5 and
        # throw away any extra
        # credit it goes to Aric Coady for this little trick
        columns = (line.lstrip().split() + [None, None])[:5]

        # In later pacemakers a new entry is added for stopped servers
        # MGS_424f74	(ocf::chroma:Target):	(target-role:Stopped) Stopped
        # and for started servers:
        # MGS_424f74	(ocf::chroma:Target):	(target-role:Stopped) Started lotus-13vm6
        # (target-role:Stopped) is new.
        if "target-role" in columns[2]:
            del columns[2]

        # and even newer pacemakers add a "(disabled)" to the end of the line:
        # MGS_e1321a	(ocf::chroma:Target):	Stopped (disabled)
        if columns[3] == "(disabled)":
            columns[3] = None

        locations[columns[0]] = columns[3]

    return locations
Exemple #5
0
def get_resource_locations():
    # FIXME: this may break on non-english systems or new versions of pacemaker
    """Parse `crm_mon -1` to identify where (if anywhere)
       resources (i.e. targets) are running."""

    try:
        rc, lines_text, stderr = AgentShell.run_old(["crm_mon", "-1", "-r"])
    except OSError, e:
        # ENOENT is fine here.  Pacemaker might not be installed yet.
        if e.errno != errno.ENOENT:
            raise
Exemple #6
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Started'
        ])

        if error:
            return agent_error(error)

        # now wait for it to start
        _wait_target(ha_label, True)

        # and make sure it didn't start but (the RA) fail(ed)
        rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1'])

        failed = True
        for line in stdout.split("\n"):
            if line.lstrip().startswith(ha_label):
                if line.find("FAILED") < 0:
                    failed = False

        if failed:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message([
                'crm_resource', '-r', ha_label, '-p', 'target-role', '-m',
                '-v', 'Stopped'
            ])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s" % ha_label)
            else:
                return agent_error("Failed to start target %s" % ha_label)

        else:
            location = get_resource_location(ha_label)
            if not location:
                return agent_error("Started %s but now can't locate it!" %
                                   ha_label)
            return agent_result(location)
Exemple #7
0
    def _read_crm_mon_as_xml(self):
        """Run crm_mon --one-shot --as-xml, return raw output or None

        For expected return values (0, 10), return the stdout from output.
        If the return value is unexpected, log a warning, and return None
        """

        crm_command = ['crm_mon', '--one-shot', '--as-xml']
        try:
            rc, stdout, stderr = AgentShell.run_old(crm_command)
        except OSError, e:
            # ENOENT is fine here.  Pacemaker might not be installed yet.
            if e.errno != errno.ENOENT:
                raise
def delete_node(nodename):
    rc, stdout, stderr = AgentShell.run_old(['crm_node', '-l'])
    node_id = None
    for line in stdout.split('\n'):
        node_id, name, status = line.split(" ")
        if name == nodename:
            break
    AgentShell.try_run(['crm_node', '--force', '-R', node_id])
    cibadmin(
        ["--delete", "-o", "nodes", "-X",
         "<node uname=\"%s\"/>" % nodename])
    cibadmin([
        "--delete", "-o", "nodes", "--crm_xml",
        "<node_state uname=\"%s\"/>" % nodename
    ])
    def _read_crm_mon_as_xml(self):
        """Run crm_mon --one-shot --as-xml, return raw output or None

        For expected return values (0, 10), return the stdout from output.
        If the return value is unexpected, log a warning, and return None
        """

        crm_command = ['crm_mon', '--one-shot', '--as-xml']
        rc, stdout, stderr = AgentShell.run_old(crm_command)
        if rc not in [0, 10]:  # 10 Corosync is not running on this node
            daemon_log.warning("rc=%s running '%s': '%s' '%s'" %
                               (rc, crm_command, stdout, stderr))
            stdout = None

        return stdout
def _get_cluster_size():
    # you'd think there'd be a way to query the value of a property
    # such as "expected-quorum-votes" but there does not seem to be, so
    # just count nodes instead
    rc, stdout, stderr = AgentShell.run_old(["crm_node", "-l"])

    if not stdout:
        return 0

    n = 0
    for line in stdout.rstrip().split('\n'):
        node_id, name, status = line.split(" ")
        if status == "member" or status == "lost":
            n = n + 1

    return n
Exemple #11
0
def _query_ha_targets():
    targets = {}

    rc, stdout, stderr = AgentShell.run_old(['crm_resource', '-l'])
    if rc == 234:
        return targets
    elif rc != 0:
        raise RuntimeError("Error %s running crm_resource -l: %s %s" %
                           (rc, stdout, stderr))
    else:
        for resource_id in stdout.split("\n"):
            if len(resource_id) < 1:
                continue

            target = {'ha_label': resource_id}
            raw_xml = "\n".join(
                AgentShell.try_run(['crm_resource', '-r', resource_id,
                                    '-q']).split("\n")[2:])
            target['uuid'] = _get_nvpairid_from_xml(raw_xml)
            targets[resource_id] = target

        return targets
Exemple #12
0
    def _read_crm_mon_as_xml(self):
        """Run crm_mon --one-shot --as-xml, return raw output or None

        For expected return values (0, 10), return the stdout from output.
        If the return value is unexpected, log a warning, and return None
        """

        crm_command = ["crm_mon", "--one-shot", "--as-xml"]
        try:
            rc, stdout, stderr = AgentShell.run_old(crm_command)
        except OSError as e:
            # ENOENT is fine here.  Pacemaker might not be installed yet.
            if e.errno != errno.ENOENT:
                raise e
            return None

        if rc not in [0, 10]:  # 10 Corosync is not running on this node
            daemon_log.warning("rc=%s running '%s': '%s' '%s'" %
                               (rc, crm_command, stdout, stderr))
            stdout = None

        return stdout
Exemple #13
0
    def properties(self):
        """Returns less volatile node data suitable for host validation.

        If the fetched property is expensive to compute, it should be cached / updated less frequently.
        """
        zfs_not_installed, stdout, stderr = AgentShell.run_old(
            ['which', 'zfs'])

        return {
            'zfs_installed':
            not zfs_not_installed,
            'distro':
            platform.linux_distribution()[0],
            'distro_version':
            float('.'.join(platform.linux_distribution()[1].split('.')[:2])),
            'python_version_major_minor':
            float("%s.%s" % (platform.python_version_tuple()[0],
                             platform.python_version_tuple()[1])),
            'python_patchlevel':
            int(platform.python_version_tuple()[2]),
            'kernel_version':
            platform.release()
        }
Exemple #14
0
    def properties(self):
        """Returns less volatile node data suitable for host validation.

        If the fetched property is expensive to compute, it should be cached / updated less frequently.
        """
        zfs_not_installed, stdout, stderr = AgentShell.run_old(["which", "zfs"])

        return {
            "zfs_installed": not zfs_not_installed,
            "distro": platform.linux_distribution()[0],
            "distro_version": float(
                ".".join(platform.linux_distribution()[1].split(".")[:2])
            ),
            "python_version_major_minor": float(
                "%s.%s"
                % (
                    platform.python_version_tuple()[0],
                    platform.python_version_tuple()[1],
                )
            ),
            "python_patchlevel": int(platform.python_version_tuple()[2]),
            "kernel_version": platform.release(),
        }
Exemple #15
0
def configure_target_ha(primary, device, ha_label, uuid, mount_point):
    '''
    Configure the target high availability

    Return: Value using simple return protocol
    '''

    if primary:
        # If the target already exists with the same params, skip.
        # If it already exists with different params, that is an error
        rc, stdout, stderr = AgentShell.run_old(
            ["crm_resource", "-r", ha_label, "-g", "target"])
        if rc == 0:
            info = _get_target_config(stdout.rstrip("\n"))
            if info['bdev'] == device and info['mntpt'] == mount_point:
                return agent_result_ok
            else:
                return agent_error(
                    "A resource with the name %s already exists" % ha_label)

        tmp_f, tmp_name = tempfile.mkstemp()
        os.write(
            tmp_f,
            "<primitive class=\"ocf\" provider=\"chroma\" type=\"Target\" id=\"%s\">\
  <meta_attributes id=\"%s-meta_attributes\">\
    <nvpair name=\"target-role\" id=\"%s-meta_attributes-target-role\" value=\"Stopped\"/>\
  </meta_attributes>\
  <operations id=\"%s-operations\">\
    <op id=\"%s-monitor-5\" interval=\"5\" name=\"monitor\" timeout=\"60\"/>\
    <op id=\"%s-start-0\" interval=\"0\" name=\"start\" timeout=\"300\"/>\
    <op id=\"%s-stop-0\" interval=\"0\" name=\"stop\" timeout=\"300\"/>\
  </operations>\
  <instance_attributes id=\"%s-instance_attributes\">\
    <nvpair id=\"%s-instance_attributes-target\" name=\"target\" value=\"%s\"/>\
  </instance_attributes>\
</primitive>" % (ha_label, ha_label, ha_label, ha_label, ha_label, ha_label,
                 ha_label, ha_label, ha_label, uuid))
        os.close(tmp_f)

        cibadmin(["-o", "resources", "-C", "-x", "%s" % tmp_name])
        score = 20
        preference = "primary"
    else:
        score = 10
        preference = "secondary"

    # Hostname. This is a shorterm point fix that will allow us to make HP2 release more functional. Between el6 and el7
    # (truthfully we should probably be looking at Pacemaker or Corosync versions) Pacemaker started to use fully qualified
    # domain names rather than just the nodename.  lotus-33vm15.lotus.hpdd.lab.intel.com vs lotus-33vm15. To keep compatiblity
    # easily we have to make the contraints follow the same fqdn vs node.
    if platform_info.distro_version >= 7.0:
        node = socket.getfqdn()
    else:
        node = os.uname()[1]

    result = cibadmin([
        "-o", "constraints", "-C", "-X",
        "<rsc_location id=\"%s-%s\" node=\"%s\" rsc=\"%s\" score=\"%s\"/>" %
        (ha_label, preference, node, ha_label, score)
    ])

    if result.rc == 76:
        return agent_error("A constraint with the name %s-%s already exists" %
                           (ha_label, preference))

    _mkdir_p_concurrent(mount_point)

    return agent_result_ok
 def scsi_id_command(cmd):
     rc, out, err = AgentShell.run_old(cmd)
     if rc:
         return None
     else:
         return out.strip()
Exemple #17
0
def corosync_running():
    rc, stdout, stderr = AgentShell.run_old(['service', 'corosync', 'status'])

    return rc == 0
Exemple #18
0
def yum_util(action,
             packages=[],
             fromrepo=None,
             enablerepo=None,
             narrow_updates=False):
    '''
    A wrapper to perform yum actions in encapsulated way.
    :param action:  clean, install, remove, update, requires etc
    :param packages: Packages to install or remove
    :param fromrepo: The repo the action should be carried out from, others are disabled.
    :param enablerepo: The repo to enable for the action, others are not disabled or enabled
    :param narrow_updates: ?
    :return: No return but throws CommandExecutionError on error.
    '''

    if fromrepo and enablerepo:
        raise ValueError(
            "Cannot provide fromrepo and enablerepo simultaneously")

    repo_arg = []
    valid_rc_values = [0]  # Some errors values other than 0 are valid.
    if fromrepo:
        repo_arg = ['--disablerepo=*', '--enablerepo=%s' % ','.join(fromrepo)]
    elif enablerepo:
        repo_arg = ['--enablerepo=%s' % ','.join(enablerepo)]
    if narrow_updates and action == 'query':
        repo_arg.extend(['--pkgnarrow=updates', '-a'])

    if action == 'clean':
        cmd = ['yum', 'clean', 'all'
               ] + (repo_arg if repo_arg else ["--enablerepo=*"])
    elif action == 'install':
        cmd = ['yum', 'install', '-y'] + repo_arg + list(packages)
    elif action == 'remove':
        cmd = ['yum', 'remove', '-y'] + repo_arg + list(packages)
    elif action == 'update':
        cmd = ['yum', 'update', '-y'] + repo_arg + list(packages)
    elif action == 'requires':
        cmd = ['repoquery', '--requires'] + repo_arg + list(packages)
    elif action == 'query':
        cmd = ['repoquery'] + repo_arg + list(packages)
    elif action == 'repoquery':
        cmd = ['repoquery'] + repo_arg + [
            '-a', '--qf=%{EPOCH} %{NAME} %{VERSION} %{RELEASE} %{ARCH}'
        ]
    elif action == 'check-update':
        cmd = ['yum', 'check-update', '-q'] + repo_arg + list(packages)
        valid_rc_values = [
            0, 100
        ]  # check-update returns 100 if updates are available.
    else:
        raise RuntimeError('Unknown yum util action %s' % action)

    # This is a poor solution for HYD-3855 but not one that carries any known cost.
    # We sometimes see intermittent failures in test, and possibly out of test, that occur
    # 1 in 50 (estimate) times. yum commands are idempotent and so trying the command three
    # times has no downside and changes the estimated chance of fail to 1 in 12500.
    for hyd_3885 in range(2, -1, -1):
        rc, stdout, stderr = AgentShell.run_old(cmd)

        if rc in valid_rc_values:
            return stdout
        else:
            daemon_log.info("HYD-3885 Retrying yum command '%s'" %
                            " ".join(cmd))
            if hyd_3885 == 0:
                daemon_log.info("HYD-3885 Retry yum command failed '%s'" %
                                " ".join(cmd))
                raise AgentShell.CommandExecutionError(
                    AgentShell.RunResult(rc, stdout, stderr, False),
                    cmd)  # Out of retries so raise for the caller..