Beispiel #1
0
def stop_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Stopped'
        ])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s" % ha_label)
        else:
            return agent_error("failed to stop target %s" % ha_label)
def unconfigure_corosync2(host_fqdn, mcast_port):
    """
    Unconfigure the corosync application.

    For corosync2 don't disable pcsd, just remove host node from cluster and disable corosync from
    auto starting (service should already be stopped in state transition)

    Note that pcs cluster commands handle editing and removal of the corosync.conf file

    Return: Value using simple return protocol
    """
    error = corosync_service.disable()
    if error:
        return agent_error(error)

    # Detect if we are the only node in the cluster, we want to do this before next command removes conf file
    cluster_nodes = _nodes_in_cluster()

    result = AgentShell.run(['pcs', '--force', 'cluster', 'node', 'remove', host_fqdn])

    if result.rc != 0:
        if 'No such file or directory' in result.stderr:
            # we want to return successful if the configuration file does not exist
            console_log.warning(result.stderr)
        elif 'Error: Unable to update any nodes' in result.stderr:
            # this error is expected when this is the last node in the cluster
            if len(cluster_nodes) != 1:
                return agent_error(result.stderr)
        else:
            return agent_error(result.stderr)

    return agent_ok_or_error(firewall_control.remove_rule(PCS_TCP_PORT, 'tcp', 'pcs', persist=True) or
                             firewall_control.remove_rule(mcast_port, 'udp', 'corosync', persist=True))
Beispiel #3
0
def unconfigure_target_ha(primary, ha_label, uuid):
    '''
    Unconfigure the target high availability

    Return: Value using simple return protocol
    '''

    with PreservePacemakerCorosyncState():
        if get_resource_location(ha_label):
            return agent_error("cannot unconfigure-ha: %s is still running " %
                               ha_label)

        if primary:
            result = cibadmin(
                ["-D", "-X",
                 "<rsc_location id=\"%s-primary\">" % ha_label])
            result = cibadmin(["-D", "-X", "<primitive id=\"%s\">" % ha_label])

            if result.rc != 0 and result.rc != 234:
                return agent_error("Error %s trying to cleanup resource %s" %
                                   (result.rc, ha_label))

        else:
            result = cibadmin(
                ["-D", "-X",
                 "<rsc_location id=\"%s-secondary\">" % ha_label])

        return agent_result_ok
def get_corosync_autoconfig():
    """
    Automatically detect the configuration for corosync.
    :return: dictionary containing 'result' or 'error'.
    """
    ring0 = get_ring0()

    if not ring0:
        return agent_error('Failed to detect ring0 interface')

    ring1_ipaddr, ring1_prefix = generate_ring1_network(ring0)

    try:
        ring1 = detect_ring1(ring0, ring1_ipaddr, ring1_prefix)
    except RingDetectionError as e:
        return agent_error(e.message)

    return agent_result({
        'interfaces': {
            ring0.name: {
                'dedicated': False,
                'ipaddr': ring0.ipv4_address,
                'prefix': ring0.ipv4_prefixlen
            },
            ring1.name: {
                'dedicated': True,
                'ipaddr': ring1.ipv4_address,
                'prefix': ring1.ipv4_prefixlen
            }
        },
        'mcast_port': ring1.mcastport
    })
Beispiel #5
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Started'
        ])

        if error:
            return agent_error(error)

        # now wait for it to start
        _wait_target(ha_label, True)

        # and make sure it didn't start but (the RA) fail(ed)
        rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1'])

        failed = True
        for line in stdout.split("\n"):
            if line.lstrip().startswith(ha_label):
                if line.find("FAILED") < 0:
                    failed = False

        if failed:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message([
                'crm_resource', '-r', ha_label, '-p', 'target-role', '-m',
                '-v', 'Stopped'
            ])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s" % ha_label)
            else:
                return agent_error("Failed to start target %s" % ha_label)

        else:
            location = get_resource_location(ha_label)
            if not location:
                return agent_error("Started %s but now can't locate it!" %
                                   ha_label)
            return agent_result(location)
def configure_corosync(ring0_name, ring1_name, old_mcast_port, new_mcast_port):
    """
    Process configuration including negotiated multicast port, no IP address information required

    :param ring0_name:
    :param ring1_name:
    :param old_mcast_port: None if we are configuring corosync for the first-time, present if changing mcast port
    :param new_mcast_port: desired corosync multicast port as configured by user
    :return: Value using simple return protocol
    """

    interfaces = [InterfaceInfo(CorosyncRingInterface(name=ring0_name, ringnumber=0, mcastport=new_mcast_port),
                                None,
                                None),
                  InterfaceInfo(CorosyncRingInterface(name=ring1_name, ringnumber=1, mcastport=new_mcast_port),
                                None,
                                None)]

    config = render_config([interface.corosync_iface for interface in interfaces])

    write_config_to_file("/etc/corosync/corosync.conf", config)

    if old_mcast_port is not None:
        error = firewall_control.remove_rule(old_mcast_port, "udp", "corosync", persist=True)

        if error:
            return agent_error(error)

    return agent_ok_or_error(firewall_control.add_rule(new_mcast_port, "udp", "corosync", persist=True) or
                             corosync_service.enable())
Beispiel #7
0
def _failoverback_target(ha_label, destination):
    """Fail a target over to the  destination node

    Return: Value using simple return protocol
    """
    node = _find_resource_constraint(ha_label, destination)
    if not node:
        return agent_error("Unable to find the %s server for '%s'" %
                           (destination, ha_label))

    error = _move_target(ha_label, node)

    if error:
        return agent_error(error)

    return agent_result_ok
Beispiel #8
0
def _configure_pacemaker():
    '''
    Configure pacemaker if this node is the dc.

    :return: agent_ok if no error else returns an agent_error
    '''
    pc = PacemakerConfig()

    timeout_time = time.time() + PACEMAKER_CONFIGURE_TIMEOUT
    error = None

    while (pc.configured is False) and (time.time() < timeout_time):
        if pc.is_dc:
            daemon_log.info(
                'Configuring (global) pacemaker configuration because I am the DC'
            )

            error = _do_configure_pacemaker(pc)

            if error:
                return agent_error(error)
        else:
            daemon_log.info(
                'Not configuring (global) pacemaker configuration because I am not the DC'
            )

        time.sleep(10)

    if pc.configured is False:
        error = 'Failed to configure (global) pacemaker configuration dc=%s' % pc.dc

    return agent_ok_or_error(error)
def initialise_block_device_drivers():
    console_log.info("Initialising drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.initialise_driver(config.profile_managed)

        if error:
            return agent_error(error)

    return agent_result_ok
Beispiel #10
0
def unconfigure_repo(filename):
    full_filename = os.path.join(REPO_PATH, filename)

    try:
        os.remove(full_filename)
    except OSError as error:
        if error.errno != errno.ENOENT:
            return agent_error(str(error))

    return agent_result_ok
Beispiel #11
0
    def test_set_profile_fail(self):
        # Three times because yum will try three times.
        self.add_commands(CommandCaptureCommand(('yum', 'install', '-y', '--enablerepo=iml-agent', 'chroma-agent-management'), rc=1, stdout="Bad command stdout", stderr="Bad command stderr"),
                          CommandCaptureCommand(('yum', 'install', '-y', '--enablerepo=iml-agent', 'chroma-agent-management'), rc=1, stdout="Bad command stdout", stderr="Bad command stderr"),
                          CommandCaptureCommand(('yum', 'install', '-y', '--enablerepo=iml-agent', 'chroma-agent-management'), rc=1, stdout="Bad command stdout", stderr="Bad command stderr"))

        config.update('settings', 'profile', {'managed': False})

        # Go from managed = False to managed = True, but it will fail.
        self.assertEqual(agent_updates.update_profile({'managed': True}), agent_error('Unable to set profile because yum returned Bad command stdout'))
        self.assertRanAllCommandsInOrder()
def unconfigure_corosync():
    """
    Unconfigure the corosync application.

    :return: Value using simple return protocol
    """
    corosync_service.stop()
    corosync_service.disable()
    mcast_port = None

    with open("/etc/corosync/corosync.conf") as f:
        for line in f.readlines():
            match = re.match("\s*mcastport:\s*(\d+)", line)
            if match:
                mcast_port = match.group(1)
                break
    if mcast_port is None:
        return agent_error("Failed to find mcastport in corosync.conf")

    try:
        remove("/etc/corosync/corosync.conf")
    except OSError, e:
        if e.errno != errno.ENOENT:
            return agent_error("Failed to remove corosync.conf")
    def run(self, cmd, agent_daemon_context, args):
        # FIXME: provide a log object to action plugins that we capture
        # and send back to the caller
        try:
            fn = self.commands[cmd]
        except KeyError:
            return agent_error(
                "Requested command %s was unknown to the agent" % cmd)

        # Only pass in the agent_daemon_context if the agent_daemon_context is expected by the function.
        # This feature was added just prior to 3.1 and whilst it would be better to always pass the context the
        # scope of the change was prohibitive at that time.
        # Not a fixme because it is of little value to make the additional changes at this time.
        if 'agent_daemon_context' in fn.__code__.co_varnames:
            return fn(agent_daemon_context, **args)
        else:
            return fn(**args)
Beispiel #14
0
def configure_repo(filename, file_contents):
    crypto = Crypto(config.path)
    full_filename = os.path.join(REPO_PATH, filename)
    temp_full_filename = full_filename + '.tmp'

    file_contents = file_contents.format(crypto.AUTHORITY_FILE,
                                         crypto.PRIVATE_KEY_FILE,
                                         crypto.CERTIFICATE_FILE)

    try:
        file_handle = os.fdopen(
            os.open(temp_full_filename, os.O_WRONLY | os.O_CREAT, 0644), 'w')
        file_handle.write(file_contents)
        file_handle.close()
        os.rename(temp_full_filename, full_filename)
    except OSError as error:
        return agent_error(str(error))

    return agent_result_ok
Beispiel #15
0
def stop_monitored_copytool(id):
    # Stop the monitor after the copytool so that we can relay the
    # unconfigure event.

    for service_name in ['chroma-copytool-monitor', 'chroma-copytool']:
        service = ServiceControl.create('%s-%s' % (service_name, id))

        if os.path.exists(_init_file_name(service_name,
                                          id)) and service.running:
            error = service.stop()

            if error:
                return agent_error(error)

            os.remove(_init_file_name(service_name, id))

        service.daemon_reload(
        )  # Finally cause the system agents to see our changes.

    return agent_result_ok
Beispiel #16
0
def update_profile(profile):
    '''
    Sets the profile to the profile_name by fetching the profile from the manager
    :param profile_name:
    :return: error or result OK
    '''
    old_profile = config.get('settings', 'profile')
    '''
    This is an incomplete solution but the incompleteness is at the bottom of the stack and we need this as a fix up
    for 2.2 release.

    What really needs to happen here is that the profile contains the name of the packages to install and then this
    code would diff the old list and the new list and remove and add appropriately. For now we are just going to do that
    in a hard coded way using the managed property.

    To do this properly the profile needs to contain the packages and the endpoint needs to return them. We are going to
    need it and when we do this function and profiles will need to be extended.

    This code might want to use the update_pacakges as well but it's not clear and we are in a pickle here. This code is
    not bad and doesn't have bad knock on effects.
    '''

    if old_profile['managed'] != profile['managed']:
        if profile['managed']:
            action = 'install'
        else:
            action = 'remove'

        try:
            yum_util(action,
                     enablerepo=["iml-agent"],
                     packages=['chroma-agent-management'])
        except AgentShell.CommandExecutionError as cee:
            return agent_error(
                "Unable to set profile because yum returned %s" %
                cee.result.stdout)

    config.update('settings', 'profile', profile)

    return agent_result_ok
Beispiel #17
0
def install_packages(repos, packages):
    """
    Explicitly evaluate and install or update any specific-version dependencies and satisfy even if
    that involves installing an older package than is already installed.
    Primary use case is installing lustre-modules, which depends on a specific kernel package.

    :param repos: List of strings, yum repo names
    :param packages: List of strings, yum package names
    :return: package report of the format given by the lustre device plugin
    """
    if packages != []:
        yum_util('clean')

        out = yum_util('requires', enablerepo=repos, packages=packages)
        for requirement in [l.strip() for l in out.strip().split("\n")]:
            match = re.match("([^\)/]*) = (.*)", requirement)
            if match:
                require_package, require_version = match.groups()
                packages.append("%s-%s" % (require_package, require_version))

        yum_util('install', enablerepo=repos, packages=packages)

        # So now we have installed the packages requested, we will also make sure that any installed packages we
        # have that are already installed are updated to our presumably better versions.
        update_packages = yum_check_update(repos)

        if update_packages:
            daemon_log.debug(
                "The following packages need update after we installed IML packages %s"
                % update_packages)
            yum_util('update', packages=update_packages, enablerepo=repos)

        error = _check_HYD4050()

        if error:
            return agent_error(error)

    return agent_result(lustre.scan_packages())
Beispiel #18
0
def start_monitored_copytool(id):
    # Start the monitor first so that we have a reader on the FIFO when
    # the copytool begins emitting events. Then start the copytool

    copytool_vars = _copytool_vars(id)

    for service_name in ['chroma-copytool-monitor', 'chroma-copytool']:
        _write_service_init(service_name, copytool_vars['id'],
                            copytool_vars['ct_path'],
                            copytool_vars['ct_arguments'])

        service = ServiceControl.create('%s-%s' % (service_name, id))

        service.daemon_reload()

        if service.running:
            error = service.restart()
        else:
            error = service.start()

        if error:
            return agent_error(error)

    return agent_result_ok
Beispiel #19
0
def configure_pacemaker():
    '''
    Configure pacemaker
    :return: Error string on failure, None on success
    '''
    # Corosync needs to be running for pacemaker -- if it's not, make
    # an attempt to get it going.
    if not corosync_service.running:
        error = corosync_service.restart()

        if error:
            return agent_error(error)

    for action in [
            enable_pacemaker, stop_pacemaker, start_pacemaker,
            _configure_pacemaker
    ]:
        error = action()

        if error != agent_result_ok:
            return error

    time.sleep(1)
    return agent_result_ok
 def assertAgentError(self, value, message):
     self.assertEqual(value, agent_error(message))
Beispiel #21
0
def configure_target_ha(primary, device, ha_label, uuid, mount_point):
    '''
    Configure the target high availability

    Return: Value using simple return protocol
    '''

    if primary:
        # If the target already exists with the same params, skip.
        # If it already exists with different params, that is an error
        rc, stdout, stderr = AgentShell.run_old(
            ["crm_resource", "-r", ha_label, "-g", "target"])
        if rc == 0:
            info = _get_target_config(stdout.rstrip("\n"))
            if info['bdev'] == device and info['mntpt'] == mount_point:
                return agent_result_ok
            else:
                return agent_error(
                    "A resource with the name %s already exists" % ha_label)

        tmp_f, tmp_name = tempfile.mkstemp()
        os.write(
            tmp_f,
            "<primitive class=\"ocf\" provider=\"chroma\" type=\"Target\" id=\"%s\">\
  <meta_attributes id=\"%s-meta_attributes\">\
    <nvpair name=\"target-role\" id=\"%s-meta_attributes-target-role\" value=\"Stopped\"/>\
  </meta_attributes>\
  <operations id=\"%s-operations\">\
    <op id=\"%s-monitor-5\" interval=\"5\" name=\"monitor\" timeout=\"60\"/>\
    <op id=\"%s-start-0\" interval=\"0\" name=\"start\" timeout=\"300\"/>\
    <op id=\"%s-stop-0\" interval=\"0\" name=\"stop\" timeout=\"300\"/>\
  </operations>\
  <instance_attributes id=\"%s-instance_attributes\">\
    <nvpair id=\"%s-instance_attributes-target\" name=\"target\" value=\"%s\"/>\
  </instance_attributes>\
</primitive>" % (ha_label, ha_label, ha_label, ha_label, ha_label, ha_label,
                 ha_label, ha_label, ha_label, uuid))
        os.close(tmp_f)

        cibadmin(["-o", "resources", "-C", "-x", "%s" % tmp_name])
        score = 20
        preference = "primary"
    else:
        score = 10
        preference = "secondary"

    # Hostname. This is a shorterm point fix that will allow us to make HP2 release more functional. Between el6 and el7
    # (truthfully we should probably be looking at Pacemaker or Corosync versions) Pacemaker started to use fully qualified
    # domain names rather than just the nodename.  lotus-33vm15.lotus.hpdd.lab.intel.com vs lotus-33vm15. To keep compatiblity
    # easily we have to make the contraints follow the same fqdn vs node.
    if platform_info.distro_version >= 7.0:
        node = socket.getfqdn()
    else:
        node = os.uname()[1]

    result = cibadmin([
        "-o", "constraints", "-C", "-X",
        "<rsc_location id=\"%s-%s\" node=\"%s\" rsc=\"%s\" score=\"%s\"/>" %
        (ha_label, preference, node, ha_label, score)
    ])

    if result.rc == 76:
        return agent_error("A constraint with the name %s-%s already exists" %
                           (ha_label, preference))

    _mkdir_p_concurrent(mount_point)

    return agent_result_ok
    corosync_service.disable()
    mcast_port = None

    with open("/etc/corosync/corosync.conf") as f:
        for line in f.readlines():
            match = re.match("\s*mcastport:\s*(\d+)", line)
            if match:
                mcast_port = match.group(1)
                break
    if mcast_port is None:
        return agent_error("Failed to find mcastport in corosync.conf")

    try:
        remove("/etc/corosync/corosync.conf")
    except OSError, e:
        if e.errno != errno.ENOENT:
            return agent_error("Failed to remove corosync.conf")
    except:
        return agent_error("Failed to remove corosync.conf")

    error = firewall_control.remove_rule(mcast_port, "udp", "corosync", persist=True)

    if error:
        return agent_error(error)

    return agent_result_ok


ACTIONS = [start_corosync, stop_corosync,
           configure_corosync, unconfigure_corosync]