Ejemplo n.º 1
0
def unconfigure_target_ha(primary, ha_label, uuid):
    '''
    Unconfigure the target high availability

    :param primary: Boolean if localhost is primary
    :param ha_label: String that identifies resource
    :param uuid: UUID that identifies config
    :return: Value using simple return protocol
     '''

    with PreservePacemakerCorosyncState():
        info = _get_target_config(uuid)
        if get_resource_location(ha_label):
            return agent_error(
                "cannot unconfigure-ha: {} is still running ".format(ha_label))

        _unconfigure_target_priority(primary, ha_label)

        if primary:
            result = _unconfigure_target_ha(ha_label, info)

            if result.rc != 0 and result.rc != 234:
                return agent_error(
                    "Error {} trying to cleanup resource {}".format(
                        result.rc, ha_label))

        return agent_result_ok
Ejemplo n.º 2
0
def stop_target(ha_label):
    '''
    Stop the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        if _resource_exists(_zfs_name(ha_label)):
            # Group disable will disable all members of group regardless of current status
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable',
                 _group_name(ha_label)])
        else:
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable', ha_label])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s", ha_label)
        else:
            return agent_error("Failed to stop target {}".format(ha_label))
Ejemplo n.º 3
0
def get_corosync_autoconfig():
    """
    Automatically detect the configuration for corosync.
    :return: dictionary containing 'result' or 'error'.
    """
    ring0 = get_shared_ring()

    if not ring0:
        return agent_error("Failed to detect ring0 interface")

    ring1_ipaddr, ring1_prefix = generate_ring1_network(ring0)

    try:
        ring1 = detect_ring1(ring0, ring1_ipaddr, ring1_prefix)
    except RingDetectionError as e:
        return agent_error(e.message)

    return agent_result({
        "interfaces": {
            ring0.name: {
                "dedicated": False,
                "ipaddr": ring0.ipv4_address,
                "prefix": ring0.ipv4_prefixlen,
            },
            ring1.name: {
                "dedicated": True,
                "ipaddr": ring1.ipv4_address,
                "prefix": ring1.ipv4_prefixlen,
            },
        },
        "mcast_port": ring1.mcastport,
    })
Ejemplo n.º 4
0
def get_corosync_autoconfig():
    """
    Automatically detect the configuration for corosync.
    :return: dictionary containing 'result' or 'error'.
    """
    ring0 = get_ring0()

    if not ring0:
        return agent_error('Failed to detect ring0 interface')

    ring1_ipaddr, ring1_prefix = generate_ring1_network(ring0)

    try:
        ring1 = detect_ring1(ring0, ring1_ipaddr, ring1_prefix)
    except RingDetectionError as e:
        return agent_error(e.message)

    return agent_result({
        'interfaces': {
            ring0.name: {
                'dedicated': False,
                'ipaddr': ring0.ipv4_address,
                'prefix': ring0.ipv4_prefixlen
            },
            ring1.name: {
                'dedicated': True,
                'ipaddr': ring1.ipv4_address,
                'prefix': ring1.ipv4_prefixlen
            }
        },
        'mcast_port': ring1.mcastport
    })
Ejemplo n.º 5
0
def unconfigure_target_ha(primary, ha_label, uuid):
    '''
    Unconfigure the target high availability

    Return: Value using simple return protocol
    '''

    with PreservePacemakerCorosyncState():
        if get_resource_location(ha_label):
            return agent_error("cannot unconfigure-ha: %s is still running " %
                               ha_label)

        if primary:
            result = cibadmin(
                ["-D", "-X",
                 "<rsc_location id=\"%s-primary\">" % ha_label])
            result = cibadmin(["-D", "-X", "<primitive id=\"%s\">" % ha_label])

            if result.rc != 0 and result.rc != 234:
                return agent_error("Error %s trying to cleanup resource %s" %
                                   (result.rc, ha_label))

        else:
            result = cibadmin(
                ["-D", "-X",
                 "<rsc_location id=\"%s-secondary\">" % ha_label])

        return agent_result_ok
Ejemplo n.º 6
0
def stop_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Stopped'
        ])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s" % ha_label)
        else:
            return agent_error("failed to stop target %s" % ha_label)
Ejemplo n.º 7
0
def unconfigure_corosync2(host_fqdn, mcast_port):
    """
    Unconfigure the corosync application.

    For corosync2 don't disable pcsd, just remove host node from cluster and disable corosync from
    auto starting (service should already be stopped in state transition)

    Note that pcs cluster commands handle editing and removal of the corosync.conf file

    Return: Value using simple return protocol
    """
    error = corosync_service.disable()
    if error:
        return agent_error(error)

    # Detect if we are the only node in the cluster, we want to do this before next command removes conf file
    cluster_nodes = _nodes_in_cluster()

    result = AgentShell.run(["pcs", "--force", "cluster", "node", "remove", host_fqdn])

    if result.rc != 0:
        if "No such file or directory" in result.stderr:
            # we want to return successful if the configuration file does not exist
            console_log.warning(result.stderr)
        elif "Error: Unable to update any nodes" in result.stderr:
            # this error is expected when this is the last node in the cluster
            if len(cluster_nodes) != 1:
                return agent_error(result.stderr)
        else:
            return agent_error(result.stderr)

    return agent_ok_or_error(
        firewall_control.remove_rule(PCS_TCP_PORT, "tcp", "pcs", persist=True)
        or firewall_control.remove_rule(mcast_port, "udp", "corosync", persist=True)
    )
Ejemplo n.º 8
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Started'
        ])

        if error:
            return agent_error(error)

        # now wait for it to start
        _wait_target(ha_label, True)

        # and make sure it didn't start but (the RA) fail(ed)
        rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1'])

        failed = True
        for line in stdout.split("\n"):
            if line.lstrip().startswith(ha_label):
                if line.find("FAILED") < 0:
                    failed = False

        if failed:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message([
                'crm_resource', '-r', ha_label, '-p', 'target-role', '-m',
                '-v', 'Stopped'
            ])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s" % ha_label)
            else:
                return agent_error("Failed to start target %s" % ha_label)

        else:
            location = get_resource_location(ha_label)
            if not location:
                return agent_error("Started %s but now can't locate it!" %
                                   ha_label)
            return agent_result(location)
Ejemplo n.º 9
0
def start_target(ha_label):
    """
    Start the high availability target

    Return: Value using simple return protocol
    """

    if not _resource_exists(ha_label):
        return agent_error("Target {} does not exist".format(ha_label))

    # if resource already started but not on primary, move it
    location = get_resource_location(ha_label)
    primary = _find_resource_constraint(ha_label, True)
    if location:
        if location != primary:
            console_log.info(
                "Resource %s already started, moving to primary node %s",
                ha_label,
                primary,
            )
            error = _move_target(ha_label, primary)
            if error:
                return agent_error(error)
            location = primary
        return agent_result(location)

    try:
        _res_set_started(ha_label, True)
        if _resource_exists(_zfs_name(ha_label)):
            _res_set_started(_zfs_name(ha_label), True)
            # enable group also, in case group was disabled
            _res_set_started(_group_name(ha_label), True)

        # now wait for it to start
        if not _wait_target(ha_label, True):
            # try to leave things in a sane state for a failed mount
            _res_set_started(ha_label, False)

            return agent_error("Failed to start target {}".format(ha_label))

        location = get_resource_location(ha_label)
        if not location:
            return agent_error(
                "Started {} but now can't locate it!".format(ha_label))

        return agent_result(location)

    except AgentShell.CommandExecutionError as err:
        return agent_error(
            "Error (%s) running '%s': '%s' '%s'" %
            (err.result.rc, err.command, err.result.stdout, err.result.stderr))
Ejemplo n.º 10
0
    def test_set_profile_fail(self):
        # Three times because yum will try three times.
        self.add_commands(
            CommandCaptureCommand(
                ('yum', 'install', '-y', '--exclude', 'kernel-debug',
                 'python2-iml-agent-management'),
                rc=1,
                stdout="Bad command stdout",
                stderr="Bad command stderr"),
            CommandCaptureCommand(('yum', 'clean', 'metadata')),
            CommandCaptureCommand(
                ('yum', 'install', '-y', '--exclude', 'kernel-debug',
                 'python2-iml-agent-management'),
                rc=1,
                stdout="Bad command stdout",
                stderr="Bad command stderr"),
            CommandCaptureCommand(('yum', 'clean', 'metadata')),
            CommandCaptureCommand(
                ('yum', 'install', '-y', '--exclude', 'kernel-debug',
                 'python2-iml-agent-management'),
                rc=1,
                stdout="Bad command stdout",
                stderr="Bad command stderr"),
            CommandCaptureCommand(('yum', 'clean', 'metadata')))

        config.update('settings', 'profile', {'managed': False})

        # Go from managed = False to managed = True, but it will fail.
        self.assertEqual(
            agent_updates.update_profile({'managed': True}),
            agent_error(
                'Unable to set profile because yum returned Bad command stdout'
            ))
        self.assertRanAllCommandsInOrder()
Ejemplo n.º 11
0
def start_monitored_copytool(id):
    # Start the monitor first so that we have a reader on the FIFO when
    # the copytool begins emitting events. Then start the copytool

    copytool_vars = _copytool_vars(id)

    for service_name in ["chroma-copytool-monitor", "chroma-copytool"]:
        _write_service_init(
            service_name,
            copytool_vars["id"],
            copytool_vars["ct_path"],
            copytool_vars["ct_arguments"],
        )

        service = ServiceControl.create("%s-%s" % (service_name, id))

        service.daemon_reload()

        if service.running:
            error = service.restart()
        else:
            error = service.start()

        if error:
            return agent_error(error)

    return agent_result_ok
Ejemplo n.º 12
0
def configure_pacemaker():
    """
    Configure pacemaker
    :return: Error string on failure, None on success
    """
    # Corosync needs to be running for pacemaker -- if it's not, make
    # an attempt to get it going.
    if not corosync_service.running:
        error = corosync_service.restart()

        if error:
            return agent_error(error)

    for action in [
            enable_pacemaker,
            stop_pacemaker,
            start_pacemaker,
            _configure_pacemaker,
    ]:
        error = action()

        if error != agent_result_ok:
            return error

    time.sleep(1)
    return agent_result_ok
Ejemplo n.º 13
0
def _failoverback_target(ha_label, primary):
    """Fail a target over to the  destination node

    Return: Value using simple return protocol
    """
    node = _find_resource_constraint(ha_label, primary)
    if not node:
        return agent_error("Unable to find the {} server for '{}'".format(
            'primary' if primary else 'secondary', ha_label))

    error = _move_target(ha_label, node)

    if error:
        return agent_error(error)

    return agent_result_ok
Ejemplo n.º 14
0
def _configure_pacemaker():
    '''
    Configure pacemaker if this node is the dc.

    :return: agent_ok if no error else returns an agent_error
    '''
    pc = PacemakerConfig()

    timeout_time = time.time() + PACEMAKER_CONFIGURE_TIMEOUT
    error = None

    while (pc.configured is False) and (time.time() < timeout_time):
        if pc.is_dc:
            daemon_log.info(
                'Configuring (global) pacemaker configuration because I am the DC'
            )

            error = _do_configure_pacemaker(pc)

            if error:
                return agent_error(error)
        else:
            daemon_log.info(
                'Not configuring (global) pacemaker configuration because I am not the DC'
            )

        time.sleep(10)

    if pc.configured is False:
        error = 'Failed to configure (global) pacemaker configuration dc=%s' % pc.dc

    return agent_ok_or_error(error)
Ejemplo n.º 15
0
def _failoverback_target(ha_label, destination):
    """Fail a target over to the  destination node

    Return: Value using simple return protocol
    """
    node = _find_resource_constraint(ha_label, destination)
    if not node:
        return agent_error("Unable to find the %s server for '%s'" %
                           (destination, ha_label))

    error = _move_target(ha_label, node)

    if error:
        return agent_error(error)

    return agent_result_ok
Ejemplo n.º 16
0
def install_packages(repos, packages):
    """
    Explicitly evaluate and install or update any specific-version dependencies and satisfy even if
    that involves installing an older package than is already installed.
    Primary use case is installing lustre-modules, which depends on a specific kernel package.

    :param repos: List of strings, yum repo names
    :param packages: List of strings, yum package names
    :return: package report of the format given by the lustre device plugin
    """
    if packages != []:
        yum_util("clean")

        out = yum_util("requires", enablerepo=repos, packages=packages)
        for requirement in [l.strip() for l in out.strip().split("\n")]:
            match = re.match("([^\)/]*) = (.*)", requirement)
            if match:
                require_package, require_version = match.groups()
                packages.append("%s-%s" % (require_package, require_version))

        yum_util("install", enablerepo=repos, packages=packages)

        error = _check_HYD4050()

        if error:
            return agent_error(error)

    ServiceControl.create("iml-update-check").start(0)

    return agent_result_ok
Ejemplo n.º 17
0
    def test_set_profile_fail(self):
        # Three times because yum will try three times.
        self.add_commands(
            CommandCaptureCommand(
                (
                    "yum",
                    "install",
                    "-y",
                    "--exclude",
                    "kernel-debug",
                    "python2-iml-agent-management",
                ),
                rc=1,
                stdout="Bad command stdout",
                stderr="Bad command stderr",
            ),
            CommandCaptureCommand(("yum", "clean", "metadata")),
            CommandCaptureCommand(
                (
                    "yum",
                    "install",
                    "-y",
                    "--exclude",
                    "kernel-debug",
                    "python2-iml-agent-management",
                ),
                rc=1,
                stdout="Bad command stdout",
                stderr="Bad command stderr",
            ),
            CommandCaptureCommand(("yum", "clean", "metadata")),
            CommandCaptureCommand(
                (
                    "yum",
                    "install",
                    "-y",
                    "--exclude",
                    "kernel-debug",
                    "python2-iml-agent-management",
                ),
                rc=1,
                stdout="Bad command stdout",
                stderr="Bad command stderr",
            ),
            CommandCaptureCommand(("yum", "clean", "metadata")),
        )

        config.update("settings", "profile", {"managed": False})

        # Go from managed = False to managed = True, but it will fail.
        self.assertEqual(
            agent_updates.update_profile({"managed": True}),
            agent_error(
                "Unable to set profile because yum returned Bad command stdout"
            ),
        )
        self.assertRanAllCommandsInOrder()
Ejemplo n.º 18
0
def initialise_block_device_drivers():
    console_log.info("Initialising drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.initialise_driver(config.profile_managed)

        if error:
            return agent_error(error)

    return agent_result_ok
Ejemplo n.º 19
0
def terminate_block_device_drivers():
    console_log.info("Terminating drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.terminate_driver()

        if error:
            return agent_error(error)

    return agent_result_ok
Ejemplo n.º 20
0
def unconfigure_repo(filename):
    full_filename = os.path.join(REPO_PATH, filename)

    try:
        os.remove(full_filename)
    except OSError as error:
        if error.errno != errno.ENOENT:
            return agent_error(str(error))

    return agent_result_ok
Ejemplo n.º 21
0
def configure_target_ha(primary, device, ha_label, uuid, mount_point):
    """
    Configure the target high availability

    :return: Value using simple return protocol
    """

    _mkdir_p_concurrent(mount_point)

    if primary:
        info = _get_target_config(uuid)
        # If the target already exists with the same params, skip.
        # If it already exists with different params, that is an error
        if _resource_exists(ha_label):
            if info["bdev"] == device and info["mntpt"] == mount_point:
                return agent_result_ok

            return agent_error(
                "A resource with the name {} already exists".format(ha_label))
        if info["bdev"] != device or info["mntpt"] != mount_point:
            console_log.error(
                "Mismatch for %s do not match configured (%s on %s) != (%s on %s)",
                ha_label,
                device,
                mount_point,
                info["bdev"],
                info["mntpt"],
            )
        result = _configure_target_ha(ha_label, info, False)
        if result.rc != 0:
            return agent_error("Failed to create {}: {}".format(
                ha_label, result.rc))

    result = _configure_target_priority(primary, ha_label, _this_node())
    if result.rc != 0:
        return agent_error(
            "Failed to create location constraint on {}: {}".format(
                ha_label, result.rc))

    return agent_result_ok
Ejemplo n.º 22
0
def stop_target(ha_label):
    """
    Stop the high availability target

    Return: Value using simple return protocol
    """
    try:
        # Issue the command to Pacemaker to stop the target
        if _resource_exists(_zfs_name(ha_label)):
            _res_set_started(_group_name(ha_label), False)
        else:
            _res_set_started(ha_label, False)

    except AgentShell.CommandExecutionError as err:
        return agent_error(
            "Error (%s) running '%s': '%s' '%s'" %
            (err.result.rc, err.command, err.result.stdout, err.result.stderr))

    if not _wait_target(ha_label, False):
        return agent_error("Failed to stop target {}".format(ha_label))

    return agent_result_ok
Ejemplo n.º 23
0
def terminate_block_device_drivers():
    """
    When the agent is stopped we want to allow block devices to do any termination that they might need, this function
    may also be called by the manager.
    """
    console_log.info("Terminating drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.terminate_driver()

        if error:
            return agent_error(error)

    return agent_result_ok
Ejemplo n.º 24
0
def initialise_block_device_drivers():
    """
    When the agent is run we want to allow block devices to do any initialization that they might need, this function
    may also be called by the manager.
    """
    console_log.info("Initialising drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.initialise_driver(config.profile_managed)

        if error:
            return agent_error(error)

    return agent_result_ok
Ejemplo n.º 25
0
def unconfigure_corosync():
    """
    Unconfigure the corosync application.

    :return: Value using simple return protocol
    """
    corosync_service.stop()
    corosync_service.disable()
    mcast_port = None

    with open("/etc/corosync/corosync.conf") as f:
        for line in f.readlines():
            match = re.match("\s*mcastport:\s*(\d+)", line)
            if match:
                mcast_port = match.group(1)
                break
    if mcast_port is None:
        return agent_error("Failed to find mcastport in corosync.conf")

    try:
        remove("/etc/corosync/corosync.conf")
    except OSError as e:
        if e.errno != errno.ENOENT:
            return agent_error("Failed to remove corosync.conf")
    except:
        return agent_error("Failed to remove corosync.conf")

    error = firewall_control.remove_rule(mcast_port,
                                         "udp",
                                         "corosync",
                                         persist=True)

    if error:
        return agent_error(error)

    return agent_result_ok
Ejemplo n.º 26
0
def configure_repo(filename, file_contents):
    crypto = Crypto(config.path)
    full_filename = os.path.join(REPO_PATH, filename)
    temp_full_filename = full_filename + '.tmp'

    file_contents = file_contents.format(crypto.AUTHORITY_FILE, crypto.PRIVATE_KEY_FILE, crypto.CERTIFICATE_FILE)

    try:
        file_handle = os.fdopen(os.open(temp_full_filename, os.O_WRONLY | os.O_CREAT, 0644), 'w')
        file_handle.write(file_contents)
        file_handle.close()
        os.rename(temp_full_filename, full_filename)
    except OSError as error:
        return agent_error(str(error))

    return agent_result_ok
Ejemplo n.º 27
0
    def run(self, cmd, agent_daemon_context, args):
        # FIXME: provide a log object to action plugins that we capture
        # and send back to the caller
        try:
            fn = self.commands[cmd]
        except KeyError:
            return agent_error(
                "Requested command %s was unknown to the agent" % cmd)

        # Only pass in the agent_daemon_context if the agent_daemon_context is expected by the function.
        # This feature was added just prior to 3.1 and whilst it would be better to always pass the context the
        # scope of the change was prohibitive at that time.
        # Not a fixme because it is of little value to make the additional changes at this time.
        if 'agent_daemon_context' in fn.__code__.co_varnames:
            return fn(agent_daemon_context, **args)
        else:
            return fn(**args)
Ejemplo n.º 28
0
def configure_corosync(ring0_name, ring1_name, old_mcast_port, new_mcast_port):
    """
    Process configuration including negotiated multicast port, no IP address information required

    :param ring0_name:
    :param ring1_name:
    :param old_mcast_port: None if we are configuring corosync for the first-time, present if changing mcast port
    :param new_mcast_port: desired corosync multicast port as configured by user
    :return: Value using simple return protocol
    """

    interfaces = [
        InterfaceInfo(
            CorosyncRingInterface(name=ring0_name,
                                  ringnumber=0,
                                  mcastport=new_mcast_port),
            None,
            None,
        ),
        InterfaceInfo(
            CorosyncRingInterface(name=ring1_name,
                                  ringnumber=1,
                                  mcastport=new_mcast_port),
            None,
            None,
        ),
    ]

    config = render_config(
        [interface.corosync_iface for interface in interfaces])

    write_config_to_file("/etc/corosync/corosync.conf", config)

    if old_mcast_port is not None:
        error = firewall_control.remove_rule(old_mcast_port,
                                             "udp",
                                             "corosync",
                                             persist=True)

        if error:
            return agent_error(error)

    return agent_ok_or_error(
        firewall_control.add_rule(
            new_mcast_port, "udp", "corosync", persist=True)
        or corosync_service.enable())
Ejemplo n.º 29
0
def stop_monitored_copytool(id):
    # Stop the monitor after the copytool so that we can relay the
    # unconfigure event.

    for service_name in ['chroma-copytool-monitor', 'chroma-copytool']:
        service = ServiceControl.create('%s-%s' % (service_name, id))

        if os.path.exists(_init_file_name(service_name, id)) and service.running:
            error = service.stop()

            if error:
                return agent_error(error)

            os.remove(_init_file_name(service_name, id))

        service.daemon_reload()         # Finally cause the system agents to see our changes.

    return agent_result_ok
Ejemplo n.º 30
0
    def _fake_invoke_agent(self, host, invoke, args=None):
        args = args if args is not None else {}

        assert type(args) is dict, "args list must be dict :%s" % type(args)

        args = InvokeAgentInvoke(host.fqdn, invoke, args, None, None)

        self._invokes_history.append(args)

        result = self._get_executable_invoke(args)
        result.executions_remaining -= 1

        if result.error:
            return agent_error(result.error)

        if result.result:
            return agent_result(result.result)

        return agent_result_ok