コード例 #1
0
ファイル: agent_setup.py プロジェクト: whamcloud/iml-agent
def reregister_server(url, address):
    """ Update manager url and register agent address with manager """
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    conf.set_server_url(url)
    crypto = Crypto(conf.ENV_PATH)
    agent_client = AgentClient(
        url + "reregister/",
        ActionPluginManager(),
        DevicePluginManager(),
        ServerProperties(),
        crypto,
    )
    data = {"address": address, "fqdn": agent_client._fqdn}

    try:
        result = agent_client.post(data)
    except HttpError:
        console_log.error("Reregistration failed to %s with request %s" %
                          (agent_client.url, data))
        raise

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return result
コード例 #2
0
def register_server(url, ca, secret, address=None):
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    crypto = Crypto(config.path)
    # Call delete in case we are over-writing a previous configuration that wasn't removed properly
    crypto.delete()
    crypto.install_authority(ca)

    agent_client = AgentClient(url + "register/%s/" % secret,
                               ActionPluginManager(), DevicePluginManager(),
                               ServerProperties(), crypto)

    registration_result = agent_client.register(address)
    crypto.install_certificate(registration_result['certificate'])

    config.set('settings', 'server', {'url': url})

    console_log.info("Enabling chroma-agent service")
    agent_service.enable()

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return registration_result
コード例 #3
0
def stop_target(ha_label):
    '''
    Stop the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        if _resource_exists(_zfs_name(ha_label)):
            # Group disable will disable all members of group regardless of current status
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable',
                 _group_name(ha_label)])
        else:
            error = AgentShell.run_canned_error_message(
                ['pcs', 'resource', 'disable', ha_label])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s", ha_label)
        else:
            return agent_error("Failed to stop target {}".format(ha_label))
コード例 #4
0
ファイル: manage_node.py プロジェクト: zaja1kun/iml-agent
    def _shutdown():
        console_log.info("Initiating server shutdown per manager request")
        # This will initiate a "nice" shutdown with a wall from root, etc.
        AgentShell.try_run(["shutdown", "-H" if halt else "-h", at_time])

        console_log.info("Terminating")
        os._exit(0)
コード例 #5
0
def reregister_server(url, address):
    """ Update manager url and register agent address with manager """
    if _service_is_running() is True:
        console_log.warning(
            "chroma-agent service was running before registration, stopping.")
        agent_service.stop()

    config.set('settings', 'server', {'url': url})
    crypto = Crypto(config.path)
    agent_client = AgentClient(url + 'reregister/', ActionPluginManager(),
                               DevicePluginManager(), ServerProperties(),
                               crypto)
    data = {'address': address, 'fqdn': agent_client._fqdn}

    try:
        result = agent_client.post(data)
    except HttpError:
        console_log.error("Reregistration failed to %s with request %s" %
                          (agent_client.url, data))
        raise

    console_log.info("Starting chroma-agent service")
    agent_service.start()

    return result
コード例 #6
0
def stop_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-7230: brute force, try up to 3 times to stop the target
    i = 0
    while True:
        i += 1

        # Issue the command to Pacemaker to stop the target
        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Stopped'
        ])

        if error:
            return agent_error(error)

        if _wait_target(ha_label, False):
            return agent_result_ok

        if i < 4:
            console_log.info("failed to stop target %s" % ha_label)
        else:
            return agent_error("failed to stop target %s" % ha_label)
コード例 #7
0
ファイル: manage_node.py プロジェクト: zaja1kun/iml-agent
    def _reboot():
        console_log.info("Initiating server reboot per manager request")
        # reboot(8) just calls shutdown anyhow.
        AgentShell.try_run(["shutdown", "-r", at_time])

        console_log.info("Terminating")
        os._exit(0)
コード例 #8
0
ファイル: corosync.py プロジェクト: zaja1kun/iml-agent
    def set_address(self, ipv4_address, prefix):
        ifaddr = "%s/%s" % (ipv4_address, prefix)

        console_log.info("Set %s (%s) up" % (self.name, ifaddr))

        if self.ipv4_address != ipv4_address:
            node_admin.unmanage_network(self.device, self.mac_address)

            AgentShell.try_run(
                ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up'])
            AgentShell.try_run(
                ['/sbin/ip', 'addr', 'add', ifaddr, 'dev', self.name])

            # The link address change is asynchronous, so we need to wait for the
            # address to stick of we have a race condition.
            timeout = 30
            while self.ipv4_address != ipv4_address and timeout != 0:
                self.refresh()
                time.sleep(1)
                timeout -= 1

            if self.ipv4_address != ipv4_address:
                raise RuntimeError(
                    'Unable to set the address %s for interface %s' %
                    (self.ipv4_address, self.name))

            node_admin.write_ifcfg(self.device, self.mac_address,
                                   self.ipv4_address, self.ipv4_netmask)
        else:
            console_log.info("Nothing to do as %s already has address %s" %
                             (self.name, ifaddr))
コード例 #9
0
ファイル: manage_lnet.py プロジェクト: zaja1kun/iml-agent
def _remove_module(name, modules):
    try:
        m = modules[name]
    except KeyError:
        # It's not loaded, do nothing.
        return None

    console_log.info("Removing %d dependents of %s : %s" %
                     (len(m.dependents), name, m.dependents))
    while m.dependents:
        error = _remove_module(m.dependents.pop(), modules)

        if error:
            return error

    console_log.info("Removing %s" % name)

    error = AgentShell.run_canned_error_message(['rmmod', name])

    if error:
        return error

    modules.pop(name)
    for m in modules.values():
        if name in m.dependents:
            m.dependents.remove(name)

    return None
コード例 #10
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run(['openssl', 'genrsa', '-out', self.PRIVATE_KEY_FILE, '2048', '-sha256'])

        return self.PRIVATE_KEY_FILE
コード例 #11
0
ファイル: agent_setup.py プロジェクト: whamcloud/iml-agent
    def disable_and_kill():
        console_log.info("Terminating")

        storage_server_target = ServiceControl.create(
            "iml-storage-server.target")
        storage_server_target.disable()
        storage_server_target.stop()
コード例 #12
0
def generate_ring1_network(ring0):
    # find a good place for the ring1 network
    subnet = find_subnet(ring0.ipv4_network, ring0.ipv4_prefixlen)
    address = str(IPAddress((int(IPAddress(ring0.ipv4_hostmask)) &
                             int(IPAddress(ring0.ipv4_address))) |
                            int(subnet.ip)))
    console_log.info("Chose %s/%d for ring1 address" % (address, subnet.prefixlen))
    return address, str(subnet.prefixlen)
コード例 #13
0
ファイル: manage_node.py プロジェクト: zaja1kun/iml-agent
def stonith(node):
    p_cfg = PacemakerConfig()

    # TODO: signal that manager that a STONITH has been done so that it
    #       doesn't treat it as an AWOL
    console_log.info("Rebooting %s per a STONITH request" % node)

    p_cfg.get_node(node).fence_reboot()
コード例 #14
0
ファイル: manage_lnet.py プロジェクト: whamcloud/iml-agent
def start_lnet():
    """
    Place lnet into the 'up' state.
    """
    console_log.info("Starting LNet")

    return AgentShell.run_canned_error_message(
        ["lnetctl", "lnet", "configure", "--all"])
コード例 #15
0
def terminate_block_device_drivers():
    console_log.info("Terminating drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.terminate_driver()

        if error:
            return agent_error(error)

    return agent_result_ok
コード例 #16
0
def stop_lnet():
    '''
    Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state
    will be unloaded before lnet is stopped.
    '''

    console_log.info("Stopping LNet")
    return agent_ok_or_error(_rmmod_deps("lnet", excpt=["ksocklnd", "ko2iblnd"]) or
                             AgentShell.run_canned_error_message(["lctl", "net", "down"]))
コード例 #17
0
def initialise_block_device_drivers():
    console_log.info("Initialising drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.initialise_driver(config.profile_managed)

        if error:
            return agent_error(error)

    return agent_result_ok
コード例 #18
0
    def private_key_file(self):
        """Return a path to a PEM file"""
        if not os.path.exists(self.PRIVATE_KEY_FILE):
            console_log.info("Generating private key")
            AgentShell.try_run([
                "openssl", "genrsa", "-out", self.PRIVATE_KEY_FILE, "2048",
                "-sha256"
            ])

        return self.PRIVATE_KEY_FILE
コード例 #19
0
def start_lnet():
    '''
    Place lnet into the 'up' state.
    '''
    console_log.info("Starting LNet")

    # modprobe lust is a hack for HYD-1263 - Fix or work around LU-1279 - failure trying to mount
    # should be removed when LU-1279 is fixed
    return agent_ok_or_error(AgentShell.run_canned_error_message(["lctl", "net", "up"]) or
                             AgentShell.run_canned_error_message(["modprobe", "lustre"]))
コード例 #20
0
ファイル: corosync.py プロジェクト: whamcloud/iml-agent
def get_cluster_node_name():
    try:
        return AgentShell.try_run(["crm_node", "-n"]).strip()
    except Exception as e:
        console_log.info(
            "Could not get cluster node name {}. Falling back to socket.getfqdn()".format(
                e
            )
        )

        return socket.getfqdn()
コード例 #21
0
ファイル: manage_lnet.py プロジェクト: whamcloud/iml-agent
def stop_lnet():
    """
    Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state
    will be unloaded before lnet is stopped.
    """

    console_log.info("Stopping LNet")

    return agent_ok_or_error(
        AgentShell.run_canned_error_message(["lustre_rmmod", "ptlrpc"])
        or AgentShell.run_canned_error_message(
            ["lnetctl", "lnet", "unconfigure"]))
コード例 #22
0
ファイル: corosync.py プロジェクト: AlexTalker/iml-agent
def find_unused_port(ring0, timeout=10, batch_count=10000):
    from random import choice

    dest_addr = ring0.mcastaddr
    port_min = 32767
    port_max = 65535
    ports = range(port_min, port_max, 2)
    portrange_str = "%s-%s" % (port_min, port_max)

    firewall_control.add_rule(
        0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
    )

    try:
        networking.subscribe_multicast(ring0)
        console_log.info(
            "Sniffing for packets to %s on %s within port range %s"
            % (dest_addr, ring0.name, portrange_str)
        )
        cap = networking.start_cap(
            ring0,
            timeout,
            "host %s and udp and portrange %s" % (dest_addr, portrange_str),
        )

        def recv_packets(header, data):
            tgt_port = networking.get_dport_from_packet(data)

            try:
                ports.remove(tgt_port)
            except ValueError:
                # already removed
                pass

        packet_count = 0
        start = time.time()
        while time.time() - start < timeout:
            try:
                packet_count += cap.dispatch(batch_count, recv_packets)
            except Exception as e:
                raise RuntimeError("Error reading from the network: %s" % str(e))

        console_log.info(
            "Finished after %d seconds, sniffed: %d"
            % (time.time() - start, packet_count)
        )
    finally:
        firewall_control.remove_rule(
            0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
        )

    return choice(ports)
コード例 #23
0
def start_target(ha_label):
    '''
    Start the high availability target

    Return: Value using simple return protocol
    '''
    # HYD-1989: brute force, try up to 3 times to start the target
    i = 0
    while True:
        i += 1

        error = AgentShell.run_canned_error_message([
            'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v',
            'Started'
        ])

        if error:
            return agent_error(error)

        # now wait for it to start
        _wait_target(ha_label, True)

        # and make sure it didn't start but (the RA) fail(ed)
        rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1'])

        failed = True
        for line in stdout.split("\n"):
            if line.lstrip().startswith(ha_label):
                if line.find("FAILED") < 0:
                    failed = False

        if failed:
            # try to leave things in a sane state for a failed mount
            error = AgentShell.run_canned_error_message([
                'crm_resource', '-r', ha_label, '-p', 'target-role', '-m',
                '-v', 'Stopped'
            ])

            if error:
                return agent_error(error)

            if i < 4:
                console_log.info("failed to start target %s" % ha_label)
            else:
                return agent_error("Failed to start target %s" % ha_label)

        else:
            location = get_resource_location(ha_label)
            if not location:
                return agent_error("Started %s but now can't locate it!" %
                                   ha_label)
            return agent_result(location)
コード例 #24
0
def terminate_block_device_drivers():
    """
    When the agent is stopped we want to allow block devices to do any termination that they might need, this function
    may also be called by the manager.
    """
    console_log.info("Terminating drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.terminate_driver()

        if error:
            return agent_error(error)

    return agent_result_ok
コード例 #25
0
def initialise_block_device_drivers():
    """
    When the agent is run we want to allow block devices to do any initialization that they might need, this function
    may also be called by the manager.
    """
    console_log.info("Initialising drivers for block device types")
    for cls in util.all_subclasses(BlockDevice):
        error = cls.initialise_driver(config.profile_managed)

        if error:
            return agent_error(error)

    return agent_result_ok
コード例 #26
0
def start_target(ha_label):
    """
    Start the high availability target

    Return: Value using simple return protocol
    """

    if not _resource_exists(ha_label):
        return agent_error("Target {} does not exist".format(ha_label))

    # if resource already started but not on primary, move it
    location = get_resource_location(ha_label)
    primary = _find_resource_constraint(ha_label, True)
    if location:
        if location != primary:
            console_log.info(
                "Resource %s already started, moving to primary node %s",
                ha_label,
                primary,
            )
            error = _move_target(ha_label, primary)
            if error:
                return agent_error(error)
            location = primary
        return agent_result(location)

    try:
        _res_set_started(ha_label, True)
        if _resource_exists(_zfs_name(ha_label)):
            _res_set_started(_zfs_name(ha_label), True)
            # enable group also, in case group was disabled
            _res_set_started(_group_name(ha_label), True)

        # now wait for it to start
        if not _wait_target(ha_label, True):
            # try to leave things in a sane state for a failed mount
            _res_set_started(ha_label, False)

            return agent_error("Failed to start target {}".format(ha_label))

        location = get_resource_location(ha_label)
        if not location:
            return agent_error(
                "Started {} but now can't locate it!".format(ha_label))

        return agent_result(location)

    except AgentShell.CommandExecutionError as err:
        return agent_error(
            "Error (%s) running '%s': '%s' '%s'" %
            (err.result.rc, err.command, err.result.stdout, err.result.stderr))
コード例 #27
0
ファイル: corosync.py プロジェクト: whamcloud/iml-agent
def find_unused_port(ring0, timeout=10, batch_count=10000):
    from random import choice

    dest_addr = ring0.mcastaddr
    port_min = 32767
    port_max = 65535
    ports = range(port_min, port_max, 2)
    portrange_str = "%s-%s" % (port_min, port_max)

    firewall_control.add_rule(
        0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
    )

    try:
        console_log.info(
            "Sniffing packets on {}({}) within range: {}".format(
                ring0.name, dest_addr, portrange_str
            )
        )

        dports = sniff(
            iface=ring0.name,
            lfilter=lambda x: x.haslayer(UDP)
            and isinstance(x[UDP].dport, (int, long))
            and x[UDP].dport >= port_min
            and x[UDP].dport <= port_max
            and x[IP].dst == dest_addr,
            timeout=timeout,
        )

        console_log.info(
            "Finished after %d seconds, sniffed: %d" % (timeout, len(dports))
        )

        for dport in dports:
            try:
                ports.remove(dport)
            except ValueError:
                # already removed
                pass

    finally:
        firewall_control.remove_rule(
            0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr
        )

    return choice(ports)
コード例 #28
0
def get_resource_locations():
    """Parse `crm_mon -1` to identify where (if anywhere) resources
    (i.e. targets) are running
    returns [ resoure_id: location|None, ... ]
    """
    try:
        result = AgentShell.run(["crm_mon", "-1", "-r", "-X"])
    except OSError as err:
        # ENOENT is fine here.  Pacemaker might not be installed yet.
        if err.errno != errno.ENOENT:
            raise err
        return {}

    if result.rc != 0:
        console_log.info("crm_mon failed (%d): '%s' '%s'", result.rc,
                         result.stdout, result.stderr)
        return {}

    return _get_resource_locations(result.stdout)
コード例 #29
0
ファイル: corosync.py プロジェクト: zaja1kun/iml-agent
def get_ring0():
    # ring0 will always be on the interface used for agent->manager comms
    from urlparse import urlparse
    server_url = urljoin(os.environ["IML_MANAGER_URL"], "agent")
    manager_address = socket.gethostbyname(urlparse(server_url).hostname)
    out = AgentShell.try_run(['/sbin/ip', 'route', 'get', manager_address])
    match = re.search(r'dev\s+([^\s]+)', out)
    if match:
        manager_dev = match.groups()[0]
    else:
        raise RuntimeError("Unable to find ring0 dev in %s" % out)

    console_log.info("Chose %s for corosync ring0" % manager_dev)
    ring0 = CorosyncRingInterface(manager_dev)

    if ring0.ipv4_prefixlen < 9:
        raise RuntimeError("%s subnet is too large (/%s)" %
                           (ring0.name, ring0.ipv4_prefixlen))

    return ring0
コード例 #30
0
def clear_targets(force=False):
    if not force:
        from os import _exit
        import textwrap
        warning = """
        clear-targets will forcibly unmount and unconfigure all Lustre targets
        on EVERY node in this HA domain.  This is an irreversible and
        potentially very destructive operation.  Data loss may occur.  Please
        do not use it unless you fully understand the consequences!  If you
        are sure that this command does what you intend to do, then you must
        supply the --force flag to avoid seeing this message.
        """
        console_log.warn(textwrap.fill(textwrap.dedent(warning)))
        _exit(1)

    for resource, attrs in _query_ha_targets().items():
        console_log.info("Stopping %s" % resource)
        stop_target(attrs['ha_label'])
        console_log.info("Unconfiguring %s" % resource)
        unconfigure_target_ha(True, attrs['ha_label'], attrs['uuid'])