def reregister_server(url, address): """ Update manager url and register agent address with manager """ if _service_is_running() is True: console_log.warning( "chroma-agent service was running before registration, stopping.") agent_service.stop() conf.set_server_url(url) crypto = Crypto(conf.ENV_PATH) agent_client = AgentClient( url + "reregister/", ActionPluginManager(), DevicePluginManager(), ServerProperties(), crypto, ) data = {"address": address, "fqdn": agent_client._fqdn} try: result = agent_client.post(data) except HttpError: console_log.error("Reregistration failed to %s with request %s" % (agent_client.url, data)) raise console_log.info("Starting chroma-agent service") agent_service.start() return result
def register_server(url, ca, secret, address=None): if _service_is_running() is True: console_log.warning( "chroma-agent service was running before registration, stopping.") agent_service.stop() crypto = Crypto(config.path) # Call delete in case we are over-writing a previous configuration that wasn't removed properly crypto.delete() crypto.install_authority(ca) agent_client = AgentClient(url + "register/%s/" % secret, ActionPluginManager(), DevicePluginManager(), ServerProperties(), crypto) registration_result = agent_client.register(address) crypto.install_certificate(registration_result['certificate']) config.set('settings', 'server', {'url': url}) console_log.info("Enabling chroma-agent service") agent_service.enable() console_log.info("Starting chroma-agent service") agent_service.start() return registration_result
def stop_target(ha_label): ''' Stop the high availability target Return: Value using simple return protocol ''' # HYD-7230: brute force, try up to 3 times to stop the target i = 0 while True: i += 1 # Issue the command to Pacemaker to stop the target if _resource_exists(_zfs_name(ha_label)): # Group disable will disable all members of group regardless of current status error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', _group_name(ha_label)]) else: error = AgentShell.run_canned_error_message( ['pcs', 'resource', 'disable', ha_label]) if error: return agent_error(error) if _wait_target(ha_label, False): return agent_result_ok if i < 4: console_log.info("failed to stop target %s", ha_label) else: return agent_error("Failed to stop target {}".format(ha_label))
def _shutdown(): console_log.info("Initiating server shutdown per manager request") # This will initiate a "nice" shutdown with a wall from root, etc. AgentShell.try_run(["shutdown", "-H" if halt else "-h", at_time]) console_log.info("Terminating") os._exit(0)
def reregister_server(url, address): """ Update manager url and register agent address with manager """ if _service_is_running() is True: console_log.warning( "chroma-agent service was running before registration, stopping.") agent_service.stop() config.set('settings', 'server', {'url': url}) crypto = Crypto(config.path) agent_client = AgentClient(url + 'reregister/', ActionPluginManager(), DevicePluginManager(), ServerProperties(), crypto) data = {'address': address, 'fqdn': agent_client._fqdn} try: result = agent_client.post(data) except HttpError: console_log.error("Reregistration failed to %s with request %s" % (agent_client.url, data)) raise console_log.info("Starting chroma-agent service") agent_service.start() return result
def stop_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-7230: brute force, try up to 3 times to stop the target i = 0 while True: i += 1 # Issue the command to Pacemaker to stop the target error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if _wait_target(ha_label, False): return agent_result_ok if i < 4: console_log.info("failed to stop target %s" % ha_label) else: return agent_error("failed to stop target %s" % ha_label)
def _reboot(): console_log.info("Initiating server reboot per manager request") # reboot(8) just calls shutdown anyhow. AgentShell.try_run(["shutdown", "-r", at_time]) console_log.info("Terminating") os._exit(0)
def set_address(self, ipv4_address, prefix): ifaddr = "%s/%s" % (ipv4_address, prefix) console_log.info("Set %s (%s) up" % (self.name, ifaddr)) if self.ipv4_address != ipv4_address: node_admin.unmanage_network(self.device, self.mac_address) AgentShell.try_run( ['/sbin/ip', 'link', 'set', 'dev', self.name, 'up']) AgentShell.try_run( ['/sbin/ip', 'addr', 'add', ifaddr, 'dev', self.name]) # The link address change is asynchronous, so we need to wait for the # address to stick of we have a race condition. timeout = 30 while self.ipv4_address != ipv4_address and timeout != 0: self.refresh() time.sleep(1) timeout -= 1 if self.ipv4_address != ipv4_address: raise RuntimeError( 'Unable to set the address %s for interface %s' % (self.ipv4_address, self.name)) node_admin.write_ifcfg(self.device, self.mac_address, self.ipv4_address, self.ipv4_netmask) else: console_log.info("Nothing to do as %s already has address %s" % (self.name, ifaddr))
def _remove_module(name, modules): try: m = modules[name] except KeyError: # It's not loaded, do nothing. return None console_log.info("Removing %d dependents of %s : %s" % (len(m.dependents), name, m.dependents)) while m.dependents: error = _remove_module(m.dependents.pop(), modules) if error: return error console_log.info("Removing %s" % name) error = AgentShell.run_canned_error_message(['rmmod', name]) if error: return error modules.pop(name) for m in modules.values(): if name in m.dependents: m.dependents.remove(name) return None
def private_key_file(self): """Return a path to a PEM file""" if not os.path.exists(self.PRIVATE_KEY_FILE): console_log.info("Generating private key") AgentShell.try_run(['openssl', 'genrsa', '-out', self.PRIVATE_KEY_FILE, '2048', '-sha256']) return self.PRIVATE_KEY_FILE
def disable_and_kill(): console_log.info("Terminating") storage_server_target = ServiceControl.create( "iml-storage-server.target") storage_server_target.disable() storage_server_target.stop()
def generate_ring1_network(ring0): # find a good place for the ring1 network subnet = find_subnet(ring0.ipv4_network, ring0.ipv4_prefixlen) address = str(IPAddress((int(IPAddress(ring0.ipv4_hostmask)) & int(IPAddress(ring0.ipv4_address))) | int(subnet.ip))) console_log.info("Chose %s/%d for ring1 address" % (address, subnet.prefixlen)) return address, str(subnet.prefixlen)
def stonith(node): p_cfg = PacemakerConfig() # TODO: signal that manager that a STONITH has been done so that it # doesn't treat it as an AWOL console_log.info("Rebooting %s per a STONITH request" % node) p_cfg.get_node(node).fence_reboot()
def start_lnet(): """ Place lnet into the 'up' state. """ console_log.info("Starting LNet") return AgentShell.run_canned_error_message( ["lnetctl", "lnet", "configure", "--all"])
def terminate_block_device_drivers(): console_log.info("Terminating drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.terminate_driver() if error: return agent_error(error) return agent_result_ok
def stop_lnet(): ''' Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state will be unloaded before lnet is stopped. ''' console_log.info("Stopping LNet") return agent_ok_or_error(_rmmod_deps("lnet", excpt=["ksocklnd", "ko2iblnd"]) or AgentShell.run_canned_error_message(["lctl", "net", "down"]))
def initialise_block_device_drivers(): console_log.info("Initialising drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.initialise_driver(config.profile_managed) if error: return agent_error(error) return agent_result_ok
def private_key_file(self): """Return a path to a PEM file""" if not os.path.exists(self.PRIVATE_KEY_FILE): console_log.info("Generating private key") AgentShell.try_run([ "openssl", "genrsa", "-out", self.PRIVATE_KEY_FILE, "2048", "-sha256" ]) return self.PRIVATE_KEY_FILE
def start_lnet(): ''' Place lnet into the 'up' state. ''' console_log.info("Starting LNet") # modprobe lust is a hack for HYD-1263 - Fix or work around LU-1279 - failure trying to mount # should be removed when LU-1279 is fixed return agent_ok_or_error(AgentShell.run_canned_error_message(["lctl", "net", "up"]) or AgentShell.run_canned_error_message(["modprobe", "lustre"]))
def get_cluster_node_name(): try: return AgentShell.try_run(["crm_node", "-n"]).strip() except Exception as e: console_log.info( "Could not get cluster node name {}. Falling back to socket.getfqdn()".format( e ) ) return socket.getfqdn()
def stop_lnet(): """ Place lnet into the 'down' state, any modules that are dependent on lnet being in the 'up' state will be unloaded before lnet is stopped. """ console_log.info("Stopping LNet") return agent_ok_or_error( AgentShell.run_canned_error_message(["lustre_rmmod", "ptlrpc"]) or AgentShell.run_canned_error_message( ["lnetctl", "lnet", "unconfigure"]))
def find_unused_port(ring0, timeout=10, batch_count=10000): from random import choice dest_addr = ring0.mcastaddr port_min = 32767 port_max = 65535 ports = range(port_min, port_max, 2) portrange_str = "%s-%s" % (port_min, port_max) firewall_control.add_rule( 0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr ) try: networking.subscribe_multicast(ring0) console_log.info( "Sniffing for packets to %s on %s within port range %s" % (dest_addr, ring0.name, portrange_str) ) cap = networking.start_cap( ring0, timeout, "host %s and udp and portrange %s" % (dest_addr, portrange_str), ) def recv_packets(header, data): tgt_port = networking.get_dport_from_packet(data) try: ports.remove(tgt_port) except ValueError: # already removed pass packet_count = 0 start = time.time() while time.time() - start < timeout: try: packet_count += cap.dispatch(batch_count, recv_packets) except Exception as e: raise RuntimeError("Error reading from the network: %s" % str(e)) console_log.info( "Finished after %d seconds, sniffed: %d" % (time.time() - start, packet_count) ) finally: firewall_control.remove_rule( 0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr ) return choice(ports)
def start_target(ha_label): ''' Start the high availability target Return: Value using simple return protocol ''' # HYD-1989: brute force, try up to 3 times to start the target i = 0 while True: i += 1 error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Started' ]) if error: return agent_error(error) # now wait for it to start _wait_target(ha_label, True) # and make sure it didn't start but (the RA) fail(ed) rc, stdout, stderr = AgentShell.run_old(['crm_mon', '-1']) failed = True for line in stdout.split("\n"): if line.lstrip().startswith(ha_label): if line.find("FAILED") < 0: failed = False if failed: # try to leave things in a sane state for a failed mount error = AgentShell.run_canned_error_message([ 'crm_resource', '-r', ha_label, '-p', 'target-role', '-m', '-v', 'Stopped' ]) if error: return agent_error(error) if i < 4: console_log.info("failed to start target %s" % ha_label) else: return agent_error("Failed to start target %s" % ha_label) else: location = get_resource_location(ha_label) if not location: return agent_error("Started %s but now can't locate it!" % ha_label) return agent_result(location)
def terminate_block_device_drivers(): """ When the agent is stopped we want to allow block devices to do any termination that they might need, this function may also be called by the manager. """ console_log.info("Terminating drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.terminate_driver() if error: return agent_error(error) return agent_result_ok
def initialise_block_device_drivers(): """ When the agent is run we want to allow block devices to do any initialization that they might need, this function may also be called by the manager. """ console_log.info("Initialising drivers for block device types") for cls in util.all_subclasses(BlockDevice): error = cls.initialise_driver(config.profile_managed) if error: return agent_error(error) return agent_result_ok
def start_target(ha_label): """ Start the high availability target Return: Value using simple return protocol """ if not _resource_exists(ha_label): return agent_error("Target {} does not exist".format(ha_label)) # if resource already started but not on primary, move it location = get_resource_location(ha_label) primary = _find_resource_constraint(ha_label, True) if location: if location != primary: console_log.info( "Resource %s already started, moving to primary node %s", ha_label, primary, ) error = _move_target(ha_label, primary) if error: return agent_error(error) location = primary return agent_result(location) try: _res_set_started(ha_label, True) if _resource_exists(_zfs_name(ha_label)): _res_set_started(_zfs_name(ha_label), True) # enable group also, in case group was disabled _res_set_started(_group_name(ha_label), True) # now wait for it to start if not _wait_target(ha_label, True): # try to leave things in a sane state for a failed mount _res_set_started(ha_label, False) return agent_error("Failed to start target {}".format(ha_label)) location = get_resource_location(ha_label) if not location: return agent_error( "Started {} but now can't locate it!".format(ha_label)) return agent_result(location) except AgentShell.CommandExecutionError as err: return agent_error( "Error (%s) running '%s': '%s' '%s'" % (err.result.rc, err.command, err.result.stdout, err.result.stderr))
def find_unused_port(ring0, timeout=10, batch_count=10000): from random import choice dest_addr = ring0.mcastaddr port_min = 32767 port_max = 65535 ports = range(port_min, port_max, 2) portrange_str = "%s-%s" % (port_min, port_max) firewall_control.add_rule( 0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr ) try: console_log.info( "Sniffing packets on {}({}) within range: {}".format( ring0.name, dest_addr, portrange_str ) ) dports = sniff( iface=ring0.name, lfilter=lambda x: x.haslayer(UDP) and isinstance(x[UDP].dport, (int, long)) and x[UDP].dport >= port_min and x[UDP].dport <= port_max and x[IP].dst == dest_addr, timeout=timeout, ) console_log.info( "Finished after %d seconds, sniffed: %d" % (timeout, len(dports)) ) for dport in dports: try: ports.remove(dport) except ValueError: # already removed pass finally: firewall_control.remove_rule( 0, "tcp", "find unused port", persist=False, address=ring0.mcastaddr ) return choice(ports)
def get_resource_locations(): """Parse `crm_mon -1` to identify where (if anywhere) resources (i.e. targets) are running returns [ resoure_id: location|None, ... ] """ try: result = AgentShell.run(["crm_mon", "-1", "-r", "-X"]) except OSError as err: # ENOENT is fine here. Pacemaker might not be installed yet. if err.errno != errno.ENOENT: raise err return {} if result.rc != 0: console_log.info("crm_mon failed (%d): '%s' '%s'", result.rc, result.stdout, result.stderr) return {} return _get_resource_locations(result.stdout)
def get_ring0(): # ring0 will always be on the interface used for agent->manager comms from urlparse import urlparse server_url = urljoin(os.environ["IML_MANAGER_URL"], "agent") manager_address = socket.gethostbyname(urlparse(server_url).hostname) out = AgentShell.try_run(['/sbin/ip', 'route', 'get', manager_address]) match = re.search(r'dev\s+([^\s]+)', out) if match: manager_dev = match.groups()[0] else: raise RuntimeError("Unable to find ring0 dev in %s" % out) console_log.info("Chose %s for corosync ring0" % manager_dev) ring0 = CorosyncRingInterface(manager_dev) if ring0.ipv4_prefixlen < 9: raise RuntimeError("%s subnet is too large (/%s)" % (ring0.name, ring0.ipv4_prefixlen)) return ring0
def clear_targets(force=False): if not force: from os import _exit import textwrap warning = """ clear-targets will forcibly unmount and unconfigure all Lustre targets on EVERY node in this HA domain. This is an irreversible and potentially very destructive operation. Data loss may occur. Please do not use it unless you fully understand the consequences! If you are sure that this command does what you intend to do, then you must supply the --force flag to avoid seeing this message. """ console_log.warn(textwrap.fill(textwrap.dedent(warning))) _exit(1) for resource, attrs in _query_ha_targets().items(): console_log.info("Stopping %s" % resource) stop_target(attrs['ha_label']) console_log.info("Unconfiguring %s" % resource) unconfigure_target_ha(True, attrs['ha_label'], attrs['uuid'])