def perform_outlet_action(self, outlet, action): log.debug( "%s: Running %s -> %s on %s:%s" % (self.__class__.__name__, outlet, action, self.address, self.port)) actions = {"on": "1", "off": "2", "reboot": "3"} # Quick and dirty sock = socket.create_connection((self.address, self.port)) sock.recv(4096) sock.send("%s%s%sfoo\n\n" % (chr(255), chr(253), chr(5))) sock.recv(4096) sock.send("bar\n\n") sock.recv(4096) sock.send("1\n\n") sock.recv(4096) sock.send("2\n\n") sock.recv(4096) sock.send("1\n\n") sock.recv(4096) sock.send("%s\n\n" % outlet) sock.recv(4096) sock.send("1\n\n") sock.recv(4096) sock.send("%s\n\n" % actions[action]) sock.recv(4096) sock.send("YES\n\n") sock.recv(4096) sock.send("\n\n") sock.recv(4096) sock.send("".join(CTRL_C)) sock.close()
def reboot_server(self, fqdn): log.debug("reboot %s" % fqdn) server = self.servers[fqdn] if not server.running: server.startup() else: server.shutdown(reboot=True)
def start(self, ha_label): with self._lock: resource = self.state['resources'][ha_label] resource['started_on'] = resource['primary_node'] log.debug("Starting resource %s on %s" % (ha_label, resource['primary_node'])) self.save() return resource
def mount(self, nids, filesystem_name): log.debug("FakeClient.mount %s %s" % (nids, filesystem_name)) # Look up NIDs to an MGT # Check the MGT and targets are really started # Add an entry to self.state['mounts'] if not (nids, filesystem_name) in self.state['mounts']: self.state['mounts'].append((nids, filesystem_name)) self.save()
def del_client_mount(self, mountspec): with self._lock: try: del self.state['client_mounts'][mountspec] self.save() except KeyError: pass log.debug("Unmounted %s" % mountspec)
def add_client_mount(self, mountspec, mountpoint): fsname = mountspec.split(':/')[1] mountpoint = "%s/%s" % (mountpoint, fsname) self.start_lnet() with self._lock: self.state['client_mounts'][mountspec] = mountpoint self.save() log.debug("Mounted %s on %s" % (mountspec, mountpoint))
def server_poweron_hook(self, outlet): """ When an outlet has been turned on, attempts to start the associated server. If the server has already been started, this is a no-op. """ fqdn = self.state['outlet_servers'][outlet] log.debug("starting %s in poweron_hook" % fqdn) self.start_server_fn(fqdn)
def start_server(self, fqdn, simulate_bootup=False): """ :param simulate_bootup: Whether to simulate a bootup, delays and all """ log.debug("start %s" % fqdn) server = self.servers[fqdn] if server.running and not simulate_bootup: raise RuntimeError("Can't start %s, it is already running" % fqdn) server.startup(simulate_bootup)
def inject_log_message(self, message): log.debug("Injecting log message %s/%s" % (self.fqdn, message)) self._log_messages.append({ 'source': 'cluster_sim', 'severity': 1, 'facility': 1, 'message': message, 'datetime': IMLDateTime.utcnow().isoformat() })
def server_poweroff_hook(self, outlet): """ When an outlet has been turned off, checks to see if the associated server has lost all powered outlets. If so, the server is stopped. """ fqdn = self.state['outlet_servers'][outlet] if not self.server_has_power(fqdn): log.debug("stopping %s in poweroff_hook" % fqdn) self.stop_server_fn(fqdn)
def login(self): self.fd.write("\n\rUser Name : ") self.fd.flush() # We don't really need to do this, but it's nice to get rid of the # telnet control characters for logging. username = self._strip_controls(self.fd.readline().rstrip()) self.fd.readline() log.debug("Received username: %s" % username) self.fd.write("\rPassword : "******"Received password: %s" % password) self.control_console()
def stop_server(self, fqdn, shutdown=False, simulate_shutdown=False): """ :param shutdown: Whether to treat this like a server shutdown (leave the HA cluster) rather than just an agent shutdown. :param simulate_shutdown: Whether to simulate a shutdown, delays and all """ log.debug("stop %s" % fqdn) server = self.servers[fqdn] if not server.running: log.debug("not running") return if shutdown: server.shutdown(simulate_shutdown) else: server.shutdown_agent()
def start_all(self): self.pre_server_start() # Spread out starts to avoid everyone doing sending their update # at the same moment if len(self.servers): delay = Session.POLL_PERIOD / float(len(self.servers)) log.debug("Start all (%.2f dispersion)" % delay) for i, fqdn in enumerate(self.servers.keys()): self.start_server(fqdn) if i != len(self.servers) - 1: time.sleep(delay) else: log.info("start_all: No servers yet") self.post_server_start()
def join(self, nodename, **kwargs): with self._lock: if nodename in self.state['nodes']: self.state['nodes'][nodename]['online'] = True else: self.state['nodes'][nodename] = { 'online': True, 'nodename': nodename } self.state['nodes'][nodename].update(**kwargs) for ha_label, resource in self.state['resources'].items(): if resource['started_on'] is None: if resource['primary_node'] == nodename: log.debug("Starting %s on primary %s" % (ha_label, nodename)) resource['started_on'] = nodename elif resource['secondary_node'] == nodename: log.debug("Starting %s on secondary %s" % (ha_label, nodename)) resource['started_on'] = nodename self.save()
def register(self, fqdn, secret): try: log.debug("register %s" % fqdn) server = self.servers[fqdn] if server.agent_is_running: # e.g. if the server was added then force-removed then re-added server.shutdown_agent() if not server.is_worker and not self.power.server_has_power(fqdn): raise RuntimeError( "Not registering %s, none of its PSUs are powered" % fqdn) client = AgentClient(url=self.url + "register/%s/" % secret, action_plugins=FakeActionPlugins( self, server), device_plugins=FakeDevicePlugins(server), server_properties=server, crypto=server.crypto) try: registration_result = client.register() except ConnectionError as e: log.error("Registration connection failed for %s: %s" % (fqdn, e)) return except HttpError as e: log.error("Registration request failed for %s: %s" % (fqdn, e)) return server.crypto.install_certificate( registration_result['certificate']) # Immediately start the agent after registration, to pick up the # setup actions that will be waiting for us on the manager. self.start_server(fqdn) return registration_result except Exception: log.error(traceback.format_exc())
def register_many(self, fqdns, secret): simulator = self class RegistrationThread(threading.Thread): def __init__(self, fqdn, secret): super(RegistrationThread, self).__init__() self.fqdn = fqdn self.secret = secret def run(self): self.result = simulator.register(self.fqdn, self.secret) threads = [] log.debug("register_many: spawning threads") for fqdn in fqdns: thread = RegistrationThread(fqdn, secret) thread.start() threads.append(thread) for i, thread in enumerate(threads): thread.join() log.debug("register_many: joined %s/%s" % (i + 1, len(threads))) return [t.result for t in threads]
def leave(self, nodename): with self._lock: log.debug("leave: %s" % nodename) self.state['nodes'][nodename]['online'] = False for ha_label, resource in self.state['resources'].items(): if resource['started_on'] == nodename: options = set([ resource['primary_node'], resource['secondary_node'] ]) - set([nodename]) if options: destination = options.pop() log.debug("migrating %s to %s" % (ha_label, destination)) resource['started_on'] = destination else: log.debug("stopping %s" % (ha_label)) resource['started_on'] = None self.save()
def start_sim_server(self, pdu_name): log.debug("starting server for %s" % pdu_name) assert pdu_name not in self.sim_servers pdu = self.pdu_sims[pdu_name] self.sim_servers[pdu_name] = PDUSimulatorServer(pdu) self.sim_servers[pdu_name].start()
def detect_scan(self, target_devices): local_targets = [] mgs_target = None for serial, target in self._devices.state['targets'].items(): log.info("targets: %s: %s %s" % (serial, target['label'], target['uuid'])) for ha_label, resource in self._cluster.state['resources'].items(): log.info("cluster: %s %s %s" % (ha_label, resource['uuid'], resource['device_path'])) for target_device in target_devices: path = target_device['path'] try: target = self._devices.get_target_by_path(self.fqdn, path) except KeyError: # Key error means this is not a target continue try: ha_resource = self._cluster.get_by_uuid(target['uuid']) except KeyError: log.warning("No HA resource for target %s/%s" % (target['label'], target['uuid'])) continue location = self._cluster.resource_locations()[ ha_resource['ha_label']] mounted = location == self.nodename local_targets.append({ "name": target['label'], "uuid": target['uuid'], "params": {}, "device_paths": [path], "mounted": mounted }) if target['label'] == 'MGS': mgs_target = target mgs_targets = defaultdict(lambda: []) if mgs_target is not None: for target_label in self._devices.mgt_get_target_labels( mgs_target['mgsnode']): target = self._devices.get_target_by_label(target_label) mgs_targets[target['fsname']].append({ 'uuid': target['uuid'], 'name': target['label'], 'nid': target['primary_nid'] }) result = { "local_targets": local_targets, "mgs_targets": mgs_targets, "mgs_conf_params": {} } log.debug("detect_scan: %s" % json.dumps(result, indent=2)) return result
def create_pdu_entries(self, simulator, args): if not (args.username and args.password): sys.stderr.write( "Username and password required to create PDU entries\n") sys.exit(-1) session = self._get_authenticated_session(args.url, args.username, args.password) log.info( "Creating PDU entries and associating PDU outlets with servers...") outlet_count = len(simulator.servers) if outlet_count < 1: log.error("Skipping PDU creation (no servers)") return # Create a custom type to ensure that it has enough outlets. # NB: If more servers are added later this won't work correctly, # but it should handle most use cases for simulated clusters. response = session.post("%s/api/power_control_type/" % args.url, data=json.dumps({ 'agent': "fence_apc", 'make': "Fake", 'model': "PDU", 'default_username': "******", 'default_password': "******", 'max_outlets': outlet_count })) assert 200 <= response.status_code < 300, response.text fence_apc = json.loads(response.text) log.debug("Created power_control_type: %s" % fence_apc['name']) pdu_entries = [] for pdu_sim in simulator.power.pdu_sims.values(): response = session.post("%s/api/power_control_device/" % args.url, data=json.dumps({ 'device_type': fence_apc['resource_uri'], 'name': pdu_sim.name, 'address': pdu_sim.address, 'port': pdu_sim.port })) assert 200 <= response.status_code < 300, response.text pdu_entries.append(json.loads(response.text)) log.debug("Created power_control_device: %s" % pdu_entries[-1]['name']) response = session.get("%s/api/host/" % args.url, data=json.dumps({'limit': 0})) assert 200 <= response.status_code < 300, response.text servers = [ s for s in json.loads(response.text)['objects'] if 'posix_copytool_worker' not in s['server_profile'] ] for i, server in enumerate( sorted(servers, key=lambda server: server['fqdn'])): for pdu in pdu_entries: outlet = [ o for o in pdu['outlets'] if o['identifier'] == str(i + 1) ][0] response = session.patch( "%s/%s" % (args.url, outlet['resource_uri']), data=json.dumps({'host': server['resource_uri']})) assert 200 <= response.status_code < 300, response.text log.debug("Created association %s <=> %s:%s" % (server['fqdn'], pdu['name'], outlet['identifier']))
def _open_fifo(self): fifo_path = self.wrapper.event_fifo self.fifo = open(fifo_path, "w", 1) log.debug("Opened %s for write" % fifo_path)
def unmount(self, nids, filesystem_name): log.debug("FakeClient.unmount %s %s" % (nids, filesystem_name)) if (nids, filesystem_name) in self.state['mounts']: self.state['mounts'].remove((nids, filesystem_name)) self.save()
def run(self, cmd, agent_daemon_context, kwargs): # This is a little hackish: we don't actually separate the thread_state for # each simulated agent (they mostly don't even shell out when simulated) but # do this to avoid the subprocess log building up indefinitely. AgentShell.thread_state = ResultStore() log.debug("FakeActionPlugins: %s %s" % (cmd, kwargs)) with self._lock: if cmd == 'device_plugin': device_plugins = FakeDevicePlugins(self._server) if kwargs['plugin']: return { kwargs['plugin']: device_plugins.get( kwargs['plugin'])(None).start_session() } else: data = {} for plugin, klass in device_plugins.get_plugins().items(): data[plugin] = klass(None).start_session() return data elif cmd in [ 'configure_ntp', 'unconfigure_ntp', 'unconfigure_corosync', 'unconfigure_corosync2', 'initialise_block_device_drivers' ]: return agent_result_ok elif cmd == 'deregister_server': sim = self._simulator server = self._server class StopServer(threading.Thread): def run(self): sim.stop_server(server.fqdn) def kill(): server.crypto.delete() # Got to go and run stop_server in another thread, because it will try # to join all the agent threads (including the one that is running this # callback) StopServer().start() raise CallbackAfterResponse(None, kill) elif cmd == 'shutdown_server': server = self._server def _shutdown(): server.shutdown(simulate_shutdown=True) raise CallbackAfterResponse(None, _shutdown) elif cmd == 'reboot_server': server = self._server def _reboot(): server.shutdown(simulate_shutdown=True, reboot=True) raise CallbackAfterResponse(None, _reboot) elif cmd == 'failover_target': self._server._cluster.failover(kwargs['ha_label']) return agent_result_ok elif cmd == 'failback_target': self._server._cluster.failback(kwargs['ha_label']) return agent_result_ok elif cmd == 'set_conf_param': self._server.set_conf_param(kwargs['key'], kwargs.get('value', None)) elif cmd in [ 'configure_pacemaker', 'unconfigure_pacemaker', 'enable_pacemaker', 'configure_target_store', 'unconfigure_target_store', 'configure_repo' ]: return elif cmd == 'kernel_status': return { 'running': 'fake_kernel-0.1', 'required': 'fake_kernel-0.1', 'available': ['fake_kernel-0.1'] } elif cmd in ['configure_fencing', 'unconfigure_fencing']: # This shouldn't happen if the fence reconfiguration logic # is working. Good to simulate a failure here in case of # regressions, though. if self._server.is_worker: raise PacemakerConfigurationError() return elif cmd == "host_corosync_config": return {} elif cmd == 'mount_lustre_filesystems': for mountspec, mountpoint in kwargs['filesystems']: self._server.add_client_mount(mountspec, mountpoint) elif cmd == 'unmount_lustre_filesystems': for mountspec, _ in kwargs['filesystems']: self._server.del_client_mount(mountspec) elif cmd == 'configure_copytool': self._simulator.configure_hsm_copytool(self._server, **kwargs) elif cmd == 'unconfigure_copytool': self._simulator.unconfigure_hsm_copytool(kwargs['id']) elif cmd == 'start_monitored_copytool': self._simulator.start_monitored_copytool( self._server, kwargs['id']) elif cmd == 'stop_monitored_copytool': self._simulator.stop_monitored_copytool(kwargs['id']) else: try: fn = getattr(self._server, cmd) except AttributeError: raise RuntimeError("Unknown command %s" % cmd) else: return fn(**kwargs)
def handle(self): log.debug("Handling PDU request from %s:%s" % self.client_address) self.server.pdu_simulator.handle_client(self.request, self.client_address)