def run(self): self.server = SimpleXMLRPCServer(('localhost', SIMULATOR_PORT), allow_none=True) self.server.register_instance(self.simulator) log.info("Listening on %s" % SIMULATOR_PORT) self.server.serve_forever()
def run(self, args): simulator = ClusterSimulator(args.config, args.url) log.info("Running %s servers in %s/" % (len(simulator.servers), args.config)) simulator.start_all() return simulator
def register(self, args): simulator = ClusterSimulator(args.config, args.url) server_count = len(simulator.servers) if args.secret: server_secret = args.secret worker_secret = args.secret elif args.username and args.password: server_secret = self._acquire_token( args.url, args.username, args.password, server_count, preferred_profile='base_managed_rh7') worker_secret = self._acquire_token( args.url, args.username, args.password, server_count, preferred_profile='posix_copytool_worker') else: sys.stderr.write( "Must pass either --secret or --username and --password\n") sys.exit(-1) log.info("Registering %s servers in %s/" % (server_count, args.config)) register_count = simulator.register_all(server_secret, worker_secret) if args.create_pdu_entries and register_count > 0: self.create_pdu_entries(simulator, args) return simulator
def post_server_start(self): if len(self.servers) < 1: log.info("No servers started; skipping post_server_start()") return self.start_hsm_coordinators() self.start_hsm_copytools()
def handle_client(self, sock, address): log.info("%s: received connection from %s:%s" % (self.__class__.__name__, address[0], address[1])) fd = sock.makefile() fd.write("NOT IMPLEMENTED\r\n") fd.flush() sock.close()
def format(self, fqdn, path, mkfs_options, target_data): # defaults to what an OST would look like e2fs_dump = { 'uuid': target_data['uuid'], 'filesystem_type': "ext4", 'inode_size': 256, 'bytes_per_inode': 16384 } # overrides for MDT if mkfs_options: for pattern in [ r'-I\s*(?P<inode_size>\d+)', r'-i\s*(?P<bytes_per_inode>\d+)' ]: match = re.search(pattern, mkfs_options) if match: for k, v in match.groupdict().items(): e2fs_dump[k] = int(v) """Format a Lustre target""" with self._lock: serial = self.get_by_path(fqdn, path)['serial_80'] device = self.get_device(serial) e2fs_dump[ 'inode_count'] = device['size'] / e2fs_dump['bytes_per_inode'] log.info("format: %s" % serial) self.state['targets'][serial] = target_data self.save() return e2fs_dump
def stop(self): log.info("Stopping") self.power.stop() for group in [ 'coordinators', 'copytools', 'copytool_monitors', 'servers' ]: for thread in getattr(self, group).values(): thread.stop()
def __init__(self, pdu): log.info("Creating PDU server for %s on %s:%s" % (pdu.name, pdu.address, pdu.port)) self.server = PDUSimulatorTcpServer((pdu.address, pdu.port), PDUSimulatorTcpHandler, pdu_simulator=pdu) super(PDUSimulatorServer, self).__init__()
def join(self): log.info("Joining...") self.power.join() for group in [ 'coordinators', 'copytools', 'copytool_monitors', 'servers' ]: for thread in getattr(self, group).values(): thread.join() log.info("Joined")
def handle_client(self, sock, address): log.info("%s: received connection from %s:%s" % (self.__class__.__name__, address[0], address[1])) self.client_address = address self.socket = sock self.fd = sock.makefile() try: self.login() except socket.error: log.info("%s: client %s:%s disconnected" % (self.__class__.__name__, address[0], address[1]))
def stop(self): if self.running: log.info("Stopping HSM Copytool: %s" % self.id) try: self.coordinator.deregister_agent(self) except KeyError: pass try: self._thread.stop() except AttributeError: pass
def update_packages(self, packages, node_type='server'): log.info("Updating packages: %s" % packages) for k, v in packages.items(): self.state['packages'][node_type][k] = v self.save() # The agent only reports new versions at the start of sessions # IRL this is valid because when we install updates on the manager # we restart the manager servers, causing new sessions. In simulation, # we don't control the manager, so instead restart the AgentClient instances. for fqdn in self.servers.keys(): self.stop_server(fqdn) self.start_server(fqdn)
def run(self): self._open_fifo() self.register() self.started = True log.info("Copytool %s started" % self.wrapper.copytool.id) while not self._stopping.is_set(): for request in self.coordinator.get_agent_requests(self.uuid): self.start_request(request) self.progress_active_requests() self._stopping.wait(COPYTOOL_LOOP_INTERVAL)
def remove_server(self, fqdn): log.info("remove_server %s" % fqdn) self.stop_server(fqdn, shutdown=True) server = self.servers[fqdn] assert not server.agent_is_running self.devices.remove_presentations(fqdn) self.power.remove_server(fqdn) server.crypto.delete() server.delete() del self.servers[fqdn]
def blocking_shutdown(): log.info("%s beginning shutdown" % self.fqdn) self._enter_shutdown() shutdown_start_time = IMLDateTime.utcnow() self.shutdown_agent() self._cluster.leave(self.nodename) shutdown_end_time = IMLDateTime.utcnow() shutdown_time = (shutdown_end_time - shutdown_start_time).seconds while shutdown_time < MIN_SHUTDOWN_DURATION: if not simulate_shutdown: break remaining_delay = MIN_SHUTDOWN_DURATION - shutdown_time log.info("%s, sleeping for %d seconds on shutdown" % (self.fqdn, remaining_delay)) shutdown_time += remaining_delay time.sleep(remaining_delay) self._exit_shutdown() log.info("%s shutdown complete" % self.fqdn) if reboot: log.info("%s rebooting" % self.fqdn) self.startup(simulate_shutdown)
def start(self, coordinator): log.info("Starting HSM Copytool: %s" % self.id) coordinator.register_agent(self) self._new_thread() self._thread.start() # Give the thread time to start up in order to avoid weird races # in CI. This will prevent any transitions from the 'started' state # until the copytool is truly started. startup_timeout = 30 # 5-10 seconds seems typical; pad it out a bit while startup_timeout > 0: if self._thread.started: return time.sleep(1) startup_timeout -= 1 raise RuntimeError("Timed out waiting for copytool thread to start")
def start_all(self): self.pre_server_start() # Spread out starts to avoid everyone doing sending their update # at the same moment if len(self.servers): delay = Session.POLL_PERIOD / float(len(self.servers)) log.debug("Start all (%.2f dispersion)" % delay) for i, fqdn in enumerate(self.servers.keys()): self.start_server(fqdn) if i != len(self.servers) - 1: time.sleep(delay) else: log.info("start_all: No servers yet") self.post_server_start()
def setup(self, args): log.info("Setting up simulator configuration for %s servers in %s/" % (args.server_count, args.config)) worker_count = int(args.worker_count) server_count = int(args.server_count) if args.volume_count: volume_count = int(args.volume_count) else: volume_count = server_count * 2 reset_agent_config() set_agent_config('copytool_fifo_directory', '/tmp') simulator = ClusterSimulator(args.config, args.url) simulator.setup(server_count, worker_count, volume_count, int(args.nid_count), int(args.cluster_size), int(args.psu_count), int(args.su_size))
def blocking_startup(): log.info("%s beginning startup" % self.fqdn) startup_time = 0 # Mixing shutdown/startup state here is a little weird, but # this only happens when it's explicitly requested by a caller # that wants full server simulation. The end result is that # simulating a reboot is easier if we deal with waiting for # the shutdown to finish here before moving on to startup. while simulate_bootup and self.shutting_down: log.info("%s, waiting for shutdown (%d)" % (self.fqdn, startup_time)) startup_time += 1 time.sleep(1) # This may happen to the losing thread(s) during startup after # a multi-PDU power cycle. if self.starting_up: log.info( "%s exiting startup because another thread is already doing it?" % self.fqdn) return self._enter_startup() self.boot_time = IMLDateTime.utcnow() while startup_time < MIN_STARTUP_DURATION: if not simulate_bootup: break remaining_delay = MIN_STARTUP_DURATION - startup_time log.info("%s, sleeping for %d seconds on boot" % (self.fqdn, remaining_delay)) startup_time += remaining_delay time.sleep(remaining_delay) self.start_agent() self._cluster.join(self.nodename) self._exit_startup() log.info("%s startup complete" % self.fqdn)
def _create_server(self, i, nid_count): interface_names = ['tcp', 'o2ib'] nid_tuples = [] nodename = "test%.3d" % i fqdn = "%s.localdomain" % nodename x, y = (i / 256, i % 256) for network in range(0, nid_count): name = interface_names[network % len(interface_names)] address = '10.%d.%d.%d' % (network, x, y) nid_tuples.append((address, name, network)) log.info("_create_server: %s" % fqdn) server = FakeServer(self, self._get_cluster_for_server(i), i, fqdn, nodename, nid_tuples) self.servers[fqdn] = server self.power.add_server(fqdn) return server
def _create_worker(self, i, nid_count): nid_tuples = [] nodename = "worker%.3d" % i fqdn = "%s.localdomain" % nodename x, y = (i / 256, i % 256) for network in range(0, nid_count): nid_tuples.append( ('10.%d.%d.%d' % (network, x, y), 'tcp', network)) log.info("_create_worker: %s" % fqdn) # Use -1 as the special cluster for workers worker = FakeServer(self, self._get_cluster_for_server(-1), i, fqdn, nodename, nid_tuples, worker=True) self.servers[fqdn] = worker return worker
def join(self): log.info("Power control: joining...") for sim_server in self.sim_servers.values(): sim_server.join()
def set_log_rate(self, fqdn, rate): log.info("Set log rate for %s to %s" % (fqdn, rate)) self.servers[fqdn].log_rate = rate
def logout(self): self.fd.write("Connection Closed - Bye\r\n") self.fd.flush() log.info("Client %s:%s logged out" % self.client_address) self.fd.close() self.socket.close()
def handler(*args, **kwargs): log.info("Stopping...") cli.stop()
def create_pdu_entries(self, simulator, args): if not (args.username and args.password): sys.stderr.write( "Username and password required to create PDU entries\n") sys.exit(-1) session = self._get_authenticated_session(args.url, args.username, args.password) log.info( "Creating PDU entries and associating PDU outlets with servers...") outlet_count = len(simulator.servers) if outlet_count < 1: log.error("Skipping PDU creation (no servers)") return # Create a custom type to ensure that it has enough outlets. # NB: If more servers are added later this won't work correctly, # but it should handle most use cases for simulated clusters. response = session.post("%s/api/power_control_type/" % args.url, data=json.dumps({ 'agent': "fence_apc", 'make': "Fake", 'model': "PDU", 'default_username': "******", 'default_password': "******", 'max_outlets': outlet_count })) assert 200 <= response.status_code < 300, response.text fence_apc = json.loads(response.text) log.debug("Created power_control_type: %s" % fence_apc['name']) pdu_entries = [] for pdu_sim in simulator.power.pdu_sims.values(): response = session.post("%s/api/power_control_device/" % args.url, data=json.dumps({ 'device_type': fence_apc['resource_uri'], 'name': pdu_sim.name, 'address': pdu_sim.address, 'port': pdu_sim.port })) assert 200 <= response.status_code < 300, response.text pdu_entries.append(json.loads(response.text)) log.debug("Created power_control_device: %s" % pdu_entries[-1]['name']) response = session.get("%s/api/host/" % args.url, data=json.dumps({'limit': 0})) assert 200 <= response.status_code < 300, response.text servers = [ s for s in json.loads(response.text)['objects'] if 'posix_copytool_worker' not in s['server_profile'] ] for i, server in enumerate( sorted(servers, key=lambda server: server['fqdn'])): for pdu in pdu_entries: outlet = [ o for o in pdu['outlets'] if o['identifier'] == str(i + 1) ][0] response = session.patch( "%s/%s" % (args.url, outlet['resource_uri']), data=json.dumps({'host': server['resource_uri']})) assert 200 <= response.status_code < 300, response.text log.debug("Created association %s <=> %s:%s" % (server['fqdn'], pdu['name'], outlet['identifier']))
def main(self): log.addHandler(logging.StreamHandler()) daemon_log.addHandler(logging.StreamHandler()) daemon_log.setLevel(logging.DEBUG) handler = logging.FileHandler("chroma-agent.log") handler.setFormatter( logging.Formatter('[%(asctime)s] %(message)s', '%d/%b/%Y:%H:%M:%S')) daemon_log.addHandler(handler) # Usually on our Intel laptops https_proxy is set, and needs to be unset for tests, # but let's not completely rule out the possibility that someone might want to run # the tests on a remote system using a proxy. if 'https_proxy' in os.environ: sys.stderr.write( "Warning: Using proxy %s from https_proxy" % os.environ['https_proxy'] + " environment variable, you probably don't want that\n") parser = argparse.ArgumentParser(description="Cluster simulator") parser.add_argument('--config', required=False, help="Simulator configuration/state directory", default="cluster_sim") parser.add_argument('--url', required=False, help="Manager URL", default="https://localhost:8000/") subparsers = parser.add_subparsers() setup_parser = subparsers.add_parser("setup") setup_parser.add_argument('--su_size', required=False, help="Servers per SU", default='0') setup_parser.add_argument('--cluster_size', required=False, help="Number of simulated storage servers", default='4') setup_parser.add_argument('--server_count', required=False, help="Number of simulated storage servers", default='8') setup_parser.add_argument('--worker_count', required=False, help="Number of simulated HSM workers", default='1') setup_parser.add_argument( '--nid_count', required=False, help= "Number of LNet NIDs per storage server, defaults to 1 per server", default='1') setup_parser.add_argument( '--volume_count', required=False, help= "Number of simulated storage devices, defaults to twice the number of servers" ) setup_parser.add_argument( '--psu_count', required=False, help= "Number of simulated server Power Supply Units, defaults to one per server", default='1') setup_parser.set_defaults(func=self.setup) register_parser = subparsers.add_parser( "register", help= "Provide a secret for registration, or provide API credentials for the simulator to acquire a token itself" ) register_parser.add_argument('--secret', required=False, help="Registration token secret") register_parser.add_argument('--username', required=False, help="API username") register_parser.add_argument('--password', required=False, help="API password") register_parser.add_argument('--create_pdu_entries', action='store_true', help="Create PDU entries on the manager") register_parser.set_defaults(func=self.register) run_parser = subparsers.add_parser("run") run_parser.set_defaults(func=self.run) args = parser.parse_args() simulator = args.func(args) if simulator: self.simulator = simulator rpc_thread = RpcThread(self.simulator) rpc_thread.start() # Wake up periodically to handle signals, instead of going straight into join while not self._stopping.is_set(): self._stopping.wait(timeout=1) log.info("Running indefinitely.") self.simulator.join() rpc_thread.stop() rpc_thread.join()
def start(self): log.info("Power control: starting...") for pdu_name in self.pdu_sims: self.start_sim_server(pdu_name)
def toggle_outlet(self, outlet, state): with self._lock: self.state['outlets'][outlet] = state log.info("POWER: Toggled %s:%s to %s" % (self.name, outlet, self.outlet_state(outlet))) self.save()
def stop(self): log.info("Power control: stopping...") for sim_server in self.sim_servers.values(): sim_server.stop()