def __get_stats(self): LOGGER.info('Getting stats from client %d', self.config['client_id']) buf = struct.pack('>I', constants.CMD_PULL_STATS) self.conn.sendall(buf) incoming_stats = recvJSON(self.conn) with self.stats_cond: self.stats = incoming_stats self.stats_cond.notify_all()
def shutdown(self): # not asynchronous by design LOGGER.warning(f'Shutting down client %d...', self.config['client_id']) # wait for worker thread to finish the current task self.shutdown_flag.set() self.worker.join() buf = struct.pack('>I', constants.CMD_SHUTDOWN) self.conn.sendall(buf) self.conn.shutdown(socket.SHUT_RDWR) self.conn.close() LOGGER.warning(f'Client %d shut down.', self.config['client_id'])
def __send_step(self, index, checksum, data): # first build and send header hdr = { constants.STEP_METADATA_INDEX: index, constants.STEP_METADATA_SIZE: len(data), constants.STEP_METADATA_CHKSUM: checksum } LOGGER.info('Client %d checking step %d', self.config['client_id'], index) buf = struct.pack('>I', constants.CMD_PUSH_STEP) self.conn.sendall(buf) sendJSON(self.conn, hdr) try: self.__wait_for_confirmation() LOGGER.info('Client %d already has step %d!', self.config['client_id'], index) # client already had the step file return except Exception: LOGGER.info('Client %d does not have step %d!', self.config['client_id'], index) pass # client-side verification of the step failed, # we need to push a new copy LOGGER.info('Sending step %d to client %d. Total size: %d bytes', index, self.config['client_id'], len(data)) self.conn.sendall(data) self.__wait_for_confirmation()
def __run_experiment(self, init_offset): time.sleep(init_offset) # TODO: maybe make more accurate? LOGGER.info('Client %d starting experiment!', self.config['client_id']) buf = struct.pack('>I', constants.CMD_START_EXP) self.conn.sendall(buf) self.__wait_for_confirmation() # wait for experiment finish confirmation_b = recvall(self.conn, 4) (confirmation, ) = struct.unpack('>I', confirmation_b) if confirmation != constants.MSG_EXPERIMENT_FINISH: raise RuntimeError( f'Client {self.address}: error on experiment finish!') LOGGER.info('Client %d done!', self.config['client_id'])
def start_cpu_load(self): LOGGER.info('Initiating artificial CPU load...') LOGGER.info('Target cores: %s', self.cores) LOGGER.info('Artificial load: %04.1f percent', self.target_load * 100) core_cmds = map(lambda c: f'-c {c}', self.cores) cmd = ' '.join(core_cmds) + f' -l {self.target_load} -d -1' self.container = self.docker.containers.run( constants.CPU_LOAD_IMG, command=cmd, detach=True, auto_remove=True, ) LOGGER.info('Wait for CPU load to ramp up (2s)...') time.sleep(2) LOGGER.info('CPU load ready.')
def execute(experiment_directory, experiment_config, host, port, output_dir=None): config_path = os.path.join(experiment_directory, experiment_config) if not os.path.exists(config_path): raise click.UsageError( 'No experiment config file named {} found in {}'.format( experiment_config, experiment_directory)) if not output_dir: output_dir = experiment_directory LOGGER.info('Starting Control server version {}'.format( constants.CONTROL_SERVER_VERSION)) e = Experiment(config_path, host, port, output_dir) e.execute()
def _init_tcpdump(self, run_path: str): LOGGER.info('Initializing TCP dump...') LOGGER.info('TCPdump directory: {}'.format(run_path)) port_cmds = list() for port_cfg in self.config.port_configs: cmds = [ 'port {}'.format(port_cfg.video), 'port {}'.format(port_cfg.result), 'port {}'.format(port_cfg.control), ] port_cmds.append(' or '.join(cmds)) port_cmds = [' or '.join(port_cmds)] tcpdump = shlex.split( ' '.join(constants.TCPDUMP_CMD_PREFIX + port_cmds + constants.TCPDUMP_CMD_SUFFIX)) self.tcpdump_proc = subprocess.Popen(tcpdump, cwd=run_path) if self.tcpdump_proc.poll(): raise RuntimeError('Could not start TCPDUMP!')
def spawn_backends(self): LOGGER.info('Spawning Docker containers...') for i, port_cfg in enumerate(self.port_configs): LOGGER.info( f'Launching container {i + 1} of {self.clients}...') self.containers.append( self.docker.containers.run( self.docker_img, detach=True, auto_remove=True, ports={ constants.DEFAULT_VIDEO_PORT : port_cfg.video, constants.DEFAULT_RESULT_PORT : port_cfg.result, constants.DEFAULT_CONTROL_PORT: port_cfg.control }, cpuset_cpus=self.cpu_set ) ) LOGGER.info('Wait 5s for container warm up...') time.sleep(5.0) LOGGER.info('Initialization done')
def _pollNTPServer(self): LOGGER.info('Getting NTP offset') sync_cnt = 0 cum_offset = 0 while sync_cnt < constants.DEFAULT_NTP_POLL_COUNT: try: res = self.ntp_client.request(self.config.ntp_servers[0], version=4) cum_offset += res.offset sync_cnt += 1 except ntplib.NTPException: continue self.offset = (cum_offset * 1000.0) / sync_cnt # convert to milliseconds LOGGER.info('Got NTP offset from %s', self.config.ntp_servers[0]) LOGGER.info('Offset: %f ms', self.offset)
def shutdown(self): LOGGER.info('Shutting down artificial CPU load...') self.container.kill() LOGGER.info('Artificial CPU load shut down.')
def __send_config(self): LOGGER.info('Sending config to client %d...', self.config['client_id']) buf = struct.pack('>I', constants.CMD_PUSH_CONFIG) self.conn.sendall(buf) sendJSON(self.conn, self.config) self.__wait_for_confirmation()
def __init__(self, toml_config: RecursiveNestedDict): try: self.name = toml_config.find('experiment.name') self.clients = toml_config.find('experiment.clients') self.runs = toml_config.find('experiment.runs') self.docker_img = toml_config.find('experiment.docker_img') num_steps = toml_config.find('experiment.trace.steps') trace_dir = toml_config.find('experiment.trace.dir') # verify trace dir actually exists if not os.path.isdir(trace_dir): error_str = 'Invalid trace directory.' LOGGER.error(error_str) raise ConfigException(error_str) # verify that step files exist self.trace_steps = [] for i in range(1, num_steps + 1): filename = constants.STEP_FILE_FMT.format(i) path = os.path.join(trace_dir, filename) if not os.path.isfile(path): error_str = '{} does not seem to be a valid step trace' 'file'.format(path) LOGGER.error(error_str) raise Exception(error_str) else: self.trace_steps.append(path) self.trace_fps = toml_config.find('experiment.trace.fps') self.rewind_seconds = toml_config.find( 'experiment.trace.rewind_seconds') self.max_replays = toml_config.find('experiment.trace.max_replays') self.ntp_servers = toml_config.find('experiment.ntp.servers') # performance settings self.cpu_cores = toml_config.find( 'experiment.performance.cpu_cores') if len(self.cpu_cores) == 0: self.cpu_cores = list(range(psutil.cpu_count())) self.gen_load = toml_config.find( 'experiment.performance.artificial_load') self.target_load = toml_config.find( 'experiment.performance.artificial_load_percent') * 0.01 # load is in percent self.port_configs = [] for port_cfg in toml_config.find('experiment.ports'): self.port_configs.append( PortConfig(video=port_cfg['video'], result=port_cfg['results'], control=port_cfg['control'])) except KeyError as e: LOGGER.error('Error when parsing TOML config.') LOGGER.error('Missing required configuration key: %s', *e.args) raise ConfigException('Missing required configuration key: ' '{}'.format(*e.args)) from e self.__raw_cfg = toml_config
def __init__(self, config, host, port, output_dir): with open(config, 'r') as f: LOGGER.info('Loading config') self.config = ExperimentConfig( toml.load(f, _dict=RecursiveNestedDict)) LOGGER.warning('Loaded config from %s', config) LOGGER.warning('Output directory: %s', output_dir) self.clients = list() self.host = host self.port = port self.tcpdump_proc = None self.output_dir = output_dir self.backend_mgr = BackendManager(self.config) if self.config.gen_load: self.load_mgr = CPULoadManager(self.config) else: self.load_mgr = NullCPULoadManager() self.ntp_client = ntplib.NTPClient() self.offset = 0 LOGGER.info('Experiment ID: %s', self.config.name) LOGGER.info('Clients: %d', self.config.clients) LOGGER.info('Runs: %d', self.config.runs)
def shutdown(self, e=None): LOGGER.warning('Shut down!') if e: LOGGER.critical(e) try: for client in self.clients: client.shutdown() except Exception as e: LOGGER.error('Something went wrong while shutting down clients') LOGGER.error(e) try: if self.tcpdump_proc: self.tcpdump_proc.send_signal(signal.SIGINT) except Exception as e: LOGGER.error('Something went wrong while shutting down TCPDUMP') LOGGER.error(e) try: if self.backend_mgr: self.backend_mgr.shutdown() except Exception as e: LOGGER.error( 'Something went wrong while shutting down Docker containers') LOGGER.error(e)
def execute(self): server_socket = None error = None try: # first start docker instances self.backend_mgr.spawn_backends() with socket(AF_INET, SOCK_STREAM) as server_socket: server_socket.bind((self.host, self.port)) server_socket.listen(self.config.clients) server_socket.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) LOGGER.info('Listening on %s:%d', self.host, self.port) # accept N clients for the experiment LOGGER.info('Waiting for %d clients to connect...', self.config.clients) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # wait for all clients to connect for i in range(self.config.clients): conn, addr = server_socket.accept() set_keepalive_linux(conn, max_fails=100) # 5 minutes client = AsyncClient(conn, addr, self._gen_config_for_client(i)) self.clients.append(client) LOGGER.info('{} out of {} clients connected: {}:{}'.format( i + 1, self.config.clients, *addr)) # send config client.send_config() # wait for clients to finish sending configs for c in self.clients: c.wait_for_tasks() # send traces, two clients at the time LOGGER.info('Pushing step files to clients...') # grouper is a generator, so wrap it in a list chunks = list(grouper(self.clients, 2, fillvalue=NullClient())) # use list comprehensions to avoid laziness (map()) step_data = [ open(p, 'rb').read() for p in self.config.trace_steps ] step_chksums = [hashlib.md5(d).hexdigest() for d in step_data] steps = zip(itertools.count(start=1), step_chksums, step_data) for chunk, step in itertools.product(chunks, steps): for c in chunk: c.send_step(*step) for c in chunk: c.wait_for_tasks() # execute runs! for r in range(self.config.runs): self.__execute_run(r) except ShutdownException: pass except Exception as e: error = e finally: try: if server_socket: # server_socket.shutdown(SHUT_RDWR) server_socket.close() except Exception as e: LOGGER.critical('Error closing server socket.') LOGGER.critical(e) self.shutdown(e=error)
def __execute_run(self, run_index): LOGGER.info('Executing run %d out of %d', run_index + 1, self.config.runs) run_path = self.output_dir + f'/run_{run_index + 1}/' if os.path.exists(run_path) and not os.path.isdir(run_path): raise RuntimeError(f'Output path {run_path} already ' 'exists and is not a directory!') elif os.path.isdir(run_path): pass else: os.mkdir(run_path) # sync NTP self._pollNTPServer() LOGGER.info('Trigger client NTP sync') for client in self.clients: client.ntp_sync() # self._init_tcpdump(run_path) # LOGGER.info('TCPdump warmup...') # time.sleep(5) # make sure all clients are done syncing NTP before starting the # experiment for client in self.clients: client.wait_for_tasks() # All clients are ready, now let's run the experiment! shuffle(self.clients) # shuffle clients before each run # Randomly offset client experiment start to avoid weird # synchronous effects on the processing times... start_times = np.random.uniform(0, constants.DEFAULT_START_WINDOW, len(self.clients)) # start artificial CPU load self.load_mgr.start_cpu_load() LOGGER.info('Execute experiment!') LOGGER.info('Starting resource monitor...') monitor = ResourceMonitor(self.offset) monitor.start() start_timestamp = time.time() * 1000.0 # finally, actually trigger the experiment on the clients... # asynchronously, of course for client, init_offset in zip(self.clients, start_times): client.execute_experiment(init_offset) # wait for the experiment to finish for client in self.clients: client.wait_for_tasks() end_timestamp = time.time() * 1000.0 LOGGER.info('Shut down monitor.') system_stats = monitor.shutdown() # shut down CPU load generator self.load_mgr.shutdown() time.sleep(1) # LOGGER.info('Terminate TCPDUMP') # self.tcpdump_proc.send_signal(signal.SIGINT) # self.tcpdump_proc.wait() LOGGER.info('Get stats from clients!') for client in self.clients: client.fetch_stats() # asynchronously triggers fetching stats # store this runs' system stats with open(run_path + constants.SYSTEM_STATS, 'w') as f: fieldnames = ['cpu_load', 'mem_avail', 'timestamp'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(system_stats) # store the client stats stats = [c.get_stats() for c in self.clients] for stat_coll in stats: client_index = stat_coll['client_id'] with open(run_path + constants.CLIENT_STATS.format(client_index), 'w') as f: json.dump(stat_coll, f) # save server stats: with open(run_path + constants.SERVER_STATS, 'w') as f: json.dump( { 'server_offset': self.offset, 'run_start': start_timestamp + self.offset, 'run_end': end_timestamp + self.offset }, f)
def shutdown(self): LOGGER.warning('Shutting down containers...') for cont in self.containers: cont.kill() LOGGER.warning('Containers shut down!')