def __get_stats(self):
        LOGGER.info('Getting stats from client %d', self.config['client_id'])
        buf = struct.pack('>I', constants.CMD_PULL_STATS)
        self.conn.sendall(buf)
        incoming_stats = recvJSON(self.conn)

        with self.stats_cond:
            self.stats = incoming_stats
            self.stats_cond.notify_all()
    def shutdown(self):
        # not asynchronous by design
        LOGGER.warning(f'Shutting down client %d...', self.config['client_id'])

        # wait for worker thread to finish the current task
        self.shutdown_flag.set()
        self.worker.join()

        buf = struct.pack('>I', constants.CMD_SHUTDOWN)
        self.conn.sendall(buf)
        self.conn.shutdown(socket.SHUT_RDWR)
        self.conn.close()

        LOGGER.warning(f'Client %d shut down.', self.config['client_id'])
    def __send_step(self, index, checksum, data):
        # first build and send header
        hdr = {
            constants.STEP_METADATA_INDEX: index,
            constants.STEP_METADATA_SIZE: len(data),
            constants.STEP_METADATA_CHKSUM: checksum
        }

        LOGGER.info('Client %d checking step %d', self.config['client_id'],
                    index)

        buf = struct.pack('>I', constants.CMD_PUSH_STEP)
        self.conn.sendall(buf)
        sendJSON(self.conn, hdr)
        try:
            self.__wait_for_confirmation()
            LOGGER.info('Client %d already has step %d!',
                        self.config['client_id'], index)
            # client already had the step file
            return
        except Exception:
            LOGGER.info('Client %d does not have step %d!',
                        self.config['client_id'], index)
            pass

        # client-side verification of the step failed,
        # we need to push a new copy

        LOGGER.info('Sending step %d to client %d. Total size: %d bytes',
                    index, self.config['client_id'], len(data))

        self.conn.sendall(data)
        self.__wait_for_confirmation()
    def __run_experiment(self, init_offset):
        time.sleep(init_offset)  # TODO: maybe make more accurate?
        LOGGER.info('Client %d starting experiment!', self.config['client_id'])

        buf = struct.pack('>I', constants.CMD_START_EXP)
        self.conn.sendall(buf)
        self.__wait_for_confirmation()

        # wait for experiment finish
        confirmation_b = recvall(self.conn, 4)
        (confirmation, ) = struct.unpack('>I', confirmation_b)
        if confirmation != constants.MSG_EXPERIMENT_FINISH:
            raise RuntimeError(
                f'Client {self.address}: error on experiment finish!')

        LOGGER.info('Client %d done!', self.config['client_id'])
    def start_cpu_load(self):
        LOGGER.info('Initiating artificial CPU load...')
        LOGGER.info('Target cores: %s', self.cores)
        LOGGER.info('Artificial load: %04.1f percent', self.target_load * 100)

        core_cmds = map(lambda c: f'-c {c}', self.cores)
        cmd = ' '.join(core_cmds) + f' -l {self.target_load} -d -1'

        self.container = self.docker.containers.run(
            constants.CPU_LOAD_IMG,
            command=cmd,
            detach=True,
            auto_remove=True,
        )

        LOGGER.info('Wait for CPU load to ramp up (2s)...')
        time.sleep(2)
        LOGGER.info('CPU load ready.')
def execute(experiment_directory,
            experiment_config,
            host,
            port,
            output_dir=None):
    config_path = os.path.join(experiment_directory, experiment_config)
    if not os.path.exists(config_path):
        raise click.UsageError(
            'No experiment config file named {} found in {}'.format(
                experiment_config, experiment_directory))

    if not output_dir:
        output_dir = experiment_directory

    LOGGER.info('Starting Control server version {}'.format(
        constants.CONTROL_SERVER_VERSION))
    e = Experiment(config_path, host, port, output_dir)
    e.execute()
    def _init_tcpdump(self, run_path: str):
        LOGGER.info('Initializing TCP dump...')
        LOGGER.info('TCPdump directory: {}'.format(run_path))
        port_cmds = list()
        for port_cfg in self.config.port_configs:
            cmds = [
                'port {}'.format(port_cfg.video),
                'port {}'.format(port_cfg.result),
                'port {}'.format(port_cfg.control),
            ]

            port_cmds.append(' or '.join(cmds))

        port_cmds = [' or '.join(port_cmds)]
        tcpdump = shlex.split(
            ' '.join(constants.TCPDUMP_CMD_PREFIX + port_cmds +
                     constants.TCPDUMP_CMD_SUFFIX))

        self.tcpdump_proc = subprocess.Popen(tcpdump, cwd=run_path)
        if self.tcpdump_proc.poll():
            raise RuntimeError('Could not start TCPDUMP!')
    def spawn_backends(self):
        LOGGER.info('Spawning Docker containers...')
        for i, port_cfg in enumerate(self.port_configs):
            LOGGER.info(
                f'Launching container {i + 1} of {self.clients}...')

            self.containers.append(
                self.docker.containers.run(
                    self.docker_img,
                    detach=True,
                    auto_remove=True,
                    ports={
                        constants.DEFAULT_VIDEO_PORT  : port_cfg.video,
                        constants.DEFAULT_RESULT_PORT : port_cfg.result,
                        constants.DEFAULT_CONTROL_PORT: port_cfg.control
                    },
                    cpuset_cpus=self.cpu_set
                )
            )

            LOGGER.info('Wait 5s for container warm up...')
            time.sleep(5.0)
            LOGGER.info('Initialization done')
    def _pollNTPServer(self):
        LOGGER.info('Getting NTP offset')
        sync_cnt = 0
        cum_offset = 0
        while sync_cnt < constants.DEFAULT_NTP_POLL_COUNT:
            try:
                res = self.ntp_client.request(self.config.ntp_servers[0],
                                              version=4)

                cum_offset += res.offset
                sync_cnt += 1
            except ntplib.NTPException:
                continue

        self.offset = (cum_offset * 1000.0) / sync_cnt
        # convert to milliseconds

        LOGGER.info('Got NTP offset from %s', self.config.ntp_servers[0])
        LOGGER.info('Offset: %f ms', self.offset)
 def shutdown(self):
     LOGGER.info('Shutting down artificial CPU load...')
     self.container.kill()
     LOGGER.info('Artificial CPU load shut down.')
Exemple #11
0
 def __send_config(self):
     LOGGER.info('Sending config to client %d...', self.config['client_id'])
     buf = struct.pack('>I', constants.CMD_PUSH_CONFIG)
     self.conn.sendall(buf)
     sendJSON(self.conn, self.config)
     self.__wait_for_confirmation()
Exemple #12
0
    def __init__(self, toml_config: RecursiveNestedDict):
        try:
            self.name = toml_config.find('experiment.name')
            self.clients = toml_config.find('experiment.clients')
            self.runs = toml_config.find('experiment.runs')
            self.docker_img = toml_config.find('experiment.docker_img')

            num_steps = toml_config.find('experiment.trace.steps')
            trace_dir = toml_config.find('experiment.trace.dir')

            # verify trace dir actually exists
            if not os.path.isdir(trace_dir):
                error_str = 'Invalid trace directory.'
                LOGGER.error(error_str)
                raise ConfigException(error_str)

            # verify that step files exist
            self.trace_steps = []
            for i in range(1, num_steps + 1):
                filename = constants.STEP_FILE_FMT.format(i)
                path = os.path.join(trace_dir, filename)
                if not os.path.isfile(path):
                    error_str = '{} does not seem to be a valid step trace'
                    'file'.format(path)
                    LOGGER.error(error_str)
                    raise Exception(error_str)
                else:
                    self.trace_steps.append(path)

            self.trace_fps = toml_config.find('experiment.trace.fps')
            self.rewind_seconds = toml_config.find(
                'experiment.trace.rewind_seconds')
            self.max_replays = toml_config.find('experiment.trace.max_replays')

            self.ntp_servers = toml_config.find('experiment.ntp.servers')

            # performance settings
            self.cpu_cores = toml_config.find(
                'experiment.performance.cpu_cores')

            if len(self.cpu_cores) == 0:
                self.cpu_cores = list(range(psutil.cpu_count()))

            self.gen_load = toml_config.find(
                'experiment.performance.artificial_load')
            self.target_load = toml_config.find(
                'experiment.performance.artificial_load_percent') * 0.01
            # load is in percent

            self.port_configs = []
            for port_cfg in toml_config.find('experiment.ports'):
                self.port_configs.append(
                    PortConfig(video=port_cfg['video'],
                               result=port_cfg['results'],
                               control=port_cfg['control']))

        except KeyError as e:
            LOGGER.error('Error when parsing TOML config.')
            LOGGER.error('Missing required configuration key: %s', *e.args)
            raise ConfigException('Missing required configuration key: '
                                  '{}'.format(*e.args)) from e

        self.__raw_cfg = toml_config
    def __init__(self, config, host, port, output_dir):

        with open(config, 'r') as f:
            LOGGER.info('Loading config')
            self.config = ExperimentConfig(
                toml.load(f, _dict=RecursiveNestedDict))

        LOGGER.warning('Loaded config from %s', config)
        LOGGER.warning('Output directory: %s', output_dir)

        self.clients = list()
        self.host = host
        self.port = port
        self.tcpdump_proc = None
        self.output_dir = output_dir

        self.backend_mgr = BackendManager(self.config)

        if self.config.gen_load:
            self.load_mgr = CPULoadManager(self.config)
        else:
            self.load_mgr = NullCPULoadManager()

        self.ntp_client = ntplib.NTPClient()
        self.offset = 0

        LOGGER.info('Experiment ID: %s', self.config.name)
        LOGGER.info('Clients: %d', self.config.clients)
        LOGGER.info('Runs: %d', self.config.runs)
    def shutdown(self, e=None):
        LOGGER.warning('Shut down!')

        if e:
            LOGGER.critical(e)

        try:
            for client in self.clients:
                client.shutdown()
        except Exception as e:
            LOGGER.error('Something went wrong while shutting down clients')
            LOGGER.error(e)

        try:
            if self.tcpdump_proc:
                self.tcpdump_proc.send_signal(signal.SIGINT)
        except Exception as e:
            LOGGER.error('Something went wrong while shutting down TCPDUMP')
            LOGGER.error(e)

        try:
            if self.backend_mgr:
                self.backend_mgr.shutdown()
        except Exception as e:
            LOGGER.error(
                'Something went wrong while shutting down Docker containers')
            LOGGER.error(e)
    def execute(self):
        server_socket = None
        error = None

        try:
            # first start docker instances
            self.backend_mgr.spawn_backends()

            with socket(AF_INET, SOCK_STREAM) as server_socket:
                server_socket.bind((self.host, self.port))
                server_socket.listen(self.config.clients)
                server_socket.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
                LOGGER.info('Listening on %s:%d', self.host, self.port)

                # accept N clients for the experiment
                LOGGER.info('Waiting for %d clients to connect...',
                            self.config.clients)

                signal.signal(signal.SIGINT, signal_handler)
                signal.signal(signal.SIGTERM, signal_handler)

                # wait for all clients to connect
                for i in range(self.config.clients):
                    conn, addr = server_socket.accept()
                    set_keepalive_linux(conn, max_fails=100)  # 5 minutes

                    client = AsyncClient(conn, addr,
                                         self._gen_config_for_client(i))

                    self.clients.append(client)

                    LOGGER.info('{} out of {} clients connected: {}:{}'.format(
                        i + 1, self.config.clients, *addr))
                    # send config
                    client.send_config()

                # wait for clients to finish sending configs
                for c in self.clients:
                    c.wait_for_tasks()

                # send traces, two clients at the time
                LOGGER.info('Pushing step files to clients...')

                # grouper is a generator, so wrap it in a list
                chunks = list(grouper(self.clients, 2, fillvalue=NullClient()))

                # use list comprehensions to avoid laziness (map())
                step_data = [
                    open(p, 'rb').read() for p in self.config.trace_steps
                ]
                step_chksums = [hashlib.md5(d).hexdigest() for d in step_data]

                steps = zip(itertools.count(start=1), step_chksums, step_data)

                for chunk, step in itertools.product(chunks, steps):
                    for c in chunk:
                        c.send_step(*step)
                    for c in chunk:
                        c.wait_for_tasks()

                # execute runs!
                for r in range(self.config.runs):
                    self.__execute_run(r)

        except ShutdownException:
            pass

        except Exception as e:
            error = e

        finally:
            try:
                if server_socket:
                    # server_socket.shutdown(SHUT_RDWR)
                    server_socket.close()
            except Exception as e:
                LOGGER.critical('Error closing server socket.')
                LOGGER.critical(e)

            self.shutdown(e=error)
    def __execute_run(self, run_index):
        LOGGER.info('Executing run %d out of %d', run_index + 1,
                    self.config.runs)

        run_path = self.output_dir + f'/run_{run_index + 1}/'
        if os.path.exists(run_path) and not os.path.isdir(run_path):
            raise RuntimeError(f'Output path {run_path} already '
                               'exists and is not a directory!')
        elif os.path.isdir(run_path):
            pass
        else:
            os.mkdir(run_path)

        # sync NTP
        self._pollNTPServer()

        LOGGER.info('Trigger client NTP sync')
        for client in self.clients:
            client.ntp_sync()

        # self._init_tcpdump(run_path)
        # LOGGER.info('TCPdump warmup...')
        # time.sleep(5)

        # make sure all clients are done syncing NTP before starting the
        # experiment
        for client in self.clients:
            client.wait_for_tasks()

        # All clients are ready, now let's run the experiment!
        shuffle(self.clients)  # shuffle clients before each run

        # Randomly offset client experiment start to avoid weird
        # synchronous effects on the processing times...
        start_times = np.random.uniform(0, constants.DEFAULT_START_WINDOW,
                                        len(self.clients))

        # start artificial CPU load
        self.load_mgr.start_cpu_load()

        LOGGER.info('Execute experiment!')
        LOGGER.info('Starting resource monitor...')
        monitor = ResourceMonitor(self.offset)
        monitor.start()

        start_timestamp = time.time() * 1000.0

        # finally, actually trigger the experiment on the clients...
        # asynchronously, of course
        for client, init_offset in zip(self.clients, start_times):
            client.execute_experiment(init_offset)

        # wait for the experiment to finish
        for client in self.clients:
            client.wait_for_tasks()

        end_timestamp = time.time() * 1000.0

        LOGGER.info('Shut down monitor.')
        system_stats = monitor.shutdown()

        # shut down CPU load generator
        self.load_mgr.shutdown()

        time.sleep(1)
        # LOGGER.info('Terminate TCPDUMP')
        # self.tcpdump_proc.send_signal(signal.SIGINT)
        # self.tcpdump_proc.wait()

        LOGGER.info('Get stats from clients!')
        for client in self.clients:
            client.fetch_stats()  # asynchronously triggers fetching stats

        # store this runs' system stats
        with open(run_path + constants.SYSTEM_STATS, 'w') as f:
            fieldnames = ['cpu_load', 'mem_avail', 'timestamp']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(system_stats)

        # store the client stats
        stats = [c.get_stats() for c in self.clients]
        for stat_coll in stats:
            client_index = stat_coll['client_id']
            with open(run_path + constants.CLIENT_STATS.format(client_index),
                      'w') as f:
                json.dump(stat_coll, f)

        # save server stats:
        with open(run_path + constants.SERVER_STATS, 'w') as f:
            json.dump(
                {
                    'server_offset': self.offset,
                    'run_start': start_timestamp + self.offset,
                    'run_end': end_timestamp + self.offset
                }, f)
 def shutdown(self):
     LOGGER.warning('Shutting down containers...')
     for cont in self.containers:
         cont.kill()
     LOGGER.warning('Containers shut down!')