def handle_event_register_prog(event): """ handle 'register_prog' event data must include 'name', 'checksum' and 'data' :event: event to handle :returns: event result data if event sucessfully handled :raises: ValueError: if program data does not match checksum """ data = event.data['data'] name = event.data['name'] checksum = event.data['checksum'] with client.client_access() as c: user_progs_dir = c.user_progs_dir hardware = c.hardware prog_dir = os.path.join(user_progs_dir, checksum) data_file = os.path.join(prog_dir, 'data.json') os.mkdir(prog_dir) with open(data_file, 'w') as fp: fp.write(data) program = user_prog.UserProg( name, checksum, data_file, hardware, build_dir=prog_dir) program.verify_checksum() with client.client_access() as c: cuda_bin = c.args.bin include_path = c.args.include program.build(cuda_bin=cuda_bin, include_path=include_path) with client.client_access() as c: c.user_programs[checksum] = program LOG.info('Registered program: %s', program) return {}
def run_server(args, tmpdir): """ entrypoint for server :args: parsed cmdline args :tmpdir: temporary directory :returns: 0 on success """ # create server state server.create_state(args, tmpdir) # init remote event system remote_event.create_remote_events() # start api server call = functools.partial( server.APP.run, debug=False, host=args.host, port=args.port) thread = threading.Thread(target=call) thread.daemon = True thread.start() # start server worker worker = server_worker.ServerWorker() LOG.info('starting server worker') try: worker.run() except queue.Empty: server_util.shutdown_all_clients() return 0
def check_gpus(args, tmpdir): """ check for CUDA capable GPUs :args: parsed cmdline args :tmpdir: temporary directory :returns: dict with GPU info """ if args.no_gpu: LOG.warning("Not scanning available gpus, running programs will fail") return {'num_gpus': 0, 'gpu_info': []} LOG.info('Checking CUDA build system') program = setup_cuda_detect(args, tmpdir) res = { 'num_gpus': program.get_num_gpus(), 'gpu_info': [], } for gpu_index in range(res['num_gpus']): props = GPUProps() program.get_gpu_data(gpu_index, ctypes.byref(props)) gpu_info = { 'gpu_index': props.gpu_index, 'comp_level_major': props.comp_level_major, 'comp_level_minor': props.comp_level_minor, 'sm_count': props.sm_count, 'max_sm_threads': props.max_sm_threads, 'max_sm_blocks': props.max_sm_blocks, 'max_block_size': props.max_block_size, 'max_total_threads': props.max_total_threads, 'max_total_blocks': props.max_total_blocks, 'name': props.name.decode(), } gpu_info['reasonable_block_size'] = get_reasonable_block_size(gpu_info) res['gpu_info'].append(gpu_info) return res
def __enter__(self): """ Create tempdir :returns: tempdir path """ self.tempdir = tempfile.mkdtemp(prefix=self.prefix) LOG.debug('Using tempdir: %s', self.tempdir) return self.tempdir
def __exit__(self, etype, value, trace): """ Destroy tempdir if no errors occurred or preserve set :etype: exception type :value: exception value :trace: exception traceback """ if etype or self.preserve: LOG.info('Preserving tempdir: %s', self.tempdir) else: shutil.rmtree(self.tempdir)
def unregister_client(self, client_uuid): """ unregister a client with the server :client_uuid: client uuid :returns: client object """ self.all_clients_hardware.pop(client_uuid, None) client = self.clients.pop(client_uuid, None) if client: LOG.info('Deleted client: %s', client) return client
def unpack(self, item_keys): """ unpack program files and set up build dir structure :item_keys: items to unpack """ LOG.debug('Extracting user program code') for key in item_keys: filename = PROGRAM_SOURCE_FILE_NAMES[key] code = self.data['code'][key] path = os.path.join(self.build_dir, filename) with open(path, 'w') as fp: fp.write(code)
def run_iteration(self, global_state_enc): """ update global state, run iteration, and encode aggregation result :global_state_enc: encoded global state :returns: encoded aggregation result """ LOG.info("Running iteration") # FIXME move into py_mod import array self.global_state = array.array('d', global_state_enc[1]) partial_results = self.py_mod.run_iteration(self.global_params, self.data_count, self.global_state, self.pinned_memory, self.dataset) return partial_results
def copy_build_files(self, build_files): """ copy build files from data/build_files into build dir :build_files: names of files to copy """ LOG.debug('Copying additional build files') for build_file in build_files: resource_path = os.path.join( PROGRAM_DATA_DIRNAME, 'build_files', build_file) data = pkgutil.get_data('lizard', resource_path) path = os.path.join(self.build_dir, build_file) with open(path, 'wb') as fp: fp.write(data)
def _normalize_cuda_args(args): """normalize cuda arguments""" if args.bin: args.bin = os.path.abspath(args.bin) if not os.path.isdir(args.bin): LOG.error('invalid bin path specified') return None if args.include: args.include = os.path.abspath(args.include) if not os.path.isdir(args.include): LOG.error('invalid include path specified') return None return args
def scan_hardware(args, tmpdir): """ scan system hardware :args: parsed cmdline args :tmpdir: temporary directory :returns: dict with hardware info """ hardware = { 'CPU': check_cpus(), 'GPU': check_gpus(args, tmpdir), } LOG.debug('hardware scan found: %s', hardware) return hardware
def get_reasonable_block_size(props, size_mult=32): """ get reasonable cuda block size :props: gpu properties dict :size_mult: block size multiple :returns: reasonable block size """ max_reasonable_size = props['max_block_size'] min_reasonable_size = props['max_sm_threads'] / props['max_sm_blocks'] avg_reasonable_size = (max_reasonable_size + min_reasonable_size) / 2 reasonable_block_size = int(avg_reasonable_size/size_mult) * size_mult LOG.debug('Using CUDA block size: %s', reasonable_block_size) return reasonable_block_size
def run(self): """ main loop, wait for event, then process if shutdown scheduled continue until queue empty :returns: does not return, uses exception to end control :raises queue.Empty: when shutdown requested and queue empty """ while True: event = SERVER_QUEUE.get(block=not self.shutdown_scheduled) LOG.debug('received event: %s', event) self.handle_event(event) LOG.debug('handled event: %s', event) SERVER_QUEUE.task_done()
def load_data(self, dataset_enc): """ load dataset :dataset_enc: encoded data """ LOG.info('in load_data') LOG.info('first line of data:') # LOG.info(dataset_enc[0]) # LOG.info(dataset_enc[1][0]) self.dataset = self.py_mod.to_array(dataset_enc[1]) self.data_count = dataset_enc[0] LOG.info('converted to array') self.pinned_memory = self.prog.pin_gpu_memory(self.dataset) LOG.info('data count %i', self.data_count) LOG.info('pinned data')
def partition_data(self, data): """ load dataset and partition among clients :data: data """ client_uuids = list(self.hardware.keys()) client_count = len(client_uuids) LOG.debug("data size %i", sys.getsizeof(data)) # FIXME this is a really rough estimate as the final calculation is done # after casting to double data_generator = self.py_mod.split_data(data) LOG.info(self.global_params) split_size = self.global_params[0] // client_count + 1 LOG.debug("split size %i", split_size) post_datasets = {} for client_uuid in client_uuids: LOG.info("Splitting data") # FIXME use hardware scan to discover GPU mem size # currently rounded slightly down to avoid overflowing in loop # 8G gpu ram size # gpu_mem_remaining = 8589934592 gpu_mem_remaining = 8500000000 split_remaining = split_size data_count = 0 LOG.info("global_params %s", self.global_params) dataset = [] # subtract params size gpu_mem_remaining = (gpu_mem_remaining - sys.getsizeof(self.global_params)) try: while split_remaining > 0 and gpu_mem_remaining > 0: next_split = next(data_generator) split_remaining = split_remaining - 1 gpu_mem_remaining = (gpu_mem_remaining - sys.getsizeof(next_split)) dataset.append(next_split) data_count = data_count + 1 except StopIteration: pass dataset_enc = [data_count, dataset] self.client_datasets[client_uuid] = dataset_enc self._initialize_global_state()
def handle_event_register_prog(event): """ handle 'register_prog' event data must include 'name', 'checksum' and 'data' :event: event to handle :returns: event result data if event sucessfully handled :raises: Exception: if error occurs handling event """ data = event.data['data'] name = event.data['name'] checksum = event.data['checksum'] wakeup_ev = threading.Event() def multi_callback_func(event_props): wakeup_ev.set() def callback_func(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: failed to register program'.format(client)) client.registered_progs.append(checksum) with server.state_access() as s: user_progs_dir = s.user_progs_dir all_hardware = s.all_clients_hardware prog_dir = os.path.join(user_progs_dir, checksum) data_file = os.path.join(prog_dir, 'data.json') os.mkdir(prog_dir) with open(data_file, 'w') as fp: fp.write(data) program = user_prog.UserProg( name, checksum, data_file, all_hardware, build_dir=prog_dir) program.build_for_server() post_data = event.data.copy() post_data['send_remote_event'] = True with server.state_access() as s: s.post_all('/programs', post_data, callback_func=callback_func, multi_callback_func=multi_callback_func) # NOTE: timeout for registering program on all nodes set to 10 min wakeup_ev.wait(timeout=600) LOG.info('Registered user program: %s', program) with server.state_access() as s: s.registered_progs[checksum] = program return program.properties
def handle(self): """ Handle event using handler defined in event handler map and set result """ if self.event_handler_map is None: raise NotImplementedError("Cannot handle BaseEvent") handler = self.event_handler_map.get( self.event_type, handler_not_implemented) start_time = time.time() try: self.status = EventStatus.RUNNING self.result = handler(self) self.status = EventStatus.SUCCESS except Exception as e: msg = repr(e) LOG.warning("Failed to complete event: %s error: %s", self, msg) self.status = EventStatus.FAILURE self.result = {'error': msg} end_time = time.time() self.completion_time = end_time - start_time
def main(): """ main entry point :returns: 0 on success """ subcmd_handlers = { 'client': run_client, 'server': run_server, 'cluster': run_cluster, } # get the argument parser parser = cli.configure_parser() # parse arguments args = parser.parse_args() # normalize arguments args = cli.normalize_args(args) if args is None: return -1 # set log level LOG.setLevel(logging.DEBUG if args.verbose else logging.INFO) LOG.debug('logging system init') LOG.debug('running with args: %s', args) # create tmpdir and run handler with util.TempDir(preserve=args.keep_tmpdir) as tmpdir: return subcmd_handlers[args.subcmd](args, tmpdir)
def handle_event_init_runtime(event): """ handle 'init_runtime' event data must include: - 'runtime_id', - 'dataset_enc', - 'checksum', - 'global_params_enc' :event: event to handle :returns: event result """ runtime_id = event.data['runtime_id'] dataset_enc = event.data['dataset_enc'] prog_checksum = event.data['checksum'] global_params_enc = event.data['global_params_enc'] with client.client_access() as c: program = c.user_programs[prog_checksum] runtime = program.get_new_program_runtime(runtime_id) runtime.prepare_datastructures(global_params_enc) runtime.load_data(dataset_enc) LOG.info('Loaded client program instance') return {}
def _normalize_server_args(args): """normalize server arguments""" # ensure port number is valid if args.port > 65535: LOG.error('port number invalid: %s', args.port) return None elif args.port < 1024: LOG.warning('port number requires root priv: %s', args.port) # NOTE: does not support ipv6 bind addrs and may allow some invalid addrs if len(args.host.split('.')) != 4: LOG.error('invalid bind addr: %s', args.host) return None return args
def shutdown_all_clients(max_wait=5, wait_interval=0.2): """ shut down all clients because the server is terminating :max_wait: max time to wait for clients to shutdown before returning :wait_interval: poll interval to check if all clients have terminated """ LOG.info('Instructing all clients to shutdown') with server.state_access() as s: s.get_all('/shutdown') for _ in range(int(max_wait / wait_interval)): time.sleep(wait_interval) with server.state_access() as s: client_count = len(s.clients) if client_count == 0: LOG.info('All clients terminated') break else: LOG.warn('Not all clients terminated, shutting down anyway')
self.user_progs_dir = os.path.join(self.tmpdir, 'user_progs_server') os.mkdir(self.user_progs_dir) def register_client(self, hardware, client_ip, client_port): """ register a client with the server :hardware: hardware info dict :client_ip: addr of client :client_port: port number for client :returns: client uuid """ client_uuid = util.hex_uuid() url = 'http://{}:{}'.format(client_ip, client_port) self.clients[client_uuid] = ClientState(client_uuid, hardware, url) <<<<<<< HEAD LOG.info('Registered client: %s', self.clients[client_uuid]) return client_uuid def get_all(self, endpoint, params=None, expect_json=True): ======= with remote_event.remote_events_access() as r: r.register_client(client_uuid) self.all_clients_hardware[client_uuid] = hardware LOG.info('Registered client: %s', self.clients[client_uuid]) return client_uuid def unregister_client(self, client_uuid): """ unregister a client with the server :client_uuid: client uuid :returns: client object
client.create_client(args, tmpdir, hardware) >>>>>>> ef9b13b186c1a356f50a36e78ad91a3ccff76392 # automatically find available port client_port = util.get_free_port() # start client api server call = functools.partial( client.APP.run, debug=False, host='0.0.0.0', port=client_port) thread = threading.Thread(target=call) thread.daemon = True thread.start() # register with server with client.client_access() as c: c.register(client_port) # start client worker worker = client_worker.ClientWorker() LOG.info('starting client worker') try: worker.run() except queue.Empty: with client.client_access() as c: c.shutdown() return 0 def run_server(args, tmpdir): """ entrypoint for server :args: parsed cmdline args :tmpdir: temporary directory :returns: 0 on success """
def handle_event_run_program(event): """ handle 'run_program' eent :event: event to handle :returns: program result :raises: Exception: if error occurs or invalid request """ runtime_id = util.hex_uuid() dataset_enc = event.data['dataset_enc'] prog_checksum = event.data['checksum'] global_params_enc = event.data['global_params_enc'] init_path = os.path.join('/runtimes', prog_checksum, runtime_id) iterate_path = os.path.join(init_path, 'iterate') cleanup_path = os.path.join(init_path, 'cleanup') wakeup_ev = threading.Event() def multi_callback_wakeup(event_props): wakeup_ev.set() def runtime_init_callback(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: error on prog runtime init'.format(client)) with server.state_access() as s: program = s.registered_progs[prog_checksum] if not program.ready: raise ValueError('cannot run program, not ready') runtime = program.get_new_server_runtime(runtime_id) runtime.prepare_datastructures(global_params_enc) runtime.partition_data(dataset_enc) runtime_init_remote_event_ids = [] for client_uuid, dataset_enc in runtime.dataset_partitions_encoded.items(): data = { 'runtime_id': runtime_id, 'checksum': prog_checksum, 'dataset_enc': dataset_enc, 'global_params_enc': global_params_enc, 'send_remote_event': True, } with server.state_access() as s: c = s.clients[client_uuid] res = c.post(init_path, data, callback_func=runtime_init_callback) runtime_init_remote_event_ids.append(res['event_id']) with remote_event.remote_events_access() as r: r.register_multi_callback( runtime_init_remote_event_ids, multi_callback_wakeup) wakeup_ev.wait(timeout=300) wakeup_ev.clear() LOG.info('Runtime initialized for user program: %s', program) aggregation_lock = threading.Lock() def run_iteration_callback(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: error running prog iteration'.format(client)) with aggregation_lock: runtime.aggregate(event_props['result']['aggregation_result_enc']) runtime.reset_aggregation_result() iteration_count = 0 while True: post_data = { 'runtime_id': runtime_id, 'checksum': prog_checksum, 'global_state_enc': runtime.global_state_encoded, 'send_remote_event': True, } with server.state_access() as s: s.post_all( iterate_path, post_data, callback_func=run_iteration_callback, multi_callback_func=multi_callback_wakeup) wakeup_ev.wait(timeout=600) wakeup_ev.clear() runtime.update_global_state() runtime.reset_aggregation_result() LOG.debug('Completed iteration for user program: %s', program) iteration_count = iteration_count + 1 if runtime.done: break LOG.info('Cleaning up...') def runtime_cleanup_callback(client, event_props): if event_props['status'] != events.EventStatus.SUCCESS.value: raise ValueError('{}: error on prog runtime clean up'.format(client)) post_data = { 'runtime_id': runtime_id, 'checksum': prog_checksum, 'send_remote_event': True, } with server.state_access() as s: s.post_all( cleanup_path, post_data, callback_func=runtime_cleanup_callback, multi_callback_func=multi_callback_wakeup) wakeup_ev.wait(timeout=60) wakeup_ev.clear() LOG.info('Finished running user program: %s %i', program, iteration_count) return { 'end_aggregate': runtime.top_level_aggregate_encoded, 'end_global_state': runtime.global_state_encoded, }
def client_item(client_id): """ GET,DELETE /clients/<client_id>: query clients :client_id: client uuid :returns: flask response """ if request.method == 'GET': with server.state_access() as state: client = state.clients.get(client_id) return (respond_json(client.properties) if client else respond_error(404)) elif request.method == 'DELETE': with server.state_access() as state: <<<<<<< HEAD res = state.clients.pop(client_id, None) LOG.info('Deleted client: %s', res) return Response("ok") if res is not None else respond_error(404) ======= client = state.unregister_client(client_id) return Response("ok") if client is not None else respond_error(404) @APP.route('/programs', methods=['GET', 'POST']) def programs(): """ GET,POST /programs: register or list programs :returns: flask response """ if request.method == 'POST': event_data = request.get_json() if not all(n in event_data for n in ('name', 'data', 'checksum')):
def build( self, cuda_bin=None, include_path=None, unpack=True, set_compute_level=True): """ set up user program resources and build shared obj :cuda_bin: path to cuda tools bin :include_path: path to cuda include dir :unpack: if true, unpack program json :set_compute_level: if true, specify appropriate compute level """ if not self.build_dir or not os.path.isdir(self.build_dir): raise ValueError("Build dir not set up") if unpack: files = ['cuda', 'python', 'header'] if self.use_c_extention: files.append('cpp') self.unpack(files) build_files = ['Makefile'] if self.use_c_extention: build_files.append('setup.py') self.copy_build_files(build_files) make_cmd = ['make', '-C', self.build_dir] if cuda_bin is not None: nvcc_path = os.path.join(cuda_bin, 'nvcc') make_cmd.append('NVCC={}'.format(nvcc_path)) if include_path is not None: make_cmd.append('CUDA_L64=-L{}'.format(include_path)) if set_compute_level: flag_value = '-arch={}'.format(self.compute_level) make_cmd.append('COMPUTE_LEVEL_FLAG={}'.format(flag_value)) LOG.debug('Using compute level: %s', flag_value) else: LOG.warning('Using default compute level, not optimized') LOG.debug('Building CUDA shared object') util.subp(make_cmd) if self.use_c_extention: LOG.debug('Building Python wrapper module') # XXX # FIXME create hardcoded tmp dir used by dynamic linker shared_dll = 'user_program_cuda.so' tmp_dir = '/tmp/lizard-slayer/' pathlib.Path(tmp_dir).mkdir(exist_ok=True) for the_file in os.listdir(tmp_dir): file_path = os.path.join(tmp_dir, the_file) if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) setup_cmd = ['python3', 'setup.py', 'build_ext', '-b', tmp_dir] util.subp(setup_cmd, cwd=self.build_dir) # copy over the shared library to be found by the linker shutil.copyfile(os.path.join(self.build_dir, shared_dll), os.path.join(tmp_dir, shared_dll)) # FIXME remove path sys.path.append(tmp_dir) sys.path.append(self.build_dir) self.ready = True else: LOG.debug('No python c extention for user program') self.ready = True