def func_get_request(active_model_name, qout): # Listen connections server = TcpServer('localhost', 12345) while True: # Get connection conn, _ = server.accept() agent = TcpAgent(conn) model_name_length_b = agent.recv(4) model_name_length = struct.unpack('I', model_name_length_b)[0] if model_name_length == 0: break model_name_b = agent.recv(model_name_length) model_name = model_name_b.decode() if active_model_name not in model_name: raise Exception('Invalid model name') timestamp('tcp', 'get_name') data_length_b = agent.recv(4) data_length = struct.unpack('I', data_length_b)[0] if data_length > 0: data_b = agent.recv(data_length) else: data_b = None timestamp('tcp', 'get_data') qout.put((agent, data_b))
def worker_compute(agent, model_name, data_b): # Load model model_module = importlib.import_module('task.' + model_name) model, func, _ = model_module.import_task() data_loader = model_module.import_data_loader() # Model to GPU model = model.to('cuda') # Compute if 'training' in model_name: agent.send(b'FNSH') del agent timestamp('server', 'reply') output = func(model, data_loader) timestamp('server', 'complete') else: output = func(model, data_b) timestamp('server', 'complete') agent.send(b'FNSH') del agent timestamp('server', 'reply')
def main(): model_name = sys.argv[1] batch_size = int(sys.argv[2]) task_name_inf = '%s_inference' % model_name task_name_train = '%s_training' % model_name # Load image data = get_data(model_name, batch_size) latency_list = [] for _ in range(20): # Send training request client_train = TcpClient('localhost', 12345) send_request(client_train, task_name_train, None) time.sleep(4) # Connect client_inf = TcpClient('localhost', 12345) timestamp('client', 'after_inference_connect') time_1 = time.time() # Send inference request send_request(client_inf, task_name_inf, data) # Recv inference reply recv_response(client_inf) time_2 = time.time() latency = (time_2 - time_1) * 1000 latency_list.append(latency) time.sleep(1) recv_response(client_train) close_connection(client_inf) close_connection(client_train) time.sleep(1) timestamp('**********', '**********') print() print() print() stable_latency_list = latency_list[10:] print(stable_latency_list) print('Latency: %f ms (stdev: %f)' % (statistics.mean(stable_latency_list), statistics.stdev(stable_latency_list)))
def main(): timestamp('frontend', 'start') # Load model list model_list_file_name = sys.argv[1] model_list = [] with open(model_list_file_name) as f: for line in f.readlines(): model_list.append(line.strip()) # Warm up CUDA and allocate shared cache torch.randn(1024, device='cuda') torch.cuda.allocate_shared_cache() # Create workers num_workers = 2 worker_list = [] for _ in range(num_workers): p_parent, p_child = mp.Pipe() param_trans_parent, param_trans_child = mp.Pipe() term_parent, term_child = mp.Pipe() worker = WorkerProc(model_list, p_child, param_trans_child, term_child) worker.start() torch.cuda.send_shared_cache() worker_list.append((p_parent, worker, param_trans_parent, term_parent)) timestamp('frontend', 'create_worker') # Create request queue and scheduler thread requests_queue = Queue() t_sch = FrontendScheduleThd(model_list, requests_queue, worker_list) t_sch.start() timestamp('frontend', 'start_schedule') # Accept connections server = TcpServer('localhost', 12345) timestamp('tcp', 'listen') while True: conn, _ = server.accept() agent = TcpAgent(conn) timestamp('tcp', 'connected') t_tcp = FrontendTcpThd(requests_queue, agent) t_tcp.start() # Wait for end t_sch.join()
def worker_compute(model_name, pipe): # Load model model, func = get_model(model_name) # Model to GPU model = model.eval().cuda() while True: agent, data_b = pipe.recv() # Compute output = func(model, data_b) timestamp('server', 'complete') agent.send(b'FNSH') timestamp('server', 'reply') del agent
def run(self): while True: timestamp('tcp', 'listening') model_name_length_b = self.agent.recv(4) model_name_length = struct.unpack('I', model_name_length_b)[0] if model_name_length == 0: break model_name_b = self.agent.recv(model_name_length) model_name = model_name_b.decode() self.qout.put((self.agent, model_name)) timestamp('tcp', 'get_name') data_length_b = self.agent.recv(4) data_length = struct.unpack('I', data_length_b)[0] if data_length > 0: data_b = self.agent.recv(data_length) else: data_b = None timestamp('tcp', 'get_data') self.qout.put(data_b) timestamp('tcp', 'enqueue_request')
def format_ai21_response(response, model): prompt = response['prompt']['text'] response_dict = { 'completions': [ format_ai21_completion(completion, prompt_offset=len(prompt)) for completion in response['completions'] ], 'prompt': { 'text': prompt, 'tokens': [ format_ai21_token_data(token, prompt_offset=0) for token in response['prompt']['tokens'] ] }, 'id': response['id'], 'model': model, 'timestamp': timestamp() } return response_dict
def format_openAI_response(response, prompt, echo=True): if echo: prompt_dict, prompt_end_index = format_openAI_prompt( response['choices'][0], prompt) else: prompt_dict = {'text': prompt, 'tokens': None} prompt_end_index = 0 #prompt = '' response_dict = { 'completions': [ format_openAI_completion(completion, prompt, prompt_end_index) for completion in response['choices'] ], 'prompt': prompt_dict, 'id': response['id'], 'model': response['model'], 'timestamp': timestamp() } return response_dict
def send_request(client, task_name, data): timestamp('client', 'before_request_%s' % task_name) # Serialize data task_name_b = task_name.encode() task_name_length = len(task_name_b) task_name_length_b = struct.pack('I', task_name_length) if data is not None: data_b = data.numpy().tobytes() length = len(data_b) else: data_b = None length = 0 length_b = struct.pack('I', length) timestamp('client', 'after_inference_serialization') # Send Data client.send(task_name_length_b) client.send(task_name_b) client.send(length_b) if data_b is not None: client.send(data_b) timestamp('client', 'after_request_%s' % task_name)
def run(self): timestamp('worker', 'start') # Warm up CUDA and get shared cache torch.randn(1024, device='cuda') time.sleep(1) torch.cuda.recv_shared_cache() # pylint: disable=no-member timestamp('worker', 'share_gpu_memory') # Create requried variables model_map = {} TERMINATE_SIGNAL = [0] # 0 Idle, 1 Running, 2 Terminate complete_queue = Queue() # Import models for model_name in self.model_list: model_summary = ModelSummary(model_name, TERMINATE_SIGNAL, self.param_trans_pipe) model_map[hash(model_name)] = model_summary timestamp('worker', 'import models') # ------- start terminate thread ----------- term_t = WorkerTermThd(self.term_pipe, complete_queue, TERMINATE_SIGNAL) term_t.start() timestamp('worker', 'start_term_thd') # ------- terminate thread started --------- while True: # event loop get a msg then compute # after started forward compute # last while loop for receiving complete queue trans agent, model_name = self.pipe.recv() model_summary = model_map[hash(model_name)] TERMINATE_SIGNAL[0] = 1 timestamp('worker_proc', 'get_model') data_b = self.pipe.recv() timestamp('worker_proc', 'get_data') # start doing inference # frontend_scheduler will directly put # mod_list[0] in to self.complete_queue_trans try: if 'training' in model_name: self.pipe.send('FNSH') agent.send(b'FNSH') with torch.cuda.stream( model_summary.cuda_stream_for_computation): output = model_summary.execute(data_b) print('Get output', output) del output if 'inference' in model_name: self.pipe.send('FNSH') agent.send(b'FNSH') except Exception as e: complete_queue.put('FNSH') # start do cleaning TERMINATE_SIGNAL[0] = 0 timestamp('worker_comp_thd', 'complete') model_summary.reset_initialized(model_summary.model)
def run(self): timestamp('schedule', 'start') # Load models models = {} for model_name in self.model_list: models[hash(model_name)] = self._load_model(model_name) timestamp('schedule', 'load_model') # Create CUDA stream cuda_stream_for_parameter = torch.cuda.Stream() timestamp('schedule', 'create_stream') while True: # Get request agent, model_name = self.qin.get() timestamp('schedule', 'get_request') # Get current worker _, _, _, term_pipe = self.worker_list[self.cur_w_idx] timestamp('schedule', 'get_current_worker') # Send terminate signal to current worker term_pipe.send('terminate') # Get next worker to work on request self.cur_w_idx += 1 self.cur_w_idx %= len(self.worker_list) new_pipe, _, param_trans_pipe_parent, _ = self.worker_list[ self.cur_w_idx] # Send request to new worker new_pipe.send((agent, model_name)) timestamp('schedule', 'notify_new_worker') # Wait for current worker to terminate resp = term_pipe.recv() timestamp('schedule', 'terminate_current_worker') # Transfer data to GPU data_b = self.qin.get() new_pipe.send(data_b) timestamp('schedule', 'send_data') # Allocate cache to streams with torch.cuda.stream(cuda_stream_for_parameter): torch.cuda.insert_shared_cache_for_parameter() # pylint: disable=no-member timestamp('schedule', 'insert_cache') # Transfer parameters to GPU batched_parameter_list = models[hash(model_name)] self._transfer_parameter(new_pipe, batched_parameter_list, cuda_stream_for_parameter, param_trans_pipe_parent) timestamp('schedule', 'transfer_parameters') # Clear status with torch.cuda.stream(cuda_stream_for_parameter): torch.cuda.clear_shared_cache() # pylint: disable=no-member timestamp('schedule', 'clear_status') # Recv response res = new_pipe.recv() timestamp('schedule', 'get_response')
def main(): model_name = sys.argv[1] batch_size = int(sys.argv[2]) # Load image data = get_data(model_name, batch_size) latency_list = [] for _ in range(100): timestamp('client', 'before_request') # Connect client = TcpClient('localhost', 12345) timestamp('client', 'after_connect') time_1 = time.time() # Serialize data task_name = model_name + '_inference' task_name_b = task_name.encode() task_name_length = len(task_name_b) task_name_length_b = struct.pack('I', task_name_length) data_b = data.numpy().tobytes() length = len(data_b) length_b = struct.pack('I', length) timestamp('client', 'after_serialization') # Send Data client.send(task_name_length_b) client.send(task_name_b) client.send(length_b) client.send(data_b) timestamp('client', 'after_send') # Get reply reply_b = client.recv(4) reply = reply_b.decode() if reply == 'FAIL': timestamp('client', 'FAIL') break timestamp('client', 'after_reply') time_2 = time.time() model_name_length = 0 model_name_length_b = struct.pack('I', model_name_length) client.send(model_name_length_b) timestamp('client', 'close_training_connection') timestamp('**********', '**********') latency = (time_2 - time_1) * 1000 latency_list.append(latency) # time.sleep(1) print() print() print() stable_latency_list = latency_list[10:] print('Latency: %f ms (stdev: %f)' % (statistics.mean(stable_latency_list), statistics.stdev(stable_latency_list)))
def close_connection(client): model_name_length = 0 model_name_length_b = struct.pack('I', model_name_length) client.send(model_name_length_b) timestamp('client', 'close_connection')
def recv_response(client): reply_b = client.recv(4) reply = reply_b.decode() timestamp('client', 'after_reply')