Example #1
0
def func_get_request(active_model_name, qout):
    # Listen connections
    server = TcpServer('localhost', 12345)

    while True:
        # Get connection
        conn, _ = server.accept()
        agent = TcpAgent(conn)

        model_name_length_b = agent.recv(4)
        model_name_length = struct.unpack('I', model_name_length_b)[0]
        if model_name_length == 0:
            break
        model_name_b = agent.recv(model_name_length)
        model_name = model_name_b.decode()
        if active_model_name not in model_name:
            raise Exception('Invalid model name')
        timestamp('tcp', 'get_name')

        data_length_b = agent.recv(4)
        data_length = struct.unpack('I', data_length_b)[0]
        if data_length > 0:
            data_b = agent.recv(data_length)
        else:
            data_b = None
        timestamp('tcp', 'get_data')
        qout.put((agent, data_b))
Example #2
0
def worker_compute(agent, model_name, data_b):
    # Load model
    model_module = importlib.import_module('task.' + model_name)
    model, func, _ = model_module.import_task()
    data_loader = model_module.import_data_loader()

    # Model to GPU
    model = model.to('cuda')

    # Compute
    if 'training' in model_name:
        agent.send(b'FNSH')
        del agent
        timestamp('server', 'reply')

        output = func(model, data_loader)
        timestamp('server', 'complete')

    else:
        output = func(model, data_b)
        timestamp('server', 'complete')

        agent.send(b'FNSH')
        del agent
        timestamp('server', 'reply')
Example #3
0
def main():
    model_name = sys.argv[1]
    batch_size = int(sys.argv[2])

    task_name_inf = '%s_inference' % model_name
    task_name_train = '%s_training' % model_name

    # Load image
    data = get_data(model_name, batch_size)

    latency_list = []
    for _ in range(20):
        # Send training request
        client_train = TcpClient('localhost', 12345)
        send_request(client_train, task_name_train, None)
        time.sleep(4)

        # Connect
        client_inf = TcpClient('localhost', 12345)
        timestamp('client', 'after_inference_connect')
        time_1 = time.time()

        # Send inference request
        send_request(client_inf, task_name_inf, data)

        # Recv inference reply
        recv_response(client_inf)
        time_2 = time.time()
        latency = (time_2 - time_1) * 1000
        latency_list.append(latency)

        time.sleep(1)
        recv_response(client_train)
        close_connection(client_inf)
        close_connection(client_train)
        time.sleep(1)
        timestamp('**********', '**********')

    print()
    print()
    print()
    stable_latency_list = latency_list[10:]
    print(stable_latency_list)
    print('Latency: %f ms (stdev: %f)' %
          (statistics.mean(stable_latency_list),
           statistics.stdev(stable_latency_list)))
Example #4
0
def main():
    timestamp('frontend', 'start')

    # Load model list
    model_list_file_name = sys.argv[1]
    model_list = []
    with open(model_list_file_name) as f:
        for line in f.readlines():
            model_list.append(line.strip())

    # Warm up CUDA and allocate shared cache
    torch.randn(1024, device='cuda')
    torch.cuda.allocate_shared_cache()

    # Create workers
    num_workers = 2
    worker_list = []
    for _ in range(num_workers):
        p_parent, p_child = mp.Pipe()
        param_trans_parent, param_trans_child = mp.Pipe()
        term_parent, term_child = mp.Pipe()
        worker = WorkerProc(model_list, p_child, param_trans_child, term_child)
        worker.start()
        torch.cuda.send_shared_cache()
        worker_list.append((p_parent, worker, param_trans_parent, term_parent))
        timestamp('frontend', 'create_worker')

    # Create request queue and scheduler thread
    requests_queue = Queue()
    t_sch = FrontendScheduleThd(model_list, requests_queue, worker_list)
    t_sch.start()
    timestamp('frontend', 'start_schedule')

    # Accept connections
    server = TcpServer('localhost', 12345)
    timestamp('tcp', 'listen')
    while True:
        conn, _ = server.accept()
        agent = TcpAgent(conn)
        timestamp('tcp', 'connected')
        t_tcp = FrontendTcpThd(requests_queue, agent)
        t_tcp.start()

    # Wait for end
    t_sch.join()
Example #5
0
def worker_compute(model_name, pipe):
    # Load model
    model, func = get_model(model_name)

    # Model to GPU
    model = model.eval().cuda()

    while True:
        agent, data_b = pipe.recv()

        # Compute
        output = func(model, data_b)
        timestamp('server', 'complete')

        agent.send(b'FNSH')
        timestamp('server', 'reply')

        del agent
Example #6
0
    def run(self):
        while True:
            timestamp('tcp', 'listening')
            
            model_name_length_b = self.agent.recv(4)
            model_name_length = struct.unpack('I', model_name_length_b)[0]
            if model_name_length == 0:
                break
            model_name_b = self.agent.recv(model_name_length)
            model_name = model_name_b.decode()
            self.qout.put((self.agent, model_name))
            timestamp('tcp', 'get_name')

            data_length_b = self.agent.recv(4)
            data_length = struct.unpack('I', data_length_b)[0]
            if data_length > 0:
                data_b = self.agent.recv(data_length)
            else:
                data_b = None
            timestamp('tcp', 'get_data')
            self.qout.put(data_b)
            timestamp('tcp', 'enqueue_request')
Example #7
0
def format_ai21_response(response, model):
    prompt = response['prompt']['text']
    response_dict = {
        'completions': [
            format_ai21_completion(completion, prompt_offset=len(prompt))
            for completion in response['completions']
        ],
        'prompt': {
            'text':
            prompt,
            'tokens': [
                format_ai21_token_data(token, prompt_offset=0)
                for token in response['prompt']['tokens']
            ]
        },
        'id':
        response['id'],
        'model':
        model,
        'timestamp':
        timestamp()
    }
    return response_dict
Example #8
0
def format_openAI_response(response, prompt, echo=True):
    if echo:
        prompt_dict, prompt_end_index = format_openAI_prompt(
            response['choices'][0], prompt)
    else:
        prompt_dict = {'text': prompt, 'tokens': None}
        prompt_end_index = 0
        #prompt = ''

    response_dict = {
        'completions': [
            format_openAI_completion(completion, prompt, prompt_end_index)
            for completion in response['choices']
        ],
        'prompt':
        prompt_dict,
        'id':
        response['id'],
        'model':
        response['model'],
        'timestamp':
        timestamp()
    }
    return response_dict
Example #9
0
def send_request(client, task_name, data):
    timestamp('client', 'before_request_%s' % task_name)

    # Serialize data
    task_name_b = task_name.encode()
    task_name_length = len(task_name_b)
    task_name_length_b = struct.pack('I', task_name_length)

    if data is not None:
        data_b = data.numpy().tobytes()
        length = len(data_b)
    else:
        data_b = None
        length = 0
    length_b = struct.pack('I', length)
    timestamp('client', 'after_inference_serialization')

    # Send Data
    client.send(task_name_length_b)
    client.send(task_name_b)
    client.send(length_b)
    if data_b is not None:
        client.send(data_b)
    timestamp('client', 'after_request_%s' % task_name)
Example #10
0
    def run(self):
        timestamp('worker', 'start')

        # Warm up CUDA and get shared cache
        torch.randn(1024, device='cuda')
        time.sleep(1)
        torch.cuda.recv_shared_cache()  # pylint: disable=no-member
        timestamp('worker', 'share_gpu_memory')

        # Create requried variables
        model_map = {}
        TERMINATE_SIGNAL = [0]  # 0 Idle, 1 Running, 2 Terminate
        complete_queue = Queue()

        # Import models
        for model_name in self.model_list:
            model_summary = ModelSummary(model_name, TERMINATE_SIGNAL,
                                         self.param_trans_pipe)
            model_map[hash(model_name)] = model_summary
        timestamp('worker', 'import models')

        # ------- start terminate thread -----------
        term_t = WorkerTermThd(self.term_pipe, complete_queue,
                               TERMINATE_SIGNAL)
        term_t.start()
        timestamp('worker', 'start_term_thd')
        # ------- terminate thread started ---------

        while True:
            # event loop get a msg then compute
            # after started forward compute
            # last while loop for receiving complete queue trans
            agent, model_name = self.pipe.recv()
            model_summary = model_map[hash(model_name)]
            TERMINATE_SIGNAL[0] = 1
            timestamp('worker_proc', 'get_model')

            data_b = self.pipe.recv()
            timestamp('worker_proc', 'get_data')

            # start doing inference
            # frontend_scheduler will directly put
            # mod_list[0] in to self.complete_queue_trans
            try:
                if 'training' in model_name:
                    self.pipe.send('FNSH')
                    agent.send(b'FNSH')

                with torch.cuda.stream(
                        model_summary.cuda_stream_for_computation):
                    output = model_summary.execute(data_b)
                    print('Get output', output)
                    del output

                if 'inference' in model_name:
                    self.pipe.send('FNSH')
                    agent.send(b'FNSH')
            except Exception as e:
                complete_queue.put('FNSH')

            # start do cleaning
            TERMINATE_SIGNAL[0] = 0
            timestamp('worker_comp_thd', 'complete')

            model_summary.reset_initialized(model_summary.model)
Example #11
0
    def run(self):
        timestamp('schedule', 'start')

        # Load models
        models = {}
        for model_name in self.model_list:
            models[hash(model_name)] = self._load_model(model_name)
        timestamp('schedule', 'load_model')

        # Create CUDA stream
        cuda_stream_for_parameter = torch.cuda.Stream()
        timestamp('schedule', 'create_stream')

        while True:
            # Get request
            agent, model_name = self.qin.get()
            timestamp('schedule', 'get_request')

            # Get current worker
            _, _, _, term_pipe = self.worker_list[self.cur_w_idx]
            timestamp('schedule', 'get_current_worker')
            # Send terminate signal to current worker
            term_pipe.send('terminate')

            # Get next worker to work on request
            self.cur_w_idx += 1
            self.cur_w_idx %= len(self.worker_list)
            new_pipe, _, param_trans_pipe_parent, _ = self.worker_list[
                self.cur_w_idx]

            # Send request to new worker
            new_pipe.send((agent, model_name))
            timestamp('schedule', 'notify_new_worker')

            # Wait for current worker to terminate
            resp = term_pipe.recv()
            timestamp('schedule', 'terminate_current_worker')

            # Transfer data to GPU
            data_b = self.qin.get()
            new_pipe.send(data_b)
            timestamp('schedule', 'send_data')

            # Allocate cache to streams
            with torch.cuda.stream(cuda_stream_for_parameter):
                torch.cuda.insert_shared_cache_for_parameter()  # pylint: disable=no-member
            timestamp('schedule', 'insert_cache')
            # Transfer parameters to GPU
            batched_parameter_list = models[hash(model_name)]
            self._transfer_parameter(new_pipe, batched_parameter_list,
                                     cuda_stream_for_parameter,
                                     param_trans_pipe_parent)
            timestamp('schedule', 'transfer_parameters')

            # Clear status
            with torch.cuda.stream(cuda_stream_for_parameter):
                torch.cuda.clear_shared_cache()  # pylint: disable=no-member
            timestamp('schedule', 'clear_status')

            # Recv response
            res = new_pipe.recv()
            timestamp('schedule', 'get_response')
Example #12
0
def main():
    model_name = sys.argv[1]
    batch_size = int(sys.argv[2])

    # Load image
    data = get_data(model_name, batch_size)

    latency_list = []
    for _ in range(100):
        timestamp('client', 'before_request')

        # Connect
        client = TcpClient('localhost', 12345)
        timestamp('client', 'after_connect')
        time_1 = time.time()

        # Serialize data
        task_name = model_name + '_inference'
        task_name_b = task_name.encode()
        task_name_length = len(task_name_b)
        task_name_length_b = struct.pack('I', task_name_length)
        data_b = data.numpy().tobytes()
        length = len(data_b)
        length_b = struct.pack('I', length)
        timestamp('client', 'after_serialization')

        # Send Data
        client.send(task_name_length_b)
        client.send(task_name_b)
        client.send(length_b)
        client.send(data_b)
        timestamp('client', 'after_send')

        # Get reply
        reply_b = client.recv(4)
        reply = reply_b.decode()
        if reply == 'FAIL':
            timestamp('client', 'FAIL')
            break
        timestamp('client', 'after_reply')
        time_2 = time.time()

        model_name_length = 0
        model_name_length_b = struct.pack('I', model_name_length)
        client.send(model_name_length_b)
        timestamp('client', 'close_training_connection')

        timestamp('**********', '**********')
        latency = (time_2 - time_1) * 1000
        latency_list.append(latency)

        # time.sleep(1)

    print()
    print()
    print()
    stable_latency_list = latency_list[10:]
    print('Latency: %f ms (stdev: %f)' %
          (statistics.mean(stable_latency_list),
           statistics.stdev(stable_latency_list)))
Example #13
0
def close_connection(client):
    model_name_length = 0
    model_name_length_b = struct.pack('I', model_name_length)
    client.send(model_name_length_b)
    timestamp('client', 'close_connection')
Example #14
0
def recv_response(client):
    reply_b = client.recv(4)
    reply = reply_b.decode()
    timestamp('client', 'after_reply')