def run(self): TIMEOUT = 1000 buffer = experience_buffer() self.poller.modify(self.sock, ALL_FLAGS) curr_flags = ALL_FLAGS rall = 0 while self.running: if self.window_is_open(): if curr_flags != ALL_FLAGS: self.poller.modify(self.sock, ALL_FLAGS) curr_flags = ALL_FLAGS else: if curr_flags != READ_ERR_FLAGS: self.poller.modify(self.sock, READ_ERR_FLAGS) curr_flags = READ_ERR_FLAGS #Polls the set of registered file descriptors, and returns a possibly-empty list containing (fd, event) 2-tuples for the descriptors that have events or errors to report. events = self.poller.poll(TIMEOUT) if not events: # timed out self.send() for fd, flag in events: #fileno():Return the socket's file descriptor (a small integer) assert self.sock.fileno() == fd if flag & ERR_FLAGS: sys.exit('Error occurred to the channel') if flag & READ_FLAGS: s0 = self.state norm_state = normalize(s0) one_hot_action = one_hot(self.action, self.action_cnt) s0 = norm_state + one_hot_action s1, action, reward, done = self.recv() norm_state = normalize(s1) one_hot_action = one_hot(self.action, self.action_cnt) s1 = norm_state + one_hot_action buffer.add([[s0, action, reward, s1, done]]) rall += reward if flag & WRITE_FLAGS: if self.window_is_open(): self.send() return buffer, rall
def sample_action(self, state): norm_state = normalize(state) one_hot_action = one_hot(self.prev_action, self.action_cnt) aug_state = norm_state + one_hot_action # debug # print("entry") # print("dagger-runsender: aug_state: " + str(aug_state)) # debug # Get probability of each action from the local network. pi = self.model feed_dict = { pi.input: [[aug_state]], pi.state_in: self.lstm_state, } #debug self.logger.warning("RUN_SENDER: aug_state is: "+str(aug_state)) #debug ops_to_run = [pi.action_probs, pi.state_out] action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict) # Choose an action to take action = np.argmax(action_probs[0][0]) self.prev_action = action return action
def recv(self): serialized_ack, addr = self.sock.recvfrom(1600) if addr != self.peer_addr: return ack = datagram_pb2.Ack() ack.ParseFromString(serialized_ack) action = self.action self.update_state(ack) if self.step_start_ms is None: self.step_start_ms = curr_ts_ms() done = False reward = 0 # At each step end, feed the state: if curr_ts_ms() - self.step_start_ms > self.step_len_ms: # step's end self.state = [ self.delay_ewma, self.delivery_rate_ewma, self.send_rate_ewma, self.cwnd ] #print(state) # time how long it takes to get an action from the NN if self.debug: start_sample = time.time() norm_state = normalize(self.state) one_hot_action = one_hot(self.action, self.action_cnt) state = norm_state + one_hot_action self.action = self.sample_action(state) if self.debug: self.sampling_file.write('%.2f ms\n' % ((time.time() - start_sample) * 1000)) self.take_action(self.action) ''' self.delay_ewma = None self.delivery_rate_ewma = None self.send_rate_ewma = None ''' self.step_start_ms = curr_ts_ms() done = False if self.train: self.step_cnt += 1 reward = self.compute_performance() if self.step_cnt >= Sender.max_steps: self.step_cnt = 0 self.running = False done = True #print self.state,self.action, reward, done return self.state, action, reward, done
def sample_action(self, state): norm_state = normalize(state) one_hot_action = one_hot(self.prev_action, self.action_cnt) aug_state = norm_state + one_hot_action btime = time.time() # Get probability of each action from the local network. pi = self.model feed_dict = { pi.input: [[aug_state]], pi.state_in: self.lstm_state, } ops_to_run = [pi.action_probs, pi.state_out] action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict) # Choose an action to take action = np.argmax(action_probs[0][0]) self.prev_action = action info = 'make decision {} to {} with {}s \n'.format( action, state[3], time.time() - btime) self.log.write(info) # action = np.argmax(np.random.multinomial(1, action_probs[0] - 1e-5)) # temperature = 1.0 # temp_probs = softmax(action_probs[0] / temperature) # action = np.argmax(np.random.multinomial(1, temp_probs - 1e-5)) return action, aug_state
def sample_action(self, state): norm_state = normalize(state) one_hot_action = one_hot(self.prev_action, self.action_cnt) aug_state = norm_state + one_hot_action # Get probability of each action from the local network. pi = self.mainQN feed_dict = {pi.input: [[aug_state]]} ops_to_run = pi.action_probs action_probs = self.sess.run(ops_to_run, feed_dict) # Choose an action to take action = np.argmax(action_probs[0][0]) self.prev_action = action return action
def sample_action(self, state): """ Given a state buffer in the past step, returns an action to perform. Appends to the state/action buffers the state and the "correct" action to take according to the expert. """ cwnd = state[self.state_dim - 1] expert_action = self.expert.sample_action(cwnd) # For decision-making, normalize. norm_state = normalize(state) one_hot_action = one_hot(self.prev_action, self.action_cnt) aug_state = norm_state + one_hot_action # Fill in state_buf, action_buf self.state_buf.append(aug_state) self.action_buf.append(expert_action) # Always use the expert on the first episode to get our bearings. if self.curr_ep == 0: self.prev_action = expert_action return expert_action # Get probability of each action from the local network. pi = self.local_network feed_dict = { pi.input: [[aug_state]], pi.state_in: self.lstm_state, } ops_to_run = [pi.action_probs, pi.state_out] action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict) # Choose an action to take and update current LSTM state # action = np.argmax(np.random.multinomial(1, action_probs[0][0] - 1e-5)) action = np.argmax(action_probs[0][0]) self.prev_action = action return action
def sample_action(self, state): if np.random.rand(1) < e: action = np.random.randint(0, self.env.action_cnt) else: norm_state = normalize(state) one_hot_action = one_hot(self.prev_action, self.action_cnt) aug_state = norm_state + one_hot_action # Get probability of each action from the local network. pi = self.mainQN feed_dict = { pi.state: [[aug_state]], } ops_to_run = [pi.action_probs] action_probs = self.sess.run(ops_to_run, feed_dict) # Choose an action to take action = np.argmax(action_probs[0][0]) self.prev_action = action return action
def sample_action(self, state): """ Given a state buffer in the past step, returns an action to perform. Appends to the state/action buffers the state and the "correct" action to take according to the expert. """ cwnd = state[3] # expert_action = self.expert.sample_action(cwnd) # For decision-making, normalize. norm_state = normalize(state) one_hot_action = one_hot(self.prev_action, self.action_cnt) aug_state = norm_state + one_hot_action # Fill in state_buf, action_buf # self.state_buf.append(aug_state) r = self.utility(aug_state) - self.prev_utility transition = np.hstack((aug_state, [self.prev_action, r], self.prev_state)) # replace the old memory with new memory index = self.memory_counter % self.memory_size self.memory[index, :] = transition # sample action self.memory_counter += 1 # refresh previous state and utility self.prev_utility = self.utility(aug_state) self.prev_state = aug_state # sample batch memory from all memory if self.memory_counter > self.memory_size: sample_index = np.random.choice(self.memory_size, size=self.batch_size) else: sample_index = np.random.choice(self.memory_counter, size=self.batch_size) batch_memory = self.memory[sample_index, :] # todo : train current network # self.action_buf.append(expert_action) # Always use the expert on the first episode to get our bearings. #if self.curr_ep == 0: # self.prev_action = expert_action # return expert_action # Get probability of each action from the local network. pi = self.local_network feed_dict = { pi.input: [[aug_state]], pi.state_in: self.lstm_state, } ops_to_run = [pi.action_probs, pi.state_out] action_probs, self.lstm_state = self.sess.run(ops_to_run, feed_dict) # Choose an action to take and update current LSTM state # action = np.argmax(np.random.multinomial(1, action_probs[0][0] - 1e-5)) action = np.argmax(action_probs[0][0]) self.prev_action = action return action
def run(self): TIMEOUT = 1000 self.poller.modify(self.sock, ALL_FLAGS) curr_flags = ALL_FLAGS tput_list = [] delay_list = [] steps = 1 while self.running: if self.window_is_open(): if curr_flags != ALL_FLAGS: self.poller.modify(self.sock, ALL_FLAGS) curr_flags = ALL_FLAGS else: if curr_flags != READ_ERR_FLAGS: self.poller.modify(self.sock, READ_ERR_FLAGS) curr_flags = READ_ERR_FLAGS #Polls the set of registered file descriptors, and returns a possibly-empty list containing (fd, event) 2-tuples for the descriptors that have events or errors to report. events = self.poller.poll(TIMEOUT) if not events: # timed out self.send() for fd, flag in events: #fileno():Return the socket's file descriptor (a small integer) assert self.sock.fileno() == fd if flag & ERR_FLAGS: sys.exit('Error occurred to the channel') if flag & READ_FLAGS: s0 = self.state norm_state = normalize(s0) one_hot_action = one_hot(self.action, self.action_cnt) s0 = norm_state + one_hot_action step_end ,s1, action, reward, done,tput, perc_delay = self.recv() if step_end: norm_state = normalize(s1) one_hot_action = one_hot(self.action, self.action_cnt) s1 = norm_state + one_hot_action buffer.add([[s0,action,reward,s1,done]]) if steps > 500 and steps % 4 == 0: self.update_Qnet(buffer) rList.append(reward) tput_list.append(tput) delay_list.append(perc_delay) #print(reward) if steps % 1000 == 0: r_ave.append(sum(rList)/1000.0) print("average reward on last 1000 steps", r_ave[-1]) print("average tput on last 1000 steps", sum(tput_list[-1000:])/1000.0) print("average delay on last 1000 steps", sum(delay_list[-1000:])/1000.0) rList = [] steps += 1 if flag & WRITE_FLAGS: if self.window_is_open(): self.send()