def __init__(self, env, is_batch_norm=False, is_grad_inverter=True): super().__init__(env) assert isinstance(env.action_space, Box), "action space must be continuous" if is_batch_norm: self.critic_net = CriticNet_bn(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet_bn(self.observation_space_size, self.action_space_size) else: self.critic_net = CriticNet(self.observation_space_size, self.action_space_size) self.actor_net = ActorNet(self.observation_space_size, self.action_space_size) self.is_grad_inverter = is_grad_inverter self.replay_memory = deque() self.time_step = 0 action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self,env): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] #Initialize Actor Network: action_bound = env.action_space.high self.critic_net = CriticNet(self.num_states, self.num_actions) #self.actor_net is an object self.actor_net = ActorNet(self.num_states, self.num_actions, action_bound) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 #invert gradients (softthresholding) action_bounds = [[3], [-3]] #specify upper bound and lower bound of action space #action_bound structure for higher dimension actions[ #[max_of_action_dim_0, max_of_action_dim_1, ..., max_of_action_dim_10], #[min_of_action_dim_0, min_of_action_dim_1, ..., min_of_action_dim_10] #] self.grad_inv = grad_inverter(action_bounds)
def __init__(self,env, is_batch_norm): self.env = env self.num_states = 59 self.num_actions = 3 if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = [1.0, 1.0, 1.0] action_min = [-1.0, -1.0, -1.0] action_bounds = [action_max,action_min] self.grad_inv = grad_inverter(action_bounds)
def restore(self, path): print("restoring the agent") file = os.path.join( path, "agent_data.pkl") # TODO -- come up with a not stupid name with open(file, "rb") as f: dump = pickle.load(f) i_vars = vars(dump) keys = i_vars.keys() for key in keys: tmp = getattr(dump, key) setattr(self, key, tmp) action_max = np.array(self.high).tolist() action_min = np.array(self.low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds) # Now replace the networks # IGNORE THE "IS BATCH " CONDITION FOR NOW saved_critic_net = CriticNet(self.observation_space_size, self.action_space_size) saved_actor_net = ActorNet(self.observation_space_size, self.action_space_size) # Load in the saved graphs critic_file = os.path.join(path, "critic_net.ckpt") saved_critic_net.restore(critic_file) actor_file = os.path.join(path, "actor_net.ckpt") saved_actor_net.restore(actor_file) self.critic_net = saved_critic_net self.actor_net = saved_actor_net
def __init__(self,env, is_batch_norm=False): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max,action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self, num_states, num_actions, is_batch_norm): self.num_states = num_states self.num_actions = num_actions if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = 5 * np.ones((1, num_actions)) action_max = action_max.flatten() action_max = action_max.tolist() action_min = 0 * np.ones((1, num_actions)) action_min = action_min.flatten() action_min = action_min.tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self,env, is_batch_norm): self.env = env self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] if is_batch_norm: self.critic_net = CriticNet_bn(self.num_states, self.num_actions) self.actor_net = ActorNet_bn(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #Initialize Buffer Network: self.replay_memory = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max,action_min] self.grad_inv = grad_inverter(action_bounds)
def __init__(self, env, is_batch_norm): self.env = env self.num_states = env.observation_space.shape[0] - 1 self.num_actions = env.action_space.shape[0] self.num_hidden_states = 5 if is_batch_norm: self.critic_net = CriticNet(self.num_states, sself.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) else: self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) #因为是连续的,刚开始要确定action是多大,范围是多少。 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max, action_min] #初始化一个东西,计算Q的梯度用的 self.grad_inv = grad_inverter(action_bounds) self.replay_memory = deque()
def __init__( self, hisar_size, ar_size, action_size, TAU = 0.001, is_batch_norm = 0, write_sum = 0, net_size_scale=1, max_load=1, beta0=beta): self.hisar_size = hisar_size self.load_size = action_size + 1 self.ar_size = ar_size self.state_size = action_size * 2 self.action_size = action_size self.ar_action_size = ar_size + action_size #print("net_size_scale: "+str(net_size_scale)) if is_batch_norm: if len(CN_N_HIDDENS)==2: self.critic_net = CriticNet_bn( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) else: self.critic_net = CriticNet_bn_3( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.actor_net = ActorNet_bn( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.ar_pred_net = ARPredNet_bn( self.hisar_size, self.ar_size, write_sum, net_size_scale ) # arrival rate prediction network self.load_map_net = LoadMapNet_bn( self.ar_size, self.action_size, self.load_size, write_sum, net_size_scale ) # load mapping network else: self.critic_net = CriticNet( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.actor_net = ActorNet( self.state_size, self.action_size, TAU, write_sum, net_size_scale ) self.ar_pred_net = ARPredNet( self.hisar_size, self.ar_size, write_sum, net_size_scale ) # arrival rate prediction network self.load_map_net = LoadMapNet( self.ar_size, self.action_size, self.load_size, write_sum, net_size_scale ) # load mapping network self.env = ENV( action_size, max_load=max_load, beta0=beta0 ) #self.k_nearest_neighbors = int(max_actions * k_ratio ) #Initialize Network Buffers: self.replay_memory_ac = deque() self.replay_memory_arp = deque() self.replay_memory_lm = deque() #Intialize time step: self.time_step = 0 self.counter = 0 action_max = np.ones( ( self.action_size ) ).tolist() action_min = np.zeros( ( self.action_size ) ).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter( action_bounds )
def __init__(self, num_states, num_actions, action_space_high, action_space_low, is_batch_norm): self.num_states = num_states self.num_actions = num_actions self.action_space_high = action_space_high self.action_space_low = action_space_low # Batch normalisation disabled. self.critic_net = CriticNet(self.num_states, self.num_actions) self.actor_net = ActorNet(self.num_states, self.num_actions) # Replay Memory 초기화 self.replay_memory = deque() # time 초기화 self.time_step = 0 self.counter = 0 action_max = np.array(action_space_high).tolist() action_min = np.array(action_space_low).tolist() action_bounds = [action_max, action_min] self.grad_inv = grad_inverter(action_bounds)
def run_DQN(self, seed_n, Exp, Double, Prioritized): ############## parameter 복사 ############## sess = self.sess dis = self.dis REPLAY_MEMORY = self.REPLAY_MEMORY replay_memory = self.replay_memory batch_size = self.batch_size size_action_batch = self.size_action_batch Game = self.Game save_epi = self.save_epi save_network = self.save_network max_episodes = self.max_episodes max_steps = self.max_steps env = self.env random_action = self.random_action input_size = self.input_size output_size = self.output_size alpha = self.alpha beta_init = self.beta_init beta_max_step = self.beta_max_step eps = self.eps eps_div = self.eps_div s_scale = self.s_scale training_step = self.training_step copy_step = self.copy_step action_copy_step = self.action_copy_step action_train = self.action_train weighted_train = self.weighted_train repu_num = self.repu_num DDPG = self.DDPG ending_cond_epis = self.ending_cond_epis ending_cond_reward = self.ending_cond_reward env.seed(seed_n) np.random.seed(seed_n) tf.set_random_seed(seed_n) random.seed(seed_n) ############################################# Q_Network = self.Q_Network A_batch = Q_Network.get_action_batch() if DDPG: Action_Network = self.Action_Network # DDPG Action Network 학습 시 사용되는 grad_inv 설정 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max, action_min] grad_inv = grad_inverter(sess, action_bounds) case_n = seed_n + 1 end_episode = 0 step_count_total = 0 global_step = 0 loss = 0 e = 1. replay_buffer = deque() Q_list = [] TD_buffer = deque() steps_list = [] step_avg_list = [] global_step_list = [] average_distance = [] rate_of_adjacent = [] print("") print("CASE {}".format(case_n)) print(" STATE DIM : {}, ACTION DIM : {}".format( input_size, self.action_dim)) print(" Exp : {}".format(Exp)) if DDPG: print(" Strategy : Double : {}, Prioritized : {}, DDPG : {}". format(Double, Prioritized, DDPG)) elif random_action: if action_train: print( " Strategy : Double : {}, Prioritized : {}, ACTION : RANDOM, ACTION TRAIN 'ON'" .format(Double, Prioritized)) else: print( " Strategy : Double : {}, Prioritized : {}, ACTION : RANDOM" .format(Double, Prioritized)) else: if action_train: print( " Strategy : Double : {}, Prioritized : {}, ACTION : DISCRETIZATION, ACTION TRAIN 'ON'" .format(Double, Prioritized)) else: print( " Strategy : Double : {}, Prioritized : {}, ACTION : DISCRETIZATION" .format(Double, Prioritized)) print("") for episode in range(1, max_episodes + 1): done = False step_count = 0 current_step = 0 cost = 0 state = env.reset() while not done: # 입실론 값 조정, 0.001미만이 될 시 더 이상 작아지지 않는다. if e > 0.001: #e = 1. / ((float(episode - 1) / eps_div) + 1) e = 1. / ((float(global_step) / eps_div) + 1) t4 = time.time() if DDPG: # DDPG true 시, 액션네트워크로부터 행동을 결정받음 action = Action_Network.evaluate_actor( np.reshape(state, [1, input_size]))[0] else: # DDPG false 시, state에 따른 각 행동 별 q 값을 get_q_batch로 받은 후 Exploration 방식에 따라 행동 결정 action0 = Exploration.choice_action(Exp, e, s_scale,\ np.reshape(Q_Network.get_q_batch(np.reshape(state,[1,-1])),[1,-1])[0]) action = A_batch[action0] next_state, reward, done, _ = env.step(action) step_count += reward global_step += 1 current_step += 1 # Prioritized 시 tree(replay_memory)에 저장, 아닐 시 랜덤으로 추출할 replay_beffer에 저장 if Prioritized: replay_memory.save_experience(state, action, reward, next_state, done) else: replay_buffer.append( (state, next_state, action, reward, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state if global_step <= beta_max_step: replay_memory.anneal_per_importance_sampling( global_step, beta_max_step) # training step마다 traing 실행 if global_step > batch_size and global_step % training_step == 0: for re in range( repu_num): # repu_num만큼 반복 training. 거의 1로 사용. if Prioritized: # replay_memory로부터 batch를 추출 idx, priorities, w_batch, experience = replay_memory.retrieve_experience( batch_size) minibatch = self.format_experience(experience) if DDPG: # DDPG true시 Q네트워크와 Action네트워크 모두 training errors, cost = Train.train_prioritized_DDPG( Q_Network, Action_Network, minibatch, w_batch, output_size, grad_inv) replay_memory.update_experience_weight( idx, errors) else: # DDPG false시 Q네트워크 training errors, cost, state_t_batch = Train.train_prioritized( Q_Network, minibatch, w_batch, Exp, s_scale, input_size, output_size, size_action_batch) replay_memory.update_experience_weight( idx, errors) # action_copy_step 마다 action set을 training, action_train이 false 시 RAS 알고리즘 if action_train and global_step % action_copy_step == 0: action_weight = [] if weighted_train: # WARAS 알고리즘 # weight 계산 for k in range(batch_size): state_t = np.reshape( state_t_batch[k], [1, -1]) q_batch = Q_Network.get_q_batch( state_t) q_batch = np.reshape( q_batch, [1, -1])[0] q_batch = q_batch * 10. max_q = np.max(q_batch) q_batch = np.exp(q_batch - max_q) action_weight.append(q_batch) else: # ARAS 알고리즘 # 모든 weight를 1로 설정 action_weight = np.ones( [batch_size, size_action_batch]) # weight 값을 이용한 Q네트워크 training Q_Network.train_weighted_actor( state_t_batch, action_weight) # target-action set을 update Q_Network.update_action_target_critic() A_batch = Q_Network.get_action_batch() t_A_batch = Q_Network.get_target_action_batch( ) """ # 거리가 가까운 action 쌍을 찾아 resampling A_batch, t_A_batch = self.realign_action_batch(A_batch, t_A_batch) Q_Network.realign_action_batch(A_batch, t_A_batch) A_batch = Q_Network.get_action_batch() t_A_batch = Q_Network.get_target_action_batch() """ else: # Prioritized가 아닐 시 랜덤하게 minibatch를 생성해 training minibatch = random.sample(replay_buffer, batch_size) if DDPG: cost = Train.train_DDPG( Q_Network, Action_Network, minibatch, output_size, grad_inv) else: cost, state_t_batch = Train.train( Q_Network, minibatch, Exp, s_scale, input_size, output_size, size_action_batch) # copy_step 마다 Q네트워크 업데이트 if global_step % copy_step == 0: if DDPG: # Update target Critic and actor network Q_Network.update_target_critic() Q_Network.update_action_target_critic() Action_Network.update_target_actor() else: Q_Network.update_target_critic() Q_Network.update_action_target_critic() steps_list.append(step_count) global_step_list.append(global_step) # Print the average of result if episode < ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / episode) if episode == ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / ending_cond_epis) if episode > ending_cond_epis: step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format( episode, round(step_avg_list[episode - 1], 3))) if DDPG: print (" ( Result : {}, Loss : {}, Steps : {}, Global Steps : {} )" #.format(round(step_count, 3), round(cost, 5), current_step, global_step)) .format(round(step_count, 3), 0, current_step, global_step)) elif Exp == 'epsilon' or Exp == 'sparsemax': print (" ( Result : {}, Loss : {}, Epsilon : {}, Steps : {}, Global Steps : {} )" #.format(round(step_count, 3), round(cost, 5), round(e, 4), current_step, global_step)) .format(round(step_count, 3), 0, round(e, 5), current_step, global_step)) else: print (" ( Result : {}, Loss : {}, Steps : {}, Global Steps : {} )" #.format(round(step_count, 3), round(cost, 5), current_step, global_step)) .format(round(step_count, 3), 0, current_step, global_step)) distance, per_of_sim, per_of_sim2 = self.get_action_variance( A_batch) print( " ( Action Batch :::: Distance : {}, Percent : {}%({}%) )" .format(distance, per_of_sim, per_of_sim2)) average_distance.append(distance) rate_of_adjacent.append(per_of_sim) # Save the networks if episode % save_epi == 0: file_case = str(case_n) if save_network: Q_Network.save_network(game_name=self.file_name + '_seed' + file_case, episode=episode, save_epi=save_epi) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() with open( '/home/minjae/Desktop/JOLP/' + 'Average_of_Distance_(' + self.file_name + '_seed' + file_case + ')', 'wb') as fout: pickle.dump(average_distance, fout) with open( '/home/minjae/Desktop/JOLP/' + 'Rate_of_Adjacent_(' + self.file_name + '_global_' + '_seed' + file_case + ')', 'wb') as fout2: pickle.dump(rate_of_adjacent, fout2) p_values = list(range(1, episode + 1)) q_values = average_distance[:] r_values = rate_of_adjacent[:] plt.plot(p_values, q_values, c='r') plt.title('Average of Distance between Actions') plt.grid(True) plt.show() plt.plot(p_values, r_values, c='b') plt.title('Rate of Adjacent Actions') plt.grid(True) plt.show() end_episode += 1 # 결과가 목표치를 달성하면 학습 중단 if step_avg_list[episode - 1] > ending_cond_reward: break # max_steps 만큼 학습되었으면 학습 중단 if global_step > max_steps: break print("--------------------------------------------------") print("--------------------------------------------------") # 목표치를 달성하여 학습 중단 시, 남은 episode 만큼 실행 for episode in range(end_episode + 1, max_episodes + 1): if global_step > max_steps: break s = env.reset() reward_sum = 0 done = False while not done: # 최대 Q 값을 나타내는 행동 선택 action = np.argmax( Q_Network.evaluate_critic( np.reshape(state, [1, input_size]))) if conti_action_flag: action = [action_map[action]] else: action = action state, reward, done, _ = env.step(action) reward_sum += reward global_step += 1 if done: steps_list.append(reward_sum) global_step_list.append(global_step) step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format( episode, round(step_avg_list[episode - 1], 3))) print(" ( Result : {} )".format( reward_sum)) if episode % save_epi == 0: file_case = str(case_n) if save_network: Q_Network.save_network(game_name=self.file_name + '_seed' + file_case, episode=episode, save_epi=save_epi) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() # parameter 저장 file_case = str(case_n) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) # 그래프 출력 x_values = list(range(1, len(step_avg_list) + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() with open( '/home/minjae/Desktop/JOLP/' + 'Average_of_Distance_(' + self.file_name + '_seed' + file_case + ')', 'wb') as fout: pickle.dump(average_distance, fout) with open( '/home/minjae/Desktop/JOLP/' + 'Rate_of_Adjacent_(' + self.file_name + '_global_' + '_seed' + file_case + ')', 'wb') as fout2: pickle.dump(rate_of_adjacent, fout2) p_values = list(range(1, episode + 1)) q_values = average_distance[:] r_values = rate_of_adjacent[:] plt.plot(p_values, q_values, c='r') plt.title('Average of Distance between Actions') plt.grid(True) plt.show() plt.plot(p_values, r_values, c='b') plt.title('Rate of Adjacent Actions') plt.grid(True) plt.show()