def __init__(self, env, mem_size=7 * int(1e3), lr_critic=1e-3, lr_actor=1e-4, epsilon=1., max_epi=1500, epsilon_decay=1. / (1e5), gamma=.99, target_update_frequency=200, batch_size=64, random_process=True, max_step=None): self.CUDA = torch.cuda.is_available() self.orig_env = env #for recording if max_step is not None: self.orig_env._max_episode_steps = max_step self.env = self.orig_env self.N_S = self.env.observation_space.shape[0] self.N_A = self.env.action_space.shape[0] self.MAX_EPI = max_epi self.LOW = self.env.action_space.low self.HIGH = self.env.action_space.high self.actor = Actor(self.N_S, self.N_A) self.critic = Critic(self.N_S, self.N_A) self.target_actor = Actor(self.N_S, self.N_A) self.target_critic = Critic(self.N_S, self.N_A) self.target_actor.eval() self.target_critic.eval() self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) if self.CUDA: self.actor.cuda() self.critic.cuda() self.target_actor.cuda() self.target_critic.cuda() self.exp = Experience(mem_size) self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor) self.random_process = OrnsteinUhlenbeckProcess(\ size=self.N_A, theta=.15, mu=0, sigma=.2) self.EPSILON = epsilon self.EPSILON_DECAY = epsilon_decay self.GAMMA = gamma self.TARGET_UPDATE_FREQUENCY = target_update_frequency self.BATCH_SIZE = batch_size title = {common.S_EPI: [], common.S_TOTAL_R: []} self.data = pd.DataFrame(title) self.RAND_PROC = random_process
def update_policy(self, striker_memory, goalie_memory): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor c_loss = [] a_loss = [] for agent_index in range(self.n_striker): s_transitions = striker_memory.sample(self.batchSize) g_transitions = goalie_memory.sample(self.batchSize) s_batch = Experience(*zip(*s_stransitions)) g_batch = Experience(*zip(*s_stransitions)) s_non_final_mask = ByteTensor(list(map(lambda s: s is not None, s_batch.next_states))) g_non_final_mask = ByteTensor(list(map(lambda s: s is not None, g_batch.next_states))) # state_batch: batch_size x n_agents x dim_obs s_state_batch = torch.stack(s_batch.states).type(FloatTensor) s_action_batch = torch.stack(s_batch.actions).type(FloatTensor) s_reward_batch = torch.stack(s_batch.rewards).type(FloatTensor) g_state_batch = torch.stack(g_batch.states).type(FloatTensor) g_action_batch = torch.stack(g_batch.actions).type(FloatTensor) g_reward_batch = torch.stack(g_batch.rewards).type(FloatTensor) # : (batch_size_non_final) x n_agents x dim_obs s_non_final_next_states = torch.stack( [s for s in s_batch.next_states if s is not None]).type(FloatTensor) g_non_final_next_states = torch.stack( [s for s in g_batch.next_states if s is not None]).type(FloatTensor) # for current agent s_whole_state = s_state_batch.view(self.batchSize, -1) print(s_whole_state.shape) s_whole_action = s_action_batch.view(self.batchSize, -1) print(s_whole_action.shape) g_whole_state = g_state_batch.view(self.batchSize, -1) print(g_whole_state.shape) g_whole_action = g_action_batch.view(self.batchSize, -1) print(g_whole_action.shape) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # we need a discussion to define the meaning of act_dim # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # self.s_critic_optimizer[agent_index].zero_grad() self.g_critic_optimizer[agent_index].zero_grad()
def __call__(self, env): result = [] for episode in range(self.num_episodes): # reset at the start of episode current_obs = env.reset() next_obs = None episode_steps = 0 episode_reward = 0. assert current_obs is not None # start episode done = False while not done: if next_obs is not None: current_obs = next_obs # basic operation, action ,reward, blablabla ... action = self.policy(current_obs) next_obs, reward, done, info = env.step(action) if next_obs is None: current_expr = Experience(current_obs, action, reward, next_obs, True) else: current_expr = Experience(current_obs, action, reward, next_obs, False) # put data into the queue self.queue.put(current_expr) # only max steps if self.max_step == "None" or episode_steps >= self.max_step: done = True if self.visualize: env.render(mode='human') # update episode_reward += reward episode_steps += 1 result.append(episode_reward) result = np.array(result).reshape(-1, 1) if self.save: self.save_results( result, os.path.join('{}'.format(self.save_path), "validate_reward")) return np.mean(result)
def do_setup(self, args: Dict, observation: np.ndarray, session: tf.Session): self.frame_size = args.frame_size self.stack_size = args.stack_size self.mem = Experience(capacity=args.mem_capacity) self.dqn = DQN((*args.frame_size, args.stack_size), self.max_action, args.learning_rate, "ex") self.stacked_frames, self.state = stack_frames( deque([], maxlen=self.stack_size), observation, args.frame_size, args.stack_size) self.sess = session self.sess.run(tf.global_variables_initializer())
def update_rule(self): if self.episode_done <= self.episodes_before_train: return None FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) state_batch = th.stack(batch.states).type(FloatTensor) action_batch = th.stack(batch.actions).type(FloatTensor) whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) #for ag in range(self.n_agents): true_act,rules = self.select_rule_action(state_batch) if self.steps_done%2==0: id = 0 else: id = 1 Q = [] for ag in range(self.n_agents): Q.append( self.critics[ag](whole_state, Variable(th.Tensor( true_act))) ) Qsum = sum(Q) if self.steps_done%600==0: print("true_act..",true_act[15]) print("rule..",rules[id][15]) print("Qsum..",Qsum[15]) loss_r = -rules[id]*Qsum loss_r = loss_r.mean() loss_r.backward() self.constrain_optimizer.step() return loss_r
def play(self): # self.env.render() s = self.s if self.step < self.learning_starts: a = self.env.action_space.sample() else: a = self.epsilon_greedy() old_lives = self.env.lives() SP, r, terminal, step_info = self.env.step(a) new_lives = self.env.lives() self.episode_scores += r sp = self.four_frames_to_4_84_84(SP) if new_lives < old_lives: print('agent died, current lives = ', new_lives) r = min(-1.0, r) if terminal and new_lives > 0: task_done = True done = 1 r = max(1.0, r) print('task is solved succesfully, end of episode') else: task_done = False done = 0 r = min(-0.1, r) # just a tiny punishment self.episode_rewards += r if terminal and new_lives == 0: terminal = True print('agent terminated, end of episode') r = min(-10.0, r) r = np.clip(r, -1.0, 1.0) experience = Experience(s, a, r, sp, done) self.experience_memory.push(experience) self.s = copy.deepcopy(sp) if terminal or task_done: self.episode_steps_list.append(self.step) self.episode_scores_list.append(self.episode_scores) self.episode_rewards_list.append(self.episode_rewards) self.game_episode += 1 self.episode_rewards = 0.0 self.episode_scores = 0.0 self.episode_end_time = time.time() episode_time = self.episode_end_time - self.episode_start_time self.episode_time_list.append(episode_time) print('episode time: {0:.2f}'.format(episode_time)) print('-' * 60) print('game episode: ', self.game_episode) print('time step: ', self.step) self.episode_start_time = time.time() S = self.env.reset() # reset S self.s = self.four_frames_to_4_84_84(S) # get s
def start_training(self): start_episode = self.training_info['episode'] frames_passed = self.training_info['frames'] train_frames = 1000000 t = 0 for episode in range(start_episode, episodes + 1): # Set initial state state = self.read_image(t) episode_start_time = time.time() while t < train_frames: t += 1 random_probability = self.random_action_policy.get_probability( frames_passed) if random.random() < random_probability: action = self.random_action_policy.sample_action( self.action_type) else: # noinspection PyTypeChecker action = self.action_type.from_code( np.argmax(self._predict(state))) for _ in range(self.training_info['batches_per_frame']): self._train_minibatch() new_state, reward = self.read_image(t) experience = Experience(state, action, reward, new_state) self.memory.append_experience(experience) state = new_state frames_passed += 1 # Print status time_since_failure = time.time() - episode_start_time print('Episode {}, Total frames {}, ε={:.4f}, Reward {:.4f}, ' '{:.0f}s since failure'.format(episode, frames_passed, random_probability, reward, time_since_failure), end='\r') # Save model after a fixed amount of frames if frames_passed % 1000 == 0: self.training_info['episode'] = episode self.training_info['frames'] = frames_passed self.training_info[ 'mean_training_time'] = self.mean_training_time.get() self.training_info.save() self.model.save(self.MODEL_PATH)
def predict(self): signal.signal(signal.SIGINT, self.stop) while True: state = self.environment.read_sensors(self.image_size, self.image_size)[0] while not state.is_terminal: action = self.action_type.from_code( np.argmax(self._predict(state))) self.environment.write_action(action) # Wait as long as we usually need to wait due to training time.sleep(self.training_info['batches_per_frame'] * self.training_info['mean_training_time']) new_state, reward = self.environment.read_sensors( self.image_size, self.image_size) experience = Experience(state, action, reward, new_state) self.memory.append_experience(experience) state = new_state if self.should_exit: sys.exit(0)
def update(self, step: dm_env.TimeStep, action: int, next_step: dm_env.TimeStep) -> None: """ Adds experience to the replay memory, performs an optimization_step and updates the q_target neural network. Args: step(dm_env.TimeStep): Current observation from the environment action(int): The action that was performed by the agent. next_step(dm_env.TimeStep): Next observation from the environment Returns: None """ observation = np.array(step.observation).flatten() next_observation = np.array(next_step.observation).flatten() done = next_step.last() exp = Experience(observation, action, next_step.reward, next_step.discount, next_observation, 0, done) self.memory.add(exp) if self.memory.number_samples() < self.start_optimization: return if self.number_steps % self.update_qnet_every == 0: s0, a0, n_step_reward, discount, s1, _, dones, indices, weights = self.memory.sample_batch( self.batch_size) if not self.distributional: self.optimization_step(s0, a0, n_step_reward, discount, s1, indices, weights) else: self.distributional_optimization_step(s0, a0, n_step_reward, discount, s1, dones, indices, weights) if self.number_steps % self.update_target_every == 0: self.q_target.load_state_dict(self.qnet.state_dict()) return
def update_policy(self, i_episode): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor( list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = Variable(th.stack(batch.states).type(FloatTensor)) #whole_list = [] # next_whole_list = [] # next_state_count = 0 #print(len(batch.states) == len(batch.next_states)) # for i in range(len(batch.states)): # n_list = [] # for j in range(4): # for k in range(len(batch.states[i][j])): # n_list.append(batch.states[i][j][k].data.numpy()) # #if batch.next_states[i] != None: # #print('batch.next_states[i][j][k]',type(batch.next_states[i][j][k]),i,j,k) # # next_state_count += 1 # n_array = np.asarray(n_list) # # print('n_array',type(n_array)) # n_tensor = th.from_numpy(n_array).float() # n_variable = Variable(n_tensor).type(FloatTensor) # whole_list.append(n_variable.data.numpy()) # whole_array = np.asarray(whole_list) # whole_tensor = th.from_numpy(whole_array).float() # for i in range(len(batch.states)): # next_list = [] # if batch.next_states[i] != None: # for j in range(4): # for k in range(len(batch.next_states[i][j])): # #print('batch.next_states[i][j][k]',batch.next_states[i][j][k],i,j,k) # next_list.append(batch.next_states[i][j][k].data.numpy()) # next_array = np.asarray(next_list) # next_tensor = th.from_numpy(next_array).float() # next_variable = Variable(next_tensor).type(FloatTensor) # next_whole_list.append(th.t(next_variable).data.numpy()) # next_whole_array = np.asarray(next_whole_list) # next_whole_tensor = th.from_numpy(next_whole_array).float() #state_batch = Variable(th.stack(whole_tensor).type(FloatTensor)) # print('state_batch',state_batch) #[torch.FloatTensor of size 100x62x1] # state_batch = Variable(th.stack(batch.states).type(FloatTensor)) action_batch = Variable(th.stack(batch.actions).type(FloatTensor)) reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor)) # : (batch_size_non_final) x n_agents x dim_obs # print('next_whole_tensor',next_whole_tensor) #non_final_next_states = Variable(th.stack(next_whole_tensor).type(FloatTensor)) non_final_next_states = Variable( th.stack([s for s in batch.next_states if s is not None]).type(FloatTensor)) # print('non_final_next_states',non_final_next_states) [torch.FloatTensor of size 99x1x62] # for current agent whole_state = state_batch.view(self.batch_size, -1) # print('whole_state',whole_state) [torch.FloatTensor of size 100x62] whole_action = action_batch.view(self.batch_size, -1) # non_final_next_states = non_final_next_states.view(next_state_count,-1) # print('non_final_next_states',non_final_next_states) self.critic_optimizer[agent].zero_grad() current_Q = self.critics[agent](whole_state, whole_action) # non_final_next_actions = [] # for a in range(self.n_agents): # batch_obs = [] # # for j in range(self.n_agents): # for i in range(len(batch.next_states)): # if batch.next_states[i] is not None: # batch_obs.append(batch.next_states[i][a].data.numpy()) # # print('batch_obs',type(batch.next_states[i][a])) 'torch.autograd.variable.Variable' # batch_obs = np.asarray(batch_obs) # batch_obs = th.from_numpy(batch_obs).float() # batch_obs = Variable(batch_obs).type(FloatTensor) # # print('batch_obs',batch_obs) [torch.FloatTensor of size 99x16] # non_final_next_actions.append(self.actors_target[a](batch_obs)) # print('non_final_next_actions',non_final_next_actions) non_final_next_actions = [ #[torch.FloatTensor of size 989x2] self.actors_target[i]( non_final_next_states[:, #[torch.FloatTensor of size 989x213] i, :]) for i in range(self.n_agents) ] non_final_next_actions = th.stack(non_final_next_actions) # non_final_next_actions = Variable(non_final_next_actions) non_final_next_actions = (non_final_next_actions.transpose( 0, 1).contiguous()) target_Q = Variable(th.zeros(self.batch_size).type(FloatTensor)) # print('non_final_mask',non_final_mask) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)) # scale_reward: to scale reward in Q functions target_Q = (target_Q * self.GAMMA) + (reward_batch[:, agent] * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] # state_i = [] # for i in range(len(state_batch)): # state_i.append(batch.states[i][agent].data.numpy()) # print('batch_obs',type(batch.next_states[i][a])) 'torch.autograd.variable.Variable' #state_i = np.asarray(state_i) #state_i = th.from_numpy(state_i).float() #state_i = Variable(state_i).type(FloatTensor) # print('state_i',state_i) [torch.FloatTensor of size 100x1] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) if self.steps_done % 100 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.critics[i], self.tau) soft_update(self.actors_target[i], self.actors[i], self.tau) if i_episode % 100 == 0: for i in range(self.n_agents): th.save(self.critics[i], 'critic[' + str(i) + '].pkl_episode' + str(i_episode)) th.save(self.actors[i], 'actors[' + str(i) + '].pkl_episode' + str(i_episode)) return c_loss, a_loss
def update_policy(self, memory): # momory format is memory.push(prev_state, states, [prev_action_striker, prev_action_goalie], prev_reward) # do not train until exploration is enough # if self.episode_done <= self.episode_before_training: # return None, None c_loss = [] a_loss = [] if len(memory) < 1024 * 10: return None, None for agent_index in range(self.n_striker): # # # # # # # # # # # # # # # # # # # # # # # # batch sample is batch * N play ground * agents * state/next_state/action/reward/ # # # # # # # # # # # # # # # # # # # # # # # # transitions = memory.sample(self.batchSize_d2) batch = Experience(*zip(*transitions)) batch_state = np.asarray(batch.states) batch_action = np.asarray(batch.actions) batch_reward = np.asarray(batch.rewards) batch_reward = torch.from_numpy(batch_reward).to( self.device).float() batch_next_state = np.asarray(batch.next_state) state_batch = torch.from_numpy(batch_state) action_batch = torch.from_numpy(batch_action) next_state_batch = torch.from_numpy(batch_next_state) # # # # # # # # # # total numbers of data = batchsize * play ground # # # # # # # # # # total_numbers_of_data = batch_state.shape[0] * batch_state.shape[1] whole_state = state_batch.view(total_numbers_of_data, -1).to(self.device).float() whole_action = action_batch.view(total_numbers_of_data * 4, -1).long() # # # # # translate action into one hot # # # # # one_hot = (whole_action == torch.arange(7).reshape(1, 7)).float() one_hot = one_hot.view(total_numbers_of_data, -1).to(self.device) self.critic_optimizer[0].zero_grad() self.critic_optimizer[1].zero_grad() s_current_Q = self.critic[0](whole_state, one_hot) g_current_Q = self.critic[1](whole_state, one_hot) s_whole_next_state = next_state_batch[:, :, 0:2, :].to( self.device).float() s_whole_next_state = s_whole_next_state.view( total_numbers_of_data * 2, -1) g_whole_next_state = next_state_batch[:, :, 2:4, :].to( self.device).float() g_whole_next_state = g_whole_next_state.view( total_numbers_of_data * 2, -1) # # # # Next_actions # # # # s_next_actions = [self.s_actors_target[0](s_whole_next_state)] g_next_actions = [self.g_actors_target[0](g_whole_next_state)] s_next_actions = torch.stack(s_next_actions) g_next_actions = torch.stack(g_next_actions) s_next_actions = (s_next_actions.transpose(0, 1).contiguous()) g_next_actions = (g_next_actions.transpose(0, 1).contiguous()) s_next_state = s_whole_next_state.view(-1, 2, 112).to(self.device) g_next_state = g_whole_next_state.view(-1, 2, 112).to(self.device) whole_next_stat = torch.cat([s_next_state, g_next_state], dim=-2) s_next_actions = s_next_actions.view(-1, 14).to(self.device) g_next_actions = g_next_actions.view(-1, 14).to(self.device) whole_next_action = torch.cat([s_next_actions, g_next_actions], dim=-1) whole_next_stat = whole_next_stat.view(-1, 112 * 4) whole_next_action = whole_next_action.view(-1, 7 * 4) s_target_Q = self.critic_target[0](whole_next_stat, whole_next_action) g_target_Q = self.critic_target[1](whole_next_stat, whole_next_action) # scale_reward: to scale reward in Q functions batch_reward = batch_reward.view(64, -1) s_1_target_Q = (s_target_Q * self.GAMMA) + ( batch_reward[:, 0].unsqueeze(1) * self.scale_reward) s_2_target_Q = (s_target_Q * self.GAMMA) + ( batch_reward[:, 1].unsqueeze(1) * self.scale_reward) g_1_target_Q = (g_target_Q * self.GAMMA) + ( batch_reward[:, 2].unsqueeze(1) * self.scale_reward) g_2_target_Q = (g_target_Q * self.GAMMA) + ( batch_reward[:, 3].unsqueeze(1) * self.scale_reward) # 64 *1 # # # # Update first striker # # # # s_1_loss_Q = nn.MSELoss()(s_current_Q, s_1_target_Q.detach()) s_1_loss_Q.backward(retain_graph=True) self.critic_optimizer[0].step() # # # # Update 2nd striker # # # # # print(s_2_target_Q) self.critic_optimizer[0].zero_grad() s_2_loss_Q = nn.MSELoss()(s_current_Q, s_2_target_Q.detach()) s_2_loss_Q.backward() self.critic_optimizer[0].step() # # # # Update first goalie # # # # self.critic_optimizer[1].zero_grad() g_1_loss_Q = nn.MSELoss()(g_current_Q, g_1_target_Q.detach()) g_1_loss_Q.backward(retain_graph=True) self.critic_optimizer[1].step() # # # # Update 2nd goalie # # # # self.critic_optimizer[1].zero_grad() g_2_loss_Q = nn.MSELoss()(g_current_Q, g_2_target_Q.detach()) g_2_loss_Q.backward() self.critic_optimizer[1].step() self.s_actor_optimizer[agent_index].zero_grad() self.g_actor_optimizer[agent_index].zero_grad() # state_i = state_batch[:, agent_index, :] # action_i = self.actors[agent_index](state_i) s_state = batch_state[:, :, 0:2, :] s_state = torch.from_numpy(s_state).to(self.device).float() s_state = s_state.view(total_numbers_of_data * 2, -1) g_state = batch_state[:, :, 2:4, :] g_state = torch.from_numpy(g_state).to(self.device).float() g_state = g_state.view(total_numbers_of_data * 2, -1) s_action = self.s_actor[agent_index](s_state) g_action = self.g_actor[agent_index](g_state) # # # # striker # # # s_ac = one_hot.clone() # 8*8 * 7 g_ac = one_hot.clone() s_action = s_action.view(-1, 1, 7).to(self.device) g_action = g_action.view(-1, 1, 7).to(self.device) # print(s_action.shape) s_ac = s_ac.view(-1, 2, 7) # print(s_ac.shape) s_ac[:, 0] = s_action.squeeze() g_ac = g_ac.view(-1, 2, 7) g_ac[:, 1] = g_action.squeeze() sup_action = s_ac.view(total_numbers_of_data, -1) gup_action = g_ac.view(total_numbers_of_data, -1) sactor_loss = -self.critic[0](whole_state, sup_action) sactor_loss = sactor_loss.mean() sactor_loss.backward() gactor_loss = -self.critic[1](whole_state, gup_action) gactor_loss = gactor_loss.mean() gactor_loss.backward() self.s_actor_optimizer[agent_index].step() self.g_actor_optimizer[agent_index].step() # # # # goalie # # # c_loss.append(s_1_loss_Q + s_2_loss_Q + g_1_loss_Q + g_2_loss_Q) a_loss.append(sactor_loss + gactor_loss) if self.steps_done % 100 == 0 and self.steps_done > 0: soft_update(self.critic_target[0], self.critic[0], self.tau) soft_update(self.s_actors_target[0], self.s_actor[0], self.tau) soft_update(self.critic_target[1], self.critic[1], self.tau) soft_update(self.g_actors_target[0], self.g_actor[0], self.tau) return c_loss, a_loss
class DQNAgent: def __init__(self, max_action: int, training=False): self.max_action = max_action self.sess = None def reset_state(self, observation: np.ndarray): self.stacked_frames, self.state = stack_frames( deque([], maxlen=self.stack_size), observation, self.frame_size, self.stack_size) def do_setup(self, args: Dict, observation: np.ndarray, session: tf.Session): self.frame_size = args.frame_size self.stack_size = args.stack_size self.mem = Experience(capacity=args.mem_capacity) self.dqn = DQN((*args.frame_size, args.stack_size), self.max_action, args.learning_rate, "ex") self.stacked_frames, self.state = stack_frames( deque([], maxlen=self.stack_size), observation, args.frame_size, args.stack_size) self.sess = session self.sess.run(tf.global_variables_initializer()) def remember(self, observation: np.ndarray, action: int, reward: float): """ Add current observation to the stack of frames and create a memory entry corresponding to this tuple. Also update the internal state of the agent. """ self.stacked_frames, next_state = stack_frames(self.stacked_frames, observation, self.frame_size, self.stack_size) self.mem.add((self.state, action, reward, next_state)) self.state = next_state def get_random_action(self): return np.random.randint(self.max_action) # TODO: move explore_prob_* and decay rate inside the agent # This should be used when the agent is still in training. def predict_action(self, explore_prob_begin: float, explore_prob_min: float, decay_rate: float, decay_step: int): explore_prob_curr = explore_prob_min + \ (explore_prob_begin - explore_prob_min) * \ np.exp(-decay_rate * decay_step) if np.random.rand() < explore_prob_curr: action = self.get_random_action() else: Qs = self.sess.run(self.dqn.output, feed_dict={ self.dqn.inputs_: self.state.reshape(1, *self.state.shape) }) action = int(np.argmax(Qs)) print('action: %d' % action) return action, explore_prob_curr def act(self, observation: np.ndarray): """ :param observation: numpy array of shape (width, height, 3) *defined in config file :return: int between 0 and max_action This method should be called when the agent is already trained. """ if self.sess is None: # Used some hardcoded parameters here, sorryyy. session = tf.Session() self.dqn = DQN((*(84, 84), 4), self.max_action, 0.002, "ex") saver = tf.train.Saver() saver.restore(session, './models/second_model.ckpt') self.sess = session self.cnt = 0 self.stacked_frames, self.state = stack_frames( deque([], maxlen=4), observation, (84, 84), 4) # Used to visualize the game when testing the model. cv2.imwrite('game_' + str(self.cnt) + '.png', observation) self.cnt += 1 self.stacked_frames, self.state = stack_frames(self.stacked_frames, observation, (84, 84), 4) Qs = self.sess.run(self.dqn.output, feed_dict={ self.dqn.inputs_: self.state.reshape(1, *self.state.shape) }) return int(np.argmax(Qs))
def learn(self): """ Update policy and value parameters using given batch of experience tuples""" if self.eps_done <= self.eps_b_train: return None, None if self.eps_done == (self.eps_b_train + 1): print("========== Training now =========") ByteTensor = th.cuda.ByteTensor if self.cuda_on else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor(list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = th.stack(batch.states).type(FloatTensor) reward_batch = th.stack(batch.rewards).type(FloatTensor) action_batch = th.stack(batch.actions).type(FloatTensor) #pdb.set_trace() # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = th.stack( [s for s in batch.next_states if s is not None]).type(FloatTensor) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) self.critic_optimizer[agent].zero_grad() current_Q = self.critics[agent](whole_state, whole_action) non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range( self.n_agents)] non_final_next_actions = th.stack(non_final_next_actions) non_final_next_actions = ( non_final_next_actions.transpose(0, 1).contiguous()) target_Q = th.zeros( self.batch_size).type(FloatTensor) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.dim_obs), non_final_next_actions.view(-1, self.n_agents * self.dim_act) ).squeeze() # scale_reward: to scale reward in Q functions target_Q = (target_Q.unsqueeze(1) * GAMMA) + ( reward_batch[:, agent].unsqueeze(1) * SCALE_REWARD) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) #if self.steps_done % NUM_STEPS_TO_UPDATE == 0 and self.steps_done > 0: #for i in range(self.n_agents): soft_update(self.critics_target[agent], self.critics[agent], TAU) soft_update(self.actors_target[agent], self.actors[agent], TAU) return c_loss, a_loss
def train(self): print('-' * 60) print('-' * 60) print( 'PHASE I: Initial Intrinsic Motivation Learning and Subgoal Discovery' ) print('Purpose 1) Training Controller to reach random locations') print('Purpose 2) Discovering subgoals') print('-' * 60) print('-' * 60) # reset print('-' * 60) print('game episode: ', self.game_episode) print('time step: ', self.step) S = self.env.reset() s = four_frames_to_4_84_84(S) man_mask = self.image_processor.get_man_mask(S) man_loc = get_man_xy_np_coordinate(man_mask) g_id, subgoal_mask = self.image_processor.sample_from_random_subgoal_set( ) # random g print('new subgoal assigned, g_id = ', g_id) subgoal_frame = self.image_processor.create_mask_frame(subgoal_mask) g = single_channel_frame_to_1_84_84(subgoal_frame) for t in range(self.max_iter + 1): self.step = t if t < self.learning_starts: a = self.env.action_space.sample() else: a = self.epsilon_greedy(s, g) old_lives = self.env.lives() SP, r, terminal, step_info = self.env.step(a) new_lives = self.env.lives() self.episode_scores += r sp = four_frames_to_4_84_84(SP) man_mask = self.image_processor.get_man_mask(SP) man_loc = get_man_xy_np_coordinate(man_mask) intrinsic_done_task = is_man_inside_subgoal_mask( man_mask, subgoal_mask) # outlier for the subgoal discovery if r > 0: print('############# found an outlier ###############') self.subgoal_discovery.push_outlier(man_loc) else: r = -0.1 # small negative reward if intrinsic_done_task: intrinsic_done = 1 # binary mask print('succesful intrinsic motivation learning to g_id = ', g_id) r_tilde = +1.0 self.intrinsic_motivation_learning_episode += 1 else: intrinsic_done = 0 r_tilde = -0.1 # small negetive reward to motivate agent to solve task if new_lives < old_lives: print('agent died, current lives = ', new_lives) r = -1.0 r_tilde = -1.0 # dying reward if r > 100: # it means solving room #1 which in our paper is equivalent to task comelition task_done = True done = 1 # binary mask for done print('The room #1 task is completed, needs to reset!') else: task_done = False done = 0 if terminal: print('agent terminated, end of episode') r = -10.0 self.episode_rewards += r # including negative rewards for death r = np.clip(r, -1.0, 1.0) experience = Experience(s, g, g_id, a, r, r_tilde, sp, intrinsic_done, done, man_loc) self.experience_memory.push(experience) s = copy.deepcopy(sp) self.anneal_epsilon() if intrinsic_done_task: # reset subgoal when intrinsic motivation task is accomplished g_id, subgoal_mask = self.image_processor.sample_from_random_subgoal_set( ) # random g print('new subgoal assigned, g_id = ', g_id) subgoal_frame = self.image_processor.create_mask_frame( subgoal_mask) g = single_channel_frame_to_1_84_84(subgoal_frame) if (new_lives < old_lives ) and not terminal and self.repeat_noop_action > 0: for _ in range(self.repeat_noop_action ): # do 20 nothing actions to ignore post-death S, _, _, _ = self.env.step(0) s = four_frames_to_4_84_84(S) if terminal or task_done: self.episode_scores_list.append(self.episode_scores) self.episode_rewards_list.append(self.episode_rewards) self.game_episode += 1 self.episode_rewards = 0.0 self.episode_scores = 0.0 print('-' * 60) print('game episode: ', self.game_episode) print('time step: ', self.step) S = self.env.reset() # reset S s = four_frames_to_4_84_84(S) # get s man_mask = self.image_processor.get_man_mask(S) # man's mask man_loc = get_man_xy_np_coordinate( man_mask) # man's np location g_id, subgoal_mask = self.image_processor.sample_from_random_subgoal_set( ) # id and mask of random subgoal print('new subgoal assigned, g_id = ', g_id) subgoal_frame = self.image_processor.create_mask_frame( subgoal_mask) #subgoal frame g = single_channel_frame_to_1_84_84(subgoal_frame) if (t > self.learning_starts) and (t % self.learning_freq == 0): self.controller.update_w() if (t > 0) and (t % self.subgoal_discovery_freq == 0): # find centroids X = self.experience_memory.get_man_positions() self.subgoal_discovery.feed_data(X) self.subgoal_discovery.find_kmeans_clusters() results_file_path = './results/subgoal_discovery_step_' + str( t) + '.pkl' self.subgoal_discovery.save_results( results_file_path=results_file_path) if (t > self.learning_starts) and ( t % self.test_freq == 0): # test controller's performance self.test() if (t > 0) and (t % self.save_model_freq == 0): # save controller model model_save_path = './models/controller_step_' + str( t) + '.model' self.controller.save_model(model_save_path) print('saving model, steps = ', t) if (t > 0) and (t % self.save_results_freq == 0): results_file_path = './results/performance_results_' + str( t) + '.pkl' with open(results_file_path, 'wb') as f: pickle.dump([ self.episode_scores_list, self.episode_rewards_list, self.testing_scores ], f) if (t > self.learning_starts) and ( t % self.controller_target_update_freq == 0): self.controller.update_target_params()
def update_policy(self, i_episode, initial_train): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor( list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = Variable(th.stack(batch.states).type(FloatTensor)) action_batch = Variable(th.stack(batch.actions).type(FloatTensor)) reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor)) s = [s for s in batch.next_states if s is not None] non_final_next_states = Variable(th.stack(s).type(FloatTensor)) #tmp_whole_state = state_batch[:, (1, 4), :] initial_train = initial_train if agent == 4: tmp_state = Variable( th.zeros(self.batch_size, 5, 22).type(FloatTensor)) tmp_action = Variable( th.zeros(self.batch_size, 5, 2).type(FloatTensor)) tmp_no_final_s = Variable( th.zeros(len(non_final_next_states), 5, 22).type(FloatTensor)) non_final_next_actions = Variable( th.zeros(len(non_final_next_states), 5, 2).type(FloatTensor)) else: tmp_state = Variable( th.zeros(self.batch_size, 4, 22).type(FloatTensor)) tmp_action = Variable( th.zeros(self.batch_size, 4, 2).type(FloatTensor)) tmp_no_final_s = Variable( th.zeros(len(non_final_next_states), 4, 22).type(FloatTensor)) non_final_next_actions = Variable( th.zeros(len(non_final_next_states), 4, 2).type(FloatTensor)) non_final_next_actions_tmp = [ # [torch.FloatTensor of size 989x2] self.actors_target[i]( non_final_next_states[:, # [torch.FloatTensor of size 989x213] i, :]) for i in range(self.n_agents) ] non_final_next_actions_tmp = Variable( th.stack(non_final_next_actions_tmp).type(FloatTensor)) non_final_next_actions_tmp = (non_final_next_actions_tmp.transpose( 0, 1).contiguous()) startTime = datetime.datetime.now() # the main difference between the double values of initial_train is: when initial_train is True, the input of the critic network # is all agents' observation, while it is False, the input is only the nearest four agents observation of the now agent if initial_train is False: #non_final_next_actions = [] for j in range(self.batch_size): if j < len(non_final_next_states): tmp_no_final_s[j, 0:4, :] = non_final_next_states[j, ([ i for i in range(self.n_agents) if i != batch.max_id[j][agent] ]), :] #non_final_next_actions[j,:,:] = Variable(th.stack([self.actors_target[i](non_final_next_states[j, i, :]) for i in range(self.n_agents) if i!=batch.max_id[j][agent]]).type(FloatTensor)) non_final_next_actions[ j, 0:4, :] = non_final_next_actions_tmp[j, ([ i for i in range(self.n_agents) if i != batch.max_id[j][agent] ]), :] tmp_state[j, 0:4, :] = state_batch[j, ([ i for i in range(self.n_agents) if i != batch.max_id[j][agent] ]), :] tmp_action[j, 0:4, :] = action_batch[j, ([ i for i in range(self.n_agents) if i != batch.max_id[j][agent] ]), :] if agent == 4: tmp_state[:, 4, :] = tmp_state[:, 3, :] tmp_action[:, 4, :] = tmp_action[:, 3, :] tmp_no_final_s[:, 4, :] = tmp_no_final_s[:, 3, :] non_final_next_actions[:, 4, :] = non_final_next_actions[:, 3, :] if initial_train is True: tmp_state = state_batch tmp_action = action_batch tmp_no_final_s = non_final_next_states #whole_state = state_batch.view(self.batch_size, -1) #print('-----------------------------') whole_state = tmp_state.view(self.batch_size, -1) # print('whole_state',whole_state) [torch.FloatTensor of size 100x62] whole_action = tmp_action.view(self.batch_size, -1) # non_final_next_states = non_final_next_states.view(next_state_count,-1) # print('non_final_next_states',non_final_next_states) self.critic_optimizer[agent].zero_grad() current_Q = self.critics[agent](whole_state, whole_action) if initial_train is True: non_final_next_actions = [ # [torch.FloatTensor of size 989x2] self.actors_target[i]( tmp_no_final_s[:, # [torch.FloatTensor of size 989x213] i, :]) for i in range(self.n_agents) ] non_final_next_actions = Variable( th.stack(non_final_next_actions).type(FloatTensor)) target_Q = Variable(th.zeros(self.batch_size, 1).type(FloatTensor)) # print('non_final_mask',non_final_mask) if initial_train is True: target_Q[non_final_mask] = self.critics_target[agent]( tmp_no_final_s.view((-1, self.n_agents * self.n_states)), non_final_next_actions.view( (-1, self.n_agents * self.n_actions))) else: if agent != 4: target_Q[non_final_mask] = self.critics_target[agent]( tmp_no_final_s.view( (-1, (self.n_agents - 1) * self.n_states)), non_final_next_actions.view( (-1, (self.n_agents - 1) * self.n_actions))) else: target_Q[non_final_mask] = self.critics_target[agent]( tmp_no_final_s.view( (-1, self.n_agents * self.n_states)), non_final_next_actions.view( (-1, self.n_agents * self.n_actions))) # scale_reward: to scale reward in Q functions target_Q = (target_Q * self.GAMMA) + ( reward_batch[:, agent].reshape(self.batch_size, 1) * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) * 4 ac = tmp_action.clone() if initial_train is False and agent != 4: for j in range(self.batch_size): if agent < batch.max_id[j][agent]: tmp_agent = agent else: tmp_agent = agent - 1 ac[j, tmp_agent, :] = action_i[j] if agent == 4: ac[:, 4, :] = action_i ac[:, 3, :] = action_i if initial_train is True: ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) if self.steps_done % 100 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.critics[i], self.tau) soft_update(self.actors_target[i], self.actors[i], self.tau) return c_loss, a_loss
def update_policy(self): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor c_loss = [] a_loss = [] critics_grad = [] actors_grad = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor(list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = Variable(th.stack(batch.states).type(FloatTensor)) action_batch = Variable(th.stack(batch.actions).type(FloatTensor)) reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor)) # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = Variable(th.stack( [s for s in batch.next_states if s is not None]).type(FloatTensor)) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) # critic network self.critic_optimizer[agent].zero_grad() current_Q = self.models.critics[agent](whole_state, whole_action) # forward? non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents)] non_final_next_actions = th.stack(non_final_next_actions) # non_final_next_actions = Variable(non_final_next_actions) non_final_next_actions = ( non_final_next_actions.transpose(0, 1).contiguous()) target_Q = Variable(th.zeros(self.batch_size).type(FloatTensor)) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)) # scale_reward: to scale reward in Q functions target_Q = (target_Q * self.GAMMA) + (reward_batch[:, agent] * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() # actor network self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.models.actors[agent](state_i) # forward ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.models.critics[agent](whole_state, whole_action) # forward actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) # for test ''' s = 0 for x in self.models.critics[agent].parameters(): s += 1 print('s: ', s) print(type(x)) print('x.grad.shape: ', x.grad.size()) print('x.data.shape: ', x.data.size()) ''' critics_agent_grad = [] actors_agent_grad = [] for x in self.models.critics[agent].parameters(): critics_agent_grad.append(x.grad.data.norm(2)) # critics_agent_grad.append(th.mean(x.grad).data[0]) for x in self.models.actors[agent].parameters(): actors_agent_grad.append(x.grad.data.norm(2)) # actors_agent_grad.append(th.mean(x.grad).data[0]) critics_grad.append(critics_agent_grad) actors_grad.append(actors_agent_grad) if self.steps_done % 100 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.models.critics[i], self.tau) soft_update(self.actors_target[i], self.models.actors[i], self.tau) ''' # gradient clipping if self.clip is not None: nn.utils.clip_grad_norm(self.model.parameters(), self.clip) ''' # return c_loss, a_loss #, critics_grad, actors_grad return critics_grad, actors_grad
def play(self): s = self.s if self.step < self.learning_starts: a = self.env.action_space.sample() else: a = self.epsilon_greedy() old_lives = self.env.lives() SP, r, terminal, step_info = self.env.step(a) new_lives = self.env.lives() self.episode_scores += r sp = self.preprocess_concat_frames(SP) if new_lives < old_lives: print('agent died, current lives = ', new_lives) r = min(-1.0, r) if (terminal and new_lives>0): task_done = True done = 1 r = max(1.0,r) print('task is solved succesfully, end of episode') else: task_done = False done = 0 if terminal and new_lives==0: print('agent terminated, end of episode') r = min(-1.0,r) if ('Pong' in self.env.task): if terminal and (self.episode_scores > 17): task_done = True done = 1 r = max(1.0,r) print('task is solved succesfully, end of episode') else: task_done = False done = 0 # if r < 0.0 or isclose(r, 0.0): # r = min(-0.01,r) self.episode_rewards += r experience = Experience(s, a, r, sp, done) self.experience_memory.push(experience) self.s = copy.deepcopy(sp) if terminal or task_done: self.episode_steps_list.append(self.step) self.episode_scores_list.append(self.episode_scores) self.episode_rewards_list.append(self.episode_rewards) self.episode_end_time = time.time() episode_time = self.episode_end_time - self.episode_start_time if self.there_was_a_test is True: episode_time = episode_time - self.test_duration self.there_was_a_test = False self.episode_time_list.append(episode_time) print('episode score: ', self.episode_scores) print('episode time: {0:.2f}' .format(episode_time)) self.game_episode += 1 print('-'*60) print('game episode: ', self.game_episode) print('time step: ', self.step) self.episode_rewards = 0.0 self.episode_scores = 0.0 self.episode_start_time = time.time() S = self.env.reset() # reset S self.s = self.preprocess_concat_frames(S)
model.cuda() optimizer_meta_actor = Adam(model.parameters(), lr=0.001) optimizer_config_network = Adam(config_network.parameters(), lr=0.001) for t in range(100000): ByteTensor = pt.cuda.ByteTensor if use_cuda else pt.ByteTensor FloatTensor = pt.cuda.FloatTensor if use_cuda else pt.FloatTensor random_position = np.random.randint(low=length_lstm, high=min( memory.__len__(), n_episode * n_agents * max_steps)) memory_info = memory.get_item(random_position, length_lstm) batch = Experience(*zip(*memory_info)) state_batch = Variable(pt.stack(batch.states).type(FloatTensor)) action_batch = Variable(pt.stack(batch.actions).type(FloatTensor)) for i in range(n_agents): optimizer_meta_actor.zero_grad() whole_state = state_batch[0:length_lstm - 1, i, :].view(length_lstm - 1, 22) whole_action = action_batch[0:length_lstm - 1, i, :].view( length_lstm - 1, 2) / 4 final_state = state_batch[length_lstm - 1, i, :] final_action = action_batch[length_lstm - 1, i, :] #pre_data_samples = pt.cat((whole_state, whole_action),1).unsqueeze(0) pre_data_samples = whole_state.unsqueeze(0)
def remember(self, states, actions, rewards, next_states, dones): '''Populates the replay memory with new batch of data; observations of all agents''' self.memory.add( Experience(states, actions, rewards, next_states, dones))
def forward(self, obs, acts): result = F.relu(self.FC1(obs)) combined = pt.cat([result, acts], 1) result = F.relu(self.FC2(combined)) return self.FC4(F.relu(self.FC3(result))) model = meta_critic(5, 22, 2) model.cuda() optimizer = Adam(model.parameters(), lr=0.0001) for t in range(100000): transitions = memory.sample(batch_size) batch = Experience(*zip(*transitions)) optimizer.zero_grad() state_batch = Variable(pt.stack(batch.states).type(FloatTensor)) action_batch = Variable(pt.stack(batch.actions).type(FloatTensor)) Q_batch = Variable(pt.stack(batch.rewards).type(FloatTensor)) whole_state = state_batch.view(batch_size, -1) whole_action = action_batch.view(batch_size, -1) whole_Q = Q_batch.view(batch_size, -1) prediction = model(whole_state, whole_action) #target_Q = Variable(pt.zeros(batch_size, 1).type(FloatTensor))
(subgoal_mask.x,subgoal_mask.y) ) intrinsic_done = 1 tilde_r = +1 subgoal_index, subgoal_mask = \ sample_from_random_subgoal_set(random_subgoals_set) subgoal_frame = create_mask_frame(base_img, subgoal_mask) else: intrinsic_done = 0 tilde_r = -1 if terminal and env.unwrapped.ale.lives() > 0: done = 1 else: done = 0 experience = Experience(s, g, g_id, a, r, tilde_r, sp, intrinsic_done, done, man_loc) experience_memory.push(experience) epsilon = max(0.1, 1 - (1 - 0.1) * t / 1000000) s = deepcopy(sp) steps += 1 if terminal or (steps > MAX_STEPS): S = reset() # s is reserved for 4*84*84 input image s = four_frames_to_4_84_84(S) man_mask = get_man_mask(S) man_loc = get_man_xy_np_coordinate(man_mask) subgoal_index, subgoal_mask = sample_from_random_subgoal_set( random_subgoals_set) # random g subgoal_frame = create_mask_frame(base_img, subgoal_mask) g = single_channel_frame_to_1_84_84(subgoal_frame)
def update_policy(self): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor # actor Loss c_loss = [] # critic Loss a_loss = [] # 循环,对于每一个agent提取transitions for agent in range(self.n_agents): # 提取过渡态 transitions = self.memory.sample(self.batch_size) # 利用*号操作符,可以将元组解压为列表. *transitions将transtions解压为列表 # zip(*transitions) 得到的结果是[(state1, state2), (action1, action2), (next_state1, next_state2), (reward1, reward2)] # batch = Experience(states=(1, 5), actions=(2, 6), next_states=(3, 7), rewards=(4, 8)) batch = Experience(*zip(*transitions)) # 是否终止状态 # list(map(...))返回的数值: [True, True] # ByteTensor后返回的数值tensor([1, 1], dtype=torch.uint8) non_final_mask = ByteTensor( list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = Variable(th.stack(batch.states).type(FloatTensor)) action_batch = Variable(th.stack(batch.actions).type(FloatTensor)) reward_batch = Variable(th.stack(batch.rewards).type(FloatTensor)) # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = Variable( th.stack([s for s in batch.next_states if s is not None]).type(FloatTensor)) # for current agent # 使用view重新塑形 # whole_state的格式为([batch_size, n_agents ✖ dim_obs]) whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) # 把critic优化器梯度置零,也就是把loss关于weight的导数变成0 self.critic_optimizer[agent].zero_grad() # 当前的Q值, 使用当前critic来进行评估 current_Q = self.critics[agent](whole_state, whole_action) non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents) ] non_final_next_actions = th.stack(non_final_next_actions) # transpose: 交换维度0和1,即转置 # contiguous操作保证张量是连续的,方便后续的view操作 non_final_next_actions = (non_final_next_actions.transpose( 0, 1).contiguous()) # TODO: 对此处代码不深究,涉及到数学内容,直接套用 # target_Q初始化 target_Q = th.zeros(self.batch_size).type(FloatTensor) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)).squeeze() # scale_reward: to scale reward in Q functions target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + ( reward_batch[:, agent].unsqueeze(1) * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) if self.steps_done % 100 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.critics[i], self.tau) soft_update(self.actors_target[i], self.actors[i], self.tau) return c_loss, a_loss
def update_policy(self): # do not train until exploration is enough if self.episode_done <= self.episodes_before_train: return None, None ByteTensor = th.cuda.ByteTensor if self.use_cuda else th.ByteTensor FloatTensor = th.cuda.FloatTensor if self.use_cuda else th.FloatTensor c_loss = [] a_loss = [] for agent in range(self.n_agents): transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) non_final_mask = ByteTensor( list(map(lambda s: s is not None, batch.next_states))) # state_batch: batch_size x n_agents x dim_obs state_batch = th.stack(batch.states).type(FloatTensor) action_batch = th.stack(batch.actions).type(FloatTensor) reward_batch = th.stack(batch.rewards).type(FloatTensor) # : (batch_size_non_final) x n_agents x dim_obs non_final_next_states = th.stack([ s for s in batch.next_states if s is not None ]).type(FloatTensor) # for current agent whole_state = state_batch.view(self.batch_size, -1) whole_action = action_batch.view(self.batch_size, -1) self.critic_optimizer[agent].zero_grad() current_Q = self.critics[agent](whole_state, whole_action) non_final_next_actions = [ self.actors_target[i](non_final_next_states[:, i, :]) for i in range(self.n_agents) ] non_final_next_actions = th.stack(non_final_next_actions) non_final_next_actions = (non_final_next_actions.transpose( 0, 1).contiguous()) target_Q = th.zeros(self.batch_size).type(FloatTensor) target_Q[non_final_mask] = self.critics_target[agent]( non_final_next_states.view(-1, self.n_agents * self.n_states), non_final_next_actions.view(-1, self.n_agents * self.n_actions)).squeeze() # scale_reward: to scale reward in Q functions target_Q = (target_Q.unsqueeze(1) * self.GAMMA) + ( reward_batch[:, agent].unsqueeze(1) * scale_reward) loss_Q = nn.MSELoss()(current_Q, target_Q.detach()) loss_Q.backward() self.critic_optimizer[agent].step() self.actor_optimizer[agent].zero_grad() state_i = state_batch[:, agent, :] action_i = self.actors[agent](state_i) ac = action_batch.clone() ac[:, agent, :] = action_i whole_action = ac.view(self.batch_size, -1) actor_loss = -self.critics[agent](whole_state, whole_action) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer[agent].step() c_loss.append(loss_Q) a_loss.append(actor_loss) if self.steps_done % 100 == 0 and self.steps_done > 0: for i in range(self.n_agents): soft_update(self.critics_target[i], self.critics[i], self.tau) soft_update(self.actors_target[i], self.actors[i], self.tau) return c_loss, a_loss
class DDPG(object): def __init__(self, env, mem_size=7 * int(1e3), lr_critic=1e-3, lr_actor=1e-4, epsilon=1., max_epi=1500, epsilon_decay=1. / (1e5), gamma=.99, target_update_frequency=200, batch_size=64, random_process=True, max_step=None): self.CUDA = torch.cuda.is_available() self.orig_env = env #for recording if max_step is not None: self.orig_env._max_episode_steps = max_step self.env = self.orig_env self.N_S = self.env.observation_space.shape[0] self.N_A = self.env.action_space.shape[0] self.MAX_EPI = max_epi self.LOW = self.env.action_space.low self.HIGH = self.env.action_space.high self.actor = Actor(self.N_S, self.N_A) self.critic = Critic(self.N_S, self.N_A) self.target_actor = Actor(self.N_S, self.N_A) self.target_critic = Critic(self.N_S, self.N_A) self.target_actor.eval() self.target_critic.eval() self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) if self.CUDA: self.actor.cuda() self.critic.cuda() self.target_actor.cuda() self.target_critic.cuda() self.exp = Experience(mem_size) self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor) self.random_process = OrnsteinUhlenbeckProcess(\ size=self.N_A, theta=.15, mu=0, sigma=.2) self.EPSILON = epsilon self.EPSILON_DECAY = epsilon_decay self.GAMMA = gamma self.TARGET_UPDATE_FREQUENCY = target_update_frequency self.BATCH_SIZE = batch_size title = {common.S_EPI: [], common.S_TOTAL_R: []} self.data = pd.DataFrame(title) self.RAND_PROC = random_process def train(self, dir=None, interval=1000): if dir is not None: self.env = wrappers.Monitor(self.orig_env, '{}/train_record'.format(dir), force=True) os.mkdir(os.path.join(dir, 'models')) update_counter = 0 epsilon = self.EPSILON for epi in trange(self.MAX_EPI, desc='train epi', leave=True): self.random_process.reset_states() o = self.env.reset() counter = 0 acc_r = 0 while True: counter += 1 #if dir is not None: # self.env.render() a = self.choose_action(o) if self.RAND_PROC: a += max(epsilon, 0) * self.random_process.sample() a = np.clip(a, -1., 1.) epsilon -= self.EPSILON_DECAY o_, r, done, info = self.env.step(self.map_to_action(a)) self.exp.push(o, a, r, o_, done) if epi > 0: self.update_actor_critic() update_counter += 1 if update_counter % self.TARGET_UPDATE_FREQUENCY == 0: self.update_target() acc_r += r o = o_ if done: break if dir is not None: if (epi + 1) % interval == 0: self.save(os.path.join(dir, 'models'), str(epi + 1), save_data=False) s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R]) self.data = self.data.append(s, ignore_index=True) def choose_action(self, state): self.actor.eval() s = Variable(torch.Tensor(state)).unsqueeze(0) if self.CUDA: s = s.cuda() a = self.actor(s).data.cpu().numpy()[0].astype('float64') self.actor.train() return a def map_to_action(self, a): return (self.LOW + self.HIGH) / 2 + a * (self.HIGH - self.LOW) / 2 def update_target(self): self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) def update_actor_critic(self): # sample minibatch minibatch = common.Transition(*zip(*self.exp.sample(self.BATCH_SIZE))) bat_o = Variable(torch.Tensor(minibatch.state)) bat_a = Variable(torch.Tensor(minibatch.action)) bat_r = Variable(torch.Tensor(minibatch.reward)).unsqueeze(1) bat_o_ = Variable(torch.Tensor(minibatch.next_state)) bat_not_done_mask = list( map(lambda done: 0 if done else 1, minibatch.done)) bat_not_done_mask = Variable( torch.ByteTensor(bat_not_done_mask)).unsqueeze(1) if self.CUDA: bat_o = bat_o.cuda() bat_a = bat_a.cuda() bat_r = bat_r.cuda() bat_o_ = bat_o_.cuda() bat_not_done_mask = bat_not_done_mask.cuda() # update critic bat_a_o_ = self.target_actor(bat_o_) Gt = bat_r Gt[bat_not_done_mask] += self.GAMMA * self.target_critic( bat_o_, bat_a_o_)[bat_not_done_mask] Gt.detach_() eval_o = self.critic(bat_o, bat_a) criterion = nn.MSELoss() if self.CUDA: criterion.cuda() loss = criterion(eval_o, Gt) self.optim_critic.zero_grad() loss.backward() self.optim_critic.step() # update actor self.critic.eval() bat_a_o = self.actor(bat_o) obj = torch.mean(self.critic(bat_o, bat_a_o)) self.optim_actor.zero_grad() obj.backward() self.optim_actor.step() self.critic.train() def test(self, dir=None, n=1): if dir is not None: self.env = wrappers.Monitor(self.orig_env, '{}/test_record'.format(dir), force=True, video_callable=lambda episode_id: True) title = {common.S_EPI: [], common.S_TOTAL_R: []} df = pd.DataFrame(title) for epi in trange(n, desc='test epi', leave=True): o = self.env.reset() acc_r = 0 while True: #if dir is not None: # self.env.render() a = self.choose_action(o) o_, r, done, info = self.env.step(self.map_to_action(a)) acc_r += r o = o_ if done: break s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R]) df = df.append(s, ignore_index=True) if dir is not None: df.to_csv('{}/test_data.csv'.format(dir)) else: print df def save(self, dir, suffix='', save_data=True): torch.save(self.actor.state_dict(), '{}/actor{}.pt'.format(dir, suffix)) torch.save(self.critic.state_dict(), '{}/critic{}.pt'.format(dir, suffix)) if save_data: self.data.to_csv('{}/train_data{}.csv'.format(dir, suffix)) def load_actor(self, dir): self.actor.load_state_dict(torch.load(dir)) def load_critic(self, dir): self.critic.load_state_dict(torch.load(dir)) def get_data(self): return self.data