def train_statistics_network(self): if self.replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), volatile=True) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() all_actions = self.get_all_actions(self.action_space) all_actions = Variable(torch.cat(all_actions)) new_state_marginals = [] for state in states: state = state.expand(self.action_space, -1) new_states = self.fwd(state, all_actions) new_states = torch.mean(new_states) new_state_marginals.append(new_states) new_state_marginals = Variable(torch.cat(new_state_marginals)) mutual_information = self.stats(new_states, actions) - \ torch.log(torch.exp(self.stats(new_state_marginals, actions))) # Maximize the mutual information loss = -mutual_information self.stats_optim.zero_grad() loss.backward() self.stats_optim.step() # Store in the dqn replay buffer rewards = rewards + mutual_information self.store_transition(buffer=self.dqn_replay_buffer, state=states, action=actions, new_state=new_states, reward=rewards, done=dones, success=None) return loss
def train_forward_dynamics(self): if self.replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), volatile=True) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() predicted_new_states = self.fwd(states, actions) mse_error = F.mse_loss(predicted_new_states, new_states) self.fwd_optim.zero_grad() mse_error.backward() self.fwd_optim.step() return mse_error
def train_policy(self, clip_gradients=True): # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay. # If the size of the buffer is less than batch size then return if self.dqn_replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.dqn_replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), requires_grad=False) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken q_values = self.policy_network(states) next_q_values = self.policy_network(new_states) next_q_state_values = self.target_policy_network(new_states).detach() q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = rewards + self.gamma * next_q_value * (1 - dones) expected_q_value = expected_q_value.detach() td_loss = F.smooth_l1_loss(q_value, expected_q_value) self.policy_optim.zero_grad() td_loss.backward() if clip_gradients: for param in self.policy_network.parameters(): param.grad.data.clamp_(-1, 1) self.policy_optim.step() return td_loss
def calc_td_error(self): """ Calculates the td error against the bellman target :return: """ # Calculate the TD error only for the particular transition # Get the separate values from the named tuple transitions = self.buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) state = batch.state new_state = batch.next_state action = batch.action reward = batch.reward done = batch.done #reward = list(reward) #done = list(done) state = Variable(torch.cat(state), volatile=True) new_state = Variable(torch.cat(new_state), volatile=True) action = Variable(torch.cat(action)) reward = Variable(torch.cat(reward)) done = Variable(torch.cat(done)) if self.use_cuda: state = state.cuda() action = action.cuda() reward = reward.cuda() new_state = new_state.cuda() done = done.cuda() q_values = self.current_model(state) next_q_values = self.current_model(new_state) next_q_state_values = self.target_model(new_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - Variable(expected_q_value.data)).pow(2).mean() self.optim.zero_grad() loss.backward() self.optim.step() return loss
def fit_batch_dqn(self): # Sample mini-batch from the replay buffer uniformly or from the prioritized experience replay. # If the size of the buffer is less than batch size then return if self.replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.dqn_replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), volatile=True) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken # Encode the states and the new states states = self.encoder(states) new_states = self.encoder(new_states) state_action_values = self.policy_network(states).gather(1, actions) # Compute V(s_{t+1}) for all next states. next_state_values = self.target_policy_network(new_states).max(1)[0].detach() next_state_values = next_state_values * (1 - dones) y = rewards + self.gamma * next_state_values td_loss = F.smooth_l1_loss(state_action_values, y) self.policy_optim.zero_grad() td_loss.backward() for param in self.policy_network.parameters(): param.grad.data.clamp_(-1, 1) self.policy_optim.step() return td_loss
def fit_batch(self): transitions = self.buffer.sample_batch(self.bs) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) with torch.no_grad(): new_states = Variable(torch.cat(new_states)) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() value_loss, values = self.calc_soft_value_function_error(states) q_loss, q_values = self.calc_soft_q_function_error( states, actions, new_states, rewards, dones) policy_loss = self.calc_policy_loss(states, q_values, values) """ Update the networks """ self.value_optim.zero_grad() value_loss.backward() self.value_optim.step() self.critic_optim.zero_grad() q_loss.backward() self.critic_optim.zero_grad() self.actor_optim.zero_grad() policy_loss.backward() self.actor_optim.step() # Update the target networks self.update_target_networks() return value_loss, q_loss, policy_loss
def train_forward_dynamics(self, clamp_gradients=False, use_difference_representation=True): if self.replay_buffer.get_buffer_size() < self.batch_size: return None transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states)) actions = Variable(torch.cat(actions)) if self.use_cuda: states = states.cuda() actions = actions.cuda() new_states = new_states.cuda() if use_difference_representation: # Under this representation, the model predicts the difference between the current state and the next state. diff_new_states = self.fwd(states, actions) predicted_new_states = states + diff_new_states else: predicted_new_states = self.fwd(states, actions) mse_error = F.smooth_l1_loss(predicted_new_states, new_states) self.fwd_optim.zero_grad() mse_error.backward() # Clamp the gradients if clamp_gradients: for param in self.fwd.parameters(): param.grad.data.clamp_(-1, 1) self.fwd_optim.step() return mse_error
def fit_batch(self): # Sample mini-batch from the buffer uniformly or using prioritized experience replay # If the size of the buffer is less than batch size then return if self.buffer.get_buffer_size() < self.batch_size: return None, None transitions = self.buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done #actions = list(actions) rewards = list(rewards) dones = list(dones) states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), volatile=True) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() # Step 2: Compute the target values using the target actor network and target critic network # Compute the Q-values given the current state ( in this case it is the new_states) #with torch.no_grad(): new_action = self.target_actor(new_states) new_action.volatile = True next_Q_values = self.target_critic(new_states, new_action) # Find the Q-value for the action according to the target actior network # We do this because calculating max over a continuous action space is intractable # next_Q_values.volatile = False next_Q_values = torch.squeeze(next_Q_values, dim=1) next_Q_values = next_Q_values * (1 - dones) next_Q_values.volatile = False y = rewards + self.gamma * next_Q_values # Zero the optimizer gradients self.actor_optim.zero_grad() self.critic_optim.zero_grad() # Forward pass outputs = self.critic(states, actions) loss = self.criterion(outputs, y) loss.backward() # Clamp the gradients to avoid vanishing gradient problem for param in self.critic.parameters(): param.grad.data.clamp_(-1, 1) self.critic_optim.step() # Updating the actor policy policy_loss = -1 * self.critic(states, self.actor(states)) policy_loss = policy_loss.mean() policy_loss.backward() # Clamp the gradients to avoid the vanishing gradient problem for param in self.actor.parameters(): param.grad.data.clamp_(-1, 1) self.actor_optim.step() return loss, policy_loss
def train(self): epoch_episode_rewards = [] # Initialize the training with an initial state state = self.env.reset() # Initialize the losses episode_reward = 0 # Check whether to use cuda or not state = to_tensor(state, use_cuda=self.use_cuda) fwd_loss = 0 stats_loss = 0 policy_loss = 0 # Mean rewards mean_rewards = [] with torch.no_grad(): state = self.encoder(state) state = state.detach() for frame_idx in range(1, self.num_frames + 1): epsilon_by_frame = epsilon_greedy_exploration() epsilon = epsilon_by_frame(frame_idx) action = self.policy_network.act(state, epsilon) # Execute the action next_state, reward, done, success = self.env.step(action.item()) episode_reward += reward reward = np.sign(reward) next_state = to_tensor(next_state, use_cuda=self.use_cuda) with torch.no_grad(): next_state = self.encoder(next_state) next_state = next_state.detach() reward = torch.tensor([reward], dtype=torch.float) done_bool = done * 1 done_bool = torch.tensor([done_bool], dtype=torch.float) # Store in the replay buffer self.store_transition(state=state, new_state=next_state, action=action, done=done_bool, reward=reward) state = next_state if done: epoch_episode_rewards.append(episode_reward) # Add episode reward to tensorboard episode_reward = 0 state = self.env.reset() state = to_tensor(state, use_cuda=self.use_cuda) state = self.encoder(state) # Train the forward dynamics model if len(self.replay_buffer) > self.fwd_limit: # Sample a minibatch from the replay buffer transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) batch = self.get_train_variables(batch) mse_loss = self.train_forward_dynamics(batch=batch) fwd_loss += mse_loss.item() if frame_idx % self.print_every == 0: print('Forward Dynamics Loss :', fwd_loss / (frame_idx - self.fwd_limit)) # Train the statistics network and the policy if len(self.replay_buffer) > self.policy_limit: transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) batch = self.get_train_variables(batch) loss, aug_rewards = self.train_statistics_network(batch=batch) p_loss = self.train_policy(batch=batch, rewards=aug_rewards) stats_loss += loss.item() policy_loss += p_loss.item() if frame_idx % self.print_every == 0: print('Statistics Loss: ', stats_loss / (frame_idx - self.policy_limit)) print('Policy Loss: ', policy_loss / (frame_idx - self.policy_limit)) # Print the statistics if self.verbose: if frame_idx % self.print_every == 0: print('Mean Reward ', str(np.mean(epoch_episode_rewards))) print('Sum of Rewards ', str(np.sum(epoch_episode_rewards))) mean_rewards.append(np.mean(epoch_episode_rewards)) if self.plot_stats: if frame_idx % self.plot_every == 0: # Plot the statistics calculated self.plot(frame_idx=frame_idx, rewards=epoch_episode_rewards, mean_rewards=mean_rewards, output_folder=self.output_folder, placeholder_name='/DQN_montezuma_intrinsic') # Update the target network if frame_idx % self.update_every == 0: self.update_networks() # Save the models and the rewards file if frame_idx % self.save_epoch == 0: self.save_m() self.save_rewards(ep_rewards=epoch_episode_rewards, mean_rewards=mean_rewards) self.save_m()
def train_statistics_network(self, use_jenson_shannon_divergence=True, use_target_forward_dynamics=False, use_target_stats_network=False, clamp_gradients=False): if self.replay_buffer.get_buffer_size() < self.batch_size: return None, None, None transitions = self.replay_buffer.sample_batch(self.batch_size) batch = Buffer.Transition(*zip(*transitions)) # Get the separate values from the named tuple states = batch.state new_states = batch.next_state actions = batch.action rewards = batch.reward dones = batch.done states = Variable(torch.cat(states)) new_states = Variable(torch.cat(new_states), requires_grad=False) actions = Variable(torch.cat(actions)) rewards = Variable(torch.cat(rewards)) dones = Variable(torch.cat(dones)) if self.use_cuda: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() new_states = new_states.cuda() dones = dones.cuda() all_actions = self.get_all_actions(self.action_space) all_actions = Variable(torch.cat(all_actions)) new_state_marginals = [] for state in states: state = state.expand(self.action_space, -1) if use_target_forward_dynamics: n_s = self.target_fwd(state, all_actions) else: n_s = self.fwd(state, all_actions) n_s = n_s.detach() n_s = n_s + state n_s = torch.mean(n_s, dim=0) n_s = torch.unsqueeze(n_s, dim=0) new_state_marginals.append(n_s) new_state_marginals = tuple(new_state_marginals) new_state_marginals = Variable(torch.cat(new_state_marginals), requires_grad=False) p_sa = self.stats(new_states, actions) p_s_a = self.stats(new_state_marginals, actions) p_s_ta = self.target_stats(new_states, actions) p_s_t_a = self.target_stats(new_state_marginals, actions) if use_jenson_shannon_divergence: # Improves stability and gradients are unbiased if use_target_stats_network: mutual_information = -F.softplus(-p_s_ta) - F.softplus(p_s_t_a) else: mutual_information = -F.softplus(-p_sa) - F.softplus(p_s_a) lower_bound = torch.mean(-F.softplus(-p_sa)) - torch.mean( F.softplus(p_s_a)) else: # Use KL Divergence if use_target_stats_network: mutual_information = p_s_ta - torch.log(torch.exp(p_s_t_a)) else: mutual_information = p_sa - torch.log(torch.exp(p_s_a)) lower_bound = torch.mean(p_sa) - torch.log( torch.mean(torch.exp(p_s_a))) # Maximize the mutual information loss = -lower_bound self.stats_optim.zero_grad() loss.backward() # Clamp the gradients if clamp_gradients: for param in self.stats.parameters(): param.grad.data.clamp_(-1, 1) self.stats_optim.step() # Store in the dqn replay buffer mutual_information = torch.squeeze(mutual_information, dim=-1) mutual_information = mutual_information.detach() rewards_combined = rewards + self.intrinsic_param * mutual_information # Store the updated reward transition in the replay buffer self.store_transition(state=states, action=actions, new_state=new_states, reward=rewards_combined, done=dones, buffer=self.dqn_replay_buffer) return loss, rewards, mutual_information, lower_bound