def predict_batch(self, states): batch_size = states.shape[0] actions = self.action_sampler.sample((self.horizon, batch_size, self.num_random_action_selection)) states = np.expand_dims(states, axis=1) states = (states, (1, self.num_random_action_selection, 1)) states = convert_to_tensor(states) actions = convert_to_tensor(actions) cost = torch.zeros(size=(batch_size, self.num_random_action_selection)).type(FloatTensor) for i in range(self.horizon): # reshape states and actions states = states.view(-1, *states.shape[2:]) # (b, K, ob_dim) -> (b * K, ob_dim) current_action = actions[i].view(-1, *actions[i][2:]) # predict next states and rewards next_states, rewards = self.model.predict_next_states_rewards(states, current_action) # reshape back rewards = rewards.view(batch_size, self.num_random_action_selection) next_states = next_states.view(batch_size, self.num_random_action_selection, *states.shape[2:]) cost += -rewards * self.gamma_inverse states = next_states best_action = torch.gather(actions[0], dim=1, index=torch.argmin(cost, dim=1, keepdim=True)) best_action = best_action.cpu().numpy() return best_action
def predict_next_state(self, state, action): states = np.expand_dims(state, axis=0) actions = np.expand_dims(action, axis=0) states = convert_to_tensor(states) actions = convert_to_tensor(actions) next_state = self.predict_next_states(states, actions).cpu().numpy()[0] return next_state
def predict(self, state): state = np.expand_dims(state, axis=0) with torch.no_grad(): state = convert_to_tensor(state) state = (state - self.state_mean) / self.state_std action = self.model.forward(state) return action.cpu().numpy()[0]
def update(self, replay_buffer, num_updates, action_limit, policy_freq=2, batch_size=128, target_noise=0.2, clip_noise=0.5, tau=5e-3, gamma=0.99): for i in range(num_updates): transition = replay_buffer.sample(batch_size) s_batch, a_batch, s2_batch, r_batch, t_batch = convert_to_tensor( transition, location='gpu') r_batch = r_batch.type(FloatTensor) t_batch = t_batch.type(FloatTensor) # get ground truth q value with torch.no_grad(): target_action_noise = torch.clamp(torch.randn_like(a_batch) * target_noise, min=-clip_noise, max=clip_noise) target_action = torch.clamp( self.target_actor_module.forward(s2_batch) + target_action_noise, min=-action_limit, max=action_limit) target_q = self.target_critic_module.forward( state=s2_batch, action=target_action, minimum=True) q_target = r_batch + gamma * target_q * (1 - t_batch) # critic loss q_values, q_values2 = self.critic_module.forward(s_batch, a_batch, minimum=False) q_values_loss = F.mse_loss(q_values, q_target) + F.mse_loss( q_values2, q_target) self.critic_optimizer.zero_grad() q_values_loss.backward() self.critic_optimizer.step() if i % policy_freq == 0: action = self.actor_module.forward(s_batch) q_values = self.critic_module.forward(s_batch, action, minimum=False)[0] loss = -torch.mean(q_values) self.actor_optimizer.zero_grad() loss.backward() self.actor_optimizer.step() soft_update(self.target_critic_module, self.critic_module, tau) soft_update(self.target_actor_module, self.actor_module, tau)
def set_statistics(self, dataset: ReplayBuffer): state_mean, state_std = dataset.state_mean_std self.state_mean = convert_to_tensor(state_mean).unsqueeze(dim=0) self.state_std = convert_to_tensor(state_std).unsqueeze(dim=0) if self.dynamics_model.discrete: self.action_mean = None self.action_std = None else: action_mean, action_std = dataset.action_mean_std self.action_mean = convert_to_tensor(action_mean).unsqueeze(dim=0) self.action_std = convert_to_tensor(action_std).unsqueeze(dim=0) delta_state_mean, delta_state_std = dataset.delta_state_mean_std self.delta_state_mean = convert_to_tensor(delta_state_mean).unsqueeze( dim=0) self.delta_state_std = convert_to_tensor(delta_state_std).unsqueeze( dim=0) if self.cost_fn_batch is None: reward_mean, reward_std = dataset.reward_mean_std self.reward_mean = reward_mean self.reward_std = reward_std
def update(self, obs, actions, next_obs, done, reward): """ Sample a mini-batch from replay buffer and update the network Args: obs: (batch_size, ob_dim) actions: (batch_size, action_dim) next_obs: (batch_size, ob_dim) done: (batch_size,) reward: (batch_size,) Returns: None """ obs = convert_to_tensor(obs) actions = convert_to_tensor(actions) next_obs = convert_to_tensor(next_obs) done = convert_to_tensor(done).type(FloatTensor) reward = convert_to_tensor(reward) # q loss q_values, q_values2 = self.q_network.forward(obs, actions, False) with torch.no_grad(): next_action_distribution = self.policy_net.forward_action(next_obs) next_action = next_action_distribution.sample() next_action_log_prob = next_action_distribution.log_prob( next_action) target_q_values = self.target_q_network.forward( next_obs, next_action, True) - self.alpha * next_action_log_prob q_target = reward + self.gamma * (1.0 - done) * target_q_values q_values_loss = F.mse_loss(q_values, q_target) + F.mse_loss( q_values2, q_target) # policy loss if self.discrete: # for discrete action space, we can directly compute kl divergence analytically without sampling action_distribution = self.policy_net.forward_action(obs) q_values_min = self.q_network.forward(obs, None, True) # (batch_size, ac_dim) probs = F.softmax(q_values_min, dim=-1) target_distribution = torch.distributions.Categorical(probs=probs) policy_loss = torch.distributions.kl_divergence( action_distribution, target_distribution).mean() log_prob = -action_distribution.entropy() else: action_distribution = self.policy_net.forward_action(obs) pi = action_distribution.rsample() log_prob = action_distribution.log_prob( pi) # should be shape (batch_size,) q_values_pi_min = self.q_network.forward(obs, pi, True) policy_loss = torch.mean(log_prob * self.alpha - q_values_pi_min) # alpha loss if self.log_alpha_tensor is not None: alpha_loss = -(self.log_alpha_tensor * (log_prob + self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha_tensor.exp().item() self.q_optimizer.zero_grad() q_values_loss.backward() self.q_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
def predict_batch(self, states): states = convert_to_tensor(states.astype(np.float32)) action_distribution = self.policy_net.forward_action(states) return action_distribution.sample().cpu().numpy()
def predict_batch(self, state): state = convert_to_tensor(state.astype(np.float32)) return self.actor_module.forward(state).cpu().numpy()
def set_state_stats(self, state_mean, state_std): self.state_mean = convert_to_tensor(state_mean).unsqueeze(dim=0) self.state_std = convert_to_tensor(state_std).unsqueeze(dim=0)