def learn(self, replay_buffer, env): self.it += 1 states, actions, next_states, rewards, dones = replay_buffer.sample(self.batch_size) states = states.squeeze() actions = actions.squeeze() next_states = next_states.squeeze() rewards = rewards.squeeze() dones = dones.squeeze() if env.has_discrete_state_space(): states = to_one_hot_encoding(states, self.state_dim) next_states = to_one_hot_encoding(next_states, self.state_dim) q_values = self.model(states) next_q_values = self.model(next_states) next_q_state_values = self.model_target(next_states) q_value = q_values.gather(1, actions.long().unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather(1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = rewards + self.gamma * next_q_value * (1 - dones) loss = F.mse_loss(q_value, expected_q_value.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # target network update for target_param, param in zip(self.model_target.parameters(), self.model.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) return loss
def reset(self): self.state = torch.tensor(self.reset_env.reset()) if len(self.state) > self.state_dim: self.state = from_one_hot_encoding(self.state) elif len(self.state) < self.state_dim: self.state = to_one_hot_encoding(self.state, self.state_dim) return self.state
def select_train_action(self, state, env): if random.random() < self.eps: return env.get_random_action() else: if env.has_discrete_state_space(): state = to_one_hot_encoding(state, self.state_dim) qvals = self.model(state.to(self.device)) return torch.argmax(qvals).unsqueeze(0).detach()
def step(self, action, state=None): if self.is_virtual_env(): reward_sum = None if self.has_discrete_action_space(): action = to_one_hot_encoding(action, self.get_action_dim()) if state is None: for i in range(self.same_action_num): state, reward, done = self.env.step( action=action.to(self.env.device)) if reward_sum is None: reward_sum = reward else: reward_sum += reward # TODO: proper handling of the done flag for a batch of states/actions if same_action_num > 1 # required for the histogram experiment else: for i in range(self.same_action_num): state, reward, done = self.env.step( action=action.to(self.env.device), state=state.to(self.env.device)) if reward_sum is None: reward_sum = reward else: reward_sum += reward # TODO: proper handling of the done flag for a batch of states/actions if same_action_num > 1 # todo: to device? reward = reward_sum.to("cpu") next_state = state.to("cpu") done = done.to("cpu") if self.has_discrete_state_space(): next_state = from_one_hot_encoding(next_state) return next_state, reward, done else: action = action.cpu().detach().numpy() if self.has_discrete_action_space(): action = action.astype(int)[0] reward_sum = 0 for i in range(self.same_action_num): state, reward, done, _ = self.env.step(action) reward_sum += reward if done: break next_state_torch = torch.tensor(state, device="cpu", dtype=torch.float32) reward_torch = torch.tensor(reward_sum, device="cpu", dtype=torch.float32) done_torch = torch.tensor(done, device="cpu", dtype=torch.float32) if next_state_torch.dim() == 0: next_state_torch = next_state_torch.unsqueeze(0) return next_state_torch, reward_torch, done_torch
def select_test_action(self, state, env): if env.has_discrete_state_space(): state = to_one_hot_encoding(state, self.state_dim) qvals = self.model(state.to(self.device)) return torch.argmax(qvals).unsqueeze(0).detach()
def _calc_reward(self, state, next_state, reward, info): if 'TimeLimit.truncated' in info: # remove additional information from wrapper info.pop('TimeLimit.truncated') reward_torch = torch.tensor(reward, device=self.device, dtype=torch.float32) if isinstance(state, int) or len(state) < self.state_dim: state_torch = to_one_hot_encoding(state, self.state_dim) next_state_torch = to_one_hot_encoding(next_state, self.state_dim) else: state_torch = torch.tensor(state, device=self.device, dtype=torch.float32) next_state_torch = torch.tensor(next_state, device=self.device, dtype=torch.float32) if self.reward_env_type == 0: reward_res = reward_torch elif self.reward_env_type == 1: reward_res = self.gamma * self.reward_net( next_state_torch) - self.reward_net(state_torch) elif self.reward_env_type == 2: reward_res = reward_torch + self.gamma * self.reward_net( next_state_torch) - self.reward_net(state_torch) elif self.reward_env_type == 3 or self.reward_env_type == 4: if not info: raise ValueError('No info dict provided by environment') info_torch = torch.tensor(list(info.values()), device=self.device, dtype=torch.float32) input_state = torch.cat( (state_torch.to(self.device), info_torch.to(self.device)), dim=state_torch.dim() - 1) input_state_next = torch.cat( (next_state_torch.to(self.device), info_torch.to(self.device)), dim=state_torch.dim() - 1) if self.reward_env_type == 3: reward_res = self.gamma * self.reward_net( input_state_next) - self.reward_net(input_state) elif self.reward_env_type == 4: reward_res = reward_torch + self.gamma * self.reward_net( input_state_next) - self.reward_net(input_state) elif self.reward_env_type == 5: reward_res = self.reward_net(next_state_torch) elif self.reward_env_type == 6: reward_res = reward_torch + self.reward_net(next_state_torch) elif self.reward_env_type == 7 or self.reward_env_type == 8: if not info: raise ValueError('No info dict provided by environment') info_torch = torch.tensor(list(info.values()), device=self.device, dtype=torch.float32) input_state = torch.cat( (state_torch.to(self.device), info_torch.to(self.device)), dim=state_torch.dim() - 1) input_state_next = torch.cat( (next_state_torch.to(self.device), info_torch.to(self.device)), dim=state_torch.dim() - 1) if self.reward_env_type == 7: reward_res = self.reward_net(input_state_next) elif self.reward_env_type == 8: reward_res = reward_torch + self.reward_net(input_state_next) elif self.reward_env_type == 101 or self.reward_env_type == 102: if not info: raise ValueError('No info dict provided by environment') info_torch = torch.tensor(list(info.values()), device=self.device, dtype=torch.float32) if self.reward_env_type == 101: reward_res = self.reward_net(info_torch) elif self.reward_env_type == 102: reward_res = reward_torch + self.reward_net(info_torch) return reward_res.item()