def step(self, reward, observation): if self.episode_counter < episodes_only_dqn: action = BaseDynaAgent.step(self, reward, observation) elif self.episode_counter < episodes_only_dqn + episodes_only_mcts: action = MCTSAgent.step(self, reward, observation) else: if self.episode_counter % 2 == 0: action = BaseDynaAgent.step(self, reward, observation) else: action = MCTSAgent.step(self, reward, observation) return action
def step(self, reward, observation): self.time_step += 1 if self.episode_counter % 2 == 0: self.state = self.getStateRepresentation(observation) self.action = self.policy(self.state) reward = torch.tensor([reward], device=self.device) with torch.no_grad(): real_prev_action = self.action_list[self.prev_action.item()] prev_state_value = self.getStateActionValue( self.prev_state, real_prev_action).item() state_value = self._vf['q']['network']( self.state).max(1)[1].view(1, 1).item() td_error = reward.item( ) + self.gamma * state_value - prev_state_value self.update_average_td_error(td_error) # store the new transition in buffer self.updateTransitionBuffer( utils.transition(self.prev_state, self.prev_action, reward, self.state, self.action, False, self.time_step, 0)) # update target if self._target_vf['counter'] >= self._target_vf['update_rate']: self.setTargetValueFunction(self._vf['q'], 'q') # self.setTargetValueFunction(self._vf['s'], 's') # update value function with the buffer if self._vf['q']['training']: if len(self.transition_buffer) >= self._vf['q']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 'q') if self._vf['s']['training']: if len(self.transition_buffer) >= self._vf['s']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 's') # train/plan with model self.trainModel() self.plan() self.updateStateRepresentation() self.prev_state = self.getStateRepresentation(observation) self.prev_action = self.action # another option:** we can again call self.policy function ** action = self.action_list[self.prev_action.item()] else: action = MCTSAgent.step(self, reward, observation) return action
def step(self, reward, observation): if self.episode_counter % 2 == 0: self.time_step += 1 self.state = self.getStateRepresentation(observation) self.action = self.policy(self.state) # update target if self._target_vf['counter'] >= self._target_vf['update_rate']: self.setTargetValueFunction(self._vf['q'], 'q') # self.setTargetValueFunction(self._vf['s'], 's') # update value function with the buffer if self._vf['q']['training']: if len(self.transition_buffer) >= self._vf['q']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 'q') if self._vf['s']['training']: if len(self.transition_buffer) >= self._vf['s']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 's') # train/plan with model self.trainModel() self.plan() self.updateStateRepresentation() self.prev_state = self.getStateRepresentation(observation) self.prev_action = self.action # another option:** we can again call self.policy function ** action = self.action_list[self.prev_action.item()] else: action = MCTSAgent.step(self, reward, observation) return action
def step(self, reward, observation): action = MCTSAgent.step(self, reward, observation) return action
def step(self, reward, observation): if self.episode_counter % 2 == 0: self.time_step += 1 self.state = self.getStateRepresentation(observation) self.action = self.policy(self.state) reward = torch.tensor([reward], device=self.device) # store the new transition in buffer self.updateTransitionBuffer( utils.transition(self.prev_state, self.prev_action, reward, self.state, self.action, False, self.time_step, 0)) # update target if self._target_vf['counter'] >= self._target_vf['update_rate']: self.setTargetValueFunction(self._vf['q'], 'q') # self.setTargetValueFunction(self._vf['s'], 's') # update value function with the buffer if self._vf['q']['training']: if len(self.transition_buffer) >= self._vf['q']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 'q') if self._vf['s']['training']: if len(self.transition_buffer) >= self._vf['s']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 's') # train/plan with model self.trainModel() self.plan() self.updateStateRepresentation() self.prev_state = self.getStateRepresentation(observation) self.prev_action = self.action # another option:** we can again call self.policy function ** action = self.action_list[self.prev_action.item()] else: action = MCTSAgent.step(self, reward, observation) prev_action_index = self.getActionIndex(self.mcts_prev_action) prev_action_torch = torch.tensor([prev_action_index], device=self.device, dtype=int).view(1, 1) reward = torch.tensor([reward], device=self.device).float() state_torch = self.getStateRepresentation(observation) self.updateTransitionBuffer( utils.transition(self.mcts_prev_state, prev_action_torch, reward, state_torch, None, False, self.time_step, 0)) self.mcts_prev_state = state_torch self.mcts_prev_action = action # update target if self._target_vf['counter'] >= self._target_vf['update_rate']: self.setTargetValueFunction(self._vf['q'], 'q') # self.setTargetValueFunction(self._vf['s'], 's') # update value function with the buffer if self._vf['q']['training']: if len(self.transition_buffer) >= self._vf['q']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 'q') if self._vf['s']['training']: if len(self.transition_buffer) >= self._vf['s']['batch_size']: transition_batch = self.getTransitionFromBuffer( n=self._vf['q']['batch_size']) self.updateValueFunction(transition_batch, 's') return action