def __call__(self, state, greedy=False): action_scale = self.env.max_action # initial exploration and intrinsic curiosity res = None if self.training: if self.config.get('initial_explore') and len(self.replay_buffer) < self.config.initial_explore: res = np.array([self.env.action_space.sample() for _ in range(self.env.num_envs)]) elif hasattr(self, 'ag_curiosity'): state = self.ag_curiosity.relabel_state(state) state = flatten_state(state, self.config.modalities + self.config.goal_modalities) # flatten goal environments if hasattr(self, 'state_normalizer'): state = self.state_normalizer(state, update=self.training) if res is not None: return res state = self.torch(state) if self.use_actor_target: action, _ = self.actor_target(state) else: action, _ = self.actor(state) action = self.numpy(action) if self.training and not greedy and self.config.get('eexplore'): eexplore = self.config.eexplore if hasattr(self, 'ag_curiosity'): eexplore = self.ag_curiosity.go_explore * self.config.go_eexplore + eexplore mask = (np.random.random((action.shape[0], 1)) < eexplore).astype(np.float32) randoms = np.random.random(action.shape) * (2 * action_scale) - action_scale action = mask * randoms + (1 - mask) * action return np.clip(action, -action_scale, action_scale)
def __call__(self, state, greedy=False): res = None # Initial Exploration if self.training: if self.config.get('initial_explore') and len( self.replay_buffer) < self.config.initial_explore: res = np.array([self.env.action_space.sample() for _ in range(self.env.num_envs)]) elif hasattr(self, 'ag_curiosity'): state = self.ag_curiosity.relabel_state(state) state = flatten_state(state) # flatten goal environments if hasattr(self, 'state_normalizer'): state = self.state_normalizer(state, update=self.training) if res is not None: return res state = self.torch(state) if self.use_qvalue_target: q_values = self.numpy(self.qvalue_target(state)) else: q_values = self.numpy(self.qvalue(state)) if self.training and not greedy and np.random.random() < self.config.random_action_prob(steps=self.config.env_steps): action = np.random.randint(self.env.action_space.n, size=[self.env.num_envs]) else: action = np.argmax(q_values, -1) # Convert to int return action
def _process_experience(self, exp): if getattr(self, 'logger'): self.logger.add_tabular('Replay buffer size', len(self.buffer)) done = np.expand_dims(exp.done, 1) # format for replay buffer reward = np.expand_dims(exp.reward, 1) # format for replay buffer action = exp.action if self.goal_shape: state = flatten_state(exp.state, self.modalities) next_state = flatten_state(exp.next_state, self.modalities) if hasattr(self, 'achieved_goal'): previous_achieved = self.achieved_goal(exp.state) achieved = self.achieved_goal(exp.next_state) else: previous_achieved = exp.state['achieved_goal'] achieved = exp.next_state['achieved_goal'] desired = flatten_state(exp.state, self.goal_modalities) if hasattr(self, 'ag_curiosity' ) and self.ag_curiosity.current_goals is not None: behavioral = self.ag_curiosity.current_goals # recompute online reward reward = self.env.compute_reward(achieved, behavioral, { 's': state, 'a': action, 'ns': next_state }).reshape(-1, 1) else: behavioral = desired for i in range(self.n_envs): self._subbuffers[i].append([ state[i], action[i], reward[i], next_state[i], done[i], previous_achieved[i], achieved[i], behavioral[i], desired[i] ]) else: state = exp.state next_state = exp.next_state for i in range(self.n_envs): self._subbuffers[i].append( [state[i], action[i], reward[i], next_state[i], done[i]]) for i in range(self.n_envs): if exp.trajectory_over[i]: trajectory = [np.stack(a) for a in zip(*self._subbuffers[i])] self.buffer.add_trajectory(*trajectory) self._subbuffers[i] = []
def __call__(self, state, greedy=False): action_scale = self.env.max_action # initial exploration and intrinsic curiosity res = None if self.training: if self.config.get('initial_explore') and len(self.replay_buffer) < self.config.initial_explore: res = np.array([self.env.action_space.sample() for _ in range(self.env.num_envs)]) elif hasattr(self, 'ag_curiosity'): state = self.ag_curiosity.relabel_state(state) state = flatten_state(state, self.config.modalities + self.config.goal_modalities) # flatten goal environments, # batch x state_dim if hasattr(self, 'state_normalizer'): state = self.state_normalizer(state, update=self.training) if res is not None: return res state = self.torch(state) action_proposals = self.actor(state) # batch x num_proposals x action_dim states = torch.repeat_interleave(state, action_proposals.shape[1], 0) q_values = self.critic(states, action_proposals.reshape(-1, action_proposals.shape[-1])) q_values = q_values.reshape(state.shape[0], action_proposals.shape[1]) # batch x num_proposals best_actions = torch.argmax(q_values, dim=-1, keepdims=True) # batch x 1 action = action_proposals.gather(1, torch.tile(best_actions[:,:,None], (1, 1, action_proposals.shape[2]))).squeeze(1) action = self.numpy(action) if self.training and not greedy: action = self.action_noise(action) if self.config.get('eexplore'): eexplore = self.config.eexplore if hasattr(self, 'ag_curiosity'): eexplore = self.ag_curiosity.go_explore * self.config.go_eexplore + eexplore mask = (np.random.random((action.shape[0], 1)) < eexplore).astype(np.float32) randoms = np.random.random(action.shape) * (2 * action_scale) - action_scale action = mask * randoms + (1 - mask) * action return np.clip(action, -action_scale, action_scale)