コード例 #1
0
ファイル: continuous_off_policy.py プロジェクト: spitis/mrl
  def _setup(self):
    """Sets up actor/critic optimizers and creates target network modules"""

    self.targets_and_models = []

    # Actor setup
    actor_params = []
    self.actors = []
    for module in list(self.module_dict.values()):
      name = module.module_name
      if name.startswith('actor') and isinstance(module, PytorchModel):
        self.actors.append(module)
        actor_params += list(module.model.parameters())
        target = module.copy(name + '_target')
        target.model.load_state_dict(module.model.state_dict())
        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in target.model.parameters():
          p.requires_grad = False
        self.agent.set_module(name + '_target', target)
        self.targets_and_models.append((target.model, module.model))

    if actor_params:
      self.actor_opt = torch.optim.Adam(
          actor_params,
          lr=self.config.actor_lr,
          weight_decay=self.config.actor_weight_decay)
    else:
      self.actor_opt = AttrDict({'state_dict': lambda: []})
    
    self.actor_params = actor_params

    # Critic setup
    critic_params = []
    self.critics = []
    for module in list(self.module_dict.values()):
      name = module.module_name
      if name.startswith('critic') and isinstance(module, PytorchModel):
        self.critics.append(module)
        critic_params += list(module.model.parameters())
        target = module.copy(name + '_target')
        target.model.load_state_dict(module.model.state_dict())
        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in target.model.parameters():
          p.requires_grad = False
        self.agent.set_module(name + '_target', target)
        self.targets_and_models.append((target.model, module.model))

    self.critic_opt = torch.optim.Adam(
        critic_params,
        lr=self.config.critic_lr,
        weight_decay=self.config.critic_weight_decay)
    
    self.critic_params = critic_params

    self.action_scale = self.env.max_action
コード例 #2
0
ファイル: env.py プロジェクト: nicolascastanet/mrl
    def step(self, action):

        obs, reward, done, info = self.env.step(action)

        if self.mode == 'Bob':
            import ipdb
            ipdb.set_trace()
            #First visit done for Bob
            if np.allclose(reward, 0.):
                done = True
                info['is_success'] = True
                if info.get('TimeLimit.truncated'):
                    del info['TimeLimit.truncated']

            return obs, reward, done, info

        elif self.mode == 'Alice':
            import ipdb
            ipdb.set_trace()
            info = AttrDict(info)
            self.total_rewards += reward
            if done:
                done = False
                info.done_observation = obs
                #info.terminal_state = True
                if info.get('TimeLimit.truncated'):
                    done = True
                    info.terminal_state = False
                info.episodic_return = self.total_rewards
                self.total_rewards = 0
            else:
                info.terminal_state = False
                info.episodic_return = None

            return obs, reward, done, info
コード例 #3
0
  def _overshoot_goals(self, experience, overshooting_idxs, overshooting_proposals):
    #score the proposals
    num_proposals = overshooting_proposals.shape[1]
    num_idxs = len(overshooting_idxs)
    states = np.tile(experience.reset_state['observation'][overshooting_idxs, None, :], (1, num_proposals, 1))
    states = np.concatenate((states, overshooting_proposals), -1).reshape(num_proposals * num_idxs, -1)

    bad_q_idxs, q_values = [], None
    if self.use_qcutoff:
      q_values = self.compute_q(states)
      q_values = q_values.reshape(num_idxs, num_proposals)
      bad_q_idxs = q_values < self.cutoff
    goal_values = self.score_goals(overshooting_proposals, AttrDict(q_values=q_values, states=states))

    if self.config.dg_score_multiplier > 1. and self.dg_kde.ready:
      dg_scores = self.dg_kde.evaluate_log_density(overshooting_proposals.reshape(num_proposals * num_idxs, -1))
      dg_scores = dg_scores.reshape(num_idxs, num_proposals)
      goal_values[dg_scores > -np.inf] *= self.config.dg_score_multiplier

    goal_values[bad_q_idxs] = q_values[bad_q_idxs] * -1e-8

    chosen_idx = np.argmin(goal_values, axis=1)
    chosen_idx = np.eye(num_proposals)[chosen_idx]  # shape(sampled_ags) = n_envs x num_proposals
    chosen_ags = np.sum(overshooting_proposals * chosen_idx[:, :, None], axis=1)  # n_envs x goal_feats

    for idx, goal in zip(overshooting_idxs, chosen_ags):
      self.current_goals[idx] = goal
      self.replaced_goal[idx] = 1.
コード例 #4
0
ファイル: eval.py プロジェクト: spitis/mrl
    def __call__(self, num_episodes: int, *unused_args, any_success=False):
        """
    Runs num_steps steps in the environment and returns results.
    Results tracking is done here instead of in process_experience, since 
    experiences aren't "real" experiences; e.g. agent cannot learn from them.  
    """
        self.eval_mode()
        env = self.eval_env
        num_envs = env.num_envs

        episode_rewards, episode_steps = [], []
        discounted_episode_rewards = []
        is_successes = []
        record_success = False

        while len(episode_rewards) < num_episodes:
            state = env.reset()

            dones = np.zeros((num_envs, ))
            steps = np.zeros((num_envs, ))
            is_success = np.zeros((num_envs, ))
            ep_rewards = [[] for _ in range(num_envs)]

            while not np.all(dones):
                action = self.policy(state)
                state, reward, dones_, infos = env.step(action)

                for i, (rew, done,
                        info) in enumerate(zip(reward, dones_, infos)):
                    if dones[i]:
                        continue
                    ep_rewards[i].append(rew)
                    steps[i] += 1
                    if done:
                        dones[i] = 1.
                    if 'is_success' in info:
                        record_success = True
                        is_success[i] = max(
                            info['is_success'], is_success[i]
                        ) if any_success else info['is_success']

            for ep_reward, step, is_succ in zip(ep_rewards, steps, is_success):
                if record_success:
                    is_successes.append(is_succ)
                episode_rewards.append(sum(ep_reward))
                discounted_episode_rewards.append(
                    discounted_sum(ep_reward, self.config.gamma))
                episode_steps.append(step)

        if hasattr(self, 'logger'):
            if len(is_successes):
                self.logger.add_scalar('Test/Success', np.mean(is_successes))
            self.logger.add_scalar('Test/Episode_rewards',
                                   np.mean(episode_rewards))
            self.logger.add_scalar('Test/Discounted_episode_rewards',
                                   np.mean(discounted_episode_rewards))
            self.logger.add_scalar('Test/Episode_steps',
                                   np.mean(episode_steps))

        return AttrDict({'rewards': episode_rewards, 'steps': episode_steps})
コード例 #5
0
def debug_vectorized_experience(state, action, next_state, reward, done, info):
  """Gym returns an ambiguous "done" signal. VecEnv doesn't 
  let you fix it until now. See ReturnAndObsWrapper in env.py for where
  these info attributes are coming from."""
  experience = AttrDict(
    state = state,
    action = action,
    reward = reward,
    info = info
  )
  next_copy = deepcopy(next_state) # deepcopy handles dict states

  for idx in np.argwhere(done):
    i = idx[0]
    if isinstance(next_copy, np.ndarray):
      next_copy[i] = info[i].done_observation
    else:
      assert isinstance(next_copy, dict)
      for key in next_copy:
        next_copy[key][i] = info[i].done_observation[key]
  
  experience.next_state = next_copy
  experience.trajectory_over = done
  experience.done = np.array([info[i].terminal_state for i in range(len(done))], dtype=np.float32)
  experience.reset_state = next_state
  experience.dont_record = np.zeros(len(reward)) # Record or not trajectory in replay buffer
  
  return next_state, experience
コード例 #6
0
 def step(self, action):
     obs, reward, done, info = self.env.step(action)
     info = AttrDict(info)
     self.total_rewards += reward
     if done:
         info.done_observation = obs
         info.terminal_state = True
         if info.get('TimeLimit.truncated'):
             info.terminal_state = False
         info.episodic_return = self.total_rewards
         self.total_rewards = 0
     else:
         info.terminal_state = False
         info.episodic_return = None
     return obs, reward, done, info
コード例 #7
0
ファイル: agent_base.py プロジェクト: nicolascastanet/mrl
def config_to_agent(config_dict: dict):
    module_list = []
    config = AttrDict()
    for k, v in config_dict.items():
        if is_module_or_or_module_list(v):
            module_list += flatten_modules(v)
        else:
            config[k] = v

    return Agent(module_list, config)
コード例 #8
0
  def _setup(self):
    self.ag_buffer = self.replay_buffer.buffer.BUFF.buffer_ag

    env = self.env
    assert type(env.observation_space) == gym.spaces.Dict
    self.goal_space = env.observation_space.spaces["desired_goal"]

    # Note: for now we apply entropy estimation on the achieved goal (ag) space
    # Define the buffers to store for prioritization
    items = [("entropy", (1,)), ("priority", (1,))]
    self.buffer = AttrDict()
    for name, shape in items:
      self.buffer['buffer_' + name] = RingBuffer(self.ag_buffer.maxlen, shape=shape)

    self._subbuffers = [[] for _ in range(self.env.num_envs)]
    self.n_envs = self.env.num_envs

    # Define the placeholder for mixture model to estimate trajectory
    self.clf = 0
コード例 #9
0
ファイル: agent_base.py プロジェクト: nicolascastanet/mrl
    def __init__(
        self,
        module_list: Iterable,  # list of mrl.Modules (possibly nested)
        config: AttrDict):  # hyperparameters and module settings

        self.config = config
        parent_folder = config.parent_folder
        assert parent_folder, "Setting the agent's parent folder is required!"
        self.agent_name = config.get(
            'agent_name') or 'agent_' + short_timestamp()
        self.agent_folder = os.path.join(parent_folder, self.agent_name)
        load_agent = False
        if os.path.exists(self.agent_folder):
            print('Detected existing agent! Loading agent from checkpoint...')
            load_agent = True
        else:
            os.makedirs(self.agent_folder, exist_ok=True)

        self._process_experience_registry = [
        ]  # set of modules which define _process_experience
        self._optimize_registry = []  # set of modules which define _optimize
        self.config.env_steps = 0
        self.config.opt_steps = 0

        module_list = flatten_modules(module_list)
        self.module_dict = AttrDict()
        for module in module_list:
            assert module.module_name
            setattr(self, module.module_name, module)
            self.module_dict[module.module_name] = module
        for module in module_list:
            self._register_module(module)

        self.training = True

        if load_agent:
            self.load()
            print('Successfully loaded saved agent!')
        else:
            self.save()
コード例 #10
0
def config_to_agent(config_dict: dict):
    '''
    The important method that actually creates the Agent (agent factory)

    :param config_dict: the dictionary of configuration parameters
    :return: the agent!!!!!
    '''

    module_list = []
    config = AttrDict()
    for k, v in config_dict.items():
        if is_module_or_or_module_list(v):
            module_list += flatten_modules(v)
        else:
            config[k] = v

    return Agent(module_list, config)
コード例 #11
0
ファイル: agent_base.py プロジェクト: nicolascastanet/mrl
class Agent():
    """
  The base agent class. Important: Agents should almost always be generated from a config_dict
  using mrl.util.config_to_agent(config_dict). See configs folder for default configs / examples.
  
  Agent is a flat collection of mrl.Module, which may include:
    - environments (train/eval)
    - replay buffer(s)
    - new task function
    - action function  (exploratory + greedy) 
    - loss function
    - intrinsic curiosity module 
    - value / policy networks and other models (e.g. goal generation)
    - planner (e.g., MCTS)
    - logger
    - anything else you want (image tagger, human interface, etc.)

  Agent has some lifecycle methods (process_experience, optimize, save, load) that call the 
  corresponding lifecycle hooks on modules that declare them.

  Modules have a reference to the Agent so that they can access each other via the Agent. Actually,
  modules use __getattr__ to access the agent directly (via self.*), so they are effectively agent
  methods that are defined in separate files / have their own initialize/save/load functions.

  Modules are registered and saved/restored individually. This lets you swap out / tweak individual
  agent methods without subclassing the agent. Individual saves let you swap out saved modules via
  the filesystem (good for, e.g., BatchRL), avoid pickling problems from non-picklable modules.
  """
    def __init__(
        self,
        module_list: Iterable,  # list of mrl.Modules (possibly nested)
        config: AttrDict):  # hyperparameters and module settings

        self.config = config
        parent_folder = config.parent_folder
        assert parent_folder, "Setting the agent's parent folder is required!"
        self.agent_name = config.get(
            'agent_name') or 'agent_' + short_timestamp()
        self.agent_folder = os.path.join(parent_folder, self.agent_name)
        load_agent = False
        if os.path.exists(self.agent_folder):
            print('Detected existing agent! Loading agent from checkpoint...')
            load_agent = True
        else:
            os.makedirs(self.agent_folder, exist_ok=True)

        self._process_experience_registry = [
        ]  # set of modules which define _process_experience
        self._optimize_registry = []  # set of modules which define _optimize
        self.config.env_steps = 0
        self.config.opt_steps = 0

        module_list = flatten_modules(module_list)
        self.module_dict = AttrDict()
        for module in module_list:
            assert module.module_name
            setattr(self, module.module_name, module)
            self.module_dict[module.module_name] = module
        for module in module_list:
            self._register_module(module)

        self.training = True

        if load_agent:
            self.load()
            print('Successfully loaded saved agent!')
        else:
            self.save()

    def train_mode(self):
        """Set agent to train mode; exploration / use dropout / etc. As in Pytorch."""
        self.training = True

    def eval_mode(self):
        """Set agent to eval mode; act deterministically / don't use dropout / etc."""
        self.training = False

    def process_experience(self, experience: AttrDict):
        """Calls the _process_experience function of each relevant module
    (typically, these will include a replay buffer and one or more logging modules)"""
        self.config.env_steps += self.env.num_envs if hasattr(self,
                                                              'env') else 1
        for module in self._process_experience_registry:
            module._process_experience(experience)

    def optimize(self):
        """Calls the _optimize function of each relevant module
    (typically, this will be the main algorithm; but may include others)"""
        self.config.opt_steps += 1
        for module in self._optimize_registry:
            module._optimize()

    def _register_module(self, module):
        """
    Provides module with a reference to agent so that modules can interact; e.g., 
    allows agent's policy to reference the value function.

    Then, calls each module's _setup and verify methods to _setup the module and
    verify that agent has all required modules.
    """
        self.module_dict[module.module_name] = module

        module.agent = self
        module.verify_agent_compatibility()
        module._setup()
        module.new_task()
        if hasattr(module, '_process_experience'):
            self._process_experience_registry.append(module)
        if hasattr(module, '_optimize'):
            self._optimize_registry.append(module)

    def set_module(self, module_name, module):
        """
    Sets a module (can be used to switch environments / policies)
    """
        setattr(self, module_name, module)
        self._register_module(module)

    def save(self, subfolder: Optional[str] = None):
        """
    The state of all stateful modules is saved to the agent's folder.
    The agent itself is NOT saved, and should be (1) rebuilt, and (2) restored using self.load().
    Subfolder can be used to save various checkpoints of same agent.
    """
        save_folder = self.agent_folder
        subfolder = subfolder or 'checkpoint'
        save_folder = os.path.join(save_folder, subfolder)

        if not os.path.exists(save_folder):
            os.makedirs(save_folder)

        for module in self.module_dict.values():
            module.save(save_folder)

        with open(os.path.join(save_folder, 'config.pickle'), 'wb') as f:
            pickle.dump(self.config, f)

    def load(self, subfolder: Optional[str] = None):
        """
    Restores state of stateful modules from the agent's folder[/subfolder].
    """
        save_folder = self.agent_folder
        subfolder = subfolder or 'checkpoint'
        save_folder = os.path.join(save_folder, subfolder)

        assert os.path.exists(save_folder), "load path does not exist!"

        with open(os.path.join(save_folder, 'config.pickle'), 'rb') as f:
            self.config = pickle.load(f)

        for module in self.module_dict.values():
            print("Loading module {}".format(module.module_name))
            module.load(save_folder)

    def save_checkpoint(self, checkpoint_dir):
        """
    Saves agent together with its buffer regardless of save buffer.
    Keeps 2 saves in the in folder in case the job is killed and last
    checkpoint is corrupted.

    NOTE: You should call agent.save to save to the main folder BEFORE calling this.
    """
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        with open(os.path.join(checkpoint_dir, 'INITIALIZED'), 'w') as f:
            f.write('INITIALIZED')

        subfolder1 = os.path.join(checkpoint_dir, '1')
        subfolder2 = os.path.join(checkpoint_dir, '2')

        os.makedirs(os.path.join(subfolder1, 'checkpoint'), exist_ok=True)
        os.makedirs(os.path.join(subfolder2, 'checkpoint'), exist_ok=True)

        done1 = os.path.join(subfolder1, 'DONE')
        done2 = os.path.join(subfolder2, 'DONE')

        if not os.path.exists(done1):
            savedir = subfolder1
            done_file = done1
        elif not os.path.exists(done2):
            savedir = subfolder2
            done_file = done2
        else:
            modtime1 = os.path.getmtime(done1)
            modtime2 = os.path.getmtime(done2)
            if modtime1 < modtime2:
                savedir = subfolder1
                done_file = done1
            else:
                savedir = subfolder2
                done_file = done2

            os.remove(done_file)

        savedir_checkpoint = os.path.join(savedir, 'checkpoint')
        # First save all modules, including replay buffer
        old_save_replay_buf = self.config.save_replay_buf
        self.config.save_replay_buf = True
        for module in self.module_dict.values():
            module.save(savedir_checkpoint)
        self.config.save_replay_buf = old_save_replay_buf

        # Now save the config also
        with open(os.path.join(savedir_checkpoint, 'config.pickle'),
                  'wb') as f:
            pickle.dump(self.config, f)

        # Now copy over the config and results files from the agent_folder
        files_and_folders = glob.glob(os.path.join(self.agent_folder, '*'))
        for file_or_folder in files_and_folders:
            if os.path.isfile(file_or_folder):
                shutil.copy(file_or_folder, savedir)

        # Finally, print the DONE file.
        with open(done_file, 'w') as f:
            f.write('DONE')

    def load_from_checkpoint(self, checkpoint_dir):
        """
    This loads an agent from a checkpoint_dir to which it was saved using the `save_checkpoint` method.
    """
        subfolder1 = os.path.join(checkpoint_dir, '1')
        subfolder2 = os.path.join(checkpoint_dir, '2')
        done1 = os.path.join(subfolder1, 'DONE')
        done2 = os.path.join(subfolder2, 'DONE')

        if not os.path.exists(done1):
            assert os.path.exists(done2)
            savedir = subfolder2
        elif not os.path.exists(done2):
            savedir = subfolder1
        else:
            modtime1 = os.path.getmtime(done1)
            modtime2 = os.path.getmtime(done2)
            if modtime1 > modtime2:
                savedir = subfolder1
            else:
                savedir = subfolder2

        savedir_checkpoint = os.path.join(savedir, 'checkpoint')

        # First load the agent
        with open(os.path.join(savedir_checkpoint, 'config.pickle'),
                  'rb') as f:
            self.config = pickle.load(f)

        for module in self.module_dict.values():
            print("Loading module {}".format(module.module_name))
            module.load(savedir_checkpoint)

        # Then copy over the config and results file to the agent_folder
        files_and_folders = glob.glob(os.path.join(savedir, '*'))
        for file_or_folder in files_and_folders:
            if os.path.isfile(file_or_folder):
                shutil.copy(file_or_folder, self.agent_folder)

    def torch(self, x):
        if isinstance(x, torch.Tensor): return x
        return torch.FloatTensor(x).to(self.config.device)

    def numpy(self, x):
        return x.cpu().detach().numpy()
コード例 #12
0
ファイル: continuous_off_policy.py プロジェクト: spitis/mrl
class OffPolicyActorCritic(mrl.Module):
  """This is the standard DDPG"""

  def __init__(self):
    super().__init__(
        'algorithm',
        required_agent_modules=['actor','critic','replay_buffer', 'env'],
        locals=locals())

  def _setup(self):
    """Sets up actor/critic optimizers and creates target network modules"""

    self.targets_and_models = []

    # Actor setup
    actor_params = []
    self.actors = []
    for module in list(self.module_dict.values()):
      name = module.module_name
      if name.startswith('actor') and isinstance(module, PytorchModel):
        self.actors.append(module)
        actor_params += list(module.model.parameters())
        target = module.copy(name + '_target')
        target.model.load_state_dict(module.model.state_dict())
        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in target.model.parameters():
          p.requires_grad = False
        self.agent.set_module(name + '_target', target)
        self.targets_and_models.append((target.model, module.model))

    if actor_params:
      self.actor_opt = torch.optim.Adam(
          actor_params,
          lr=self.config.actor_lr,
          weight_decay=self.config.actor_weight_decay)
    else:
      self.actor_opt = AttrDict({'state_dict': lambda: []})
    
    self.actor_params = actor_params

    # Critic setup
    critic_params = []
    self.critics = []
    for module in list(self.module_dict.values()):
      name = module.module_name
      if name.startswith('critic') and isinstance(module, PytorchModel):
        self.critics.append(module)
        critic_params += list(module.model.parameters())
        target = module.copy(name + '_target')
        target.model.load_state_dict(module.model.state_dict())
        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in target.model.parameters():
          p.requires_grad = False
        self.agent.set_module(name + '_target', target)
        self.targets_and_models.append((target.model, module.model))

    self.critic_opt = torch.optim.Adam(
        critic_params,
        lr=self.config.critic_lr,
        weight_decay=self.config.critic_weight_decay)
    
    self.critic_params = critic_params

    self.action_scale = self.env.max_action

  def save(self, save_folder : str):
    path = os.path.join(save_folder, self.module_name + '.pt')
    torch.save({
      'actor_opt_state_dict': self.actor_opt.state_dict(),
      'critic_opt_state_dict': self.critic_opt.state_dict()
    }, path)

  def load(self, save_folder : str):
    path = os.path.join(save_folder, self.module_name + '.pt')
    checkpoint = torch.load(path)
    #self.actor_opt.load_state_dict(checkpoint['actor_opt_state_dict'])
    self.critic_opt.load_state_dict(checkpoint['critic_opt_state_dict'])

  def _optimize(self):
    if len(self.replay_buffer) > self.config.warm_up:
      states, actions, rewards, next_states, gammas = self.replay_buffer.sample(
          self.config.batch_size)

      self.optimize_from_batch(states, actions, rewards, next_states, gammas)
      
      if self.config.opt_steps % self.config.target_network_update_freq == 0:
        for target_model, model in self.targets_and_models:
          soft_update(target_model, model, self.config.target_network_update_frac)

  def optimize_from_batch(self, states, actions, rewards, next_states, gammas):
    raise NotImplementedError('Subclass this!')
コード例 #13
0
  def __init__(self, limit, item_shape, n_cpu=1):
    """
    The replay buffer object. Stores everything in float32.

    :param limit: (int) the max number of transitions to store
    :param item_shape: a list of tuples of (str) item name and (tuple) the shape for item
      Ex: [("observations", env.observation_space.shape),\
          ("actions",env.action_space.shape),\
          ("rewards", (1,)),\
          ("dones", (1,))]
    """
    self.limit = limit

    global BUFF
    BUFF = AttrDict()
    self.BUFF = BUFF # a global object that has shared RawArray-based RingBuffers.

    BUFF.items = []

    # item buffers
    for name, shape in item_shape:
      BUFF.items.append('buffer_' + name)
      BUFF['raw_' + name] = RawArray('f', int(np.prod((limit, ) + shape)))
      BUFF['np_' + name] =\
        np.frombuffer(BUFF['raw_' + name], dtype=np.float32).reshape((limit, ) + shape)
      BUFF['buffer_' + name] = RingBuffer(limit, shape=shape, data=BUFF['np_' + name])

    # special buffers
    BUFF.raw_tidx = RawArray('d', limit)
    BUFF.np_tidx = np.frombuffer(BUFF.raw_tidx, dtype=np.int64)
    BUFF.buffer_tidx = RingBuffer(limit, shape=(), dtype=np.int64, data=BUFF.np_tidx)

    BUFF.raw_tleft = RawArray('d', limit)
    BUFF.np_tleft = np.frombuffer(BUFF.raw_tleft, dtype=np.int64)
    BUFF.buffer_tleft = RingBuffer(limit, shape=(), dtype=np.int64, data=BUFF.np_tleft)

    if 'buffer_bg' in BUFF: # is this a successful trajectory?
      BUFF.raw_success = RawArray('f', limit)
      BUFF.np_success = np.frombuffer(BUFF.raw_success, dtype=np.float32)
      BUFF.buffer_success = RingBuffer(limit, shape=(), dtype=np.float32, data=BUFF.np_success)

    self.trajectories = OrderedDict() # a centralized dict of trajectory_id --> trajectory_idxs
    self.total_trajectory_len = 0
    self.current_trajectory = 0

    self.pool = None
    self.n_cpu = n_cpu
    if n_cpu > 1:
      self.pool = mp.Pool(n_cpu, initializer=worker_init, initargs=(BUFF,))
コード例 #14
0
class EntropyPrioritizedOnlineHERBuffer(mrl.Module):

  def __init__(
      self,
      module_name='prioritized_replay',
      rank_method='dense',
      temperature=1.0
  ):
    """
    Buffer that stores entropy of trajectories for prioritized replay
    """

    super().__init__(module_name, required_agent_modules=['env','replay_buffer'], locals=locals())

    self.goal_space = None
    self.buffer = None
    self.rank_method = rank_method
    self.temperature = temperature
    self.traj_len = None

  def _setup(self):
    self.ag_buffer = self.replay_buffer.buffer.BUFF.buffer_ag

    env = self.env
    assert type(env.observation_space) == gym.spaces.Dict
    self.goal_space = env.observation_space.spaces["desired_goal"]

    # Note: for now we apply entropy estimation on the achieved goal (ag) space
    # Define the buffers to store for prioritization
    items = [("entropy", (1,)), ("priority", (1,))]
    self.buffer = AttrDict()
    for name, shape in items:
      self.buffer['buffer_' + name] = RingBuffer(self.ag_buffer.maxlen, shape=shape)

    self._subbuffers = [[] for _ in range(self.env.num_envs)]
    self.n_envs = self.env.num_envs

    # Define the placeholder for mixture model to estimate trajectory
    self.clf = 0

  def fit_density_model(self):
    ag = self.ag_buffer.data[0:self.size].copy()
    X_train = ag.reshape(-1, self.traj_len * ag.shape[-1]) # [num_episodes, episode_len * goal_dim]

    self.clf = mixture.BayesianGaussianMixture(weight_concentration_prior_type="dirichlet_distribution", n_components=3)
    self.clf.fit(X_train)
    pred = -self.clf.score_samples(X_train)

    self.pred_min = pred.min()
    pred = pred - self.pred_min
    pred = np.clip(pred, 0, None)
    self.pred_sum = pred.sum()
    pred = pred / self.pred_sum
    self.pred_avg = (1 / pred.shape[0])
    pred = np.repeat(pred, self.traj_len, axis=0)

    self.buffer.buffer_entropy.data[:self.size] = pred.reshape(-1,1).copy()

  def _process_experience(self, exp):
    # Compute the entropy 
    # TODO: Include previous achieved goal too? or use that instead of ag?
    achieved = exp.next_state['achieved_goal']
    for i in range(self.n_envs):
      self._subbuffers[i].append([achieved[i]])
    
    for i in range(self.n_envs):
      if exp.trajectory_over[i]:
        # TODO: Compute the entropy of the trajectory
        traj_len = len(self._subbuffers[i])
        if self.traj_len is None:
          self.traj_len = traj_len
        else:
          # Current implementation assumes the same length for all trajectories
          assert(traj_len == self.traj_len)

        if not isinstance(self.clf, int):
          ag = [np.stack(a) for a in zip(*self._subbuffers[i])][0] # [episode_len, goal_dim]
          X = ag.reshape(-1, ag.shape[0]*ag.shape[1])
          pred = -self.clf.score_samples(X)

          pred = pred - self.pred_min
          pred = np.clip(pred, 0, None)
          pred = pred / self.pred_sum # Shape (1,)

          entropy = np.ones((traj_len,1)) * pred
        else:
          # Not enough data to train mixture density yet, set entropy to be zero
          entropy = np.zeros((traj_len, 1))
        
        priority = np.zeros((traj_len,1))
        trajectory = [entropy, priority]
        
        # TODO: Update the trajectory with entropy
        self.add_trajectory(*trajectory)

        self._subbuffers[i] = []

        # TODO: Update the rank here before adding it to the trajectory?
        self.update_priority()

  def add_trajectory(self, *items):
    """
    Append a trajectory of transitions to the buffer.

    :param items: a list of batched transition values to append to the replay buffer,
        in the item order that we initialized the ReplayBuffer with.
    """
    for buffer, batched_values in zip(self.buffer.values(), items):
      buffer.append_batch(batched_values)

  def update_priority(self):
    """
    After adding a trajectory to the replay buffer, update the ranking of transitions
    """
    # Note: 'dense' assigns the next highest element with the rank immediately 
    # after those assigned to the tied elements.
    entropy_transition_total = self.buffer.buffer_entropy.data[:self.size]
    entropy_rank = rankdata(entropy_transition_total, method=self.rank_method)
    entropy_rank = (entropy_rank - 1).reshape(-1, 1)
    self.buffer.buffer_priority.data[:self.size] = entropy_rank

  def __call__(self, batch_size):
    """
    Samples batch_size number of indices from main replay_buffer.

    Args:
      batch_size (int): size of the batch to sample
    
    Returns:
      batch_idxs: a 1-D numpy array of length batch_size containing indices
                  sampled in prioritized manner
    """
    if self.rank_method == 'none':
      entropy_trajectory = self.buffer.buffer_entropy.data[:self.size]
    else:
      entropy_trajectory = self.buffer.buffer_priority.data[:self.size]
    
    # Factorize out sampling into sampling trajectory according to priority/entropy
    # then sample time uniformly independently
    entropy_trajectory = entropy_trajectory.reshape(-1, self.traj_len)[:,0]
    p_trajectory = np.power(entropy_trajectory, 1/(self.temperature+1e-2))
    p_trajectory = p_trajectory / p_trajectory.sum()
    
    num_trajectories = p_trajectory.shape[0]
    batch_tidx = np.random.choice(num_trajectories, size=batch_size, p=p_trajectory)
    batch_idxs = self.traj_len * batch_tidx + np.random.choice(self.traj_len, size=batch_size)

    return batch_idxs

  @property
  def size(self):
    return len(self.ag_buffer)

  def save(self, save_folder):
    if self.config.save_replay_buf:
      state = self.buffer._get_state()
      with open(os.path.join(save_folder, "{}.pickle".format(self.module_name)), 'wb') as f:
        pickle.dump(state, f)

  def load(self, save_folder):
    load_path = os.path.join(save_folder, "{}.pickle".format(self.module_name))
    if os.path.exists(load_path):
      with open(load_path, 'rb') as f:
        state = pickle.load(f)
      self.buffer._set_state(state)
    else:
      self.logger.log_color('###############################################################', '', color='red')
      self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='cyan')
      self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='red')
      self.logger.log_color('WARNING', 'Replay buffer is not being loaded / was not saved.', color='yellow')
      self.logger.log_color('###############################################################', '', color='red')
コード例 #15
0
    def _process_experience(self, experience):
        """Curiosity module updates the desired goal depending on experience.trajectory_over"""
        ag_buffer = self.replay_buffer.buffer.BUFF.buffer_ag

        if self.current_goals is None:
            self.current_goals = experience.reset_state['desired_goal']

        computed_reward = self.env.compute_reward(
            experience.next_state['achieved_goal'], self.current_goals, None)
        close = np.isclose(computed_reward, 0.)

        # First, manage the episode resets & any special behavior that occurs on goal achievement, like go explore / resets / overshooting
        reset_idxs, overshooting_idxs, overshooting_proposals = self._manage_resets_and_success_behaviors(
            experience, close)

        if reset_idxs:
            self.train.reset_next(reset_idxs)

        if overshooting_idxs and len(ag_buffer):
            self._overshoot_goals(experience, overshooting_idxs,
                                  overshooting_proposals)

        # Now consider replacing the current goals with something else:
        if np.any(experience.trajectory_over) and len(ag_buffer):
            # sample some achieved goals
            sample_idxs = np.random.randint(len(ag_buffer),
                                            size=self.num_sampled_ags *
                                            self.n_envs)
            sampled_ags = ag_buffer.get_batch(sample_idxs)
            sampled_ags = sampled_ags.reshape(self.n_envs,
                                              self.num_sampled_ags, -1)

            # compute the q-values of both the sampled achieved goals and the current goals
            states = np.tile(experience.reset_state['observation'][:, None, :],
                             (1, self.num_sampled_ags, 1))
            states = np.concatenate(
                (states, sampled_ags),
                -1).reshape(self.num_sampled_ags * self.n_envs, -1)
            states_curr = np.concatenate(
                (experience.reset_state['observation'], self.current_goals),
                -1)
            states_cat = np.concatenate((states, states_curr), 0)

            bad_q_idxs, q_values = [], None
            if self.use_qcutoff:
                q_values = self.compute_q(states_cat)
                q_values, curr_q = np.split(
                    q_values, [self.num_sampled_ags * self.n_envs])
                q_values = q_values.reshape(self.n_envs, self.num_sampled_ags)

                # Set cutoff dynamically by using intrinsic_success_percent
                if len(self.successes_deque) == 10:
                    self.min_cutoff = max(
                        self.min_min_cutoff,
                        min(np.min(q_values), self.min_cutoff))
                    intrinsic_success_percent = np.mean(self.successes_deque)
                    if intrinsic_success_percent >= self.config.cutoff_success_threshold[
                            1]:
                        self.cutoff = max(self.min_cutoff, self.cutoff - 1.)
                        self.successes_deque.clear()
                    elif intrinsic_success_percent <= self.config.cutoff_success_threshold[
                            0]:
                        self.cutoff = max(
                            min(self.config.initial_cutoff, self.cutoff + 1.),
                            self.min_min_cutoff)
                        self.successes_deque.clear()

                # zero out the "bad" values. This practically eliminates them as candidates if any goals are viable.
                bad_q_idxs = q_values < self.cutoff
                q_values[bad_q_idxs] *= -1
                min_q_values = np.min(q_values, axis=1,
                                      keepdims=True)  # num_envs x1
                q_values[bad_q_idxs] *= -1

            # score the goals -- lower is better
            goal_values = self.score_goals(
                sampled_ags, AttrDict(q_values=q_values, states=states))

            if self.config.dg_score_multiplier > 1. and self.dg_kde.ready:
                dg_scores = self.dg_kde.evaluate_log_density(
                    sampled_ags.reshape(self.n_envs * self.num_sampled_ags,
                                        -1))
                dg_scores = dg_scores.reshape(self.n_envs,
                                              self.num_sampled_ags)
                goal_values[
                    dg_scores > -np.inf] *= self.config.dg_score_multiplier

            if q_values is not None:
                goal_values[bad_q_idxs] = q_values[bad_q_idxs] * -1e-8

            if self.randomize:  # sample proportional to the absolute score
                abs_goal_values = np.abs(goal_values)
                normalized_values = abs_goal_values / np.sum(
                    abs_goal_values, axis=1, keepdims=True)
                chosen_idx = (normalized_values.cumsum(1) > np.random.rand(
                    normalized_values.shape[0])[:, None]).argmax(1)
            else:  # take minimum
                chosen_idx = np.argmin(goal_values, axis=1)

            chosen_idx = np.eye(self.num_sampled_ags)[
                chosen_idx]  # shape(sampled_ags) = n_envs x num_sampled_ags
            if q_values is not None:
                chosen_q_val = (chosen_idx * q_values).sum(axis=1,
                                                           keepdims=True)
            chosen_ags = np.sum(sampled_ags * chosen_idx[:, :, None],
                                axis=1)  # n_envs x goal_feats

            # replace goal always when first_visit_succ (relying on the dg_score_multiplier to dg focus), otherwise
            # we are going to transition into the dgs using the ag_kde_tophat
            if hasattr(self, 'curiosity_alpha'):
                if self.use_qcutoff:
                    replace_goal = np.logical_or(
                        (np.random.random(
                            (self.n_envs, 1)) > self.curiosity_alpha.alpha),
                        curr_q < self.cutoff).astype(np.float32)
                else:
                    replace_goal = (np.random.random(
                        (self.n_envs, 1)) > self.curiosity_alpha.alpha).astype(
                            np.float32)

            else:
                replace_goal = np.ones((self.n_envs, 1), dtype=np.float32)

            # sometimes keep the desired goal anyways
            replace_goal *= (np.random.uniform(size=[self.n_envs, 1]) >
                             self.keep_dg_percent).astype(np.float32)

            new_goals = replace_goal * chosen_ags + (
                1 - replace_goal) * self.current_goals

            if hasattr(self, 'logger') and len(self.successes) > 50:
                if q_values is not None:
                    self.logger.add_histogram(
                        'Explore/Goal_q', replace_goal * chosen_q_val +
                        (1 - replace_goal) * curr_q)
                self.logger.add_scalar('Explore/Intrinsic_success_percent',
                                       np.mean(self.successes))
                self.logger.add_scalar('Explore/Cutoff', self.cutoff)
                self.successes = []

            replace_goal = replace_goal.reshape(-1)

            for i in range(self.n_envs):
                if experience.trajectory_over[i]:
                    self.successes.append(float(self.is_success[i, 0] >= 1.)
                                          )  # compromise due to exploration
                    self.successes_deque.append(
                        float(self.is_success[i, 0] >= 1.)
                    )  # compromise due to exploration
                    self.current_goals[i] = new_goals[i]
                    if replace_goal[i]:
                        self.replaced_goal[i] = 1.
                    self.go_explore[i] = 0.
                    self.is_success[i] = 0.