Ejemplo n.º 1
0
    def step(self, action_n):
        results = singleton_pool.run_each(
            worker_run_step,
            [(action_n, self.scope) for _ in self._alloc_env_ids],
        )
        results = [x for x in results if x is not None]
        ids, obs, rewards, dones, env_infos = zip(*results)
        ids = np.concatenate(ids)
        obs = self.observation_space.unflatten_n(np.concatenate(obs))
        rewards = np.concatenate(rewards)
        dones = np.concatenate(dones)
        env_infos = tensor_utils.split_tensor_dict_list(
            tensor_utils.concat_tensor_dict_list(env_infos))
        if env_infos is None:
            env_infos = [dict() for _ in xrange(self.num_envs)]

        items = zip(ids, obs, rewards, dones, env_infos)
        items = sorted(items, key=lambda x: x[0])

        ids, obs, rewards, dones, env_infos = zip(*items)

        obs = list(obs)
        rewards = np.asarray(rewards)
        dones = np.asarray(dones)

        self.ts += 1
        dones[self.ts >= self.max_path_length] = True

        reset_obs = self._run_reset(dones)
        for (i, done) in enumerate(dones):
            if done:
                obs[i] = reset_obs[i]
                self.ts[i] = 0
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(
            list(env_infos))
Ejemplo n.º 2
0
 def eval_expert_probs(self, expert_paths, policy, insert=False):
     """
     Evaluate expert policy probability under current policy
     """
     if isinstance(policy, np.ndarray):
         return self._compute_path_probs(expert_paths, insert=insert)
     elif hasattr(policy, 'recurrent') and policy.recurrent:
         policy.reset([True] * len(expert_paths))
         expert_obs = self.extract_paths(expert_paths,
                                         keys=('observations', ),
                                         stack=True)[0]
         agent_infos = []
         for t in range(expert_obs.shape[1]):
             a, infos = policy.get_actions(expert_obs[:, t])
             agent_infos.append(infos)
         agent_infos_stack = tensor_utils.stack_tensor_dict_list(
             agent_infos)
         for key in agent_infos_stack:
             agent_infos_stack[key] = np.transpose(agent_infos_stack[key],
                                                   axes=[1, 0, 2])
         agent_infos_transpose = tensor_utils.split_tensor_dict_list(
             agent_infos_stack)
         for i, path in enumerate(expert_paths):
             path['agent_infos'] = agent_infos_transpose[i]
     else:
         for path in expert_paths:
             actions, agent_infos = policy.get_actions(path['observations'])
             path['agent_infos'] = agent_infos
     return self._compute_path_probs(expert_paths, insert=insert)
    def step(self, action_n):
        results = singleton_pool.run_each(
            worker_run_step,
            [(action_n, self.scope) for _ in self._alloc_env_ids],
        )
        results = [x for x in results if x is not None]
        ids, obs, rewards, dones, env_infos = list(zip(*results))
        ids = np.concatenate(ids)
        obs = self.observation_space.unflatten_n(np.concatenate(obs))
        rewards = np.concatenate(rewards)
        dones = np.concatenate(dones)
        env_infos = tensor_utils.split_tensor_dict_list(tensor_utils.concat_tensor_dict_list(env_infos))
        if env_infos is None:
            env_infos = [dict() for _ in range(self.num_envs)]

        items = list(zip(ids, obs, rewards, dones, env_infos))
        items = sorted(items, key=lambda x: x[0])

        ids, obs, rewards, dones, env_infos = list(zip(*items))

        obs = list(obs)
        rewards = np.asarray(rewards)
        dones = np.asarray(dones)

        self.ts += 1
        dones[self.ts >= self.max_path_length] = True

        reset_obs = self._run_reset(dones)
        for (i, done) in enumerate(dones):
            if done:
                obs[i] = reset_obs[i]
                self.ts[i] = 0
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(list(env_infos))
Ejemplo n.º 4
0
    def eval_expert_probs(self,
                          expert_paths,
                          policy,
                          insert=False,
                          context=None):
        """
        Evaluate expert policy probability under current policy
        """
        for traj in expert_paths:
            if 'agent_infos' in traj:
                del traj['agent_infos']
            if 'a_logprobs' in traj:
                del traj['a_logprobs']

        if isinstance(policy, np.ndarray):
            return ImitationLearning._compute_path_probs(expert_paths,
                                                         insert=insert)
        elif hasattr(policy, 'recurrent') and policy.recurrent:
            policy.reset([True] * len(expert_paths))
            expert_obs = ImitationLearning.extract_paths(
                expert_paths, keys=('observations', ), stack=True)[0]
            if context is not None:
                expert_obs = np.concatenate(
                    (expert_obs,
                     np.tile(np.expand_dims(context[i], axis=1),
                             [1, expert_obs.shape[0], 1])),
                    axis=-1)
            agent_infos = []
            for t in range(expert_obs.shape[1]):
                a, infos = policy.get_actions(expert_obs[:, t])
                agent_infos.append(infos)
            agent_infos_stack = tensor_utils.stack_tensor_dict_list(
                agent_infos)
            for key in agent_infos_stack:
                agent_infos_stack[key] = np.transpose(agent_infos_stack[key],
                                                      axes=[1, 0, 2])
            agent_infos_transpose = tensor_utils.split_tensor_dict_list(
                agent_infos_stack)
            for i, path in enumerate(expert_paths):
                path['agent_infos'] = agent_infos_transpose[i]
        else:
            for i, path in enumerate(expert_paths):
                expert_obs = path['observations']
                if context is not None:
                    expert_obs = np.concatenate(
                        (expert_obs,
                         np.tile(np.expand_dims(context[i], axis=0),
                                 [expert_obs.shape[0], 1])),
                        axis=-1)
                actions, agent_infos = policy.get_actions(expert_obs)
                path['agent_infos'] = agent_infos
        return ImitationLearning._compute_path_probs(expert_paths,
                                                     insert=insert)
Ejemplo n.º 5
0
 def eval_expert_probs(self, expert_paths, policy, insert=False):
     """
     Evaluate expert policy probability under current policy
     """
     if policy.recurrent:
         policy.reset([True]*len(expert_paths))
         expert_obs = self.extract_paths(expert_paths, keys=('observations',))[0]
         agent_infos = []
         for t in range(expert_obs.shape[1]):
             a, infos = policy.get_actions(expert_obs[:, t])
             agent_infos.append(infos)
         agent_infos_stack = tensor_utils.stack_tensor_dict_list(agent_infos)
         for key in agent_infos_stack:
             agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1,0,2])
         agent_infos_transpose = tensor_utils.split_tensor_dict_list(agent_infos_stack)
         for i, path in enumerate(expert_paths):
             path['agent_infos'] = agent_infos_transpose[i]
     else:
         for path in expert_paths:
             actions, agent_infos = policy.get_actions(path['observations'])
             path['agent_infos'] = agent_infos
     return self._compute_path_probs(expert_paths, insert=insert)
Ejemplo n.º 6
0
 def eval_expert_probs(self,
                       expert_paths,
                       policy,
                       insert=False,
                       context=None):
     """
     Evaluate expert policy probability under current policy
     """
     if policy.recurrent:
         policy.reset([True] * len(expert_paths))
         expert_obs = ImitationLearning.extract_paths(
             expert_paths, keys=('observations', ))[0]
         if context is not None:
             expert_obs = np.concatenate((expert_obs, context), axis=-1)
         agent_infos = []
         for t in range(expert_obs.shape[1]):
             a, infos = policy.get_actions(expert_obs[:, t])
             agent_infos.append(infos)
         agent_infos_stack = tensor_utils.stack_tensor_dict_list(
             agent_infos)
         for key in agent_infos_stack:
             agent_infos_stack[key] = np.transpose(agent_infos_stack[key],
                                                   axes=[1, 0, 2])
         agent_infos_transpose = tensor_utils.split_tensor_dict_list(
             agent_infos_stack)
         for i, path in enumerate(expert_paths):
             path['agent_infos'] = agent_infos_transpose[i]
     else:
         for path in expert_paths:
             expert_obs = path['observations']
             if context is not None:
                 expert_obs = np.concatenate((expert_obs, context), axis=-1)
             actions, agent_infos = policy.get_actions(expert_obs)
             path['agent_infos'] = agent_infos
     return ImitationLearning._compute_path_probs(expert_paths,
                                                  insert=insert)