def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = zip(*results) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list( tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in xrange(self.num_envs)] items = zip(ids, obs, rewards, dones, env_infos) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = zip(*items) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( list(env_infos))
def eval_expert_probs(self, expert_paths, policy, insert=False): """ Evaluate expert policy probability under current policy """ if isinstance(policy, np.ndarray): return self._compute_path_probs(expert_paths, insert=insert) elif hasattr(policy, 'recurrent') and policy.recurrent: policy.reset([True] * len(expert_paths)) expert_obs = self.extract_paths(expert_paths, keys=('observations', ), stack=True)[0] agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list( agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1, 0, 2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list( agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for path in expert_paths: actions, agent_infos = policy.get_actions(path['observations']) path['agent_infos'] = agent_infos return self._compute_path_probs(expert_paths, insert=insert)
def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list(tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(list(env_infos))
def eval_expert_probs(self, expert_paths, policy, insert=False, context=None): """ Evaluate expert policy probability under current policy """ for traj in expert_paths: if 'agent_infos' in traj: del traj['agent_infos'] if 'a_logprobs' in traj: del traj['a_logprobs'] if isinstance(policy, np.ndarray): return ImitationLearning._compute_path_probs(expert_paths, insert=insert) elif hasattr(policy, 'recurrent') and policy.recurrent: policy.reset([True] * len(expert_paths)) expert_obs = ImitationLearning.extract_paths( expert_paths, keys=('observations', ), stack=True)[0] if context is not None: expert_obs = np.concatenate( (expert_obs, np.tile(np.expand_dims(context[i], axis=1), [1, expert_obs.shape[0], 1])), axis=-1) agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list( agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1, 0, 2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list( agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for i, path in enumerate(expert_paths): expert_obs = path['observations'] if context is not None: expert_obs = np.concatenate( (expert_obs, np.tile(np.expand_dims(context[i], axis=0), [expert_obs.shape[0], 1])), axis=-1) actions, agent_infos = policy.get_actions(expert_obs) path['agent_infos'] = agent_infos return ImitationLearning._compute_path_probs(expert_paths, insert=insert)
def eval_expert_probs(self, expert_paths, policy, insert=False): """ Evaluate expert policy probability under current policy """ if policy.recurrent: policy.reset([True]*len(expert_paths)) expert_obs = self.extract_paths(expert_paths, keys=('observations',))[0] agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list(agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1,0,2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list(agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for path in expert_paths: actions, agent_infos = policy.get_actions(path['observations']) path['agent_infos'] = agent_infos return self._compute_path_probs(expert_paths, insert=insert)
def eval_expert_probs(self, expert_paths, policy, insert=False, context=None): """ Evaluate expert policy probability under current policy """ if policy.recurrent: policy.reset([True] * len(expert_paths)) expert_obs = ImitationLearning.extract_paths( expert_paths, keys=('observations', ))[0] if context is not None: expert_obs = np.concatenate((expert_obs, context), axis=-1) agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list( agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1, 0, 2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list( agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for path in expert_paths: expert_obs = path['observations'] if context is not None: expert_obs = np.concatenate((expert_obs, context), axis=-1) actions, agent_infos = policy.get_actions(expert_obs) path['agent_infos'] = agent_infos return ImitationLearning._compute_path_probs(expert_paths, insert=insert)