def flatten(self): rtn = dict() rtn['observations'] = stack_tensor_list(self['observations']) rtn['actions'] = stack_tensor_list(self['actions']) rtn['rewards'] = stack_tensor_list(self['rewards']) # for each key, val in agent_infos and env_infos, stack them bring them to lowest level for k, v in stack_tensor_dict_list(self['agent_infos']).items(): rtn[k] = v for k, v in stack_tensor_dict_list(self['env_infos']).items(): rtn[k] = v return rtn
def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = zip(*results) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list( tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in xrange(self.num_envs)] items = zip(ids, obs, rewards, dones, env_infos) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = zip(*items) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( list(env_infos))
def eval_expert_probs(self, expert_paths, policy, insert=False): """ Evaluate expert policy probability under current policy """ if isinstance(policy, np.ndarray): return self._compute_path_probs(expert_paths, insert=insert) elif hasattr(policy, 'recurrent') and policy.recurrent: policy.reset([True] * len(expert_paths)) expert_obs = self.extract_paths(expert_paths, keys=('observations', ), stack=True)[0] agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list( agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1, 0, 2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list( agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for path in expert_paths: actions, agent_infos = policy.get_actions(path['observations']) path['agent_infos'] = agent_infos return self._compute_path_probs(expert_paths, insert=insert)
def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list(tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(list(env_infos))
def step(self, action_n): # use the model to make (predicted) steps prev_obs = self.current_obs next_obs = self.model.predict(prev_obs, action_n) rewards = self.unwrapped_env.reward(prev_obs, action_n, next_obs) if self.has_done_fn: dones = self.unwrapped_env.done(next_obs) else: dones = np.asarray([False for _ in range(self.n_parallel)]) env_infos = [{} for _ in range(action_n.shape[0])] self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: next_obs[i] = self.env.reset() self.ts[i] = 0 self.current_obs = next_obs # transform obs to lists next_obs = [ np.squeeze(o) for o in np.vsplit(next_obs, next_obs.shape[0]) ] return next_obs, list(rewards), list( dones), tensor_utils.stack_tensor_dict_list(env_infos) #lists
def eval_expert_probs(self, expert_paths, policy, insert=False, context=None): """ Evaluate expert policy probability under current policy """ for traj in expert_paths: if 'agent_infos' in traj: del traj['agent_infos'] if 'a_logprobs' in traj: del traj['a_logprobs'] if isinstance(policy, np.ndarray): return ImitationLearning._compute_path_probs(expert_paths, insert=insert) elif hasattr(policy, 'recurrent') and policy.recurrent: policy.reset([True] * len(expert_paths)) expert_obs = ImitationLearning.extract_paths( expert_paths, keys=('observations', ), stack=True)[0] if context is not None: expert_obs = np.concatenate( (expert_obs, np.tile(np.expand_dims(context[i], axis=1), [1, expert_obs.shape[0], 1])), axis=-1) agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list( agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1, 0, 2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list( agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for i, path in enumerate(expert_paths): expert_obs = path['observations'] if context is not None: expert_obs = np.concatenate( (expert_obs, np.tile(np.expand_dims(context[i], axis=0), [expert_obs.shape[0], 1])), axis=-1) actions, agent_infos = policy.get_actions(expert_obs) path['agent_infos'] = agent_infos return ImitationLearning._compute_path_probs(expert_paths, insert=insert)
def step(self, action_n): all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = map(list, zip(*all_results)) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: obs[i] = self.envs[i].reset() self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
def step(self, action_n): all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: obs[i] = self.envs[i].reset() self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
def step(self, action_n, reset_args=None): if reset_args is None: reset_args = [None]*len(self.envs) all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: obs[i] = self.envs[i].reset(reset_args=reset_args[i]) self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
def worker_run_step(G, action_n, scope): assert hasattr(G, 'parallel_vec_envs') assert scope in G.parallel_vec_envs env_template = G.parallel_vec_env_template[scope] ids = [] step_results = [] for (idx, env) in G.parallel_vec_envs[scope]: action = action_n[idx] ids.append(idx) step_results.append(tuple(env.step(action))) if len(step_results) == 0: return None obs, rewards, dones, env_infos = map(list, zip(*step_results)) obs = env_template.observation_space.flatten_n(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) env_infos = tensor_utils.stack_tensor_dict_list(env_infos) return ids, obs, rewards, dones, env_infos
def worker_run_step(G, action_n, scope): assert hasattr(G, 'parallel_vec_envs') assert scope in G.parallel_vec_envs env_template = G.parallel_vec_env_template[scope] ids = [] step_results = [] for (idx, env) in G.parallel_vec_envs[scope]: action = action_n[idx] ids.append(idx) step_results.append(tuple(env.step(action))) if len(step_results) == 0: return None obs, rewards, dones, env_infos = list(map(list, list(zip(*step_results)))) obs = env_template.observation_space.flatten_n(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) env_infos = tensor_utils.stack_tensor_dict_list(env_infos) return ids, obs, rewards, dones, env_infos
def step(self, action_n, traj_starting_obs=None, traj_starting_ts=None): """ :param action_n: batches of actions for all models/taks stacked on top of each other (n_models * batch_per_model, ndim_act) :return: predicted observations (n_models * batch_per_model, ndim_obs) """ assert action_n.shape[0] == self.n_parallel # use the model to make (predicted) steps prev_obs = self.current_obs next_obs = self.model.predict_model_batches(prev_obs, action_n) if self.clip_obs: next_obs = np.clip(next_obs, -1000, 1000) rewards = self.unwrapped_env.reward(prev_obs, action_n, next_obs) if self.has_done_fn: dones = self.unwrapped_env.done(next_obs) else: dones = np.asarray([False for _ in range(self.n_parallel)]) env_infos = [{} for _ in range(action_n.shape[0])] self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: if traj_starting_obs is None or np.random.random() < 0.1: next_obs[i] = self.env.reset() self.ts[i] = 0 else: min_idx = max(-10000, -traj_starting_obs.shape[0]) idx = np.random.randint(min_idx, 0) next_obs[i] = traj_starting_obs[idx, :] self.ts[i] = traj_starting_ts[idx] self.current_obs = next_obs # transform obs to lists next_obs = [ np.squeeze(o) for o in np.vsplit(next_obs, next_obs.shape[0]) ] return next_obs, list(rewards), list( dones), tensor_utils.stack_tensor_dict_list(env_infos) #lists
def step(self, action_n, itr): all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results)))) # if env_action_space == 5: # ###function to modify the goal position # rewards, dones = self.change_goal_state(itr, obs, rewards, dones) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: obs[i] = self.envs[i].reset() self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
def eval_expert_probs(self, expert_paths, policy, insert=False): """ Evaluate expert policy probability under current policy """ if policy.recurrent: policy.reset([True]*len(expert_paths)) expert_obs = self.extract_paths(expert_paths, keys=('observations',))[0] agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list(agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1,0,2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list(agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for path in expert_paths: actions, agent_infos = policy.get_actions(path['observations']) path['agent_infos'] = agent_infos return self._compute_path_probs(expert_paths, insert=insert)
def eval_expert_probs(self, expert_paths, policy, insert=False, context=None): """ Evaluate expert policy probability under current policy """ if policy.recurrent: policy.reset([True] * len(expert_paths)) expert_obs = ImitationLearning.extract_paths( expert_paths, keys=('observations', ))[0] if context is not None: expert_obs = np.concatenate((expert_obs, context), axis=-1) agent_infos = [] for t in range(expert_obs.shape[1]): a, infos = policy.get_actions(expert_obs[:, t]) agent_infos.append(infos) agent_infos_stack = tensor_utils.stack_tensor_dict_list( agent_infos) for key in agent_infos_stack: agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1, 0, 2]) agent_infos_transpose = tensor_utils.split_tensor_dict_list( agent_infos_stack) for i, path in enumerate(expert_paths): path['agent_infos'] = agent_infos_transpose[i] else: for path in expert_paths: expert_obs = path['observations'] if context is not None: expert_obs = np.concatenate((expert_obs, context), axis=-1) actions, agent_infos = policy.get_actions(expert_obs) path['agent_infos'] = agent_infos return ImitationLearning._compute_path_probs(expert_paths, insert=insert)
def process_samples(self, itr, paths): baselines = [] returns = [] if len(paths) > 0 and "vf" in paths[0]["agent_infos"]: all_path_baselines = [ p["agent_infos"]["vf"].flatten() for p in paths ] else: if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular_misc_stat('TrajLen', [len(p["rewards"]) for p in paths], placement='front') logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular_misc_stat('Return', undiscounted_returns, placement='front') return samples_data