def get_path(self, params): """ Sample a single path in the environment accoring to the policy parameters that are passed. returns: A path consists of a dictionary of observations, actions, etc. """ policy = self.policy env = self.env env_spec = self.env.spec obs = env.reset() policy.reset() policy.set_param_values(params, trainable=True) path = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[]) for t in range(self.max_path_length): action, agent_info = policy.get_action(obs) if self.deterministic: action = agent_info['mean'] next_obs, reward, done, env_info = self.env.step(action) if agent_info is None: agent_info = dict() if env_info is None: env_info = dict() path['observations'].append(obs) path['actions'].append(action) path['rewards'].append(reward) path['env_infos'].append(env_info) path['agent_infos'].append(agent_info) if done: break obs = next_obs path['observations'] = env_spec.observation_space.flatten_n( path['observations']) path['actions'] = env_spec.action_space.flatten_n(path['actions']) path['rewards'] = tensor_utils.stack_tensor_list(path['rewards']) path['env_infos'] = tensor_utils.stack_tensor_dict_list( path['env_infos']) path['agent_infos'] = tensor_utils.stack_tensor_dict_list( path['agent_infos']) path['policy_params'] = policy.get_param_values(trainable=True) return path
def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list( tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( list(env_infos))
def step(self, action_n): self.ts += 1 ast_action_n = action_n os = [ np.reshape(env.get_observation(), env.observation_space.shape) for env in self.envs ] action_n, action_info_n = self.sut.get_actions(os) if "mean" in action_info_n: action_n = action_info_n["mean"] elif "prob" in action_info_n: action_n = np.argmax(action_info_n["prob"], axis=1) if self.sut.recurrent: self.sut.prev_actions = self.sut.action_space.flatten_n(action_n) # action = self.env.action_space.sample() results = [ env.ast_step(action, ast_action) for (action, ast_action, env) in zip(action_n, ast_action_n, self.envs) ] if self.open_loop: obs = [self._init_state for env in self.envs] else: obs = [ np.reshape(ob, env.ast_observation_space.shape) for (ob, env) in zip(list(zip(*results))[0], self.envs) ] obs = np.asarray(obs) dones = list(zip(*results))[2] dones = np.asarray(dones) if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True infos = [env.ast_get_reward_info() for env in self.envs] for (i, info) in enumerate(infos): info['is_terminal'] = dones[i] rewards = [ self.reward_function.give_reward(action=action, info=info) for (action, info) in zip(ast_action_n, infos) ] env_infos = infos rewards = np.asarray(rewards) for (i, done) in enumerate(dones): if done: if self._fixed_init_state: obs[i] = self.envs[i].ast_reset(self._init_state)[0] else: obs[i] = self.envs[i].ast_reset( self.observation_space.sample())[0] self.ts[i] = 0 self.sut.reset(dones) return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( env_infos)
def step(self, action_n): all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list( map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: obs[i] = self.envs[i].reset() self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( env_infos)
def worker_run_step(G, action_n, scope): assert hasattr(G, 'parallel_vec_envs') assert scope in G.parallel_vec_envs env_template = G.parallel_vec_env_template[scope] ids = [] step_results = [] for (idx, env) in G.parallel_vec_envs[scope]: action = action_n[idx] ids.append(idx) step_results.append(tuple(env.step(action))) if len(step_results) == 0: return None obs, rewards, dones, env_infos = list(map(list, list(zip(*step_results)))) obs = env_template.observation_space.flatten_n(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) env_infos = tensor_utils.stack_tensor_dict_list(env_infos) return ids, obs, rewards, dones, env_infos
def process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ baselines = [] returns = [] total_steps = 0 max_path_length = self.max_path_length undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data
def paths_to_tensors(paths, max_episode_length, baseline_predictions, discount, gae_lambda): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_episode_length (int): Maximum length of a single episode. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. gae_lambda (float): Lambda used for generalized advantage estimation. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) """ baselines = [] returns = [] total_steps = 0 for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(baseline_predictions[idx], 0) deltas = (path['rewards'] + discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = discount_cumsum(deltas, discount * gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = baseline_predictions[idx] baselines.append(path['baselines']) # returns path['returns'] = discount_cumsum(path['rewards'], discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_episode_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_episode_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_episode_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_episode_length) baselines = tensor_utils.pad_tensor_n(baselines, max_episode_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_episode_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_episode_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_episode_length) lengths = np.asarray([v.sum() for v in valids]) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) return samples_data
def paths_to_tensors(self, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * tasks: (numpy.ndarray) * actions: (numpy.ndarray) * trjectories: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * letent_infos: (dict) * env_infos: (dict) * trjectory_infos: (dict) * paths: (list[dict]) """ max_path_length = self.max_path_length def _extract_latent_infos(infos): """Extract and pack latent infos from dict. Args: infos (dict): A dict that contains latent infos with key prefixed by 'latent_'. Returns: dict: A dict of latent infos. """ latent_infos = dict() for k, v in infos.items(): if k.startswith('latent_'): latent_infos[k[7:]] = v return latent_infos for path in paths: path['actions'] = (self._env_spec.action_space.flatten_n( path['actions'])) path['tasks'] = self.policy.task_space.flatten_n( path['env_infos']['task_onehot']) path['latents'] = path['agent_infos']['latent'] path['latent_infos'] = _extract_latent_infos(path['agent_infos']) # - Calculate a forward-looking sliding window. # - If step_space has shape (n, d), then trajs will have shape # (n, window, d) # - The length of the sliding window is determined by the # trajectory inference spec. We smear the last few elements to # preserve the time dimension. # - Only observation is used for a single step. # Alternatively, stacked [observation, action] can be used for # in harder tasks. obs = pad_tensor(path['observations'], max_path_length) obs_flat = self._env_spec.observation_space.flatten_n(obs) steps = obs_flat window = self._inference.spec.input_space.shape[0] traj = np_tensor_utils.sliding_window(steps, window, smear=True) traj_flat = self._inference.spec.input_space.flatten_n(traj) path['trajectories'] = traj_flat _, traj_info = self._inference.get_latents(traj_flat) path['trajectory_infos'] = traj_info all_path_baselines = [self._baseline.predict(path) for path in paths] tasks = [path['tasks'] for path in paths] tasks = pad_tensor_n(tasks, max_path_length) trajectories = np.stack([path['trajectories'] for path in paths]) latents = [path['latents'] for path in paths] latents = pad_tensor_n(latents, max_path_length) latent_infos = [path['latent_infos'] for path in paths] latent_infos = stack_tensor_dict_list( [pad_tensor_dict(p, max_path_length) for p in latent_infos]) trajectory_infos = [path['trajectory_infos'] for path in paths] trajectory_infos = stack_tensor_dict_list( [pad_tensor_dict(p, max_path_length) for p in trajectory_infos]) samples_data = paths_to_tensors(paths, max_path_length, all_path_baselines, self._discount, self._gae_lambda) samples_data['tasks'] = tasks samples_data['latents'] = latents samples_data['latent_infos'] = latent_infos samples_data['trajectories'] = trajectories samples_data['trajectory_infos'] = trajectory_infos return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] max_path_length = self.algo.max_path_length if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["deltas"] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) returns.append(path["returns"]) # make all paths the same length obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) advantages = [path["advantages"] for path in paths] advantages = tensor_utils.pad_tensor_n(advantages, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = (np.mean( [path["returns"][0] for path in paths])) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, rewards=rewards, advantages=advantages, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = special.discount_cumsum(path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Parameters ---------- itr : int The iteration number. paths : list[dict] The collected paths from the sampler. Returns ------- samples_data : dict Processed sample data with same trajectory length (padded with 0) """ baselines = [] returns = [] max_path_length = self.max_path_length if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) # average_discounted_return = (np.mean( # [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) # ent = np.sum(self.policy.distribution.entropy(agent_infos) * # valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] tasks = [] tasks_gt = [] latents = [] latent_infos = [] actions = [] rewards = [] agent_infos = [] env_infos = [] # Resets o = env.reset() agent.reset() # Sample embedding network # NOTE: it is important to do this _once per rollout_, not once per # timestep, since we need correlated noise. t = env.active_task_one_hot task_gt = env.active_task_one_hot_gt z, latent_info = agent.get_latent(t) if animated: env.render() path_length = 0 while path_length < max_path_length: #a, agent_info = agent.get_action(np.concatenate((t, o))) a, agent_info = agent.get_action_from_latent(z, o) # latent_info = agent_info["latent_info"] next_o, r, d, env_info = env.step(a) observations.append(agent.observation_space.flatten(o)) tasks.append(t) tasks_gt.append(task_gt) # z = latent_info["mean"] latents.append(agent.latent_space.flatten(z)) latent_infos.append(latent_info) rewards.append(r) actions.append(agent.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), tasks=tensor_utils.stack_tensor_list(tasks), tasks_gt=tensor_utils.stack_tensor_list(tasks_gt), latents=tensor_utils.stack_tensor_list(latents), latent_infos=tensor_utils.stack_tensor_dict_list(latent_infos), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def process_samples(self, itr, paths): baselines = [] returns = [] max_path_length = self.algo.max_path_length action_space = self.algo.env.action_space observation_space = self.algo.env.observation_space if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["deltas"] = deltas # calculate trajectory tensors (TODO: probably can do this in TF) for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) returns.append(path["returns"]) # Calculate trajectory samples # # Pad and flatten action and observation traces act = tensor_utils.pad_tensor(path['actions'], max_path_length) obs = tensor_utils.pad_tensor(path['observations'], max_path_length) act_flat = action_space.flatten_n(act) obs_flat = observation_space.flatten_n(obs) # Create a time series of stacked [act, obs] vectors #XXX now the inference network only looks at obs vectors #act_obs = np.concatenate([act_flat, obs_flat], axis=1) # TODO reactivate for harder envs? act_obs = obs_flat # act_obs = act_flat # Calculate a forward-looking sliding window of the stacked vectors # # If act_obs has shape (n, d), then trajs will have shape # (n, window, d) # # The length of the sliding window is determined by the trajectory # inference spec. We smear the last few elements to preserve the # time dimension. window = self.algo.inference.input_space.shape[0] trajs = sliding_window(act_obs, window, 1, smear=True) trajs_flat = self.algo.inference.input_space.flatten_n(trajs) path['trajectories'] = trajs_flat # trajectory infos _, traj_infos = self.algo.inference.get_latents(trajs) path['trajectory_infos'] = traj_infos ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) #DEBUG CPU vars ###################### cpu_adv = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) cpu_deltas = tensor_utils.concat_tensor_list( [path["deltas"] for path in paths]) cpu_act = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) cpu_obs = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) cpu_agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: cpu_adv = utils.center_advantages(cpu_adv) if self.algo.positive_adv: cpu_adv = utils.shift_advantages_to_positive(cpu_adv) ##################################### # make all paths the same length obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) tasks = [path["tasks"] for path in paths] tasks = tensor_utils.pad_tensor_n(tasks, max_path_length) tasks_gt = [path['tasks_gt'] for path in paths] tasks_gt = tensor_utils.pad_tensor_n(tasks_gt, max_path_length) latents = [path['latents'] for path in paths] latents = tensor_utils.pad_tensor_n(latents, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) trajectories = tensor_utils.stack_tensor_list( [path["trajectories"] for path in paths]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) latent_infos = [path["latent_infos"] for path in paths] latent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in latent_infos ]) trajectory_infos = [path["trajectory_infos"] for path in paths] trajectory_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in trajectory_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, tasks=tasks, latents=latents, trajectories=trajectories, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, latent_infos=latent_infos, trajectory_infos=trajectory_infos, env_infos=env_infos, paths=paths, cpu_adv=cpu_adv, #DEBUG cpu_deltas=cpu_deltas, #DEBUG cpu_obs=cpu_obs, #DEBUG cpu_act=cpu_act, #DEBUG cpu_agent_infos=cpu_agent_infos, # DEBUG ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. (same as in bath_polopt without entropy and tabular recording) Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ baselines = [] returns = [] max_path_length = self.max_path_length if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) # average_discounted_return = (np.mean( # [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) # ent = np.sum(self.policy.distribution.entropy(agent_infos) * # valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data