def testConcatenate(self): d1 = {"s": np.array([0, 1]), "a": np.array([2, 3])} d2 = {"s": np.array([4, 5]), "a": np.array([6, 7])} d = concatenate([d1, d2]) assert_allclose(d["s"], np.array([0, 1, 4, 5])) assert_allclose(d["a"], np.array([2, 3, 6, 7])) D = concatenate([d]) assert_allclose(D["s"], np.array([0, 1, 4, 5])) assert_allclose(D["a"], np.array([2, 3, 6, 7]))
def compute_steps(self, config, obs_filter, rew_filter): """Compute multiple rollouts and concatenate the results. Args: config: Configuration parameters obs_filter: Function that is applied to each of the observations. reward_filter: Function that is applied to each of the rewards. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ num_steps_so_far = 0 trajectories = [] self.update_filters(obs_filter, rew_filter) while num_steps_so_far < config["min_steps_per_task"]: rollout = self.sampler.get_data() trajectory = process_rollout( rollout, self.reward_filter, config["gamma"], config["lambda"], use_gae=config["use_gae"]) num_steps_so_far += trajectory["rewards"].shape[0] trajectories.append(trajectory) metrics = self.sampler.get_metrics() total_rewards, trajectory_lengths = zip(*[ (c.episode_reward, c.episode_length) for c in metrics]) updated_obs_filter = self.sampler.get_obs_filter(flush=True) return ( concatenate(trajectories), total_rewards, trajectory_lengths, updated_obs_filter, self.reward_filter)
def collect_samples(agents, config, observation_filter, reward_filter): num_timesteps_so_far = 0 trajectories = [] total_rewards = [] trajectory_lengths = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = {agent.compute_steps.remote( config, observation_filter, reward_filter): agent for agent in agents} while num_timesteps_so_far < config["timesteps_per_batch"]: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [next_trajectory], _ = ray.wait(list(agent_dict)) agent = agent_dict.pop(next_trajectory) # Start task with next trajectory and record it in the dictionary. agent_dict[agent.compute_steps.remote( config, observation_filter, reward_filter)] = agent trajectory, rewards, lengths, obs_f, rew_f = ray.get(next_trajectory) total_rewards.extend(rewards) trajectory_lengths.extend(lengths) num_timesteps_so_far += sum(lengths) trajectories.append(trajectory) observation_filter.update(obs_f) reward_filter.update(rew_f) return (concatenate(trajectories), np.mean(total_rewards), np.mean(trajectory_lengths))
def collect_samples(agents, config, observation_filter=NoFilter(), reward_filter=NoFilter()): num_timesteps_so_far = 0 trajectories = [] total_rewards = [] trajectory_lengths = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = {agent.compute_steps.remote( config["gamma"], config["lambda"], config["horizon"], config["min_steps_per_task"]): agent for agent in agents} while num_timesteps_so_far < config["timesteps_per_batch"]: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [next_trajectory], waiting_trajectories = ray.wait( list(agent_dict.keys())) agent = agent_dict.pop(next_trajectory) # Start task with next trajectory and record it in the dictionary. agent_dict[agent.compute_steps.remote( config["gamma"], config["lambda"], config["horizon"], config["min_steps_per_task"])] = ( agent) trajectory, rewards, lengths = ray.get(next_trajectory) total_rewards.extend(rewards) trajectory_lengths.extend(lengths) num_timesteps_so_far += len(trajectory["dones"]) trajectories.append(trajectory) return (concatenate(trajectories), np.mean(total_rewards), np.mean(trajectory_lengths))
def compute_steps( self, gamma, lam, horizon, min_steps_per_task, observation_filter, reward_filter): """Compute multiple rollouts and concatenate the results. Args: gamma: MDP discount factor lam: GAE(lambda) parameter horizon: Number of steps after which a rollout gets cut min_steps_per_task: Lower bound on the number of states to be collected. observation_filter: Function that is applied to each of the observations. reward_filter: Function that is applied to each of the rewards. Returns: states: List of states. total_rewards: Total rewards of the trajectories. trajectory_lengths: Lengths of the trajectories. """ # Update our local filters self.observation_filter = observation_filter.copy() self.reward_filter = reward_filter.copy() num_steps_so_far = 0 trajectories = [] total_rewards = [] trajectory_lengths = [] while True: trajectory = self.compute_trajectory(gamma, lam, horizon) total_rewards.append( trajectory["raw_rewards"].sum(axis=0).mean()) trajectory_lengths.append( np.logical_not(trajectory["dones"]).sum(axis=0).mean()) trajectory = flatten(trajectory) not_done = np.logical_not(trajectory["dones"]) # Filtering out states that are done. We do this because # trajectories are batched and cut only if all the trajectories # in the batch terminated, so we can potentially get rid of # some of the states here. trajectory = {key: val[not_done] for key, val in trajectory.items()} num_steps_so_far += trajectory["raw_rewards"].shape[0] trajectories.append(trajectory) if num_steps_so_far >= min_steps_per_task: break return ( concatenate(trajectories), total_rewards, trajectory_lengths, self.observation_filter, self.reward_filter)