def _postprocess_dqn(policy_graph, sample_batch): obs, actions, rewards, new_obs, dones = [ list(x) for x in sample_batch.columns( ["obs", "actions", "rewards", "new_obs", "dones"])] # N-step Q adjustments if policy_graph.config["n_step"] > 1: adjust_nstep( policy_graph.config["n_step"], policy_graph.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards)}) # Prioritize on the worker side if batch.count > 0 and policy_graph.config["worker_side_prioritization"]: td_errors = policy_graph.compute_td_error( batch["obs"], batch["actions"], batch["rewards"], batch["new_obs"], batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch
def _optimize(self): with self.replay_timer: if isinstance(self.replay_buffer, PrioritizedReplayBuffer): (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) else: (obses_t, actions, rewards, obses_tp1, dones) = self.replay_buffer.sample(self.train_batch_size) weights = np.ones_like(rewards) batch_indexes = -np.ones_like(rewards) samples = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) with self.grad_timer: td_error = self.local_evaluator.compute_apply(samples) new_priorities = (np.abs(td_error) + self.prioritized_replay_eps) if isinstance(self.replay_buffer, PrioritizedReplayBuffer): self.replay_buffer.update_priorities(samples["batch_indexes"], new_priorities) self.grad_timer.push_units_processed(samples.count) self.num_steps_trained += samples.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: for i in range(self.config.get("num_sgd_iter", 10)): batch_index = 0 num_batches = (int(tuples_per_device) // int(self.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) batch_index += 1
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): if "weights" not in row: row["weights"] = np.ones_like(row["rewards"]) self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], row["weights"]) if self.num_steps_sampled >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def collect_samples(agents, config, local_evaluator): num_timesteps_so_far = 0 trajectories = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = {} for agent in agents: fut_sample = agent.sample.remote() agent_dict[fut_sample] = agent while num_timesteps_so_far < config["timesteps_per_batch"]: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [fut_sample], _ = ray.wait(list(agent_dict)) agent = agent_dict.pop(fut_sample) # Start task with next trajectory and record it in the dictionary. fut_sample2 = agent.sample.remote() agent_dict[fut_sample2] = agent next_sample = ray.get(fut_sample) num_timesteps_so_far += next_sample.count trajectories.append(next_sample) return SampleBatch.concat_samples(trajectories)
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range(self.config["sample_batch_size"] + self.config["n_step"] - 1): action = self.agent.act(self.state) next_state, reward, done, _ = self.env.step(action) next_state = to_rainbow(next_state) obs.append(self.state.data.cpu().numpy()) actions.append(action) rewards.append(reward) new_obs.append(next_state.data.cpu().numpy()) dones.append(1.0 if done else 0.0) self.state = next_state self.episode_rewards[-1] += reward self.episode_lengths[-1] += 1 if done: self.state = to_rainbow(self.env.reset()) self.agent.reset_noise() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) self.local_timestep += 1 # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep(self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards) }) assert batch.count == self.config["sample_batch_size"] td_errors = self.agent.compute_td_error(batch) batch.data["obs"] = [pack(o) for o in batch["obs"]] batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]] new_priorities = (np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): update_eps = self.exploration.value(self.local_timestep) action = self.act( np.array(self.obs)[None], update_eps=update_eps)[0] obs_tp1, reward, done, _ = self.env.step(action) obs.append(self.obs) actions.append(action) rewards.append(np.sign(reward)) new_obs.append(obs_tp1) dones.append(1.0 if done else 0.0) self.obs = obs_tp1 self.episode_rewards[-1] += reward self.episode_lengths[-1] += 1 if done: self.obs = self.env.reset() self.episode_rewards.append(0.0) self.episode_lengths.append(0.0) self.local_timestep += 1 # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep( self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": obs, "actions": actions, "rewards": rewards, "new_obs": new_obs, "dones": dones, "weights": np.ones_like(rewards)}) assert batch.count == self.config["sample_batch_size"] # td_errors = self.agent.compute_td_error(batch) batch.data["obs"] = [pack(o) for o in batch["obs"]] batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]] # new_priorities = ( # np.abs(td_errors) + self.config["prioritized_replay_eps"]) # batch.data["weights"] = new_priorities return batch
def replay(self): with self.replay_timer: if len(self.replay_buffer) < self.replay_starts: return None (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = self.replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) batch = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes}) return batch
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() with self.grad_timer: grad = self.local_evaluator.compute_gradients(samples) self.local_evaluator.apply_gradients(grad)
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() with self.grad_timer: grad, _ = self.local_evaluator.compute_gradients(samples) self.local_evaluator.apply_gradients(grad) self.grad_timer.push_units_processed(samples.count) self.num_steps_sampled += samples.count self.num_steps_trained += samples.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() with self.grad_timer: grad = self.local_evaluator.compute_gradients(samples) self.local_evaluator.apply_gradients(grad) self.grad_timer.push_units_processed(samples.count) self.num_steps_sampled += samples.count self.num_steps_trained += samples.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() for row in batch.rows(): self.replay_buffer.add( row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) if len(self.replay_buffer) >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) if len(self.replay_buffer) >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: for i in range(self.num_sgd_iter): batch_index = 0 num_batches = ( int(tuples_per_device) // int(self.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) batch_index += 1 self.num_steps_sampled += samples.count self.num_steps_trained += samples.count
def _replay(self): samples = {} with self.replay_timer: for policy_id, replay_buffer in self.replay_buffers.items(): if isinstance(replay_buffer, PrioritizedReplayBuffer): (obses_t, actions, rewards, obses_tp1, dones, weights, batch_indexes) = replay_buffer.sample( self.train_batch_size, beta=self.prioritized_replay_beta) else: (obses_t, actions, rewards, obses_tp1, dones) = replay_buffer.sample(self.train_batch_size) weights = np.ones_like(rewards) batch_indexes = -np.ones_like(rewards) samples[policy_id] = SampleBatch({ "obs": obses_t, "actions": actions, "rewards": rewards, "new_obs": obses_tp1, "dones": dones, "weights": weights, "batch_indexes": batch_indexes }) return MultiAgentBatch(samples, self.train_batch_size)
def sample(self): """sample rollouts from the environment, being called in step in PolicyOptimizer""" observations, rewards, actions, logprobs, dones, values = [], [], [], [], [], [] done = False for step in range(self.config['steps_per_rollout']): value, action, logprob, mean = self.net.forward( to_variable(self.obs[np.newaxis], self.config['cuda'])) action = action.cpu().data.numpy( )[0] if self.config['cuda'] else action.data.numpy()[0] next_obs, reward, done, _ = self.env.step(action) if self.config['cuda']: # torch has an additional dimension for batch size, so we need to select that batch value, logprob, mean = value.data.cpu().numpy()[0], logprob.data.cpu().numpy()[0], \ mean.data.cpu().numpy()[0] else: value, logprob, mean = value.data.numpy()[0], logprob.data.numpy()[0], \ mean.data.numpy()[0] observations.append(self.obs) actions.append(action) rewards.append(reward) logprobs.append(logprob) values.append(value) dones.append(done) self.obs = next_obs if done: # reset the environment self.obs = self.env.reset() if done: last_value = 0.0 else: # bootstrap, we only need the last value to do this value, action, logprob, mean = self.net.forward( to_variable(self.obs[np.newaxis], self.config['cuda'])) if self.config['cuda']: # torch has an additional dimension for batch size, so we need to select that batch value, = value.data.cpu().numpy()[0] else: value, = value.data.numpy()[0] last_value = value # same as ppo_single/model/ppo.py observations = np.asarray(observations) rewards = np.asarray(rewards) logprobs = np.asarray(logprobs) dones = np.asarray(dones) values = np.asarray(values) actions = np.asarray(actions) returns = calculate_returns(rewards, dones, last_value, self.config['gamma']) return SampleBatch({ 'observations': observations, 'rewards': rewards, 'logprobs': logprobs, 'dones': dones, 'values': values, 'actions': actions, 'returns': returns[:-1] })