def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: for i in range(self.config.get("num_sgd_iter", 10)): batch_index = 0 num_batches = (int(tuples_per_device) // int(self.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) batch_index += 1
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count) for policy_id, s in batch.policy_batches.items(): for row in s.rows(): if "weights" not in row: row["weights"] = np.ones_like(row["rewards"]) self.replay_buffers[policy_id].add( pack_if_needed(row["obs"]), row["actions"], row["rewards"], pack_if_needed(row["new_obs"]), row["dones"], row["weights"]) if self.num_steps_sampled >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def collect_samples(agents, config, local_evaluator): num_timesteps_so_far = 0 trajectories = [] # This variable maps the object IDs of trajectories that are currently # computed to the agent that they are computed on; we start some initial # tasks here. agent_dict = {} for agent in agents: fut_sample = agent.sample.remote() agent_dict[fut_sample] = agent while num_timesteps_so_far < config["timesteps_per_batch"]: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [fut_sample], _ = ray.wait(list(agent_dict)) agent = agent_dict.pop(fut_sample) # Start task with next trajectory and record it in the dictionary. fut_sample2 = agent.sample.remote() agent_dict[fut_sample2] = agent next_sample = ray.get(fut_sample) num_timesteps_so_far += next_sample.count trajectories.append(next_sample) return SampleBatch.concat_samples(trajectories)
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() with self.grad_timer: grad = self.local_evaluator.compute_gradients(samples) self.local_evaluator.apply_gradients(grad)
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() with self.grad_timer: grad, _ = self.local_evaluator.compute_gradients(samples) self.local_evaluator.apply_gradients(grad) self.grad_timer.push_units_processed(samples.count) self.num_steps_sampled += samples.count self.num_steps_trained += samples.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() with self.grad_timer: grad = self.local_evaluator.compute_gradients(samples) self.local_evaluator.apply_gradients(grad) self.grad_timer.push_units_processed(samples.count) self.num_steps_sampled += samples.count self.num_steps_trained += samples.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() for row in batch.rows(): self.replay_buffer.add( row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) if len(self.replay_buffer) >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: batch = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: batch = self.local_evaluator.sample() for row in batch.rows(): self.replay_buffer.add(row["obs"], row["actions"], row["rewards"], row["new_obs"], row["dones"], row["weights"]) if len(self.replay_buffer) >= self.replay_starts: self._optimize() self.num_steps_sampled += batch.count
def step(self): with self.update_weights_timer: if self.remote_evaluators: weights = ray.put(self.local_evaluator.get_weights()) for e in self.remote_evaluators: e.set_weights.remote(weights) with self.sample_timer: if self.remote_evaluators: samples = SampleBatch.concat_samples( ray.get( [e.sample.remote() for e in self.remote_evaluators])) else: samples = self.local_evaluator.sample() assert isinstance(samples, SampleBatch) with self.load_timer: tuples_per_device = self.par_opt.load_data( self.local_evaluator.sess, samples.columns([key for key, _ in self.loss_inputs])) with self.grad_timer: for i in range(self.num_sgd_iter): batch_index = 0 num_batches = ( int(tuples_per_device) // int(self.per_device_batch_size)) permutation = np.random.permutation(num_batches) while batch_index < num_batches: # TODO(ekl) support ppo's debugging features, e.g. # printing the current loss and tracing self.par_opt.optimize( self.sess, permutation[batch_index] * self.per_device_batch_size) batch_index += 1 self.num_steps_sampled += samples.count self.num_steps_trained += samples.count