def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch|MultiAgentBatch from evaluating the current policies. """ batches = [self.sampler.get_data()] steps_so_far = batches[0].count # In truncate_episodes mode, never pull more than 1 batch per env. # This avoids over-running the target batch size. if self.batch_mode == "truncate_episodes": max_batches = self.num_envs else: max_batches = float("inf") while steps_so_far < self.batch_steps and len(batches) < max_batches: batch = self.sampler.get_data() steps_so_far += batch.count batches.append(batch) batches.extend(self.sampler.get_extra_batches()) batch = batches[0].concat_samples(batches) if self.compress_observations: if isinstance(batch, MultiAgentBatch): for data in batch.policy_batches.values(): data["obs"] = [pack(o) for o in data["obs"]] data["new_obs"] = [pack(o) for o in data["new_obs"]] else: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch|MultiAgentBatch from evaluating the current policies. """ batches = [self.sampler.get_data()] steps_so_far = batches[0].count while steps_so_far < self.batch_steps: batch = self.sampler.get_data() steps_so_far += batch.count batches.append(batch) batch = batches[0].concat_samples(batches) if self.compress_observations: if isinstance(batch, MultiAgentBatch): for data in batch.policy_batches.values(): data["obs"] = [pack(o) for o in data["obs"]] data["new_obs"] = [pack(o) for o in data["new_obs"]] else: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def compress(self, bulk=False, columns=frozenset(["obs", "new_obs"])): for key in columns: if key in self.data: if bulk: self.data[key] = pack(self.data[key]) else: self.data[key] = np.array( [pack(o) for o in self.data[key]])
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch|MultiAgentBatch from evaluating the current policies. """ if log_once("sample_start"): logger.info("Generating sample batch of size {}".format( self.sample_batch_size)) batches = [self.input_reader.next()] steps_so_far = batches[0].count # In truncate_episodes mode, never pull more than 1 batch per env. # This avoids over-running the target batch size. if self.batch_mode == "truncate_episodes": max_batches = self.num_envs else: max_batches = float("inf") while steps_so_far < self.sample_batch_size and len( batches) < max_batches: batch = self.input_reader.next() steps_so_far += batch.count batches.append(batch) batch = batches[0].concat_samples(batches) if self.callbacks.get("on_sample_end"): self.callbacks["on_sample_end"]({ "evaluator": self, "samples": batch }) # Always do writes prior to compression for consistency and to allow # for better compression inside the writer. self.output_writer.write(batch) # Do off-policy estimation if needed if self.reward_estimators: for sub_batch in batch.split_by_episode(): for estimator in self.reward_estimators: estimator.process(sub_batch) if log_once("sample_end"): logger.info("Completed sample batch:\n\n{}\n".format( summarize(batch))) if self.compress_observations: if isinstance(batch, MultiAgentBatch): for data in batch.policy_batches.values(): data["obs"] = [pack(o) for o in data["obs"]] data["new_obs"] = [pack(o) for o in data["new_obs"]] else: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def _compress_in_place(path, value): if path[0] not in columns: return curr = self for i, p in enumerate(path): if i == len(path) - 1: if bulk: curr[p] = pack(value) else: curr[p] = np.array([pack(o) for o in value]) curr = curr[p]
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch|MultiAgentBatch from evaluating the current policies. """ batches = [self.input_reader.next()] steps_so_far = batches[0].count # In truncate_episodes mode, never pull more than 1 batch per env. # This avoids over-running the target batch size. if self.batch_mode == "truncate_episodes": max_batches = self.num_envs else: max_batches = float("inf") while steps_so_far < self.sample_batch_size and len( batches) < max_batches: batch = self.input_reader.next() steps_so_far += batch.count batches.append(batch) batch = batches[0].concat_samples(batches) if self.callbacks.get("on_sample_end"): self.callbacks["on_sample_end"]({ "evaluator": self, "samples": batch }) # Always do writes prior to compression for consistency and to allow # for better compression inside the writer. self.output_writer.write(batch) # Do off-policy estimation if needed if self.reward_estimators: for sub_batch in batch.split_by_episode(): for estimator in self.reward_estimators: estimator.process(sub_batch) if self.compress_observations: if isinstance(batch, MultiAgentBatch): for data in batch.policy_batches.values(): data["obs"] = [pack(o) for o in data["obs"]] data["new_obs"] = [pack(o) for o in data["new_obs"]] else: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch from evaluating the current policies. """ batch = self.policy_map["default"].postprocess_trajectory( self.sampler.get_data()) if self.compress_observations: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def compress(self, bulk: bool = False, columns: Set[str] = frozenset(["obs", "new_obs"])) -> None: """Compresses the data buffers (by column) in place. Args: bulk (bool): Whether to compress across the batch dimension (0) as well. If False will compress n separate list items, where n is the batch size. columns (Set[str]): The columns to compress. Default: Only compress the obs and new_obs columns. """ for key in columns: if key in self.keys(): if bulk: self[key] = pack(self[key]) else: self[key] = np.array([pack(o) for o in self[key]])
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch|MultiAgentBatch from evaluating the current policies. """ batches = [self.input_reader.next()] steps_so_far = batches[0].count # In truncate_episodes mode, never pull more than 1 batch per env. # This avoids over-running the target batch size. if self.batch_mode == "truncate_episodes": max_batches = self.num_envs else: max_batches = float("inf") while steps_so_far < self.sample_batch_size and len( batches) < max_batches: batch = self.input_reader.next() steps_so_far += batch.count batches.append(batch) batch = batches[0].concat_samples(batches) if self.callbacks.get("on_sample_end"): self.callbacks["on_sample_end"]({ "evaluator": self, "samples": batch }) # Always do writes prior to compression for consistency and to allow # for better compression inside the writer. self.output_writer.write(batch) if self.compress_observations: if isinstance(batch, MultiAgentBatch): for data in batch.policy_batches.values(): data["obs"] = [pack(o) for o in data["obs"]] data["new_obs"] = [pack(o) for o in data["new_obs"]] else: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch from evaluating the current policies. """ batches = [self.sampler.get_data()] steps_so_far = batches[0].count while steps_so_far < self.batch_steps: batch = self.sampler.get_data() steps_so_far += batch.count batches.append(batch) batch = SampleBatch.concat_samples(batches) if self.compress_observations: batch["obs"] = [pack(o) for o in batch["obs"]] batch["new_obs"] = [pack(o) for o in batch["new_obs"]] return batch
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep(self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": [pack(np.array(o)) for o in obs], "actions": actions, "rewards": rewards, "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones, "weights": np.ones_like(rewards) }) assert (batch.count == self.config["sample_batch_size"]) # Prioritize on the worker side if self.config["worker_side_prioritization"]: td_errors = self.ddpg_graph.compute_td_error( self.sess, obs, batch["actions"], batch["rewards"], new_obs, batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch
def sample(self): obs, actions, rewards, new_obs, dones = [], [], [], [], [] for _ in range( self.config["sample_batch_size"] + self.config["n_step"] - 1): ob, act, rew, ob1, done = self._step(self.global_timestep) obs.append(ob) actions.append(act) rewards.append(rew) new_obs.append(ob1) dones.append(done) # N-step Q adjustments if self.config["n_step"] > 1: # Adjust for steps lost from truncation self.local_timestep -= (self.config["n_step"] - 1) adjust_nstep( self.config["n_step"], self.config["gamma"], obs, actions, rewards, new_obs, dones) batch = SampleBatch({ "obs": [pack(np.array(o)) for o in obs], "actions": actions, "rewards": rewards, "new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones, "weights": np.ones_like(rewards)}) assert (batch.count == self.config["sample_batch_size"]) # Prioritize on the worker side if self.config["worker_side_prioritization"]: td_errors = self.dqn_graph.compute_td_error( self.sess, obs, batch["actions"], batch["rewards"], new_obs, batch["dones"], batch["weights"]) new_priorities = ( np.abs(td_errors) + self.config["prioritized_replay_eps"]) batch.data["weights"] = new_priorities return batch
def _to_jsonable(v, compress): if compress: return str(pack(v)) elif isinstance(v, np.ndarray): return v.tolist() return v
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] np.random.seed() rand_buff1 = np.random.choice(opt.num_buffers, 1)[0] random_steps = 0 while True: # ------ env set up ------ env = TradingEnv() # env = Wrapper(env, opt.action_repeat, opt.reward_scale) # ------ env set up end ------ o_queue = deque([], maxlen=opt.Ln + 1) a_r_d_queue = deque([], maxlen=opt.Ln) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ep_score, ep_target_bias = 0, 0 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o,)) else: o_queue.append((o,)) t_queue = 1 weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) # for a_l_ratio control np.random.seed() rand_buff = np.random.choice(opt.num_buffers, 1)[0] last_learner_steps, last_actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) while True: # don't need to random sample action if load weights from local. if random_steps > opt.start_steps or opt.weights_file or opt.recover: a = agent.get_action(o, deterministic=False) else: a = env.action_space.sample() random_steps += 1 # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_score += info['score'] ep_target_bias += info['target_bias'] ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d o = o2 a_r_d_queue.append((a, r, d,)) if opt.model == "cnn": compressed_o2 = pack(o2) o_queue.append((compressed_o2,)) else: o_queue.append((o2,)) # scheme 1: # TODO and t_queue % 2 == 0: %1 lead to q smaller # TODO if t_queue >= opt.Ln and t_queue % opt.save_freq == 0: replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote(o_queue, a_r_d_queue, worker_index) t_queue += 1 # End of episode. Training (ep_len times). # if d or (ep_len * opt.action_repeat >= opt.max_ep_len): if d or ep_len > opt.max_ep_len: sample_times, steps, _ = ray.get(replay_buffer[0].get_counts.remote()) # print('rollout ep_len:', ep_len * opt.action_repeat, 'ep_score:', ep_score, # 'ep_target_bias:', ep_target_bias) if steps > opt.start_steps: # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o,)) else: o_queue.append((o,)) # for a_l_ratio control learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote()) while (actor_steps - last_actor_steps) / ( learner_steps - last_learner_steps + 1) > opt.a_l_ratio and last_learner_steps > 0: time.sleep(1) learner_steps, actor_steps, _size = ray.get(replay_buffer[rand_buff].get_counts.remote())
def _to_jsonable(v, compress: bool) -> Any: if compress and compression_supported(): return str(pack(v)) elif isinstance(v, np.ndarray): return v.tolist() return v
def worker_rollout(ps, replay_buffer, opt, worker_index): agent = Actor(opt, job="worker") keys = agent.get_weights()[0] filling_steps = 0 while True: # ------ env set up ------ env = Wrapper(gym.make(opt.env_name), opt.obs_noise, opt.act_noise, opt.reward_scale, 3) # ------ env set up end ------ ################################## deques o_queue = deque([], maxlen=opt.Ln + 1) a_r_d_queue = deque([], maxlen=opt.Ln) ################################## deques o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o, )) else: o_queue.append((o, )) ################################## deques reset weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) while True: # don't need to random sample action if load weights from local. if filling_steps > opt.start_steps or opt.weights_file: a = agent.get_action(o, deterministic=False) else: a = env.action_space.sample() filling_steps += 1 # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len*opt.action_repeat >= opt.max_ep_len else d o = o2 #################################### deques store a_r_d_queue.append(( a, r, d, )) if opt.model == "cnn": compressed_o2 = pack(o2) o_queue.append((compressed_o2, )) else: o_queue.append((o2, )) # scheme 1: # TODO and t_queue % 2 == 0: %1 lead to q smaller # TODO if t_queue >= opt.Ln and t_queue % opt.save_freq == 0: replay_buffer[np.random.choice(opt.num_buffers, 1)[0]].store.remote( o_queue, a_r_d_queue, worker_index) t_queue += 1 #################################### deques store # End of episode. Training (ep_len times). if d or (ep_len * opt.action_repeat >= opt.max_ep_len): # TODO sample_times, steps, _ = ray.get( replay_buffer[0].get_counts.remote()) print('rollout_ep_len:', ep_len * opt.action_repeat, 'rollout_ep_ret:', ep_ret) if steps > opt.start_steps: # update parameters every episode weights = ray.get(ps.pull.remote(keys)) agent.set_weights(keys, weights) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 if opt.model == "cnn": compressed_o = pack(o) o_queue.append((compressed_o, )) else: o_queue.append((o, ))