def prepare_experiment(env, args): # Manager to share PER between a learner and explorers SyncManager.register('PrioritizedReplayBuffer', PrioritizedReplayBuffer) manager = SyncManager() manager.start() kwargs = get_default_rb_dict(args.replay_buffer_size, env) kwargs["check_for_update"] = True global_rb = manager.PrioritizedReplayBuffer(**kwargs) # queues to share network parameters between a learner and explorers n_queue = 1 if args.n_env > 1 else args.n_explorer n_queue += 1 # for evaluation queues = [manager.Queue() for _ in range(n_queue)] # Event object to share training status. if event is set True, all exolorers stop sampling transitions is_training_done = Event() # Lock lock = manager.Lock() # Shared memory objects to count number of samples and applied gradients trained_steps = Value('i', 0) return global_rb, queues, is_training_done, lock, trained_steps
def __call__(self): total_steps = 0 n_episode = 0 # TODO: clean codes # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples n_episode, total_rewards = self._collect_sample( n_episode, total_steps) total_steps += self._policy.horizon tf.summary.experimental.set_step(total_steps) if len(total_rewards) > 0: avg_training_return = sum(total_rewards) / len(total_rewards) tf.summary.scalar(name="Common/training_return", data=avg_training_return) # Train actor critic for _ in range(self._policy.n_epoch): samples = self.replay_buffer.sample(self._policy.horizon) if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def get_replay_buffer(policy, env, use_prioritized_rb=False, use_nstep_rb=False, n_step=1, size=None): if policy is None or env is None: return None obs_shape = get_space_size(env.observation_space) kwargs = get_default_rb_dict(policy.memory_capacity, env) if size is not None: kwargs["size"] = size # on-policy policy if not issubclass(type(policy), OffPolicyAgent): kwargs["size"] = policy.horizon kwargs["env_dict"].pop("next_obs") kwargs["env_dict"].pop("rew") kwargs["env_dict"]["logp"] = {} kwargs["env_dict"]["ret"] = {} kwargs["env_dict"]["adv"] = {} if is_discrete(env.action_space): kwargs["env_dict"]["act"]["dtype"] = np.int32 return ReplayBuffer(**kwargs) # N-step prioritized if use_prioritized_rb and use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return PrioritizedReplayBuffer(**kwargs) # prioritized if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) # N-step if use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return ReplayBuffer(**kwargs) return ReplayBuffer(**kwargs)
def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): act, logp, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(act) if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer._encode_sample( np.arange(self._policy.horizon)) mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def explorer(global_rb, queue, trained_steps, is_training_done, lock, env_fn, policy_fn, set_weights_fn, noise_level, n_env=64, n_thread=4, buffer_size=1024, episode_max_steps=1000, gpu=0): """ Collect transitions and store them to prioritized replay buffer. :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]): Prioritized replay buffer sharing with multiple explorers and only one learner. This object is shared over processes, so it must be locked when trying to operate something with `lock` object. :param queue (multiprocessing.Queue): A FIFO shared with the `learner` and `evaluator` to get the latest network weights. This is process safe, so you don't need to lock process when use this. :param trained_steps (multiprocessing.Value): Number of steps to apply gradients. :param is_training_done (multiprocessing.Event): multiprocessing.Event object to share the status of training. :param lock (multiprocessing.Lock): multiprocessing.Lock to lock other processes. :param env_fn (function): Method object to generate an environment. :param policy_fn (function): Method object to generate an explorer. :param set_weights_fn (function): Method object to set network weights gotten from queue. :param noise_level (float): Noise level for exploration. For epsilon-greedy policy like DQN variants, this will be epsilon, and if DDPG variants this will be variance for Normal distribution. :param n_env (int): Number of environments to distribute. If this is set to be more than 1, `MultiThreadEnv` will be used. :param n_thread (int): Number of thread used in `MultiThreadEnv`. :param buffer_size (int): Size of local buffer. If this is filled with transitions, add them to `global_rb` :param episode_max_steps (int): Maximum number of steps of an episode. :param gpu (int): GPU id. If this is set to -1, then this process uses only CPU. """ import_tf() logger = logging.getLogger("tf2rl") if n_env > 1: envs = MultiThreadEnv(env_fn=env_fn, batch_size=n_env, thread_pool=n_thread, max_episode_steps=episode_max_steps) env = envs._sample_env else: env = env_fn() policy = policy_fn(env=env, name="Explorer", memory_capacity=global_rb.get_buffer_size(), noise_level=noise_level, gpu=gpu) kwargs = get_default_rb_dict(buffer_size, env) if n_env > 1: kwargs["env_dict"]["priorities"] = {} local_rb = ReplayBuffer(**kwargs) local_idx = np.arange(buffer_size).astype(np.int) if n_env == 1: s = env.reset() episode_steps = 0 total_reward = 0. total_rewards = [] start = time.time() n_sample, n_sample_old = 0, 0 while not is_training_done.is_set(): if n_env == 1: n_sample += 1 episode_steps += 1 a = policy.get_action(s) s_, r, done, _ = env.step(a) done_flag = done if episode_steps == env._max_episode_steps: done_flag = False total_reward += r local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag) s = s_ if done or episode_steps == episode_max_steps: s = env.reset() total_rewards.append(total_reward) total_reward = 0 episode_steps = 0 else: n_sample += n_env obses = envs.py_observation() actions = policy.get_action(obses, tensor=True) next_obses, rewards, dones, _ = envs.step(actions) td_errors = policy.compute_td_error(states=obses, actions=actions, next_states=next_obses, rewards=rewards, dones=dones) local_rb.add(obs=obses, act=actions, next_obs=next_obses, rew=rewards, done=dones, priorities=np.abs(td_errors + 1e-6)) # Periodically copy weights of explorer if not queue.empty(): set_weights_fn(policy, queue.get()) # Add collected experiences to global replay buffer if local_rb.get_stored_size() == buffer_size: samples = local_rb._encode_sample(local_idx) if n_env > 1: priorities = np.squeeze(samples["priorities"]) else: td_errors = policy.compute_td_error( states=samples["obs"], actions=samples["act"], next_states=samples["next_obs"], rewards=samples["rew"], dones=samples["done"]) priorities = np.abs(np.squeeze(td_errors)) + 1e-6 lock.acquire() global_rb.add(obs=samples["obs"], act=samples["act"], rew=samples["rew"], next_obs=samples["next_obs"], done=samples["done"], priorities=priorities) lock.release() local_rb.clear() msg = "Grad: {0: 6d}\t".format(trained_steps.value) msg += "Samples: {0: 7d}\t".format(n_sample) msg += "TDErr: {0:.5f}\t".format(np.average(priorities)) if n_env == 1: ave_rew = (0 if len(total_rewards) == 0 else sum(total_rewards) / len(total_rewards)) msg += "AveEpiRew: {0:.3f}\t".format(ave_rew) total_rewards = [] msg += "FPS: {0:.2f}".format( (n_sample - n_sample_old) / (time.time() - start)) logger.info(msg) start = time.time() n_sample_old = n_sample
def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, reward, done, info = self._env.step(env_act) # print('[DEBUG] COST:', info['cost']) try: cost = info['cost'] except (TypeError, KeyError): cost = 0 if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward episode_cost += cost done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, episode_cost, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) self.total_cost += episode_cost cost_rate = self.total_cost / total_steps wandb.log( { 'Training_Return': episode_return, 'Training_Cost': episode_cost, 'Cost_Rate': cost_rate, 'FPS': fps }, step=n_epoisode) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() if total_steps % self._test_interval == 0: avg_test_return, avg_test_cost = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes" .format(total_steps, avg_test_return, avg_test_cost, self._test_episodes)) wandb.log( { 'Evaluation_Return': avg_test_return, 'Evaluation_Cost': avg_test_cost }, step=n_epoisode) # wandb.log({'Evaluation_Step': total_steps}) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) # Update normalizer if self._normalize_obs: self._obs_normalizer.experience(samples["obs"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._normalize_obs: samples["obs"] = self._obs_normalizer(samples["obs"], update=False) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) tf.summary.flush()
def __call__(self): total_steps = 0 episode_steps = 0 episode_return = 0 episode_start_time = time.time() n_episode = 0 test_step_threshold = self._test_interval # TODO: clean codes self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) obs = self._env.reset() while total_steps < self._max_steps: for _ in range(self._policy.horizon): action, log_pi, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps += 1 done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag, logp=log_pi, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: self.finish_horizon() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) samples = self.replay_buffer.sample(self._policy.horizon) # Normalize advantages if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for _ in range(1): self._policy.train_actor(samples["obs"], samples["act"], adv, samples["logp"]) # Train Critic for _ in range(5): self._policy.train_critic(samples["obs"], samples["ret"]) if total_steps > test_step_threshold: test_step_threshold += self._test_interval avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()