def collect_transitions_sim_env(self): """ Generate transitions using dynamics model """ self.replay_buffer.clear() n_episodes = 0 ave_episode_return = 0 while self.replay_buffer.get_stored_size() < self._policy.horizon: obs = self._env.reset() episode_return = 0. for _ in range(self._episode_max_steps): act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act if self._debug: next_obs, rew, _, _ = self._env.step(env_act) else: next_obs = self.predict_next_state(obs, env_act) rew = self._reward_fn(obs, act)[0] self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=rew, done=False, logp=logp, val=val) obs = next_obs episode_return += rew self.finish_horizon(last_val=val) ave_episode_return += episode_return n_episodes += 1 return ave_episode_return / n_episodes
def _evaluate_model(self): ret_real_env, ret_sim_env = 0., 0. n_episodes = 10 for _ in range(n_episodes): real_obs = self._env.reset() sim_obs = real_obs.copy() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(real_obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_real_obs, rew, _, _ = self._env.step(env_act) ret_real_env += rew real_obs = next_real_obs next_sim_obs = self.predict_next_state(sim_obs, env_act) ret_sim_env += self._reward_fn(real_obs, act)[0] sim_obs = next_sim_obs ret_real_env /= n_episodes ret_sim_env /= n_episodes return ret_real_env, ret_sim_env
def evaluate_policy(self, total_steps): """ Evaluate policy Args: total_steps (int): Current total steps of training """ avg_test_return = 0. avg_test_steps = 0 if self._save_test_path: replay_buffer = get_replay_buffer(self._policy, self._test_env, size=self._episode_max_steps) for i in range(self._test_episodes): episode_return = 0. frames = [] obs = self._test_env.reset() avg_test_steps += 1 for _ in range(self._episode_max_steps): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, _ = self._policy.get_action(obs, test=True) act = (act if is_discrete(self._env.action_space) else np.clip( act, self._env.action_space.low, self._env.action_space.high)) next_obs, reward, done, _ = self._test_env.step(act) avg_test_steps += 1 if self._save_test_path: replay_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done) if self._save_test_movie: frames.append(self._test_env.render(mode='rgb_array')) elif self._show_test_progress: self._test_env.render() episode_return += reward obs = next_obs if done: break prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format( total_steps, i, episode_return) if self._save_test_path: save_path(replay_buffer.sample(self._episode_max_steps), os.path.join(self._output_dir, prefix + ".pkl")) replay_buffer.clear() if self._save_test_movie: frames_to_gif(frames, prefix, self._output_dir) avg_test_return += episode_return if self._show_test_images: images = tf.cast( tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3), tf.uint8) tf.summary.image( 'train/input_img', images, ) return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes
def __call__(self): total_steps = 0 n_episode = 0 # TODO: clean codes # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples n_episode, total_rewards = self._collect_sample( n_episode, total_steps) total_steps += self._policy.horizon tf.summary.experimental.set_step(total_steps) if len(total_rewards) > 0: avg_training_return = sum(total_rewards) / len(total_rewards) tf.summary.scalar(name="Common/training_return", data=avg_training_return) # Train actor critic for _ in range(self._policy.n_epoch): samples = self.replay_buffer.sample(self._policy.horizon) if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def get_replay_buffer(policy, env, use_prioritized_rb=False, use_nstep_rb=False, n_step=1, size=None): if policy is None or env is None: return None obs_shape = get_space_size(env.observation_space) kwargs = get_default_rb_dict(policy.memory_capacity, env) if size is not None: kwargs["size"] = size # on-policy policy if not issubclass(type(policy), OffPolicyAgent): kwargs["size"] = policy.horizon kwargs["env_dict"].pop("next_obs") kwargs["env_dict"].pop("rew") # TODO: Remove done. Currently cannot remove because of cpprb implementation # kwargs["env_dict"].pop("done") kwargs["env_dict"]["logp"] = {} kwargs["env_dict"]["ret"] = {} kwargs["env_dict"]["adv"] = {} if is_discrete(env.action_space): kwargs["env_dict"]["act"]["dtype"] = np.int32 return ReplayBuffer(**kwargs) # N-step prioritized if use_prioritized_rb and use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return PrioritizedReplayBuffer(**kwargs) if len(obs_shape) == 3: kwargs["env_dict"]["obs"]["dtype"] = np.ubyte kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte # prioritized if use_prioritized_rb: return PrioritizedReplayBuffer(**kwargs) # N-step if use_nstep_rb: kwargs["Nstep"] = { "size": n_step, "gamma": policy.discount, "rew": "rew", "next": "next_obs" } return ReplayBuffer(**kwargs) return ReplayBuffer(**kwargs)
def _visualize_current_performance(self): obs = self._env.reset() for _ in range(self._episode_max_steps): act, _ = self._policy.get_action(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs = self.predict_next_state(obs, env_act) self._env.state = np.array([np.arctan2(next_obs[1], next_obs[0]), next_obs[2]], dtype=np.float32) # print(obs, act, next_obs, self._env.state) self._env.render() obs = next_obs
def collect_transitions_real_env(self): total_steps = 0 episode_steps = 0 obs = self._env.reset() while total_steps < self._n_collect_steps: episode_steps += 1 total_steps += 1 act, _ = self._policy.get_action(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, _, done, _ = self._env.step(env_act) self.dynamics_buffer.add( obs=obs, act=env_act, next_obs=next_obs) obs = next_obs if done or episode_steps == self._episode_max_steps: episode_steps = 0 obs = self._env.reset()
def _evaluate_current_return(self, init_states): n_episodes = self._n_dynamics_model * self._n_eval_episodes_per_model assert init_states.shape[0] == n_episodes obses = init_states.copy() next_obses = np.zeros_like(obses) returns = np.zeros(shape=(n_episodes,), dtype=np.float32) for _ in range(self._episode_max_steps): acts, _ = self._policy.get_action(obses) for i in range(n_episodes): model_idx = i // self._n_eval_episodes_per_model if not is_discrete(self._env.action_space): env_act = np.clip(acts[i], self._env.action_space.low, self._env.action_space.high) else: env_act = acts[i] next_obses[i] = self.predict_next_state(obses[i], env_act, idx=model_idx) returns += self._reward_fn(obses, acts) obses = next_obses return returns
def test_is_discrete(self): discrete_space = gym.make('CartPole-v0').action_space continuous_space = gym.make('Pendulum-v0').action_space self.assertTrue(is_discrete(discrete_space)) self.assertFalse(is_discrete(continuous_space))
#parser.set_defaults(horizon=1024) #parser.set_defaults(batch_size=512) parser.set_defaults(gpu=-1) parser.set_defaults(max_steps=100000000) parser.set_defaults(n_warmup=0) #parser.set_defaults(enable_gae=True) args = parser.parse_args() env = ArmEnvironment(static_goal=True,slow_step=slow_step) test_env = ArmEnvironment(static_goal=True,slow_step=slow_step) policy = PPO( state_shape=env.observation_space.shape, action_dim=get_act_dim(env.action_space), is_discrete=is_discrete(env.action_space), max_action=None if is_discrete( env.action_space) else env.action_space.high[0], batch_size=args.batch_size, actor_units=(64, 64), critic_units=(64, 64), n_epoch=10, lr_actor=3e-4, lr_critic=3e-4, hidden_activation_actor="tanh", hidden_activation_critic="tanh", discount=0.99, lam=0.95, entropy_coef=0.001, horizon=args.horizon, normalize_adv=args.normalize_adv,
def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): act, logp, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(act) if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer._encode_sample( np.arange(self._policy.horizon)) mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) if total_steps % self._test_interval == 0: avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() tf.summary.flush()
def __call__(self): # Prepare buffer self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._policy.horizon, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() total_steps = np.array(0, dtype=np.int32) n_epoisode = 0 obs = self._env.reset() tf.summary.experimental.set_step(total_steps) while total_steps < self._max_steps: # Collect samples for _ in range(self._policy.horizon): if self._normalize_obs: obs = self._obs_normalizer(obs, update=False) act, logp, val = self._policy.get_action_and_val(obs) if not is_discrete(self._env.action_space): env_act = np.clip(act, self._env.action_space.low, self._env.action_space.high) else: env_act = act next_obs, reward, done, info = self._env.step(env_act) # print('[DEBUG] COST:', info['cost']) try: cost = info['cost'] except (TypeError, KeyError): cost = 0 if self._show_progress: self._env.render() episode_steps += 1 total_steps += 1 episode_return += reward episode_cost += cost done_flag = done if (hasattr(self._env, "_max_episode_steps") and episode_steps == self._env._max_episode_steps): done_flag = False self.local_buffer.add(obs=obs, act=act, next_obs=next_obs, rew=reward, done=done_flag, logp=logp, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: tf.summary.experimental.set_step(total_steps) self.finish_horizon() obs = self._env.reset() n_epoisode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 6.4f} Cost: {4: 5.4f} FPS: {5:5.2f}" .format(n_epoisode, int(total_steps), episode_steps, episode_return, episode_cost, fps)) tf.summary.scalar(name="Common/training_return", data=episode_return) tf.summary.scalar(name="Common/fps", data=fps) self.total_cost += episode_cost cost_rate = self.total_cost / total_steps wandb.log( { 'Training_Return': episode_return, 'Training_Cost': episode_cost, 'Cost_Rate': cost_rate, 'FPS': fps }, step=n_epoisode) episode_steps = 0 episode_return = 0 episode_cost = 0 episode_start_time = time.time() if total_steps % self._test_interval == 0: avg_test_return, avg_test_cost = self.evaluate_policy( total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 6.4f} Average Cost {2: 5.4f} over {3: 2} episodes" .format(total_steps, avg_test_return, avg_test_cost, self._test_episodes)) wandb.log( { 'Evaluation_Return': avg_test_return, 'Evaluation_Cost': avg_test_cost }, step=n_epoisode) # wandb.log({'Evaluation_Step': total_steps}) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) self.writer.flush() if total_steps % self._save_model_interval == 0: self.checkpoint_manager.save() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) # Train actor critic if self._policy.normalize_adv: samples = self.replay_buffer.get_all_transitions() mean_adv = np.mean(samples["adv"]) std_adv = np.std(samples["adv"]) # Update normalizer if self._normalize_obs: self._obs_normalizer.experience(samples["obs"]) with tf.summary.record_if(total_steps % self._save_summary_interval == 0): for _ in range(self._policy.n_epoch): samples = self.replay_buffer._encode_sample( np.random.permutation(self._policy.horizon)) if self._normalize_obs: samples["obs"] = self._obs_normalizer(samples["obs"], update=False) if self._policy.normalize_adv: adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8) else: adv = samples["adv"] for idx in range( int(self._policy.horizon / self._policy.batch_size)): target = slice(idx * self._policy.batch_size, (idx + 1) * self._policy.batch_size) self._policy.train(states=samples["obs"][target], actions=samples["act"][target], advantages=adv[target], logp_olds=samples["logp"][target], returns=samples["ret"][target]) tf.summary.flush()
def run(parser): args = parser.parse_args() if args.gpu < 0: tf.config.experimental.set_visible_devices([], 'GPU') else: physical_devices = tf.config.list_physical_devices('GPU') tf.config.set_visible_devices(physical_devices[args.gpu], 'GPU') tf.config.experimental.set_virtual_device_configuration( physical_devices[args.gpu], [ tf.config.experimental.VirtualDeviceConfiguration( memory_limit=1024 * 3) ]) if args.env == 200: envname = 'ScratchItchPR2X' elif args.env == 201: envname = 'DressingPR2X' elif args.env == 202: envname = 'BedBathingPR2X' logdir = f'MFBox_Assistive' if args.SAC: wandb.init(config=vars(args), project="Assistive Gym", name=f'SAC on {envname}') elif args.PPO: wandb.init(config=vars(args), project="Assistive Gym", name=f'PPO on {envname}') elif args.TD3: wandb.init(config=vars(args), project="Assistive Gym", name=f'TD3 on {envname}') elif args.DEBUG: logdir = f'DEBUG_Assistive' wandb.init(config=vars(args), project="Assistive Gym", name=f'DEBUG on {envname}') else: print('PLEASE INDICATE THE ALGORITHM !!') if not os.path.exists(logdir): os.makedirs(logdir) parser.set_defaults(logdir=logdir) args = parser.parse_args() env = gym.make(f'{envname}-v0') #test_env = Monitor(env,logdir,force=True) test_env = gym.make(f'{envname}-v0') if args.SAC: policy = SAC(state_shape=env.observation_space.shape, action_dim=env.action_space.high.size, gpu=args.gpu, memory_capacity=args.memory_capacity, max_action=env.action_space.high[0], batch_size=args.batch_size, n_warmup=args.n_warmup, alpha=args.alpha, auto_alpha=args.auto_alpha) trainer = Trainer(policy, env, args, test_env=test_env) elif args.PPO: policy = PPO(state_shape=env.observation_space.shape, action_dim=get_act_dim(env.action_space), is_discrete=is_discrete(env.action_space), max_action=None if is_discrete(env.action_space) else env.action_space.high[0], batch_size=args.batch_size, actor_units=(64, 64), critic_units=(64, 64), n_epoch=10, lr_actor=3e-4, lr_critic=3e-4, hidden_activation_actor="tanh", hidden_activation_critic="tanh", discount=0.99, lam=0.95, entropy_coef=0., horizon=args.horizon, normalize_adv=args.normalize_adv, enable_gae=args.enable_gae, gpu=args.gpu) trainer = OnPolicyTrainer(policy, env, args, test_env=test_env) elif args.TD3: policy = TD3(state_shape=env.observation_space.shape, action_dim=env.action_space.high.size, gpu=args.gpu, memory_capacity=args.memory_capacity, max_action=env.action_space.high[0], batch_size=args.batch_size, n_warmup=args.n_warmup) trainer = Trainer(policy, env, args, test_env=test_env) elif args.DEBUG: policy = SAC(state_shape=env.observation_space.shape, action_dim=env.action_space.high.size, gpu=args.gpu, memory_capacity=args.memory_capacity, max_action=env.action_space.high[0], batch_size=args.batch_size, n_warmup=100, alpha=args.alpha, auto_alpha=args.auto_alpha) parser.set_defaults(test_interval=200) args = parser.parse_args() trainer = Trainer(policy, env, args, test_env=None) trainer()
def __call__(self): total_steps = 0 episode_steps = 0 episode_return = 0 episode_start_time = time.time() n_episode = 0 test_step_threshold = self._test_interval # TODO: clean codes self.replay_buffer = get_replay_buffer(self._policy, self._env) kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps, env=self._env) kwargs_local_buf["env_dict"]["logp"] = {} kwargs_local_buf["env_dict"]["val"] = {} if is_discrete(self._env.action_space): kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32 self.local_buffer = ReplayBuffer(**kwargs_local_buf) obs = self._env.reset() while total_steps < self._max_steps: for _ in range(self._policy.horizon): action, log_pi, val = self._policy.get_action_and_val(obs) next_obs, reward, done, _ = self._env.step(action) if self._show_progress: self._env.render() episode_steps += 1 episode_return += reward total_steps += 1 done_flag = done if hasattr(self._env, "_max_episode_steps") and \ episode_steps == self._env._max_episode_steps: done_flag = False self.local_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done_flag, logp=log_pi, val=val) obs = next_obs if done or episode_steps == self._episode_max_steps: self.finish_horizon() obs = self._env.reset() n_episode += 1 fps = episode_steps / (time.time() - episode_start_time) self.logger.info( "Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}" .format(n_episode, int(total_steps), episode_steps, episode_return, fps)) episode_steps = 0 episode_return = 0 episode_start_time = time.time() self.finish_horizon(last_val=val) tf.summary.experimental.set_step(total_steps) samples = self.replay_buffer.sample(self._policy.horizon) # Normalize advantages if self._policy.normalize_adv: adv = (samples["adv"] - np.mean(samples["adv"])) / np.std( samples["adv"]) else: adv = samples["adv"] for _ in range(1): self._policy.train_actor(samples["obs"], samples["act"], adv, samples["logp"]) # Train Critic for _ in range(5): self._policy.train_critic(samples["obs"], samples["ret"]) if total_steps > test_step_threshold: test_step_threshold += self._test_interval avg_test_return = self.evaluate_policy(total_steps) self.logger.info( "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes" .format(total_steps, avg_test_return, self._test_episodes)) tf.summary.scalar(name="Common/average_test_return", data=avg_test_return) tf.summary.scalar(name="Common/fps", data=fps) self.writer.flush() if total_steps % self._model_save_interval == 0: self.checkpoint_manager.save() tf.summary.flush()