def test_vec_env(tmp_path, make_env): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) if isinstance(obs, dict): for key in obs.keys(): assert np.max(np.abs(obs[key])) <= clip_obs else: assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = tmp_path / "vec_normalize" norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def test_replay_buffer_normalization(replay_buffer_cls): env = {ReplayBuffer: DummyEnv, DictReplayBuffer: DummyDictEnv}[replay_buffer_cls] env = make_vec_env(env) env = VecNormalize(env) buffer = replay_buffer_cls(100, env.observation_space, env.action_space) # Interract and store transitions env.reset() obs = env.get_original_obs() for _ in range(100): action = env.action_space.sample() _, _, done, info = env.step(action) next_obs = env.get_original_obs() reward = env.get_original_reward() buffer.add(obs, next_obs, action, reward, done, info) obs = next_obs sample = buffer.sample(50, env) # Test observation normalization for observations in [sample.observations, sample.next_observations]: if isinstance(sample, DictReplayBufferSamples): for key in observations.keys(): assert th.allclose(observations[key].mean(0), th.zeros(1), atol=1) elif isinstance(sample, ReplayBufferSamples): assert th.allclose(observations.mean(0), th.zeros(1), atol=1) # Test reward normalization assert np.allclose(sample.rewards.mean(0), np.zeros(1), atol=1)
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] env_params = { 'time_step': TIME_STEP, 'robot_class': QuadrupedRobot, 'on_rack': False, 'enable_self_collision': True, 'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT, 'train_or_test': test_or_train } policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]} if TEST_OR_TRAIN == "train": env = make_vec_env(env_change_input, n_envs=NUM_CPUS, seed=0, env_kwargs=env_params, vec_env_cls=SubprocVecEnv) env = VecNormalize(env) if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: # env = env_change_input(time_step=env_params['time_step'], # robot_class=env_params['robot_class'], # on_rack=env_params['on_rack'], # enable_self_collision=env_params['enable_self_collision'], # motor_control_mode=env_params['motor_control_mode'], # train_or_test=env_params['train_or_test']) env = env_change_input(**env_params) model_load_path = os.path.join(policy_save_dir, 'ppo_3_17-03-2021_15-39-42') model = PPO.load(model_load_path) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def test_vec_env(tmpdir): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = str(tmpdir.join("vec_normalize")) norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
# TRY NOT TO MODIFY: prepare the execution of the game. for step in range(0, args.num_steps): global_step += 1 * args.num_envs obs[step] = next_obs dones[step] = next_done # ALGO LOGIC: put action logic here with torch.no_grad(): action, logproba, _, vs = agent.get_action_and_value(next_obs) values[step] = vs.flatten() actions[step] = action logprobs[step] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rs, ds, infos = envs.step(action) rewards[step], next_done = rs.view(-1), torch.Tensor(ds).to(device) for info in infos: if 'episode' in info.keys(): print( f"global_step={global_step}, episode_reward={info['episode']['r']}" ) writer.add_scalar("charts/episodic_return", info['episode']['r'], global_step) break # bootstrap reward if not done. reached the batch limit with torch.no_grad(): last_value = agent.get_value(next_obs.to(device)).reshape(1, -1) if args.gae:
class MultiModuleExp: """ A whole experiment. It should contain: (1) environments, (2) policies, (3) training, (4) testing. The results should be able to compare with other experiments. The Multi-RNN experiment. """ def __init__( self, args, env_id="HopperBulletEnv-v0", features_extractor_class=MultiExtractor, features_extractor_kwargs={}, ) -> None: print("Starting MultiModuleExp") """ Init with parameters to control the training process """ self.args = args self.env_id = env_id self.use_cuda = torch.cuda.is_available() and args.cuda self.device = torch.device("cuda" if self.use_cuda else "cpu") # Make Environments print("Making train environments...") venv = DummyVecEnv([ make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render) for i in range(args.num_envs) ]) self.eval_env = DummyVecEnv( [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)]) if args.vec_normalize: venv = VecNormalize(venv) self.eval_env = VecNormalize(self.eval_env, norm_reward=False) features_extractor_kwargs["num_envs"] = args.num_envs policy_kwargs = { "features_extractor_class": features_extractor_class, "features_extractor_kwargs": features_extractor_kwargs, # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor. # pi: Actor (policy-function); vf: Critic (value-function) "net_arch": [dict(pi=[64, 64], vf=[64, 64])], } self.model = CustomizedPPO( CustomizedPolicy, venv, n_steps=args.rollout_n_steps, tensorboard_log="tb", policy_kwargs=policy_kwargs, device=self.device, verbose=1, rnn_move_window_step=args.rnn_move_window_step, rnn_sequence_length=args.rnn_sequence_length, use_sde=args.sde, n_epochs=args.n_epochs) def train(self) -> None: """ Start training """ print(f"train using {self.model.device.type}") callback = [ DebugCallback("Customized"), AdjustCameraCallback(), WandbCallback(self.args), CustomizedEvalCallback( self.eval_env, best_model_save_path=None, log_path=None, eval_freq=self.args.eval_freq, n_eval_episodes=3, verbose=0, ) ] self.model.learn(self.args.total_timesteps, callback=callback) def test(self, model_filename, vnorm_filename): self.model.load(model_filename) self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env) self.eval_env.render() obs = self.eval_env.reset() with self.model.policy.features_extractor.start_testing(): for i in range(1000): action = self.model.predict(obs, deterministic=True) self.eval_env.step(action) self.eval_env.close()
print(f"The training spent {time.time() - t1} s.") model.save(policy_save_path) env.save(env_stats_path) else: # env = env_change_input(time_step=env_params['time_step'], # robot_class=env_params['robot_class'], # on_rack=env_params['on_rack'], # enable_self_collision=env_params['enable_self_collision'], # motor_control_mode=env_params['motor_control_mode'], # train_or_test=env_params['train_or_test']) # env = env_change_input(**env_params) env = SubprocVecEnv([lambda: env_change_input(**env_params)]) env_stats_load_path = os.path.join( policy_save_dir, 'ppo_env_8_S_PV_4096_12w_21-03-2021_20-46-02.pkl') env = VecNormalize.load(env_stats_load_path, env) env.training = False env.norm_reward = False model_load_path = os.path.join( policy_save_dir, 'ppo_model_8_S_PV_4096_12w_21-03-2021_20-46-02.zip') model = PPO.load(model_load_path, env=env) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) # env.render() if done: obs = env.reset()