def test_replay_buffer_normalization(replay_buffer_cls): env = {ReplayBuffer: DummyEnv, DictReplayBuffer: DummyDictEnv}[replay_buffer_cls] env = make_vec_env(env) env = VecNormalize(env) buffer = replay_buffer_cls(100, env.observation_space, env.action_space) # Interract and store transitions env.reset() obs = env.get_original_obs() for _ in range(100): action = env.action_space.sample() _, _, done, info = env.step(action) next_obs = env.get_original_obs() reward = env.get_original_reward() buffer.add(obs, next_obs, action, reward, done, info) obs = next_obs sample = buffer.sample(50, env) # Test observation normalization for observations in [sample.observations, sample.next_observations]: if isinstance(sample, DictReplayBufferSamples): for key in observations.keys(): assert th.allclose(observations[key].mean(0), th.zeros(1), atol=1) elif isinstance(sample, ReplayBufferSamples): assert th.allclose(observations.mean(0), th.zeros(1), atol=1) # Test reward normalization assert np.allclose(sample.rewards.mean(0), np.zeros(1), atol=1)
def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def main(): test_or_train = TEST_OR_TRAIN assert test_or_train in ["train", "test"] env_params = { 'time_step': TIME_STEP, 'robot_class': QuadrupedRobot, 'on_rack': False, 'enable_self_collision': True, 'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT, 'train_or_test': test_or_train } policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/policies') policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") policy_save_path = os.path.join(policy_save_dir, policy_save_filename) policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]} if TEST_OR_TRAIN == "train": env = make_vec_env(env_change_input, n_envs=NUM_CPUS, seed=0, env_kwargs=env_params, vec_env_cls=SubprocVecEnv) env = VecNormalize(env) if not (os.path.exists(policy_save_dir)): os.makedirs(policy_save_dir) model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1) model.learn(total_timesteps=100000000) model.save(policy_save_path) else: # env = env_change_input(time_step=env_params['time_step'], # robot_class=env_params['robot_class'], # on_rack=env_params['on_rack'], # enable_self_collision=env_params['enable_self_collision'], # motor_control_mode=env_params['motor_control_mode'], # train_or_test=env_params['train_or_test']) env = env_change_input(**env_params) model_load_path = os.path.join(policy_save_dir, 'ppo_3_17-03-2021_15-39-42') model = PPO.load(model_load_path) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def test_vec_env(tmp_path, make_env): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) if isinstance(obs, dict): for key in obs.keys(): assert np.max(np.abs(obs[key])) <= clip_obs else: assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = tmp_path / "vec_normalize" norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def test_eval_friendly_error(): # tests that eval callback does not crash when given a vector train_env = VecNormalize(DummyVecEnv([lambda: gym.make("CartPole-v1")])) eval_env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) eval_env = VecNormalize(eval_env, training=False, norm_reward=False) _ = train_env.reset() original_obs = train_env.get_original_obs() model = A2C("MlpPolicy", train_env, n_steps=50, seed=0) eval_callback = EvalCallback( eval_env, eval_freq=100, warn=False, ) model.learn(100, callback=eval_callback) # Check synchronization assert np.allclose(train_env.normalize_obs(original_obs), eval_env.normalize_obs(original_obs)) wrong_eval_env = gym.make("CartPole-v1") eval_callback = EvalCallback( wrong_eval_env, eval_freq=100, warn=False, ) with pytest.warns(Warning): with pytest.raises(AssertionError): model.learn(100, callback=eval_callback)
def test_vec_env(tmpdir): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = str(tmpdir.join("vec_normalize")) norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
# ALGO Logic: Storage for epoch data obs = torch.zeros((args.num_steps, args.num_envs) + envs.observation_space.shape).to(device) actions = torch.zeros((args.num_steps, args.num_envs) + envs.action_space.shape).to(device) logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) dones = torch.zeros((args.num_steps, args.num_envs)).to(device) values = torch.zeros((args.num_steps, args.num_envs)).to(device) # TRY NOT TO MODIFY: start the game global_step = 0 start_time = time.time() # Note how `next_obs` and `next_done` are used; their usage is equivalent to # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 next_obs = envs.reset() next_done = torch.zeros(args.num_envs).to(device) num_updates = args.total_timesteps // args.batch_size for update in range(1, num_updates + 1): # Annealing the rate if instructed to do so. if args.anneal_lr: frac = 1.0 - (update - 1.0) / num_updates lrnow = lr(frac) optimizer.param_groups[0]['lr'] = lrnow # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(0, args.num_steps): global_step += 1 * args.num_envs obs[step] = next_obs dones[step] = next_done
class MultiModuleExp: """ A whole experiment. It should contain: (1) environments, (2) policies, (3) training, (4) testing. The results should be able to compare with other experiments. The Multi-RNN experiment. """ def __init__( self, args, env_id="HopperBulletEnv-v0", features_extractor_class=MultiExtractor, features_extractor_kwargs={}, ) -> None: print("Starting MultiModuleExp") """ Init with parameters to control the training process """ self.args = args self.env_id = env_id self.use_cuda = torch.cuda.is_available() and args.cuda self.device = torch.device("cuda" if self.use_cuda else "cpu") # Make Environments print("Making train environments...") venv = DummyVecEnv([ make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render) for i in range(args.num_envs) ]) self.eval_env = DummyVecEnv( [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)]) if args.vec_normalize: venv = VecNormalize(venv) self.eval_env = VecNormalize(self.eval_env, norm_reward=False) features_extractor_kwargs["num_envs"] = args.num_envs policy_kwargs = { "features_extractor_class": features_extractor_class, "features_extractor_kwargs": features_extractor_kwargs, # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor. # pi: Actor (policy-function); vf: Critic (value-function) "net_arch": [dict(pi=[64, 64], vf=[64, 64])], } self.model = CustomizedPPO( CustomizedPolicy, venv, n_steps=args.rollout_n_steps, tensorboard_log="tb", policy_kwargs=policy_kwargs, device=self.device, verbose=1, rnn_move_window_step=args.rnn_move_window_step, rnn_sequence_length=args.rnn_sequence_length, use_sde=args.sde, n_epochs=args.n_epochs) def train(self) -> None: """ Start training """ print(f"train using {self.model.device.type}") callback = [ DebugCallback("Customized"), AdjustCameraCallback(), WandbCallback(self.args), CustomizedEvalCallback( self.eval_env, best_model_save_path=None, log_path=None, eval_freq=self.args.eval_freq, n_eval_episodes=3, verbose=0, ) ] self.model.learn(self.args.total_timesteps, callback=callback) def test(self, model_filename, vnorm_filename): self.model.load(model_filename) self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env) self.eval_env.render() obs = self.eval_env.reset() with self.model.policy.features_extractor.start_testing(): for i in range(1000): action = self.model.predict(obs, deterministic=True) self.eval_env.step(action) self.eval_env.close()
print(f"The training spent {time.time() - t1} s.") model.save(policy_save_path) env.save(env_stats_path) else: # env = env_change_input(time_step=env_params['time_step'], # robot_class=env_params['robot_class'], # on_rack=env_params['on_rack'], # enable_self_collision=env_params['enable_self_collision'], # motor_control_mode=env_params['motor_control_mode'], # train_or_test=env_params['train_or_test']) # env = env_change_input(**env_params) env = SubprocVecEnv([lambda: env_change_input(**env_params)]) env_stats_load_path = os.path.join( policy_save_dir, 'ppo_env_8_S_PV_4096_12w_21-03-2021_20-46-02.pkl') env = VecNormalize.load(env_stats_load_path, env) env.training = False env.norm_reward = False model_load_path = os.path.join( policy_save_dir, 'ppo_model_8_S_PV_4096_12w_21-03-2021_20-46-02.zip') model = PPO.load(model_load_path, env=env) obs = env.reset() while True: action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) # env.render() if done: obs = env.reset()