コード例 #1
0
def test_vec_env(tmp_path, make_env):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv,
                             norm_obs=True,
                             norm_reward=True,
                             clip_obs=clip_obs,
                             clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        if isinstance(obs, dict):
            for key in obs.keys():
                assert np.max(np.abs(obs[key])) <= clip_obs
        else:
            assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = tmp_path / "vec_normalize"
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
コード例 #2
0
def test_vec_env(tmpdir):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = str(tmpdir.join("vec_normalize"))
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
コード例 #3
0
                    env,
                    policy_kwargs=policy_kwargs,
                    n_steps=4000,
                    verbose=1,
                    batch_size=10000,
                    n_epochs=4)

        if not (os.path.exists(policy_save_dir)):
            os.makedirs(policy_save_dir)
        # model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1, n_steps=2048)
        # model.set_parameters(os.path.join(policy_save_dir, 'ppo_model_6_S_PV_4096_3000w_21-03-2021_02-50-06.zip'))
        t1 = time.time()
        model.learn(total_timesteps=12000000)
        print(f"The training spent {time.time() - t1} s.")
        model.save(policy_save_path)
        env.save(env_stats_path)
    else:
        # env = env_change_input(time_step=env_params['time_step'],
        #                        robot_class=env_params['robot_class'],
        #                        on_rack=env_params['on_rack'],
        #                        enable_self_collision=env_params['enable_self_collision'],
        #                        motor_control_mode=env_params['motor_control_mode'],
        #                        train_or_test=env_params['train_or_test'])
        # env = env_change_input(**env_params)

        env = SubprocVecEnv([lambda: env_change_input(**env_params)])
        env_stats_load_path = os.path.join(
            policy_save_dir, 'ppo_env_8_S_PV_4096_12w_21-03-2021_20-46-02.pkl')
        env = VecNormalize.load(env_stats_load_path, env)
        env.training = False
        env.norm_reward = False