def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Beispiel #2
0
def _reward_fn_normalize_inputs(
    obs: np.ndarray,
    acts: np.ndarray,
    next_obs: np.ndarray,
    dones: np.ndarray,
    *,
    reward_fn: RewardFn,
    vec_normalize: vec_env.VecNormalize,
    norm_reward: bool = True,
) -> np.ndarray:
    """Combine with `functools.partial` to create an input-normalizing RewardFn.

    Args:
        reward_fn: The reward function that normalized inputs are evaluated on.
        vec_normalize: Instance of VecNormalize used to normalize inputs and
            rewards.
        norm_reward: If True, then also normalize reward before returning.

    Returns:
        The possibly normalized reward.
    """
    norm_obs = vec_normalize.normalize_obs(obs)
    norm_next_obs = vec_normalize.normalize_obs(next_obs)
    rew = reward_fn(norm_obs, acts, norm_next_obs, dones)
    if norm_reward:
        rew = vec_normalize.normalize_reward(rew)
    return rew
Beispiel #3
0
def train(env, log_dir):
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    env = VecNormalize(env,
                       training=True,
                       norm_obs=True,
                       norm_reward=True,
                       gamma=0.9997,
                       clip_obs=10.,
                       clip_reward=10.,
                       epsilon=0.1)

    drive = PPO("MlpPolicy",
                env,
                ent_coef=0.01,
                vf_coef=1,
                batch_size=32,
                learning_rate=linear_schedule(0.001),
                clip_range=linear_schedule(0.1),
                n_steps=1000,
                n_epochs=20,
                tensorboard_log=log_dir + "/drive_tensorboard_log",
                verbose=1)

    drive.learn(total_timesteps=total_timesteps, callback=callback)

    for i in range(total_train_runs):
        env.close()
        drive.learn(total_timesteps=total_timesteps,
                    callback=callback,
                    reset_num_timesteps=False)

    drive.save("conduziadrive")
Beispiel #4
0
def main(args):
    expert = None
    expert_state_dim = 0
    if args.policy_path is not None:
        policy_path = args.policy_path
        expert = PPO.load(policy_path)
        expert_state_dim = expert.observation_space.shape[0]

    factory = EnvFactory(args.env)
    env = DummyVecEnv([factory.make_env])
    if args.stats_path is not None:
        env = VecNormalize.load(args.stats_path, env)
        env.training = False
    else:
        env = VecNormalize(env, training=False)

    obs = env.reset()
    env.render()
    total_reward = 0
    while True:
        if expert is None:
            action = env.action_space.sample()
            action = np.zeros_like(action)
        else:
            good_obs = obs[:, :expert_state_dim]
            action, _ = expert.predict(good_obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        reward = env.get_original_reward()
        total_reward += reward[0]
        if done:
            print("Total reward: {:.3f}".format(total_reward))
            obs = env.reset()
            total_reward = 0
Beispiel #5
0
def test_sync_vec_normalize():
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    env = VecFrameStack(env, 1)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
    eval_env = VecFrameStack(eval_env, 1)

    env.reset()
    # Initialize running mean
    for _ in range(100):
        env.step([env.action_space.sample()])

    obs = env.reset()
    original_obs = env.get_original_obs()
    dummy_rewards = np.random.rand(10)
    # Normalization must be different
    assert not np.allclose(obs, eval_env.normalize_obs(original_obs))

    sync_envs_normalization(env, eval_env)

    # Now they must be synced
    assert np.allclose(obs, eval_env.normalize_obs(original_obs))
    assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
Beispiel #6
0
def make_envs(env_id, log_dir, gamma, max_train_ep_length, max_eval_ep_length,
              seed):
    """Make training and evaluation environments (vectorized envs)."""

    # Training env
    train_env = gym.make(env_id)
    train_env.seed(seed)  # Set random seed
    train_env = TimeLimitWrapper(
        train_env, max_train_ep_length)  # Limit length of training episodes
    train_env = Monitor(train_env, log_dir)  # Monitor training
    train_env = NormalizeActionWrapper(train_env)  # Normalize action space
    train_env = DummyVecEnv([lambda: train_env])  # Vectorize environment
    train_env = VecNormalize(train_env,
                             gamma=gamma)  # Normalise observations and rewards

    # Eval env
    eval_env = gym.make(env_id)
    eval_env.seed(seed)  # Set random seed
    eval_env = TimeLimitWrapper(
        eval_env,
        max_eval_ep_length)  # Set a maximum number of timesteps during eval
    eval_env = Monitor(
        eval_env
    )  # Used to ensure original action space is not modified by `NormalizeActionWrapper`
    eval_env = NormalizeActionWrapper(eval_env)  # Normalize action space
    eval_env = DummyVecEnv([lambda: eval_env])  # Vectorize environment
    eval_env = VecNormalize(eval_env,
                            gamma=gamma,
                            training=False,
                            norm_reward=False)  # Normalise observations
    # (obs/reward normalization gets synchronised with `train_env` in `EvalCallback`)

    return train_env, eval_env
Beispiel #7
0
def test_offpolicy_normalization(model_class):
    make_env_ = make_dict_env if model_class == HER else make_env
    env = DummyVecEnv([make_env_])
    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=10.0,
                       clip_reward=10.0)

    eval_env = DummyVecEnv([make_env_])
    eval_env = VecNormalize(eval_env,
                            training=False,
                            norm_obs=True,
                            norm_reward=False,
                            clip_obs=10.0,
                            clip_reward=10.0)

    kwargs = dict(model_class=SAC,
                  max_episode_length=200,
                  online_sampling=True) if model_class == HER else {}
    model = model_class("MlpPolicy",
                        env,
                        verbose=1,
                        learning_starts=100,
                        policy_kwargs=dict(net_arch=[64]),
                        **kwargs)
    model.learn(total_timesteps=500, eval_env=eval_env, eval_freq=250)
    # Check getter
    assert isinstance(model.get_vec_normalize_env(), VecNormalize)
Beispiel #8
0
def test_offpolicy_normalization(model_class, online_sampling):

    if online_sampling and model_class != HerReplayBuffer:
        pytest.skip()

    make_env_ = make_dict_env if model_class == HerReplayBuffer else make_env
    env = DummyVecEnv([make_env_])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0)

    eval_env = DummyVecEnv([make_env_])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0)

    if model_class == HerReplayBuffer:
        model = SAC(
            "MultiInputPolicy",
            env,
            verbose=1,
            learning_starts=100,
            policy_kwargs=dict(net_arch=[64]),
            replay_buffer_kwargs=dict(
                max_episode_length=100,
                online_sampling=online_sampling,
                n_sampled_goal=2,
            ),
            replay_buffer_class=HerReplayBuffer,
            seed=2,
        )
    else:
        model = model_class("MlpPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]))

    model.learn(total_timesteps=150, eval_env=eval_env, eval_freq=75)
    # Check getter
    assert isinstance(model.get_vec_normalize_env(), VecNormalize)
Beispiel #9
0
def pybullet_example():
    # PyBullet: Normalizing input features

    import pybullet_envs

    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    # Automatically normalize the input features and reward.
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

    model = PPO("MlpPolicy", env)
    model.learn(total_timesteps=2000)

    # Don't forget to save the VecNormalize statistics when saving the agent.
    log_dir = "/tmp/"
    model.save(log_dir + "ppo_halfcheetah")
    stats_path = os.path.join(log_dir, "vec_normalize.pkl")
    env.save(stats_path)

    # To demonstrate loading.
    del model, env

    # Load the saved statistics.
    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    env = VecNormalize.load(stats_path, env)
    # Do not update them at test time.
    env.training = False
    # reward normalization is not needed at test time.
    env.norm_reward = False

    # Load the agent.
    model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
Beispiel #10
0
def main():
    # multiprocess environment
    # n_cpu = 8
    # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    n_cpu = 1
    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=False)
    model.learn(total_timesteps=40000000)
    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)

        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
Beispiel #11
0
def test_sync_vec_normalize(make_env):
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=100.0,
                       clip_reward=100.0)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    if not isinstance(env.observation_space, spaces.Dict):
        env = VecFrameStack(env, 1)
        assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env,
                            training=False,
                            norm_obs=True,
                            norm_reward=True,
                            clip_obs=100.0,
                            clip_reward=100.0)

    if not isinstance(env.observation_space, spaces.Dict):
        eval_env = VecFrameStack(eval_env, 1)

    env.seed(0)
    env.action_space.seed(0)

    env.reset()
    # Initialize running mean
    latest_reward = None
    for _ in range(100):
        _, latest_reward, _, _ = env.step([env.action_space.sample()])

    # Check that unnormalized reward is same as original reward
    original_latest_reward = env.get_original_reward()
    assert np.allclose(original_latest_reward,
                       env.unnormalize_reward(latest_reward))

    obs = env.reset()
    dummy_rewards = np.random.rand(10)
    original_obs = env.get_original_obs()
    # Check that unnormalization works
    assert allclose(original_obs, env.unnormalize_obs(obs))
    # Normalization must be different (between different environments)
    assert not allclose(obs, eval_env.normalize_obs(original_obs))

    # Test syncing of parameters
    sync_envs_normalization(env, eval_env)
    # Now they must be synced
    assert allclose(obs, eval_env.normalize_obs(original_obs))
    assert allclose(env.normalize_reward(dummy_rewards),
                    eval_env.normalize_reward(dummy_rewards))
Beispiel #12
0
def test_offpolicy_normalization(model_class):
    env = DummyVecEnv([make_env])
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10., clip_reward=10.)

    model = model_class('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500)
    # Check getter
    assert isinstance(model.get_vec_normalize_env(), VecNormalize)
Beispiel #13
0
 def __init__(self,
              venv,
              training=True,
              norm_obs=True,
              norm_reward=True,
              clip_obs=10.,
              clip_reward=10.,
              gamma=0.99,
              epsilon=1e-8):
     env = DummyVecEnv([venv])
     VecNormalize.__init__(self, env, training, norm_obs, norm_reward,
                           clip_obs, clip_reward, gamma, epsilon)
def test_replay_buffer_normalization(replay_buffer_cls):
    env = {ReplayBuffer: DummyEnv, DictReplayBuffer: DummyDictEnv}[replay_buffer_cls]
    env = make_vec_env(env)
    env = VecNormalize(env)

    buffer = replay_buffer_cls(100, env.observation_space, env.action_space)

    # Interract and store transitions
    env.reset()
    obs = env.get_original_obs()
    for _ in range(100):
        action = env.action_space.sample()
        _, _, done, info = env.step(action)
        next_obs = env.get_original_obs()
        reward = env.get_original_reward()
        buffer.add(obs, next_obs, action, reward, done, info)
        obs = next_obs

    sample = buffer.sample(50, env)
    # Test observation normalization
    for observations in [sample.observations, sample.next_observations]:
        if isinstance(sample, DictReplayBufferSamples):
            for key in observations.keys():
                assert th.allclose(observations[key].mean(0), th.zeros(1), atol=1)
        elif isinstance(sample, ReplayBufferSamples):
            assert th.allclose(observations.mean(0), th.zeros(1), atol=1)
    # Test reward normalization
    assert np.allclose(sample.rewards.mean(0), np.zeros(1), atol=1)
Beispiel #15
0
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False):
    print("Testing:")
    print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info)
    eval_env = DummyVecEnv([eval_env])
    eval_env = VecNormalize.load(vec_filename, eval_env)
    eval_env.norm_reward = False

    eval_env.seed(seed)
    model = PPO.load(model_filename)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}")
    return total_reward, distance_x
def record_video(env_id,
                 model,
                 video_length=500,
                 prefix='',
                 video_folder='videos'):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
        """
    eval_env = DummyVecEnv(
        [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
    # eval_env = gym.make(env_id)
    val_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                eval_env)

    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder='tmp',
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    for i in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
Beispiel #17
0
def test_vec_normalize(model_class):
    """
    Additional tests for PPO/A2C/SAC/DDPG/TD3/DQN to check observation space support
    for GoalEnv and VecNormalize using MultiInputPolicy.
    """
    env = DummyVecEnv([
        lambda: BitFlippingEnv(n_bits=4, continuous=not (model_class == DQN))
    ])
    env = VecNormalize(env)

    kwargs = {}
    n_steps = 256

    if model_class in {A2C, PPO}:
        kwargs = dict(
            n_steps=128,
            policy_kwargs=dict(net_arch=[32], ),
        )
    else:
        # Avoid memory error when using replay buffer
        # Reduce the size of the features and make learning faster
        kwargs = dict(
            buffer_size=250,
            policy_kwargs=dict(net_arch=[32], ),
            train_freq=8,
            gradient_steps=1,
        )
        if model_class == DQN:
            kwargs["learning_starts"] = 0

    model = model_class("MultiInputPolicy", env, gamma=0.5, seed=1, **kwargs)

    model.learn(total_timesteps=n_steps)

    evaluate_policy(model, env, n_eval_episodes=5, warn=False)
Beispiel #18
0
def create_zoo_env(env_id, stats_dir, hyperparams, should_render=False):
    env_wrapper = get_wrapper_class(hyperparams)

    vec_env_cls = DummyVecEnv
    if "Bullet" in env_id and should_render:
        vec_env_cls = SubprocVecEnv

    env = make_vec_env(env_id,
                       wrapper_class=env_wrapper,
                       vec_env_cls=vec_env_cls)

    if stats_dir is not None:
        if hyperparams["normalize"]:
            norm_fpath = pjoin(stats_dir, "vecnormalize.pkl")

            if os.path.exists(norm_fpath):
                env = VecNormalize.load(norm_fpath, env)
                env.training = False
                env.norm_reward = False
            else:
                raise ValueError(f"VecNormalize stats {norm_fpath} not found")

    max_episode_steps = gym.make(env_id).spec.max_episode_steps
    Spec = namedtuple("Spec", ["max_episode_steps"])
    env.spec = Spec(max_episode_steps=max_episode_steps)

    return env
Beispiel #19
0
def init_adv(adv_env_id, disable_adv=False, env_kwargs=None):
    bridge = Bridge()
    default_env_kwargs = {
        'renders' if 'CartPole' in adv_env_id else 'render': render
    }
    if env_kwargs is None:
        env_kwargs = {}
    env_kwargs.update(default_env_kwargs)
    env = make_vec_env(adv_env_id, env_kwargs=env_kwargs, seed=seed)
    env = VecNormalize(env)
    prot_agent = PPO('MlpPolicy',
                     env,
                     verbose=verbose,
                     seed=seed,
                     n_steps=ts,
                     bridge=bridge,
                     is_protagonist=True)
    if disable_adv:
        bridge.link_agents(prot_agent, None)
    else:
        adv_agent = PPO('MlpPolicy',
                        env,
                        verbose=verbose,
                        seed=seed,
                        n_steps=ts,
                        bridge=bridge,
                        is_protagonist=False)
        bridge.link_agents(prot_agent, adv_agent)
    return prot_agent, env
def test_deprecation():
    venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    venv = VecNormalize(venv)
    with pytest.warns(None) as record:
        assert np.allclose(venv.ret, venv.returns)
    # Deprecation warning when using .ret
    assert len(record) == 1
    def __init__(
        self,
        args,
        env_id="HopperBulletEnv-v0",
        features_extractor_class=MultiExtractor,
        features_extractor_kwargs={},
    ) -> None:
        print("Starting MultiModuleExp")
        """ Init with parameters to control the training process """
        self.args = args
        self.env_id = env_id
        self.use_cuda = torch.cuda.is_available() and args.cuda
        self.device = torch.device("cuda" if self.use_cuda else "cpu")

        # Make Environments
        print("Making train environments...")
        venv = DummyVecEnv([
            make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render)
            for i in range(args.num_envs)
        ])
        self.eval_env = DummyVecEnv(
            [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)])
        if args.vec_normalize:
            venv = VecNormalize(venv)
            self.eval_env = VecNormalize(self.eval_env, norm_reward=False)

        features_extractor_kwargs["num_envs"] = args.num_envs
        policy_kwargs = {
            "features_extractor_class": features_extractor_class,
            "features_extractor_kwargs": features_extractor_kwargs,
            # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor.
            # pi: Actor (policy-function); vf: Critic (value-function)
            "net_arch": [dict(pi=[64, 64], vf=[64, 64])],
        }

        self.model = CustomizedPPO(
            CustomizedPolicy,
            venv,
            n_steps=args.rollout_n_steps,
            tensorboard_log="tb",
            policy_kwargs=policy_kwargs,
            device=self.device,
            verbose=1,
            rnn_move_window_step=args.rnn_move_window_step,
            rnn_sequence_length=args.rnn_sequence_length,
            use_sde=args.sde,
            n_epochs=args.n_epochs)
Beispiel #22
0
 def make_vec_env(self,dataset, env_args):
     env_args["df"]= dataset
     env = make_vec_env('crypt-v001', env_kwargs=env_args)
     env = VecCheckNan(env, raise_exception=True)
     env = VecNormalize(
         env, norm_obs=True, norm_reward=False, clip_obs=10.0, gamma=0.95
     )
     return env
def test_vec_monitor_warn():
    env = DummyVecEnv([lambda: Monitor(gym.make("CartPole-v1"))])
    # We should warn the user when the env is already wrapped with a Monitor wrapper
    with pytest.warns(UserWarning):
        VecMonitor(env)

    with pytest.warns(UserWarning):
        VecMonitor(VecNormalize(env))
Beispiel #24
0
def test_vec_env(tmp_path, make_env):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv,
                             norm_obs=True,
                             norm_reward=True,
                             clip_obs=clip_obs,
                             clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        if isinstance(obs, dict):
            for key in obs.keys():
                assert np.max(np.abs(obs[key])) <= clip_obs
        else:
            assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = tmp_path / "vec_normalize"
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
Beispiel #25
0
 def make_dummy_env(self, dataset, env_args):
     env = gym.make("crypt-v001", df=dataset, **env_args)
     check_env(env)
     env = DummyVecEnv([lambda: env])
     env = VecCheckNan(env, raise_exception=True)
     env = VecNormalize(
         env, norm_obs=True, norm_reward=False, clip_obs=10.0, gamma=0.95
     )
     return env
Beispiel #26
0
    def create_env(n_envs, eval_env=False, no_log=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :param no_log: (bool) Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: (Union[gym.Env, VecEnv])
        """
        global hyperparams
        global env_kwargs

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else save_path

        if n_envs == 1:
            env = SubprocVecEnv(
                [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)]
            )
        else:
            # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
            # On most env, SubprocVecEnv does not help and is quite memory hungry
            env = SubprocVecEnv(
                [
                    make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper)
                    for i in range(n_envs)
                ]
            )
        if normalize:
            # Copy to avoid changing default values by reference
            local_normalize_kwargs = normalize_kwargs.copy()
            # Do not normalize reward for env used for evaluation
            if eval_env:
                if len(local_normalize_kwargs) > 0:
                    local_normalize_kwargs["norm_reward"] = False
                else:
                    local_normalize_kwargs = {"norm_reward": False}

            if args.verbose > 0:
                if len(local_normalize_kwargs) > 0:
                    print(f"Normalization activated: {local_normalize_kwargs}")
                else:
                    print("Normalizing input and reward")
            env = VecNormalize(env, **local_normalize_kwargs)

        # Optional Frame-stacking
        if hyperparams.get("frame_stack", False):
            n_stack = hyperparams["frame_stack"]
            env = VecFrameStack(env, n_stack)
            print(f"Stacking {n_stack} frames")

        if is_image_space(env.observation_space):
            if args.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)
        return env
Beispiel #27
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])
    #
    if args.stats_path is None:
        envs = VecNormalize(envs)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs)
        learner.learn(total_timesteps=10000000, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)
        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Beispiel #28
0
def run_environment(
        algorithm: RLAlgorithm = typer.Option(...),
        agent_type: SingleOrMultiAgent = SingleOrMultiAgent.single_agent,
        agent_parameters_path: Optional[Path] = None,
        random_agent: bool = False,
        seed: Optional[int] = None,
        environment_port: Optional[int] = None,
        normalize: bool = False,
        n_envs: Optional[int] = None):
    """Run the reacher environment and visualize the actions of the agents.

    Args:
        agent_type: choice between single and multi agent environments
        agent_parameters_path: an optional path to load the agent parameters from
        random_agent: if true, agent(s) use a random policy
        seed: seed for the environment; if not set, it will be picked randomly
        environment_port: the port used from python to communicate with the C# environment backend. By using different
            values, one can run multiple environments in parallel.
    """
    env = create_environment(agent_type=agent_type,
                             normalize=False,
                             n_envs=n_envs,
                             env_seed=seed,
                             environment_port=environment_port,
                             training_mode=False,
                             no_graphics=False)

    if normalize:
        env = VecNormalize.load(
            str(agent_parameters_path.parent / 'vecnormalize.pkl'), env)

    action_size = env.action_space.shape[0]

    if random_agent:
        agent = RandomAgent(number_of_agents=n_envs, action_size=action_size)
    else:
        agent = TrainedAgent(algorithm=algorithm,
                             parameters_path=str(agent_parameters_path))

    score = 0
    state = env.reset()
    while True:
        actions = agent.act(state)
        state, reward, done, _ = env.step(actions)
        score += reward
        time.sleep(0.005)
        if np.any(done):
            break

    if agent_type == SingleOrMultiAgent.single_agent:
        print(f'Total score this episode: {score}')
    else:
        print(f'Average total score this episode: {np.array(score).mean()}')

    env.close()
Beispiel #29
0
def atari_env(num_envs=1):
    def env_fn():
        env = gym.make("SpaceInvadersNoFrameskip-v4")
        env = AtariWrapper(env)
        return env

    env = DummyVecEnv([env_fn] * num_envs)
    env = VecFrameStack(env, 4)
    env = VecTransposeImage(env)
    env = VecNormalize(env)
    return env
Beispiel #30
0
def _make_warmstart_dict_env():
    """Warm-start VecNormalize by stepping through BitFlippingEnv"""
    venv = DummyVecEnv([make_dict_env])
    venv = VecNormalize(venv)
    venv.reset()
    venv.get_original_obs()

    for _ in range(100):
        actions = [venv.action_space.sample()]
        venv.step(actions)
    return venv