def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def _reward_fn_normalize_inputs( obs: np.ndarray, acts: np.ndarray, next_obs: np.ndarray, dones: np.ndarray, *, reward_fn: RewardFn, vec_normalize: vec_env.VecNormalize, norm_reward: bool = True, ) -> np.ndarray: """Combine with `functools.partial` to create an input-normalizing RewardFn. Args: reward_fn: The reward function that normalized inputs are evaluated on. vec_normalize: Instance of VecNormalize used to normalize inputs and rewards. norm_reward: If True, then also normalize reward before returning. Returns: The possibly normalized reward. """ norm_obs = vec_normalize.normalize_obs(obs) norm_next_obs = vec_normalize.normalize_obs(next_obs) rew = reward_fn(norm_obs, acts, norm_next_obs, dones) if norm_reward: rew = vec_normalize.normalize_reward(rew) return rew
def train(env, log_dir): callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) env = VecNormalize(env, training=True, norm_obs=True, norm_reward=True, gamma=0.9997, clip_obs=10., clip_reward=10., epsilon=0.1) drive = PPO("MlpPolicy", env, ent_coef=0.01, vf_coef=1, batch_size=32, learning_rate=linear_schedule(0.001), clip_range=linear_schedule(0.1), n_steps=1000, n_epochs=20, tensorboard_log=log_dir + "/drive_tensorboard_log", verbose=1) drive.learn(total_timesteps=total_timesteps, callback=callback) for i in range(total_train_runs): env.close() drive.learn(total_timesteps=total_timesteps, callback=callback, reset_num_timesteps=False) drive.save("conduziadrive")
def main(args): expert = None expert_state_dim = 0 if args.policy_path is not None: policy_path = args.policy_path expert = PPO.load(policy_path) expert_state_dim = expert.observation_space.shape[0] factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) if args.stats_path is not None: env = VecNormalize.load(args.stats_path, env) env.training = False else: env = VecNormalize(env, training=False) obs = env.reset() env.render() total_reward = 0 while True: if expert is None: action = env.action_space.sample() action = np.zeros_like(action) else: good_obs = obs[:, :expert_state_dim] action, _ = expert.predict(good_obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() reward = env.get_original_reward() total_reward += reward[0] if done: print("Total reward: {:.3f}".format(total_reward)) obs = env.reset() total_reward = 0
def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def make_envs(env_id, log_dir, gamma, max_train_ep_length, max_eval_ep_length, seed): """Make training and evaluation environments (vectorized envs).""" # Training env train_env = gym.make(env_id) train_env.seed(seed) # Set random seed train_env = TimeLimitWrapper( train_env, max_train_ep_length) # Limit length of training episodes train_env = Monitor(train_env, log_dir) # Monitor training train_env = NormalizeActionWrapper(train_env) # Normalize action space train_env = DummyVecEnv([lambda: train_env]) # Vectorize environment train_env = VecNormalize(train_env, gamma=gamma) # Normalise observations and rewards # Eval env eval_env = gym.make(env_id) eval_env.seed(seed) # Set random seed eval_env = TimeLimitWrapper( eval_env, max_eval_ep_length) # Set a maximum number of timesteps during eval eval_env = Monitor( eval_env ) # Used to ensure original action space is not modified by `NormalizeActionWrapper` eval_env = NormalizeActionWrapper(eval_env) # Normalize action space eval_env = DummyVecEnv([lambda: eval_env]) # Vectorize environment eval_env = VecNormalize(eval_env, gamma=gamma, training=False, norm_reward=False) # Normalise observations # (obs/reward normalization gets synchronised with `train_env` in `EvalCallback`) return train_env, eval_env
def test_offpolicy_normalization(model_class): make_env_ = make_dict_env if model_class == HER else make_env env = DummyVecEnv([make_env_]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0) eval_env = DummyVecEnv([make_env_]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0) kwargs = dict(model_class=SAC, max_episode_length=200, online_sampling=True) if model_class == HER else {} model = model_class("MlpPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]), **kwargs) model.learn(total_timesteps=500, eval_env=eval_env, eval_freq=250) # Check getter assert isinstance(model.get_vec_normalize_env(), VecNormalize)
def test_offpolicy_normalization(model_class, online_sampling): if online_sampling and model_class != HerReplayBuffer: pytest.skip() make_env_ = make_dict_env if model_class == HerReplayBuffer else make_env env = DummyVecEnv([make_env_]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0, clip_reward=10.0) eval_env = DummyVecEnv([make_env_]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10.0, clip_reward=10.0) if model_class == HerReplayBuffer: model = SAC( "MultiInputPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64]), replay_buffer_kwargs=dict( max_episode_length=100, online_sampling=online_sampling, n_sampled_goal=2, ), replay_buffer_class=HerReplayBuffer, seed=2, ) else: model = model_class("MlpPolicy", env, verbose=1, learning_starts=100, policy_kwargs=dict(net_arch=[64])) model.learn(total_timesteps=150, eval_env=eval_env, eval_freq=75) # Check getter assert isinstance(model.get_vec_normalize_env(), VecNormalize)
def pybullet_example(): # PyBullet: Normalizing input features import pybullet_envs env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) # Automatically normalize the input features and reward. env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0) model = PPO("MlpPolicy", env) model.learn(total_timesteps=2000) # Don't forget to save the VecNormalize statistics when saving the agent. log_dir = "/tmp/" model.save(log_dir + "ppo_halfcheetah") stats_path = os.path.join(log_dir, "vec_normalize.pkl") env.save(stats_path) # To demonstrate loading. del model, env # Load the saved statistics. env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) env = VecNormalize.load(stats_path, env) # Do not update them at test time. env.training = False # reward normalization is not needed at test time. env.norm_reward = False # Load the agent. model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def test_offpolicy_normalization(model_class): env = DummyVecEnv([make_env]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=False, clip_obs=10., clip_reward=10.) model = model_class('MlpPolicy', env, verbose=1) model.learn(total_timesteps=1000, eval_env=eval_env, eval_freq=500) # Check getter assert isinstance(model.get_vec_normalize_env(), VecNormalize)
def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): env = DummyVecEnv([venv]) VecNormalize.__init__(self, env, training, norm_obs, norm_reward, clip_obs, clip_reward, gamma, epsilon)
def test_replay_buffer_normalization(replay_buffer_cls): env = {ReplayBuffer: DummyEnv, DictReplayBuffer: DummyDictEnv}[replay_buffer_cls] env = make_vec_env(env) env = VecNormalize(env) buffer = replay_buffer_cls(100, env.observation_space, env.action_space) # Interract and store transitions env.reset() obs = env.get_original_obs() for _ in range(100): action = env.action_space.sample() _, _, done, info = env.step(action) next_obs = env.get_original_obs() reward = env.get_original_reward() buffer.add(obs, next_obs, action, reward, done, info) obs = next_obs sample = buffer.sample(50, env) # Test observation normalization for observations in [sample.observations, sample.next_observations]: if isinstance(sample, DictReplayBufferSamples): for key in observations.keys(): assert th.allclose(observations[key].mean(0), th.zeros(1), atol=1) elif isinstance(sample, ReplayBufferSamples): assert th.allclose(observations.mean(0), th.zeros(1), atol=1) # Test reward normalization assert np.allclose(sample.rewards.mean(0), np.zeros(1), atol=1)
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False): print("Testing:") print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}") eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}") return total_reward, distance_x
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) val_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(eval_env, video_folder='tmp', record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() for i in range(video_length): action, _ = model.predict(obs) obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def test_vec_normalize(model_class): """ Additional tests for PPO/A2C/SAC/DDPG/TD3/DQN to check observation space support for GoalEnv and VecNormalize using MultiInputPolicy. """ env = DummyVecEnv([ lambda: BitFlippingEnv(n_bits=4, continuous=not (model_class == DQN)) ]) env = VecNormalize(env) kwargs = {} n_steps = 256 if model_class in {A2C, PPO}: kwargs = dict( n_steps=128, policy_kwargs=dict(net_arch=[32], ), ) else: # Avoid memory error when using replay buffer # Reduce the size of the features and make learning faster kwargs = dict( buffer_size=250, policy_kwargs=dict(net_arch=[32], ), train_freq=8, gradient_steps=1, ) if model_class == DQN: kwargs["learning_starts"] = 0 model = model_class("MultiInputPolicy", env, gamma=0.5, seed=1, **kwargs) model.learn(total_timesteps=n_steps) evaluate_policy(model, env, n_eval_episodes=5, warn=False)
def create_zoo_env(env_id, stats_dir, hyperparams, should_render=False): env_wrapper = get_wrapper_class(hyperparams) vec_env_cls = DummyVecEnv if "Bullet" in env_id and should_render: vec_env_cls = SubprocVecEnv env = make_vec_env(env_id, wrapper_class=env_wrapper, vec_env_cls=vec_env_cls) if stats_dir is not None: if hyperparams["normalize"]: norm_fpath = pjoin(stats_dir, "vecnormalize.pkl") if os.path.exists(norm_fpath): env = VecNormalize.load(norm_fpath, env) env.training = False env.norm_reward = False else: raise ValueError(f"VecNormalize stats {norm_fpath} not found") max_episode_steps = gym.make(env_id).spec.max_episode_steps Spec = namedtuple("Spec", ["max_episode_steps"]) env.spec = Spec(max_episode_steps=max_episode_steps) return env
def init_adv(adv_env_id, disable_adv=False, env_kwargs=None): bridge = Bridge() default_env_kwargs = { 'renders' if 'CartPole' in adv_env_id else 'render': render } if env_kwargs is None: env_kwargs = {} env_kwargs.update(default_env_kwargs) env = make_vec_env(adv_env_id, env_kwargs=env_kwargs, seed=seed) env = VecNormalize(env) prot_agent = PPO('MlpPolicy', env, verbose=verbose, seed=seed, n_steps=ts, bridge=bridge, is_protagonist=True) if disable_adv: bridge.link_agents(prot_agent, None) else: adv_agent = PPO('MlpPolicy', env, verbose=verbose, seed=seed, n_steps=ts, bridge=bridge, is_protagonist=False) bridge.link_agents(prot_agent, adv_agent) return prot_agent, env
def test_deprecation(): venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) venv = VecNormalize(venv) with pytest.warns(None) as record: assert np.allclose(venv.ret, venv.returns) # Deprecation warning when using .ret assert len(record) == 1
def __init__( self, args, env_id="HopperBulletEnv-v0", features_extractor_class=MultiExtractor, features_extractor_kwargs={}, ) -> None: print("Starting MultiModuleExp") """ Init with parameters to control the training process """ self.args = args self.env_id = env_id self.use_cuda = torch.cuda.is_available() and args.cuda self.device = torch.device("cuda" if self.use_cuda else "cpu") # Make Environments print("Making train environments...") venv = DummyVecEnv([ make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render) for i in range(args.num_envs) ]) self.eval_env = DummyVecEnv( [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)]) if args.vec_normalize: venv = VecNormalize(venv) self.eval_env = VecNormalize(self.eval_env, norm_reward=False) features_extractor_kwargs["num_envs"] = args.num_envs policy_kwargs = { "features_extractor_class": features_extractor_class, "features_extractor_kwargs": features_extractor_kwargs, # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor. # pi: Actor (policy-function); vf: Critic (value-function) "net_arch": [dict(pi=[64, 64], vf=[64, 64])], } self.model = CustomizedPPO( CustomizedPolicy, venv, n_steps=args.rollout_n_steps, tensorboard_log="tb", policy_kwargs=policy_kwargs, device=self.device, verbose=1, rnn_move_window_step=args.rnn_move_window_step, rnn_sequence_length=args.rnn_sequence_length, use_sde=args.sde, n_epochs=args.n_epochs)
def make_vec_env(self,dataset, env_args): env_args["df"]= dataset env = make_vec_env('crypt-v001', env_kwargs=env_args) env = VecCheckNan(env, raise_exception=True) env = VecNormalize( env, norm_obs=True, norm_reward=False, clip_obs=10.0, gamma=0.95 ) return env
def test_vec_monitor_warn(): env = DummyVecEnv([lambda: Monitor(gym.make("CartPole-v1"))]) # We should warn the user when the env is already wrapped with a Monitor wrapper with pytest.warns(UserWarning): VecMonitor(env) with pytest.warns(UserWarning): VecMonitor(VecNormalize(env))
def test_vec_env(tmp_path, make_env): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) if isinstance(obs, dict): for key in obs.keys(): assert np.max(np.abs(obs[key])) <= clip_obs else: assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = tmp_path / "vec_normalize" norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def make_dummy_env(self, dataset, env_args): env = gym.make("crypt-v001", df=dataset, **env_args) check_env(env) env = DummyVecEnv([lambda: env]) env = VecCheckNan(env, raise_exception=True) env = VecNormalize( env, norm_obs=True, norm_reward=False, clip_obs=10.0, gamma=0.95 ) return env
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path if n_envs == 1: env = SubprocVecEnv( [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)] ) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = SubprocVecEnv( [ make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper) for i in range(n_envs) ] ) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs["norm_reward"] = False else: local_normalize_kwargs = {"norm_reward": False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print(f"Normalization activated: {local_normalize_kwargs}") else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get("frame_stack", False): n_stack = hyperparams["frame_stack"] env = VecFrameStack(env, n_stack) print(f"Stacking {n_stack} frames") if is_image_space(env.observation_space): if args.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) # if args.stats_path is None: envs = VecNormalize(envs) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs) learner.learn(total_timesteps=10000000, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs) learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def run_environment( algorithm: RLAlgorithm = typer.Option(...), agent_type: SingleOrMultiAgent = SingleOrMultiAgent.single_agent, agent_parameters_path: Optional[Path] = None, random_agent: bool = False, seed: Optional[int] = None, environment_port: Optional[int] = None, normalize: bool = False, n_envs: Optional[int] = None): """Run the reacher environment and visualize the actions of the agents. Args: agent_type: choice between single and multi agent environments agent_parameters_path: an optional path to load the agent parameters from random_agent: if true, agent(s) use a random policy seed: seed for the environment; if not set, it will be picked randomly environment_port: the port used from python to communicate with the C# environment backend. By using different values, one can run multiple environments in parallel. """ env = create_environment(agent_type=agent_type, normalize=False, n_envs=n_envs, env_seed=seed, environment_port=environment_port, training_mode=False, no_graphics=False) if normalize: env = VecNormalize.load( str(agent_parameters_path.parent / 'vecnormalize.pkl'), env) action_size = env.action_space.shape[0] if random_agent: agent = RandomAgent(number_of_agents=n_envs, action_size=action_size) else: agent = TrainedAgent(algorithm=algorithm, parameters_path=str(agent_parameters_path)) score = 0 state = env.reset() while True: actions = agent.act(state) state, reward, done, _ = env.step(actions) score += reward time.sleep(0.005) if np.any(done): break if agent_type == SingleOrMultiAgent.single_agent: print(f'Total score this episode: {score}') else: print(f'Average total score this episode: {np.array(score).mean()}') env.close()
def atari_env(num_envs=1): def env_fn(): env = gym.make("SpaceInvadersNoFrameskip-v4") env = AtariWrapper(env) return env env = DummyVecEnv([env_fn] * num_envs) env = VecFrameStack(env, 4) env = VecTransposeImage(env) env = VecNormalize(env) return env
def _make_warmstart_dict_env(): """Warm-start VecNormalize by stepping through BitFlippingEnv""" venv = DummyVecEnv([make_dict_env]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv