def main(do_render: bool, seed: int, as_gdads: bool, name: str, do_train: bool): drop_abs_position = True conf: Conf = CONFS[name] dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=True) if as_gdads: flat_env = SkillWrapper(env=dict_env) else: flat_env = flatten_env(dict_env, drop_abs_position) flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling) flat_env = Monitor(flat_env) dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=False) if as_gdads: use_slider = False if use_slider: eval_env = SliderWrapper(env=dict_env) else: eval_env = GDADSEvalWrapper(dict_env, sw=BestSkillProvider(flat_env)) else: eval_env = flatten_env(dict_env=dict_env, drop_abs_position=drop_abs_position) filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename + ".zip", env=flat_env) print(f"loaded model {filename}") if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=filename, buffer_size=conf.buffer_size, batch_size=conf.batch_size, gamma=gamma(conf.ep_len), learning_starts=100 * conf.ep_len, policy_kwargs=dict(log_std_init=-3, net_arch=[conf.layer_size] * 2), seed=seed, device="cuda", train_freq=4) if do_train: train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env) if do_render: show(model=sac, env=eval_env, conf=conf) do_eval = not do_train and not do_render if do_eval: results = ant_grid_evaluation(model=sac, env=eval_env, episode_len=conf.ep_len) dump_ant_grid_evaluation(results)
def wrap(env): # Applying normalisation of observations wrapper_observation = NormalizeObservationSpace( env, lambda o: o / env.unwrapped.observation_space.high) # Applying normalisation of rewards wrapper_reward = TransformReward(wrapper_observation, lambda r: 1.e0 * r) return wrapper_reward
def atari_wrapper(env): # This is substantially the same CNN as in (Mnih et al., 2016; 2015), # the only difference is that in the pre-processing stage # we retain all colour channels. env = AtariPreprocessing(env, grayscale_obs=False, scale_obs=True) env = ReturnWrapper(env) env = TransformReward(env, lambda r: np.sign(r)) return env
def snake_wrapper(env, time_limit = 200, default_reward=0, stack_length=3): env = TransformReward(env,reward_wrapper_func(default_reward)) env = TimeLimit(env,time_limit) env = SnakeStack(env,stack_length) return env
def make_env(env_name): if env_name == 'fourrooms': return Fourrooms(), False env = gym.make(env_name) is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = AtariPreprocessing(env, grayscale_obs=True, scale_obs=True, terminal_on_life_loss=True) env = TransformReward(env, lambda r: np.clip(r, -1, 1)) env = FrameStack(env, 4) return env, is_atari
def make_env(env_id, seed, reward_noise_scale=None): try: env = gym.make(env_id) except gym.error.UnregisteredEnv: register(id=env_id, entry_point=ALL_V1_ENVIRONMENTS[env_id], max_episode_steps=150) print("Registered env", env_id) env = gym.make(env_id) assert_env(env) env.seed(seed) setattr(env, 'is_metaworld', env_id in ALL_V1_ENVIRONMENTS.keys()) if reward_noise_scale: from gym.wrappers import TransformReward max_step_bk = env._max_episode_steps env = TransformReward(env, lambda r: r + reward_noise_scale * randn()) setattr(env, "_max_episode_steps", max_step_bk) return env
def main(): as_gdads = True name = "pointmass" drop_abs_position = True dads_env_fn = envs_fns[name] conf: Conf = CONFS[name] dict_env = as_dict_env(dads_env_fn()) dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len) if drop_abs_position: dict_env = DropGoalEnvsAbsoluteLocation(dict_env) if as_gdads: flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2) else: flat_obs_content = ["observation", "desired_goal", "achieved_goal"] if drop_abs_position: flat_obs_content.remove("achieved_goal") # Because always 0 vector flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content)) flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling) flat_env = Monitor(flat_env) filename = f"modelsCommandSkills/{name}-gdads{as_gdads}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename, env=flat_env) if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=f"{filename}-tb", buffer_size=10000) train(model=sac, conf=conf, save_fname=filename) if as_gdads: flat_env.save(filename) if as_gdads: flat_env.set_sac(sac) eval_dict_env(dict_env=dict_env, model=flat_env, ep_len=conf.ep_len) show(model=sac, env=flat_env, conf=conf)
def __init__(self, game, stack=False, sticky_action=False, clip_reward=False, terminal_on_life_loss=False, **kwargs): # set action_probability=0.25 if sticky_action=True env_id = '{}NoFrameskip-v{}'.format(game, 0 if sticky_action else 4) # use official atari wrapper env = AtariPreprocessing(gym.make(env_id), terminal_on_life_loss=terminal_on_life_loss) if stack: env = FrameStack(env, num_stack=4) if clip_reward: env = TransformReward(env, lambda r: np.clip(r, -1.0, 1.0)) self._env = env self.observation_space = env.observation_space self.action_space = env.action_space
def test_transform_reward(env_id): # use case #1: scale scales = [0.1, 200] for scale in scales: env = gym.make(env_id) wrapped_env = TransformReward(gym.make(env_id), lambda r: scale * r) action = env.action_space.sample() env.reset(seed=0) wrapped_env.reset(seed=0) _, reward, _, _ = env.step(action) _, wrapped_reward, _, _ = wrapped_env.step(action) assert wrapped_reward == scale * reward del env, wrapped_env # use case #2: clip min_r = -0.0005 max_r = 0.0002 env = gym.make(env_id) wrapped_env = TransformReward(gym.make(env_id), lambda r: np.clip(r, min_r, max_r)) action = env.action_space.sample() env.reset(seed=0) wrapped_env.reset(seed=0) _, reward, _, _ = env.step(action) _, wrapped_reward, _, _ = wrapped_env.step(action) assert abs(wrapped_reward) < abs(reward) assert wrapped_reward == -0.0005 or wrapped_reward == 0.0002 del env, wrapped_env # use case #3: sign env = gym.make(env_id) wrapped_env = TransformReward(gym.make(env_id), lambda r: np.sign(r)) env.reset(seed=0) wrapped_env.reset(seed=0) for _ in range(1000): action = env.action_space.sample() _, wrapped_reward, done, _ = wrapped_env.step(action) assert wrapped_reward in [-1.0, 0.0, 1.0] if done: break del env, wrapped_env
def test_sac(): args, log_path, writer = get_args() env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = ShmPipeVecEnv([ lambda: TransformReward(BipedalWrapper(gym.make(args.task)), lambda reward: 5 * reward) for _ in range(args.training_num) ]) # test_envs = gym.make(args.task) test_envs = ShmPipeVecEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed + 1) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = DQCritic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic_target = DQCritic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic, critic_optim, critic_target, env.action_space, args.device, args.tau, args.gamma, args.alpha, reward_normalization=args.rew_norm, ignore_done=False) if args.mode == 'test': policy.load_state_dict( torch.load("{}/{}/{}/policy.pth".format(args.logdir, args.task, args.comment), map_location=args.device)) env = gym.make(args.task) collector = Collector(policy, env # Monitor(env, 'video', force=True) ) result = collector.collect(n_episode=10, render=args.render) print( f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}') collector.close() exit() # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) train_collector.collect(10000, sampling=True) test_collector = Collector(policy, test_envs) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold + 5 # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_episode, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) pprint.pprint(result)
def __init__(self, env: gym.Env, is_eval: bool = False): env = AtariPreprocessing(env, terminal_on_life_loss=not is_eval) if not is_eval: env = TransformReward(env, lambda r: np.clip(r, -1.0, 1.0)) super().__init__(ChannelFirst(env))
def basic_wrapper(env): """Use this as a wrapper only for cartpole etc.""" env = ReturnWrapper(env) env = TransformReward(env, lambda r: np.clip(r, -1, 1)) return env
import gym import numpy as np from gym.wrappers import AtariPreprocessing, TransformReward from d3rlpy.algos import DoubleDQN from d3rlpy.models.optimizers import AdamFactory from d3rlpy.online.buffers import ReplayBuffer from d3rlpy.online.explorers import LinearDecayEpsilonGreedy from d3rlpy.envs import ChannelFirst # get wrapped atari environment env = ChannelFirst( TransformReward( AtariPreprocessing(gym.make('BreakoutNoFrameskip-v4'), terminal_on_life_loss=True), lambda r: np.clip(r, -1.0, 1.0))) eval_env = ChannelFirst(AtariPreprocessing(gym.make('BreakoutNoFrameskip-v4'))) # setup algorithm dqn = DoubleDQN(batch_size=32, learning_rate=2.5e-4, optim_factory=AdamFactory(eps=1e-2 / 32), target_update_interval=10000, q_func_factory='mean', scaler='pixel', n_frames=4, use_gpu=True) # replay buffer for experience replay buffer = ReplayBuffer(maxlen=1000000, env=env)
def test_sac_with_il(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv([ lambda: TransformReward(BipedalWrapper(gym.make(args.task)), lambda reward: 5 * reward) for _ in range(args.training_num) ]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, env.action_space, args.tau, args.gamma, args.alpha, reward_normalization=args.rew_norm, ignore_done=True) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) train_collector.collect(10000, sampling=True) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) # test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print( f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}') collector.close() # here we define an imitation collector with a trivial policy if args.task == 'Pendulum-v0': env.spec.reward_threshold = -300 # lower the goal net = Actor(1, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.il_lr) il_policy = ImitationPolicy(net, optim, mode='continuous') il_test_collector = Collector(il_policy, test_envs) train_collector.reset() result = offpolicy_trainer(il_policy, train_collector, il_test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() il_test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(il_policy, env) result = collector.collect(n_episode=1, render=args.render) print( f'Final reward: {result["ep/reward"]}, length: {result["ep/len"]}') collector.close()