def test_ppo(args=get_args()): env = create_atari_environment(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space().shape or env.action_space().n # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv([ lambda: create_atari_environment(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv([ lambda: create_atari_environment(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size), preprocess_fn=preprocess_fn) test_collector = Collector(policy, test_envs, preprocess_fn=preprocess_fn) # log writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): if env.env.spec.reward_threshold: return x >= env.spec.reward_threshold else: return False # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = create_atari_environment(args.task) collector = Collector(policy, env, preprocess_fn=preprocess_fn) result = collector.collect(n_step=2000, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_a2c_with_il(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy(actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm, reward_normalization=args.rew_norm) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') policy.eval() # here we define an imitation collector with a trivial policy if args.task == 'CartPole-v0': env.spec.reward_threshold = 190 # lower the goal net = Net(1, args.state_shape, device=args.device) net = Actor(net, args.action_shape).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.il_lr) il_policy = ImitationPolicy(net, optim, mode='discrete') il_test_collector = Collector( il_policy, DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)])) train_collector.reset() result = offpolicy_trainer(il_policy, train_collector, il_test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) il_policy.eval() collector = Collector(il_policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def test_discrete_bcq(args=get_args()): # envs env = make_atari_env(args) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # make environments test_envs = SubprocVectorEnv( [lambda: make_atari_env_watch(args) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model feature_net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True).to(args.device) policy_net = Actor(feature_net, args.action_shape, device=args.device, hidden_sizes=args.hidden_sizes, softmax_output=False).to(args.device) imitation_net = Actor(feature_net, args.action_shape, device=args.device, hidden_sizes=args.hidden_sizes, softmax_output=False).to(args.device) optim = torch.optim.Adam(list(policy_net.parameters()) + list(imitation_net.parameters()), lr=args.lr) # define policy policy = DiscreteBCQPolicy(policy_net, imitation_net, optim, args.gamma, args.n_step, args.target_update_freq, args.eps_test, args.unlikely_action_threshold, args.imitation_logits_penalty) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # buffer assert os.path.exists(args.load_buffer_name), \ "Please run atari_dqn.py first to get expert's data buffer." if args.load_buffer_name.endswith('.pkl'): buffer = pickle.load(open(args.load_buffer_name, "rb")) elif args.load_buffer_name.endswith('.hdf5'): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: print(f"Unknown buffer format: {args.load_buffer_name}") exit(0) # collector test_collector = Collector(policy, test_envs, exploration_noise=True) # log log_path = os.path.join( args.logdir, args.task, 'bcq', f'seed_{args.seed}_{datetime.datetime.now().strftime("%m%d-%H%M%S")}') writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = BasicLogger(writer, update_interval=args.log_interval) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return False # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) pprint.pprint(result) rew = result["rews"].mean() print(f'Mean reward (over {result["n/ep"]} episodes): {rew}') if args.watch: watch() exit(0) result = offline_trainer(policy, buffer, test_collector, args.epoch, args.update_per_epoch, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, logger=logger) pprint.pprint(result) watch()
def test_discrete_sac(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, softmax_output=False, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) critic1 = Critic(net_c1, last_size=args.action_shape, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) net_c2 = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) critic2 = Critic(net_c2, last_size=args.action_shape, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) # better not to use auto alpha in CartPole if args.auto_alpha: target_entropy = 0.98 * np.log(np.prod(args.action_shape)) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) args.alpha = (target_entropy, log_alpha, alpha_optim) policy = DiscreteSACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, reward_normalization=args.rew_norm, ignore_done=args.ignore_done) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'discrete_sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer, test_in_train=False) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}')
def test_ppo(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n if args.reward_threshold is None: default_reward_threshold = {"CartPole-v0": 195} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold) # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, device=args.device).to(args.device) critic = Critic(net, device=args.device).to(args.device) actor_critic = ActorCritic(actor, critic) # orthogonal initialization for m in actor_critic.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip, action_space=env.action_space, deterministic_eval=True, advantage_normalization=args.norm_adv, recompute_advantage=args.recompute_adv) feature_dim = args.hidden_sizes[-1] feature_net = MLP(np.prod(args.state_shape), output_dim=feature_dim, hidden_sizes=args.hidden_sizes[:-1], device=args.device) action_dim = np.prod(args.action_shape) icm_net = IntrinsicCuriosityModule(feature_net, feature_dim, action_dim, hidden_sizes=args.hidden_sizes[-1:], device=args.device).to(args.device) icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr) policy = ICMPolicy(policy, icm_net, icm_optim, args.lr_scale, args.reward_scale, args.forward_loss_weight) # collector train_collector = Collector( policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs))) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo_icm') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, step_per_collect=args.step_per_collect, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_ppo(args=get_args()): env, train_envs, test_envs = make_atari_env( args.task, args.seed, args.training_num, args.test_num, scale=args.scale_obs, frame_stack=args.frames_stack, ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) # define model net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True, output_dim=args.hidden_size) actor = Actor(net, args.action_shape, device=args.device, softmax_output=False) critic = Critic(net, device=args.device) optim = torch.optim.Adam(ActorCritic(actor, critic).parameters(), lr=args.lr) lr_scheduler = None if args.lr_decay: # decay learning rate to 0 linearly max_update_num = np.ceil( args.step_per_epoch / args.step_per_collect) * args.epoch lr_scheduler = LambdaLR( optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num) # define policy def dist(p): return torch.distributions.Categorical(logits=p) policy = PPOPolicy( actor, critic, optim, dist, discount_factor=args.gamma, gae_lambda=args.gae_lambda, max_grad_norm=args.max_grad_norm, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, action_scaling=False, lr_scheduler=lr_scheduler, action_space=env.action_space, eps_clip=args.eps_clip, value_clip=args.value_clip, dual_clip=args.dual_clip, advantage_normalization=args.norm_adv, recompute_advantage=args.recompute_adv, ).to(args.device) if args.icm_lr_scale > 0: feature_net = DQN(*args.state_shape, args.action_shape, args.device, features_only=True) action_dim = np.prod(args.action_shape) feature_dim = feature_net.output_dim icm_net = IntrinsicCuriosityModule( feature_net.net, feature_dim, action_dim, hidden_sizes=args.hidden_sizes, device=args.device, ) icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr) policy = ICMPolicy(policy, icm_net, icm_optim, args.icm_lr_scale, args.icm_reward_scale, args.icm_forward_loss_weight).to(args.device) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # replay buffer: `save_last_obs` and `stack_num` can be removed together # when you have enough RAM buffer = VectorReplayBuffer( args.buffer_size, buffer_num=len(train_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack, ) # collector train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "ppo_icm" if args.icm_lr_scale > 0 else "ppo" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def stop_fn(mean_rewards): if env.spec.reward_threshold: return mean_rewards >= env.spec.reward_threshold elif "Pong" in args.task: return mean_rewards >= 20 else: return False def save_checkpoint_fn(epoch, env_step, gradient_step): # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html ckpt_path = os.path.join(log_path, "checkpoint.pth") torch.save({"model": policy.state_dict()}, ckpt_path) return ckpt_path # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = VectorReplayBuffer( args.buffer_size, buffer_num=len(test_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack, ) collector = Collector(policy, test_envs, buffer, exploration_noise=True) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) rew = result["rews"].mean() print(f"Mean reward (over {result['n/ep']} episodes): {rew}") if args.watch: watch() exit(0) # test train_collector and start filling replay buffer train_collector.collect(n_step=args.batch_size * args.training_num) # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, step_per_collect=args.step_per_collect, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, test_in_train=False, resume_from_log=args.resume_id is not None, save_checkpoint_fn=save_checkpoint_fn, ) pprint.pprint(result) watch()
state_shape = train_envs.observation_space[ 0].shape or train_envs.observation_space[0].n action_shape = train_envs.action_space[0].shape or train_envs.action_space[0].n # Actor, Critic custom_kwargs = {'output_dim': 1024, 'kernel_size': 8, 'dropout': 0.25} hidden_sizes = [1024, 1024, 1024] net = Net( state_shape, hidden_sizes=hidden_sizes, device=device, custom_model=CNNModel, custom_model_kwargs=custom_kwargs, ) actor = Actor(net, action_shape, device=device).to(device) critic = Critic(net, device=device).to(device) optim = torch.optim.Adam # dummy optim # Setup policy and collectors dist = torch.distributions.Categorical policy = ts.policy.PPOPolicy( actor, critic, optim, dist, max_grad_norm=0.5, use_mixed=True, ) policy.optim = torch.optim.Adam(policy.parameters(), lr=lr)
def test_discrete_crr(args=get_args()): # envs env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n if args.reward_threshold is None: default_reward_threshold = {"CartPole-v0": 180} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold ) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, args.hidden_sizes[0], device=args.device) actor = Actor( net, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, softmax_output=False ) critic = Critic( net, hidden_sizes=args.hidden_sizes, last_size=np.prod(args.action_shape), device=args.device ) actor_critic = ActorCritic(actor, critic) optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) policy = DiscreteCRRPolicy( actor, critic, optim, args.gamma, target_update_freq=args.target_update_freq, ).to(args.device) # buffer if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name): if args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: buffer = pickle.load(open(args.load_buffer_name, "rb")) else: buffer = gather_data() # collector test_collector = Collector(policy, test_envs, exploration_noise=True) log_path = os.path.join(args.logdir, args.task, 'discrete_crr') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold result = offline_trainer( policy, buffer, test_collector, args.epoch, args.update_per_epoch, args.test_num, args.batch_size, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger ) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_a2c_with_il(args=get_args()): # if you want to use python vector env, please refer to other test scripts train_envs = env = envpool.make_gym(args.task, num_envs=args.training_num, seed=args.seed) test_envs = envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n if args.reward_threshold is None: default_reward_threshold = {"CartPole-v0": 195} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = Actor(net, args.action_shape, device=args.device).to(args.device) critic = Critic(net, device=args.device).to(args.device) optim = torch.optim.Adam(ActorCritic(actor, critic).parameters(), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy(actor, critic, optim, dist, discount_factor=args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm, reward_normalization=args.rew_norm, action_space=env.action_space) # collector train_collector = Collector( policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs))) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, episode_per_collect=args.episode_per_collect, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}") policy.eval() # here we define an imitation collector with a trivial policy # if args.task == 'CartPole-v0': # env.spec.reward_threshold = 190 # lower the goal net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) net = Actor(net, args.action_shape, device=args.device).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.il_lr) il_policy = ImitationPolicy(net, optim, action_space=env.action_space) il_test_collector = Collector( il_policy, envpool.make_gym(args.task, num_envs=args.test_num, seed=args.seed), ) train_collector.reset() result = offpolicy_trainer(il_policy, train_collector, il_test_collector, args.epoch, args.il_step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) il_policy.eval() collector = Collector(il_policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_ppo(args=get_args()): args.cfg_path = f"maps/{args.task}.cfg" args.wad_path = f"maps/{args.task}.wad" args.res = (args.skip_num, 84, 84) env = Env(args.cfg_path, args.frames_stack, args.res) args.state_shape = args.res args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # make environments train_envs = ShmemVectorEnv([ lambda: Env(args.cfg_path, args.frames_stack, args.res) for _ in range(args.training_num) ]) test_envs = ShmemVectorEnv([ lambda: Env(args.cfg_path, args.frames_stack, args.res, args.save_lmp) for _ in range(min(os.cpu_count() - 1, args.test_num)) ]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # define model net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True, output_dim=args.hidden_size) actor = Actor(net, args.action_shape, device=args.device, softmax_output=False) critic = Critic(net, device=args.device) optim = torch.optim.Adam(ActorCritic(actor, critic).parameters(), lr=args.lr) lr_scheduler = None if args.lr_decay: # decay learning rate to 0 linearly max_update_num = np.ceil( args.step_per_epoch / args.step_per_collect) * args.epoch lr_scheduler = LambdaLR( optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num) # define policy def dist(p): return torch.distributions.Categorical(logits=p) policy = PPOPolicy(actor, critic, optim, dist, discount_factor=args.gamma, gae_lambda=args.gae_lambda, max_grad_norm=args.max_grad_norm, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, action_scaling=False, lr_scheduler=lr_scheduler, action_space=env.action_space, eps_clip=args.eps_clip, value_clip=args.value_clip, dual_clip=args.dual_clip, advantage_normalization=args.norm_adv, recompute_advantage=args.recompute_adv).to(args.device) if args.icm_lr_scale > 0: feature_net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True, output_dim=args.hidden_size) action_dim = np.prod(args.action_shape) feature_dim = feature_net.output_dim icm_net = IntrinsicCuriosityModule(feature_net.net, feature_dim, action_dim, device=args.device) icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr) policy = ICMPolicy(policy, icm_net, icm_optim, args.icm_lr_scale, args.icm_reward_scale, args.icm_forward_loss_weight).to(args.device) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # replay buffer: `save_last_obs` and `stack_num` can be removed together # when you have enough RAM buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) # collector train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # log log_name = 'ppo_icm' if args.icm_lr_scale > 0 else 'ppo' log_path = os.path.join(args.logdir, args.task, log_name) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): if env.spec.reward_threshold: return mean_rewards >= env.spec.reward_threshold elif 'Pong' in args.task: return mean_rewards >= 20 else: return False # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(test_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) collector = Collector(policy, test_envs, buffer, exploration_noise=True) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) rew = result["rews"].mean() lens = result["lens"].mean() * args.skip_num print(f'Mean reward (over {result["n/ep"]} episodes): {rew}') print(f'Mean length (over {result["n/ep"]} episodes): {lens}') if args.watch: watch() exit(0) # test train_collector and start filling replay buffer train_collector.collect(n_step=args.batch_size * args.training_num) # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, step_per_collect=args.step_per_collect, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, test_in_train=False) pprint.pprint(result) watch()
def test_discrete_crr(args=get_args()): # envs env, _, test_envs = make_atari_env( args.task, args.seed, 1, args.test_num, scale=args.scale_obs, frame_stack=args.frames_stack, ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) # model feature_net = DQN( *args.state_shape, args.action_shape, device=args.device, features_only=True ).to(args.device) actor = Actor( feature_net, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device, softmax_output=False, ).to(args.device) critic = Critic( feature_net, hidden_sizes=args.hidden_sizes, last_size=np.prod(args.action_shape), device=args.device, ).to(args.device) actor_critic = ActorCritic(actor, critic) optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) # define policy policy = DiscreteCRRPolicy( actor, critic, optim, args.gamma, policy_improvement_mode=args.policy_improvement_mode, ratio_upper_bound=args.ratio_upper_bound, beta=args.beta, min_q_weight=args.min_q_weight, target_update_freq=args.target_update_freq, ).to(args.device) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # buffer assert os.path.exists(args.load_buffer_name), \ "Please run atari_qrdqn.py first to get expert's data buffer." if args.load_buffer_name.endswith(".pkl"): buffer = pickle.load(open(args.load_buffer_name, "rb")) elif args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: print(f"Unknown buffer format: {args.load_buffer_name}") exit(0) # collector test_collector = Collector(policy, test_envs, exploration_noise=True) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "crr" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def stop_fn(mean_rewards): return False # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() test_envs.seed(args.seed) print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) pprint.pprint(result) rew = result["rews"].mean() print(f'Mean reward (over {result["n/ep"]} episodes): {rew}') if args.watch: watch() exit(0) result = offline_trainer( policy, buffer, test_collector, args.epoch, args.update_per_epoch, args.test_num, args.batch_size, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, ) pprint.pprint(result) watch()
def test_discrete_bcq(args=get_args()): # envs env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n if args.reward_threshold is None: default_reward_threshold = {"CartPole-v0": 190} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, args.hidden_sizes[0], device=args.device) policy_net = Actor(net, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device).to(args.device) imitation_net = Actor(net, args.action_shape, hidden_sizes=args.hidden_sizes, device=args.device).to(args.device) actor_critic = ActorCritic(policy_net, imitation_net) optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) policy = DiscreteBCQPolicy( policy_net, imitation_net, optim, args.gamma, args.n_step, args.target_update_freq, args.eps_test, args.unlikely_action_threshold, args.imitation_logits_penalty, ) # buffer if os.path.exists(args.load_buffer_name) and os.path.isfile( args.load_buffer_name): if args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: buffer = pickle.load(open(args.load_buffer_name, "rb")) else: buffer = gather_data() # collector test_collector = Collector(policy, test_envs, exploration_noise=True) log_path = os.path.join(args.logdir, args.task, 'discrete_bcq') writer = SummaryWriter(log_path) logger = TensorboardLogger(writer, save_interval=args.save_interval) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold def save_checkpoint_fn(epoch, env_step, gradient_step): # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html torch.save( { 'model': policy.state_dict(), 'optim': optim.state_dict(), }, os.path.join(log_path, 'checkpoint.pth')) if args.resume: # load from existing checkpoint print(f"Loading agent under {log_path}") ckpt_path = os.path.join(log_path, 'checkpoint.pth') if os.path.exists(ckpt_path): checkpoint = torch.load(ckpt_path, map_location=args.device) policy.load_state_dict(checkpoint['model']) optim.load_state_dict(checkpoint['optim']) print("Successfully restore policy and optim.") else: print("Fail to restore policy and optim.") result = offline_trainer(policy, buffer, test_collector, args.epoch, args.update_per_epoch, args.test_num, args.batch_size, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, resume_from_log=args.resume, save_checkpoint_fn=save_checkpoint_fn) assert stop_fn(result['best_reward']) if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() policy.set_eps(args.eps_test) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_discrete_sac(args=get_args()): env, train_envs, test_envs = make_atari_env( args.task, args.seed, args.training_num, args.test_num, scale=args.scale_obs, frame_stack=args.frames_stack, ) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) # define model net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True, output_dim=args.hidden_size) actor = Actor(net, args.action_shape, device=args.device, softmax_output=False) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(net, last_size=args.action_shape, device=args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net, last_size=args.action_shape, device=args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) # define policy if args.auto_alpha: target_entropy = 0.98 * np.log(np.prod(args.action_shape)) log_alpha = torch.zeros(1, requires_grad=True, device=args.device) alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr) args.alpha = (target_entropy, log_alpha, alpha_optim) policy = DiscreteSACPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, estimation_step=args.n_step, reward_normalization=args.rew_norm, ).to(args.device) if args.icm_lr_scale > 0: feature_net = DQN(*args.state_shape, args.action_shape, args.device, features_only=True) action_dim = np.prod(args.action_shape) feature_dim = feature_net.output_dim icm_net = IntrinsicCuriosityModule( feature_net.net, feature_dim, action_dim, hidden_sizes=[args.hidden_size], device=args.device, ) icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.actor_lr) policy = ICMPolicy(policy, icm_net, icm_optim, args.icm_lr_scale, args.icm_reward_scale, args.icm_forward_loss_weight).to(args.device) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # replay buffer: `save_last_obs` and `stack_num` can be removed together # when you have enough RAM buffer = VectorReplayBuffer( args.buffer_size, buffer_num=len(train_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack, ) # collector train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # log now = datetime.datetime.now().strftime("%y%m%d-%H%M%S") args.algo_name = "discrete_sac_icm" if args.icm_lr_scale > 0 else "discrete_sac" log_name = os.path.join(args.task, args.algo_name, str(args.seed), now) log_path = os.path.join(args.logdir, log_name) # logger if args.logger == "wandb": logger = WandbLogger( save_interval=1, name=log_name.replace(os.path.sep, "__"), run_id=args.resume_id, config=args, project=args.wandb_project, ) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) if args.logger == "tensorboard": logger = TensorboardLogger(writer) else: # wandb logger.load(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def stop_fn(mean_rewards): if env.spec.reward_threshold: return mean_rewards >= env.spec.reward_threshold elif "Pong" in args.task: return mean_rewards >= 20 else: return False def save_checkpoint_fn(epoch, env_step, gradient_step): # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html ckpt_path = os.path.join(log_path, "checkpoint.pth") torch.save({"model": policy.state_dict()}, ckpt_path) return ckpt_path # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = VectorReplayBuffer( args.buffer_size, buffer_num=len(test_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack, ) collector = Collector(policy, test_envs, buffer, exploration_noise=True) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) rew = result["rews"].mean() print(f"Mean reward (over {result['n/ep']} episodes): {rew}") if args.watch: watch() exit(0) # test train_collector and start filling replay buffer train_collector.collect(n_step=args.batch_size * args.training_num) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False, resume_from_log=args.resume_id is not None, save_checkpoint_fn=save_checkpoint_fn, ) pprint.pprint(result) watch()
env = wrapper_dict[wrapper_config['wrapper']](gym.make('jackal_navigation-v0', **env_config), wrapper_config['wrapper_args']) train_envs = DummyVectorEnv([lambda: env for _ in range(1)]) # config random seed np.random.seed(config['seed']) torch.manual_seed(config['seed']) train_envs.seed(config['seed']) state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n net = Net(training_config['layer_num'], state_shape, device=config['device']).to(config['device']) actor = Actor(net, action_shape).to(config['device']) critic = Critic(net).to(config['device']) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=training_config['learning_rate']) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim,
def main(cfg): # Single env just used to determine state and actions spaces env = hydra.utils.instantiate(cfg.env) train_envs = ts.env.DummyVectorEnv([ lambda: hydra.utils.instantiate(cfg.env) for _ in range(cfg.vec_env.n_train_envs) ]) test_envs = ts.env.DummyVectorEnv([ lambda: hydra.utils.instantiate(cfg.env) for _ in range(cfg.vec_env.n_test_envs) ]) # Seed everything for reproducibility # Train and test encs can either take int or list same length as n_envs np.random.seed(cfg.seed) torch.manual_seed(cfg.seed) train_envs.seed(cfg.vec_env.train_env_seed) test_envs.seed(cfg.vec_env.test_env_seed) # Set up the actor and critic networks state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n net_a = hydra.utils.instantiate(cfg.actor_network, state_shape=state_shape, activation=nn.Tanh) actor = Actor(net_a, action_shape=action_shape, device=cfg.device).to(cfg.device) net_c = hydra.utils.instantiate(cfg.critic_network, state_shape=state_shape, activation=nn.Tanh) critic = Critic(net_c, device=cfg.device).to(cfg.device) optim = hydra.utils.instantiate(cfg.optimiser, params=ActorCritic(actor, critic).parameters()) policy = hydra.utils.instantiate( cfg.policy, actor=actor, critic=critic, optim=optim, dist_fn=dist_fn, action_space=env.action_space, ) logger = hydra.utils.instantiate(cfg.logger) conf_dict = OmegaConf.to_container(cfg, resolve=True) logger.log_hyperparameters(conf_dict) # Log Hydra config files as artifacts for easy replication logger.experiment.log_artifacts(logger.run_id, cfg.hydra_logdir) # Set up checkpointing if needed if cfg.checkpoints.save_checkpoints: cp_path = Path(cfg.checkpoints.path) cp_path.mkdir(parents=True, exist_ok=True) best_models_path = cp_path / "best_models" best_models_path.mkdir(parents=True, exist_ok=True) # Runs when we have a new best mean evaluation reward def save_fn(policy, cp_path=cp_path, mlflow_logger=logger): now_time = datetime.strftime(datetime.now(), "%H-%M-%S") model_checkpoint_path = cp_path / f"best_models/policy_{now_time}.pt" torch.save(policy.state_dict(), model_checkpoint_path) mlflow_logger.experiment.log_artifact(mlflow_logger.run_id, model_checkpoint_path, "best_models") # Runs at frequency based on save_interval argument to logger def save_checkpoint_fn(epoch, env_step, gradient_step, cp_path=cp_path): checkpoint_dir = cp_path / f"epoch_{epoch:04}" checkpoint_dir.mkdir(parents=True, exist_ok=True) checkpoint_filepath = checkpoint_dir / "policy.pt" torch.save({"model": policy.state_dict()}, checkpoint_filepath) return checkpoint_filepath else: save_fn = None save_checkpoint_fn = None train_collector = hydra.utils.instantiate(cfg.train_collector, policy=policy, env=train_envs) test_collector = hydra.utils.instantiate( cfg.test_collector, policy=policy, env=test_envs, preprocess_fn=logger.info_logger.preprocess_fn, ) result = hydra.utils.call( cfg.trainer, policy=policy, train_collector=train_collector, test_collector=test_collector, logger=logger, save_fn=save_fn, save_checkpoint_fn=save_checkpoint_fn, ) print(f'Finished training in {result["duration"]}') logger.close()
def test_ppo(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) # you can also use tianshou.env.SubprocVectorEnv train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None, gae_lambda=args.gae_lambda, reward_normalization=args.rew_norm, dual_clip=args.dual_clip, value_clip=args.value_clip) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ppo') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_a2c(args=get_args()): env = make_atari_env(args) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape: ", args.state_shape) print("Actions shape: ", args.action_shape) # make environments train_envs = SubprocVectorEnv( [lambda: make_atari_env(args) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: make_atari_env_watch(args) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = DQN(*args.state_shape, args.hidden_layer_size, args.device).to(args.device) actor = Actor(net, args.action_shape, hidden_layer_size=args.hidden_layer_size, softmax_output=False).to(args.device) critic = Critic(net, hidden_layer_size=args.hidden_layer_size).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) def dist(x): return torch.distributions.Categorical(logits=x) # define policy policy = A2CPolicy( actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path)) print("Loaded agent from: ", args.resume_path) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size, ignore_obs_next=True)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): if env.env.spec.reward_threshold: return x >= env.spec.reward_threshold elif 'Pong' in args.task: return x >= 20 # watch agent's performance def watch(): print("Testing agent ...") policy.eval() policy.set_eps(args.eps_test) envs = SubprocVectorEnv([lambda: make_atari_env_watch(args) for _ in range(args.test_num)]) envs.seed(args.seed) collector = Collector(policy, envs) result = collector.collect(n_episode=args.test_num, render=args.render) pprint.pprint(result) if args.watch: watch() exit(0) # test train_collector and start filling replay buffer train_collector.collect(n_step=args.batch_size * 4) # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, save_fn=save_fn, test_in_train=False) pprint.pprint(result) watch()