def test_gail(args=get_args()): if os.path.exists(args.load_buffer_name) and os.path.isfile( args.load_buffer_name): if args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: buffer = pickle.load(open(args.load_buffer_name, "rb")) else: buffer = gather_data() env = gym.make(args.task) if args.reward_threshold is None: default_reward_threshold = {"Pendulum-v0": -1100, "Pendulum-v1": -1100} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device) actor = ActorProb(net, args.action_shape, max_action=args.max_action, device=args.device).to(args.device) critic = Critic(Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device), device=args.device).to(args.device) actor_critic = ActorCritic(actor, critic) # orthogonal initialization for m in actor_critic.modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr) # discriminator disc_net = Critic(Net( args.state_shape, action_shape=args.action_shape, hidden_sizes=args.hidden_sizes, activation=torch.nn.Tanh, device=args.device, concat=True, ), device=args.device).to(args.device) for m in disc_net.modules(): if isinstance(m, torch.nn.Linear): # orthogonal initialization torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) torch.nn.init.zeros_(m.bias) disc_optim = torch.optim.Adam(disc_net.parameters(), lr=args.disc_lr) # replace DiagGuassian with Independent(Normal) which is equivalent # pass *logits to be consistent with policy.forward def dist(*logits): return Independent(Normal(*logits), 1) policy = GAILPolicy( actor, critic, optim, dist, buffer, disc_net, disc_optim, disc_update_num=args.disc_update_num, discount_factor=args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, advantage_normalization=args.norm_adv, recompute_advantage=args.recompute_adv, dual_clip=args.dual_clip, value_clip=args.value_clip, gae_lambda=args.gae_lambda, action_space=env.action_space, ) # collector train_collector = Collector( policy, train_envs, VectorReplayBuffer(args.buffer_size, len(train_envs))) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, "gail") writer = SummaryWriter(log_path) logger = TensorboardLogger(writer, save_interval=args.save_interval) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth")) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold def save_checkpoint_fn(epoch, env_step, gradient_step): # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html ckpt_path = os.path.join(log_path, "checkpoint.pth") # Example: saving by epoch num # ckpt_path = os.path.join(log_path, f"checkpoint_{epoch}.pth") torch.save( { "model": policy.state_dict(), "optim": optim.state_dict(), }, ckpt_path) return ckpt_path if args.resume: # load from existing checkpoint print(f"Loading agent under {log_path}") ckpt_path = os.path.join(log_path, "checkpoint.pth") if os.path.exists(ckpt_path): checkpoint = torch.load(ckpt_path, map_location=args.device) policy.load_state_dict(checkpoint["model"]) optim.load_state_dict(checkpoint["optim"]) print("Successfully restore policy and optim.") else: print("Fail to restore policy and optim.") # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, episode_per_collect=args.episode_per_collect, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, resume_from_log=args.resume, save_checkpoint_fn=save_checkpoint_fn, ) assert stop_fn(result["best_reward"]) if __name__ == "__main__": pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_bcq(args=get_args()): if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name): if args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: buffer = pickle.load(open(args.load_buffer_name, "rb")) else: buffer = gather_data() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float if args.reward_threshold is None: # too low? default_reward_threshold = {"Pendulum-v0": -1100, "Pendulum-v1": -1100} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold ) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model # perturbation network net_a = MLP( input_dim=args.state_dim + args.action_dim, output_dim=args.action_dim, hidden_sizes=args.hidden_sizes, device=args.device, ) actor = Perturbation( net_a, max_action=args.max_action, device=args.device, phi=args.phi ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) # vae # output_dim = 0, so the last Module in the encoder is ReLU vae_encoder = MLP( input_dim=args.state_dim + args.action_dim, hidden_sizes=args.vae_hidden_sizes, device=args.device, ) if not args.latent_dim: args.latent_dim = args.action_dim * 2 vae_decoder = MLP( input_dim=args.state_dim + args.latent_dim, output_dim=args.action_dim, hidden_sizes=args.vae_hidden_sizes, device=args.device, ) vae = VAE( vae_encoder, vae_decoder, hidden_dim=args.vae_hidden_sizes[-1], latent_dim=args.latent_dim, max_action=args.max_action, device=args.device, ).to(args.device) vae_optim = torch.optim.Adam(vae.parameters()) policy = BCQPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, vae, vae_optim, device=args.device, gamma=args.gamma, tau=args.tau, lmbda=args.lmbda, ) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector # buffer has been gathered # train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_bcq' log_path = os.path.join(args.logdir, args.task, 'bcq', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold def watch(): policy.load_state_dict( torch.load( os.path.join(log_path, 'policy.pth'), map_location=torch.device('cpu') ) ) policy.eval() collector = Collector(policy, env) collector.collect(n_episode=1, render=1 / 35) # trainer result = offline_trainer( policy, buffer, test_collector, args.epoch, args.step_per_epoch, args.test_num, args.batch_size, save_best_fn=save_best_fn, stop_fn=stop_fn, logger=logger, ) assert stop_fn(result['best_reward']) # Let's watch its performance! if __name__ == '__main__': pprint.pprint(result) env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")
def test_td3_bc(args=get_args()): if os.path.exists(args.load_buffer_name) and os.path.isfile(args.load_buffer_name): if args.load_buffer_name.endswith(".hdf5"): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: buffer = pickle.load(open(args.load_buffer_name, "rb")) else: buffer = gather_data() env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # float if args.reward_threshold is None: # too low? default_reward_threshold = {"Pendulum-v0": -1200, "Pendulum-v1": -1200} args.reward_threshold = default_reward_threshold.get( args.task, env.spec.reward_threshold ) args.state_dim = args.state_shape[0] args.action_dim = args.action_shape[0] # test_envs = gym.make(args.task) test_envs = DummyVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)] ) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model # actor network net_a = Net( args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device, ) actor = Actor( net_a, action_shape=args.action_shape, max_action=args.max_action, device=args.device, ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) # critic network net_c1 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) net_c2 = Net( args.state_shape, args.action_shape, hidden_sizes=args.hidden_sizes, concat=True, device=args.device, ) critic1 = Critic(net_c1, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(net_c2, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3BCPolicy( actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, tau=args.tau, gamma=args.gamma, exploration_noise=GaussianNoise(sigma=args.exploration_noise), policy_noise=args.policy_noise, update_actor_freq=args.update_actor_freq, noise_clip=args.noise_clip, alpha=args.alpha, estimation_step=args.n_step, action_space=env.action_space, ) # load a previous policy if args.resume_path: policy.load_state_dict(torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # collector # buffer has been gathered # train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs) # log t0 = datetime.datetime.now().strftime("%m%d_%H%M%S") log_file = f'seed_{args.seed}_{t0}-{args.task.replace("-", "_")}_td3_bc' log_path = os.path.join(args.logdir, args.task, 'td3_bc', log_file) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return mean_rewards >= args.reward_threshold def watch(): policy.load_state_dict( torch.load( os.path.join(log_path, 'policy.pth'), map_location=torch.device('cpu') ) ) policy.eval() collector = Collector(policy, env) collector.collect(n_episode=1, render=1 / 35) # trainer trainer = OfflineTrainer( policy, buffer, test_collector, args.epoch, args.step_per_epoch, args.test_num, args.batch_size, save_best_fn=save_best_fn, stop_fn=stop_fn, logger=logger, ) for epoch, epoch_stat, info in trainer: print(f"Epoch: {epoch}") print(epoch_stat) print(info) assert stop_fn(info["best_reward"]) # Let's watch its performance! if __name__ == "__main__": pprint.pprint(info) env = gym.make(args.task) policy.eval() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews.mean()}, length: {lens.mean()}")