np.random.seed(args.seed) # create environment env = gym.make(args.env) env.seed(args.seed) env.action_space.seed(args.seed) train_tools.EVAL_SEED = args.seed obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_bound = env.action_space.high[0] # create nets policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, hidden_size=[256, 256], hidden_activation=nn.ReLU) q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], hidden_activation=nn.ReLU) q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], hidden_activation=nn.ReLU) # create buffer if args.show: replay_buffer = None else: replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, capacity=args.capacity, batch_size=args.batch_size) agent = SAC_Agent(env, replay_buffer=replay_buffer,
# env = gym.make('LunarLanderContinuous-v2') # env = gym.make('BipedalWalker-v3') obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_bound = env.action_space.high[0] # create nets actor_net = DDPGMLPActor(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, hidden_size=[400, 300], hidden_activation=nn.ReLU) critic_net = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300], hidden_activation=nn.ReLU) # create optimizer actor_optimizer = torch.optim.Adam(actor_net.parameters(), lr=1e-4) critic_optimizer = torch.optim.Adam(critic_net.parameters(), lr=1e-3) # create buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, capacity=50000, batch_size=64) # create agent agent = DDPG_Agent( env=env,
torch.manual_seed(args.seed) np.random.seed(args.seed) # create environment env = gym.make(args.env) env.seed(args.seed) env.action_space.seed(args.seed) train_tools.EVAL_SEED = args.seed obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_bound = env.action_space.high[0] critic_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300], hidden_activation=nn.ReLU) critic_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[400, 300], hidden_activation=nn.ReLU) perturbation_net = BCQ_Perturbation(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, hidden_size=[400, 300], hidden_activation=nn.ReLU, phi=0.05) cvae_net = CVAE(obs_dim=obs_dim, act_dim=act_dim,
def __init__( self, env, data_buffer: OfflineBuffer, policy_net: MLPSquashedReparamGaussianPolicy, # actor q_net1: MLPQsaNet, # critic q_net2: MLPQsaNet, cvae_net: CVAE, policy_lr=1e-4, qf_lr=3e-4, cvae_lr=3e-4, gamma=0.99, tau=0.05, # BEAR lmbda=0.75, # used for double clipped double q-learning mmd_sigma=20.0, # the sigma used in mmd kernel kernel_type='gaussian', # the type of mmd kernel(gaussian or laplacian) lagrange_thresh=0.05, # the hyper-parameter used in automatic tuning alpha in cql loss n_action_samples=100, # the number of action samples to compute the best action when choose action n_target_samples=10, # the number of action samples to compute BCQ-like target value n_mmd_action_samples=4, # the number of action samples to compute MMD. warmup_step=40000, # do support matching with a warm start before policy(actor) train max_train_step=1000000, log_interval=1000, eval_freq=5000, train_id="bear_hopper-medium-v2_test", resume=False, # if True, train from last checkpoint device='cpu', ): self.env = env self.data_buffer = data_buffer self.device = torch.device(device) # the network and optimizers self.policy_net = policy_net.to(self.device) self.q_net1 = q_net1.to(self.device) self.q_net2 = q_net2.to(self.device) self.target_q_net1 = copy.deepcopy(self.q_net1).to(self.device) self.target_q_net2 = copy.deepcopy(self.q_net2).to(self.device) self.cvae_net = cvae_net.to(self.device) self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.q_optimizer1 = torch.optim.Adam(self.q_net1.parameters(), lr=qf_lr) self.q_optimizer2 = torch.optim.Adam(self.q_net2.parameters(), lr=qf_lr) self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr) self.gamma = gamma self.tau = tau self.max_train_step = max_train_step self.eval_freq = eval_freq self.train_step = 0 self.resume = resume # whether load checkpoint start train from last time # BEAR self.lmbda = lmbda self.mmd_sigma = mmd_sigma self.kernel_type = kernel_type self.lagrange_thresh = lagrange_thresh self.n_action_samples = n_action_samples self.n_target_samples = n_target_samples self.n_mmd_action_samples = n_mmd_action_samples self.warmup_step = warmup_step # mmd loss's temperature self.log_alpha_prime = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_prime_optimizer = torch.optim.Adam([self.log_alpha_prime], lr=1e-3) # log dir and interval self.log_interval = log_interval self.result_dir = os.path.join(log_tools.ROOT_DIR, "run/results", train_id) log_tools.make_dir(self.result_dir) self.checkpoint_path = os.path.join(self.result_dir, "checkpoint.pth") self.tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
def __init__( self, env, data_buffer: OfflineBuffer, critic_net1: MLPQsaNet, critic_net2: MLPQsaNet, actor_net: PLAS_Actor, cvae_net: CVAE, # generation model critic_lr=1e-3, actor_lr=1e-4, cvae_lr=1e-4, gamma=0.99, tau=0.005, lmbda=0.75, # used for double clipped double q-learning max_cvae_iterations=500000, # the num of iterations when training CVAE model max_train_step=2000000, log_interval=1000, eval_freq=5000, train_id="plas_test", resume=False, # if True, train from last checkpoint device='cpu', ): self.env = env self.data_buffer = data_buffer self.device = torch.device(device) self.critic_net1 = critic_net1.to(self.device) self.critic_net2 = critic_net2.to(self.device) self.target_critic_net1 = copy.deepcopy(self.critic_net1).to( self.device) self.target_critic_net2 = copy.deepcopy(self.critic_net2).to( self.device) self.actor_net = actor_net.to(self.device) self.target_actor_net = copy.deepcopy(self.actor_net).to(self.device) self.cvae_net = cvae_net.to(self.device) self.critic_optimizer1 = torch.optim.Adam( self.critic_net1.parameters(), lr=critic_lr) self.critic_optimizer2 = torch.optim.Adam( self.critic_net2.parameters(), lr=critic_lr) self.actor_optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=actor_lr) self.cvae_optimizer = torch.optim.Adam(self.cvae_net.parameters(), lr=cvae_lr) self.gamma = gamma self.tau = tau self.lmbda = lmbda self.max_cvae_iterations = max_cvae_iterations self.max_train_step = max_train_step self.eval_freq = eval_freq self.cvae_iterations = 0 self.train_step = 0 self.resume = resume # whether load checkpoint start train from last time # log dir and interval self.log_interval = log_interval self.result_dir = os.path.join(log_tools.ROOT_DIR, "run/results", train_id) log_tools.make_dir(self.result_dir) self.checkpoint_path = os.path.join(self.result_dir, "checkpoint.pth") self.tensorboard_writer = log_tools.TensorboardLogger(self.result_dir)
if __name__ == '__main__': # create environment env = gym.make("Pendulum-v0") # env = gym.make('LunarLanderContinuous-v2') # env = gym.make('BipedalWalker-v3') obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_bound = env.action_space.high[0] # create nets policy_net = MLPSquashedReparamGaussianPolicy(obs_dim=obs_dim, act_dim=act_dim, act_bound=act_bound, hidden_size=[256, 256], hidden_activation=nn.ReLU) q_net1 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], hidden_activation=nn.ReLU) q_net2 = MLPQsaNet(obs_dim=obs_dim, act_dim=act_dim, hidden_size=[256, 256], hidden_activation=nn.ReLU) policy_optimizer = torch.optim.Adam(policy_net.parameters(), lr=4e-3) q_optimizer1 = torch.optim.Adam(q_net1.parameters(), lr=4e-3) q_optimizer2 = torch.optim.Adam(q_net2.parameters(), lr=4e-3) # create buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, capacity=50000, batch_size=128) agent = SAC_Agent(env,