def _test_load_td3(self, gpu): obs_size = 11 action_size = 3 def make_q_func_with_optimizer(): q_func = nn.Sequential( pnn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() policy = nn.Sequential( nn.Linear(obs_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, action_size), nn.Tanh(), pfrl.policies.DeterministicHead(), ) policy_optimizer = torch.optim.Adam(policy.parameters()) rbuf = replay_buffers.ReplayBuffer(100) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1.0, -1.0, -1.0], high=[1.0, 1.0, 1.0]) agent = agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=1000, gpu=gpu, minibatch_size=100, burnin_action_func=None, ) downloaded_model, exists = download_model( "TD3", "Hopper-v2", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def _test_load_ddpg(self, gpu): obs_size = 11 action_size = 3 from pfrl.nn import ConcatObsAndAction q_func = nn.Sequential( ConcatObsAndAction(), nn.Linear(obs_size + action_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, 1), ) from pfrl.nn import BoundByTanh from pfrl.policies import DeterministicHead policy = nn.Sequential( nn.Linear(obs_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, action_size), BoundByTanh(low=[-1.0, -1.0, -1.0], high=[1.0, 1.0, 1.0]), DeterministicHead(), ) opt_a = torch.optim.Adam(policy.parameters()) opt_c = torch.optim.Adam(q_func.parameters()) explorer = explorers.AdditiveGaussian(scale=0.1, low=[-1.0, -1.0, -1.0], high=[1.0, 1.0, 1.0]) agent = agents.DDPG( policy, q_func, opt_a, opt_c, replay_buffers.ReplayBuffer(100), gamma=0.99, explorer=explorer, replay_start_size=1000, target_update_method="soft", target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=gpu, minibatch_size=100, burnin_action_func=None, ) downloaded_model, exists = download_model( "DDPG", "Hopper-v2", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="Hopper-v2", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=100, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor.") parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) def make_env(test): env = gym.make(args.env) # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size policy = nn.Sequential( nn.Linear(obs_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, action_size), nn.Tanh(), pfrl.policies.DeterministicHead(), ) policy_optimizer = torch.optim.Adam(policy.parameters()) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 400), nn.ReLU(), nn.Linear(400, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.AdditiveGaussian(scale=0.1, low=action_space.low, high=action_space.high) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.TD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, soft_update_tau=5e-3, explorer=explorer, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, ) if len(args.load) > 0 or args.load_pretrained: # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load( utils.download_model("TD3", args.env, model_type=args.pretrained_type)[0]) eval_env = make_env(test=True) if args.demo: eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) import json import os with open(os.path.join(args.outdir, "demo_scores.json"), "w") as f: json.dump(eval_stats, f) else: experiments.train_agent_with_evaluation( agent=agent, env=env, steps=args.steps, eval_env=eval_env, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, train_max_episode_len=timestep_limit, )
def __init__(self, state_dim, goal_dim, action_dim, scale, replay_buffer, actor_lr, critic_lr, expl_noise, policy_noise, noise_clip, gamma, policy_freq, tau, is_low_level, buffer_freq, minibatch_size, gpu, add_entropy, burnin_action_func=None, replay_start_size=2500): self.scale = scale # parameters self.expl_noise = expl_noise self.policy_noise = policy_noise self.noise_clip = noise_clip self.gamma = gamma self.policy_freq = policy_freq self.tau = tau self.is_low_level = is_low_level self.minibatch_size = minibatch_size self.add_entropy = add_entropy # create td3 agent self.device = torch.device(f'cuda:{gpu}') if self.add_entropy: def squashed_diagonal_gaussian_head(x): mean, log_scale = torch.chunk(x, 2, dim=-1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) return base_distribution policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, action_dim * 2), nn.Tanh(), ConstantsMult( torch.cat( (torch.tensor(self.scale), torch.ones( self.scale.size))).float().to(self.device)), # pfrl.policies.DeterministicHead(), Lambda(squashed_diagonal_gaussian_head), ) else: policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, action_dim), nn.Tanh(), ConstantsMult( torch.tensor(self.scale).float().to(self.device)), pfrl.policies.DeterministicHead(), ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(state_dim + goal_dim + action_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=critic_lr) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() # TODO - have proper low and high values from action space. # from the hiro paper, the scale is 1.0 explorer = explorers.AdditiveGaussian(scale=self.expl_noise * 1.0, low=-self.scale, high=self.scale) def default_target_policy_smoothing_func(batch_action): """Add noises to actions for target policy smoothing.""" noise = torch.clamp( self.policy_noise * torch.randn_like(batch_action), -self.noise_clip, self.noise_clip) smoothed_action = batch_action + noise smoothed_action = torch.min( smoothed_action, torch.tensor(self.scale).to(self.device).float()) smoothed_action = torch.max( smoothed_action, torch.tensor(-self.scale).to(self.device).float()) return smoothed_action if self.is_low_level: # standard goal conditioned td3 self.agent = GoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, burnin_action_func=burnin_action_func, target_policy_smoothing_func= default_target_policy_smoothing_func) else: self.agent = HIROHighLevelGoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size / buffer_freq, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, burnin_action_func=burnin_action_func, target_policy_smoothing_func= default_target_policy_smoothing_func) self.device = self.agent.device
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=( "Directory path to save output files." " If it does not exist, it will be created." ), ) parser.add_argument( "--env", type=str, default="'DClawTurnFixed-v0'", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument( "--gpu", type=int, default=-1, help="GPU to use, set to -1 if no GPU." ) parser.add_argument( "--load", type=str, default="", help="Directory to load agent from." ) parser.add_argument( "--max-steps", type=int, default=10 ** 6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=64, help="Minibatch size") parser.add_argument( "--render", action="store_true", help="Render env states in a GUI window." ) parser.add_argument( "--demo", action="store_true", help="Just run evaluation, not training." ) parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument( "--pretrained-type", type=str, default="best", choices=["best", "final"] ) parser.add_argument( "--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor." ) parser.add_argument( "--log-level", type=int, default=logging.INFO, help="Level of the root logger." ) parser.add_argument("--gamma", type=float, default=0.9) parser.add_argument("--ddpg-training-steps", type=int, default=int(1e3)) parser.add_argument("--adversary-training-steps", type=int,default=int(1e3)) args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = './results' print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) def make_env(test): env = gym.make('DClawTurnFixed-v0') # Unwrap TimeLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs env_seed = 2 ** 32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) if args.monitor: env = pfrl.wrappers.Monitor(env, args.outdir) if args.render and not test: env = pfrl.wrappers.Render(env) return env env = make_env(test=False) timestep_limit = env.spec.max_episode_steps obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size q_func = nn.Sequential( ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256,256), nn.ReLU(), nn.Linear(256, 1), ) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256,256), nn.ReLU(), nn.Linear(256, action_size), BoundByTanh(low=action_space.low, high=action_space.high), DeterministicHead(), ) ddpg_opt_a = torch.optim.Adam(policy.parameters()) ddpg_opt_c = torch.optim.Adam(q_func.parameters()) ddpg_rbuf = replay_buffers.ReplayBuffer(10 ** 6) ddpg_explorer = explorers.AdditiveGaussian( scale=0.1, low=action_space.low, high=action_space.high ) def ddpg_burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 ddpg_agent = DDPG( policy, q_func, ddpg_opt_a, ddpg_opt_c, ddpg_rbuf, gamma=args.gamma, explorer=ddpg_explorer, replay_start_size=args.replay_start_size, target_update_method="soft", target_update_interval=1, update_interval=1, soft_update_tau=5e-3, n_times_update=1, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=ddpg_burnin_action_func, ) def adversary_random_func(): return np.random.randint(0,9) # adversary_q = Critic(obs_size, 1, hidden_size=adversary_hidden_size) # adversary_action_space = gym.spaces.discrete.Discrete(9) # adversary_q = q_functions.FCQuadraticStateQFunction( # obs_size, 1, n_hidden_channels = 256, n_hidden_layers = 2,action_space = adversary_action_space # ) adversary_q = nn.Sequential( nn.Linear(obs_size, 256), nn.Linear(256,256), nn.Linear(256,256), nn.Linear(256,1), DiscreteActionValueHead(), ) adversary_optimizer = torch.optim.Adam(adversary_q.parameters(), lr=1e-3) adversary_rbuf_capacity = int(1e6) adversary_rbuf = replay_buffers.ReplayBuffer(adversary_rbuf_capacity) adversary_explorer = explorers.LinearDecayEpsilonGreedy( 1.0, 0.1, 10**4, adversary_random_func ) adversary_agent = DQN( adversary_q, adversary_optimizer, adversary_rbuf, gpu=args.gpu, gamma=args.gamma, explorer=adversary_explorer, replay_start_size=args.replay_start_size, target_update_interval=1, minibatch_size=args.batch_size, target_update_method='soft', soft_update_tau=5e-3 ) logger = logging.getLogger(__name__) eval_env = make_env(test=True) evaluator = Evaluator( agent=ddpg_agent, n_steps=None, n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, outdir=args.outdir, max_episode_len=timestep_limit, env=eval_env, step_offset=0, save_best_so_far_agent=True, use_tensorboard=True, logger=logger, ) episode_reward = 0 ddpg_episode_idx = 0 adversary_episode_idx = 0 # o_0, r_0 current_state = env.reset() t = 0 ddpg_t = 0 adversary_t = 0 episode_len = 0 try: while t < args.max_steps: for i in range(args.ddpg_training_steps): t += 1 ddpg_t += 1 ddpg_action = ddpg_agent.act(current_state) adversary_action = adversary_agent.act(current_state) ddpg_action[adversary_action] = 0 next_state, reward, done, info = env.step(ddpg_action) episode_reward += reward episode_len += 1 reset = episode_len == timestep_limit or info.get("needs_reset", False) ddpg_agent.observe(next_state, reward, done, reset) current_state = next_state if done or reset or t == args.max_steps: logger.info( "ddpg phase: outdir:%s step:%s episode:%s R:%s", args.outdir, ddpg_t, ddpg_episode_idx, episode_reward, ) logger.info("statistics:%s", ddpg_agent.get_statistics()) if evaluator is not None: evaluator.evaluate_if_necessary(t=t, episodes=ddpg_episode_idx + 1) if t == args.max_steps: break episode_reward = 0 ddpg_episode_idx += 1 episode_len = 0 current_state = env.reset() episode_reward = 0 episode_len = 0 current_state = env.reset() print("start adversary training ") for i in range(args.adversary_training_steps): t += 1 adversary_t += 1 ddpg_action = ddpg_agent.act(current_state) adversary_action = adversary_agent.act(current_state) ddpg_action[adversary_action] = 0 next_state, reward, done, info = env.step(ddpg_action) reward = -reward episode_len += 1 reset = episode_len == timestep_limit or info.get("needs_reset", False) adversary_agent.observe(next_state, reward, done, reset) current_state = next_state if done or reset or t == args.max_steps: if t == args.max_steps: break episode_reward = 0 adversary_episode_idx += 1 episode_len = 0 current_state = env.reset() except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_except") save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_except" ) raise # Save the final model save_agent(ddpg_agent, t, args.outdir, logger, suffix="_ddpg_finish") save_agent(adversary_agent, t, args.outdir, logger, suffix="_adversary_finish" ) # if args.demo: # eval_env.render() # eval_stats = experiments.eval_performance( # env=eval_env, # agent=ddpg_agent, # n_steps=None, # n_episodes=args.eval_n_runbase_envs, # max_episode_len=timestep_limit, # ) # print( # "n_runs: {} mean: {} median: {} stdev {}".format( # args.eval_n_runs, # eval_stats["mean"], # eval_stats["median"], # eval_stats["stdev"], # ) # ) # else: # experiments.train_agent_with_evaluation( # agent=ddpg_agent, # env=env, # steps=args.steps, # eval_env=eval_env, # eval_n_steps=None, # eval_n_episodes=args.eval_n_runs, # eval_interval=args.eval_interval, # outdir=args.outdir, # train_max_episode_len=timestep_limit, # ) print("finish")
def __init__( self, state_dim, goal_dim, action_dim, scale, replay_buffer, actor_lr, critic_lr, expl_noise, policy_noise, noise_clip, gamma, policy_freq, tau, is_low_level, buffer_freq, minibatch_size, gpu, add_entropy, burnin_action_func=None, replay_start_size=2500, temperature=1.0, optimize_temp=False): self.scale = scale if gpu is not None and gpu >= 0: assert torch.cuda.is_available() self.device = torch.device("cuda:{}".format(gpu)) else: self.device = torch.device("cpu") self.scale_tensor = torch.tensor(self.scale).float().to(self.device) # parameters self.expl_noise = expl_noise self.policy_noise = policy_noise self.noise_clip = noise_clip self.gamma = gamma self.policy_freq = policy_freq self.tau = tau self.is_low_level = is_low_level self.minibatch_size = minibatch_size self.add_entropy = add_entropy # create agent if self.add_entropy: def squashed_diagonal_gaussian_head(x): """ taken from the SAC code. """ assert x.shape[-1] == action_dim * 2 mean, log_scale = torch.chunk(x, 2, dim=-1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1 ) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)] ) # SAC policy definition: policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_dim * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) explorer = explorers.AdditiveGaussian( scale=0.0, ) else: policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, action_dim), nn.Tanh(), pfrl.policies.DeterministicHead(), ) # TODO - have proper low and high values from action space. # from the hiro paper, the scale is 1.0 explorer = explorers.AdditiveGaussian( scale=self.expl_noise, low=-self.scale, high=self.scale ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(state_dim + goal_dim + action_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=critic_lr) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() def default_target_policy_smoothing_func(batch_action): """Add noises to actions for target policy smoothing.""" noise = torch.clamp(self.policy_noise * torch.randn_like(batch_action), -self.noise_clip, self.noise_clip) smoothed_action = batch_action + noise smoothed_action = torch.min(smoothed_action, torch.tensor(self.scale).to(self.device).float()) smoothed_action = torch.max(smoothed_action, torch.tensor(-self.scale).to(self.device).float()) return smoothed_action input_scale = self.scale_tensor if self.is_low_level: # standard goal conditioned td3 self.agent = GoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, scale=input_scale, burnin_action_func=burnin_action_func, target_policy_smoothing_func=default_target_policy_smoothing_func, entropy_temperature=temperature, optimize_temp=optimize_temp ) else: self.agent = HIROHighLevelGoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size/buffer_freq - 5, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, scale=input_scale, burnin_action_func=burnin_action_func, target_policy_smoothing_func=default_target_policy_smoothing_func, entropy_temperature=temperature, optimize_temp=optimize_temp ) self.device = self.agent.device