def make_policy(state_shape, action_shape): # net = nn.Sequential( # nn.Conv2d(3, num, 4, stride=2), # nn.ReLU(inplace=True), # nn.Conv2d(num, 64, 4, stride=2), # nn.ReLU(inplace=True), # nn.Conv2d(64, 32, 4, stride=2), # nn.ReLU(inplace=True), # nn.Conv2d(32, 8, 4, stride=2), # nn.ReLU(inplace=True), # Flatten(), # torch.Size([1, 192]) # nn.Linear(192, 64), # torch.Size([1, 64]) # nn.ReLU(inplace=True), # nn.Linear(64, 2 * 2), # Lambda(squashed_diagonal_gaussian_head), # ) num = 256 net = nn.Sequential( # nn.Linear(state_shape[0], num), nn.Linear(state_shape, num), nn.ReLU(inplace=True), # nn.Linear(num, num), # nn.ReLU(inplace=True), # nn.Linear(num, num), # nn.ReLU(inplace=True), nn.Linear(num, 128), nn.ReLU(inplace=True), nn.Linear(128, 64), nn.ReLU(inplace=True), nn.Linear(64, 2 * action_shape[0]), Lambda(squashed_diagonal_gaussian_head), ) net.apply(init_weights) return net
def __init__(self, state_dim, goal_dim, action_dim, max_action): super(StochasticActor, self).__init__() self.action_dim = action_dim self.l1 = nn.Linear(state_dim + goal_dim, 300) self.l2 = nn.Linear(300, 300) self.l3 = nn.Linear(300, action_dim * 2) self.lambda_fnc = Lambda(self.squashed_diagonal_gaussian_head) self.max_action = max_action
def test_lambda(): model = nn.Sequential( nn.ReLU(), Lambda(lambda x: x + 1), nn.ReLU(), ) x = torch.rand(3, 2) # Since x is all positive, ReLU will not have any effects y = model(x) torch_assert_allclose(y, x + 1)
def make_policy(): num = 64 net = nn.Sequential( nn.Conv2d(3, num, 4, stride=2), nn.ReLU(inplace=True), nn.Conv2d(num, 64, 4, stride=2), nn.ReLU(inplace=True), nn.Conv2d(64, 32, 4, stride=2), nn.ReLU(inplace=True), nn.Conv2d(32, 8, 4, stride=2), nn.ReLU(inplace=True), Flatten(), # torch.Size([1, 192]) nn.Linear(192, 64), # torch.Size([1, 64]) nn.ReLU(inplace=True), nn.Linear(64, 2 * 2), Lambda(squashed_diagonal_gaussian_head), ) net.apply(init_weights) return net
def make_agent(self, env, gpu): obs_size = env.observation_space.low.size action_size = env.action_space.low.size hidden_size = 20 def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.split(x, int(list(x.size())[-1] / 2), dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)], ) policy = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), nn.Linear( hidden_size, action_size * 2, ), nn.Tanh(), Lambda(squashed_diagonal_gaussian_head), ) policy[2].weight.detach().mul_(1e-1) policy_optimizer = torch.optim.Adam(policy.parameters()) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, 1), ) q_func[3].weight.detach().mul_(1e-1) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=1e-2) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = pfrl.replay_buffers.ReplayBuffer(10**6) def burnin_action_func(): return np.random.uniform(env.action_space.low, env.action_space.high).astype(np.float32) agent = pfrl.agents.SoftActorCritic( policy=policy, q_func1=q_func1, q_func2=q_func2, policy_optimizer=policy_optimizer, q_func1_optimizer=q_func1_optimizer, q_func2_optimizer=q_func2_optimizer, replay_buffer=rbuf, gamma=0.5, minibatch_size=100, replay_start_size=100, burnin_action_func=burnin_action_func, entropy_target=-action_size, max_grad_norm=1.0, ) return agent
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="RoboschoolAtlasForwardWalk-v1", help="OpenAI Gym env to perform algorithm on.", ) parser.add_argument("--num-envs", type=int, default=4, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=20, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Interval in timesteps between model updates.", ) parser.add_argument("--batch-size", type=int, default=256, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--monitor", action="store_true", help="Wrap env with Monitor to write videos.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--n-hidden-channels", type=int, default=1024, help="Number of hidden channels of NN models.", ) parser.add_argument("--discount", type=float, default=0.98, help="Discount factor.") parser.add_argument("--n-step-return", type=int, default=3, help="N-step return.") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") parser.add_argument("--adam-eps", type=float, default=1e-1, help="Adam eps.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, args, process_seeds[idx], test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(args, process_seeds[0], test=False) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) del sample_env action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_space.low.size, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr, eps=args.adam_eps) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_space.low.size + action_size, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=args.lr, eps=args.adam_eps) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=args.discount, update_interval=args.update_interval, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=args.lr, ) if len(args.load) > 0: agent.load(args.load) if args.demo: eval_env = make_env(args, seed=0, test=True) eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, )
def _test_load_sac(self, gpu): obs_size = 11 action_size = 3 def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) from torch import distributions base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)], ) from pfrl.nn.lmbda import Lambda policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=3e-4) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() agent = agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffers.ReplayBuffer(100), gamma=0.99, replay_start_size=1000, gpu=gpu, minibatch_size=256, burnin_action_func=None, entropy_target=-3, temperature_optimizer_lr=3e-4, ) downloaded_model, exists = download_model( "SAC", "Hopper-v2", model_type=self.pretrained_type) agent.load(downloaded_model) if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"): assert exists
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument( "--env", type=str, default="Hopper-v2", help="OpenAI Gym MuJoCo env to perform algorithm on.", ) parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=10000, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--batch-size", type=int, default=256, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--load-pretrained", action="store_true", default=False) parser.add_argument("--pretrained-type", type=str, default="best", choices=["best", "final"]) parser.add_argument("--monitor", action="store_true", help="Wrap env with gym.wrappers.Monitor.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--policy-output-scale", type=float, default=1.0, help="Weight initialization scale of policy output.", ) parser.add_argument( "--optimizer", type=str, default="AdaBelief", ) args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) # Unwrap TimiLimit wrapper assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) # Normalize action space to [-1, 1]^n env = pfrl.wrappers.NormalizeActionSpace(env) if args.monitor: env = gym.wrappers.Monitor(env, args.outdir) if args.render: env = pfrl.wrappers.Render(env) return env def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv([ functools.partial(make_env, idx, test) for idx, env in enumerate(range(args.num_envs)) ]) sample_env = make_env(process_idx=0, test=False) timestep_limit = sample_env.spec.max_episode_steps obs_space = sample_env.observation_space action_space = sample_env.action_space print("Observation space:", obs_space) print("Action space:", action_space) obs_size = obs_space.low.size action_size = action_space.low.size if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) def make_optimizer(parameters): if args.optimizer == "OfficialAdaBelief": import adabelief_pytorch optim_class = adabelief_pytorch.AdaBelief optim = optim_class(parameters, betas=(0.9, 0.999), eps=1e-12) else: optim_class = getattr( torch_optimizer, args.optimizer, getattr(torch.optim, args.optimizer, None), ) optim = optim_class(parameters) assert optim_class is not None print(str(optim_class), "with default hyperparameters") return optim policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight, gain=args.policy_output_scale) policy_optimizer = make_optimizer(policy.parameters()) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = make_optimizer(q_func.parameters()) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(10**6) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=0.99, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=3e-4, ) if len(args.load) > 0 or args.load_pretrained: if args.load_pretrained: raise Exception("Pretrained models are currently unsupported.") # either load or load_pretrained must be false assert not len(args.load) > 0 or not args.load_pretrained if len(args.load) > 0: agent.load(args.load) else: agent.load( utils.download_model("SAC", args.env, model_type=args.pretrained_type)[0]) if args.demo: eval_stats = experiments.eval_performance( env=make_batch_env(test=True), agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_agent_batch_with_evaluation( agent=agent, env=make_batch_env(test=False), eval_env=make_batch_env(test=True), outdir=args.outdir, steps=args.steps, eval_n_steps=None, eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=timestep_limit, )
def main(): if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") parser = argparse.ArgumentParser() parser.add_argument('-w', '--weight_dir', type=str, default='', help='path to trained') parser.add_argument('-s', '--step_to_load', type=int, default=0, help='step checkpoint to load') parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)') args = parser.parse_args() weight_dir = args.weight_dir step_to_load = args.step_to_load task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" save_path = os.path.join( weight_dir, 'testing_' + str(step_to_load), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) os.makedirs(save_path) for file in os.listdir(weight_dir): if file.startswith('cfg_sac'): cfg_abs_path = weight_dir + '/' + file # config cfg = YAML().load(open(cfg_abs_path, 'r')) cfg['environment']['num_envs'] = 1 cfg['environment']['num_threads'] = 1 cfg['environment']['control_dt'] = cfg['testing']['control_dt'] cfg['environment']['render'] = cfg['testing']['render'] impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper)) env = VecEnvPython(impl) obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) # seeding seed = cfg['environment']['seed'] torch.manual_seed(seed) utils.set_random_seed(seed) # Set a random seed used in PFRL obs_size = obs_space.low.size action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=cfg['algorithm']['learning_rate']) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam( q_func.parameters(), lr=cfg['algorithm']['learning_rate']) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size']) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=cfg['algorithm']['discount_factor'], replay_start_size=cfg['algorithm']['replay_start_size'], gpu=args.gpu, minibatch_size=cfg['algorithm']['minibatch_size'], burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'], ) agent.load(weight_dir + '/' + str(step_to_load) + '_checkpoint') if cfg['testing']['render']: env.wrapper.showWindow() if cfg['testing']['record_video']: env.start_recording_video(save_path + '/test.mp4') test_steps = int(cfg['testing']['seconds'] / cfg['testing']['control_dt']) torch.manual_seed(cfg['environment']['seed']) act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32) _, _, _, new_info = env.step(act, visualize=cfg['testing']['render']) ob = env.reset() try: for i in range(test_steps): if i % 100 == 0: env.reset() with agent.eval_mode(): agent.act_deterministically = True act = agent.batch_act(ob) ob, rew, done, info = env.step(act, visualize=cfg['testing']['render']) except KeyboardInterrupt: pass finally: if cfg['testing']['record_video']: env.stop_recording_video()
def __init__(self, state_dim, goal_dim, action_dim, scale, replay_buffer, actor_lr, critic_lr, expl_noise, policy_noise, noise_clip, gamma, policy_freq, tau, is_low_level, buffer_freq, minibatch_size, gpu, add_entropy, burnin_action_func=None, replay_start_size=2500): self.scale = scale # parameters self.expl_noise = expl_noise self.policy_noise = policy_noise self.noise_clip = noise_clip self.gamma = gamma self.policy_freq = policy_freq self.tau = tau self.is_low_level = is_low_level self.minibatch_size = minibatch_size self.add_entropy = add_entropy # create td3 agent self.device = torch.device(f'cuda:{gpu}') if self.add_entropy: def squashed_diagonal_gaussian_head(x): mean, log_scale = torch.chunk(x, 2, dim=-1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) return base_distribution policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, action_dim * 2), nn.Tanh(), ConstantsMult( torch.cat( (torch.tensor(self.scale), torch.ones( self.scale.size))).float().to(self.device)), # pfrl.policies.DeterministicHead(), Lambda(squashed_diagonal_gaussian_head), ) else: policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, action_dim), nn.Tanh(), ConstantsMult( torch.tensor(self.scale).float().to(self.device)), pfrl.policies.DeterministicHead(), ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(state_dim + goal_dim + action_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=critic_lr) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() # TODO - have proper low and high values from action space. # from the hiro paper, the scale is 1.0 explorer = explorers.AdditiveGaussian(scale=self.expl_noise * 1.0, low=-self.scale, high=self.scale) def default_target_policy_smoothing_func(batch_action): """Add noises to actions for target policy smoothing.""" noise = torch.clamp( self.policy_noise * torch.randn_like(batch_action), -self.noise_clip, self.noise_clip) smoothed_action = batch_action + noise smoothed_action = torch.min( smoothed_action, torch.tensor(self.scale).to(self.device).float()) smoothed_action = torch.max( smoothed_action, torch.tensor(-self.scale).to(self.device).float()) return smoothed_action if self.is_low_level: # standard goal conditioned td3 self.agent = GoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, burnin_action_func=burnin_action_func, target_policy_smoothing_func= default_target_policy_smoothing_func) else: self.agent = HIROHighLevelGoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size / buffer_freq, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, burnin_action_func=burnin_action_func, target_policy_smoothing_func= default_target_policy_smoothing_func) self.device = self.agent.device
def __init__( self, state_dim, goal_dim, action_dim, scale, replay_buffer, actor_lr, critic_lr, expl_noise, policy_noise, noise_clip, gamma, policy_freq, tau, is_low_level, buffer_freq, minibatch_size, gpu, add_entropy, burnin_action_func=None, replay_start_size=2500, temperature=1.0, optimize_temp=False): self.scale = scale if gpu is not None and gpu >= 0: assert torch.cuda.is_available() self.device = torch.device("cuda:{}".format(gpu)) else: self.device = torch.device("cpu") self.scale_tensor = torch.tensor(self.scale).float().to(self.device) # parameters self.expl_noise = expl_noise self.policy_noise = policy_noise self.noise_clip = noise_clip self.gamma = gamma self.policy_freq = policy_freq self.tau = tau self.is_low_level = is_low_level self.minibatch_size = minibatch_size self.add_entropy = add_entropy # create agent if self.add_entropy: def squashed_diagonal_gaussian_head(x): """ taken from the SAC code. """ assert x.shape[-1] == action_dim * 2 mean, log_scale = torch.chunk(x, 2, dim=-1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1 ) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)] ) # SAC policy definition: policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_dim * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) explorer = explorers.AdditiveGaussian( scale=0.0, ) else: policy = nn.Sequential( nn.Linear(state_dim + goal_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, action_dim), nn.Tanh(), pfrl.policies.DeterministicHead(), ) # TODO - have proper low and high values from action space. # from the hiro paper, the scale is 1.0 explorer = explorers.AdditiveGaussian( scale=self.expl_noise, low=-self.scale, high=self.scale ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(state_dim + goal_dim + action_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, 1), ) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=critic_lr) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() def default_target_policy_smoothing_func(batch_action): """Add noises to actions for target policy smoothing.""" noise = torch.clamp(self.policy_noise * torch.randn_like(batch_action), -self.noise_clip, self.noise_clip) smoothed_action = batch_action + noise smoothed_action = torch.min(smoothed_action, torch.tensor(self.scale).to(self.device).float()) smoothed_action = torch.max(smoothed_action, torch.tensor(-self.scale).to(self.device).float()) return smoothed_action input_scale = self.scale_tensor if self.is_low_level: # standard goal conditioned td3 self.agent = GoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, scale=input_scale, burnin_action_func=burnin_action_func, target_policy_smoothing_func=default_target_policy_smoothing_func, entropy_temperature=temperature, optimize_temp=optimize_temp ) else: self.agent = HIROHighLevelGoalConditionedTD3( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, replay_buffer, gamma=gamma, soft_update_tau=tau, explorer=explorer, update_interval=1, policy_update_delay=policy_freq, replay_start_size=replay_start_size/buffer_freq - 5, buffer_freq=buffer_freq, minibatch_size=minibatch_size, gpu=gpu, add_entropy=self.add_entropy, scale=input_scale, burnin_action_func=burnin_action_func, target_policy_smoothing_func=default_target_policy_smoothing_func, entropy_temperature=temperature, optimize_temp=optimize_temp ) self.device = self.agent.device
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--outdir", type=str, default="results", help=("Directory path to save output files." " If it does not exist, it will be created."), ) parser.add_argument("--num-envs", type=int, default=1, help="Number of envs run in parallel.") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--steps", type=int, default=10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument( "--eval-interval", type=int, default=100000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--replay-start-size", type=int, default=2500, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--update-interval", type=int, default=1, help="Interval in timesteps between model updates.", ) parser.add_argument("--batch-size", type=int, default=100, help="Minibatch size") parser.add_argument("--render", action="store_true", help="Render env states in a GUI window.") parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--monitor", action="store_true", help="Wrap env with Monitor to write videos.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument("--log-level", type=int, default=logging.INFO, help="Level of the root logger.") parser.add_argument( "--n-hidden-channels", type=int, default=256, help="Number of hidden channels of NN models.", ) parser.add_argument( "--env", default="AntMaze", help= "Type of Ant Env to use. Options are AntMaze, AntFall, and AntPush.", type=str) parser.add_argument("--discount", type=float, default=0.99, help="Discount factor.") parser.add_argument("--n-step-return", type=int, default=3, help="N-step return.") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.") parser.add_argument("--adam-eps", type=float, default=1e-1, help="Adam eps.") args = parser.parse_args() logging.basicConfig(level=args.log_level) args.outdir = experiments.prepare_output_dir(args, args.outdir, argv=sys.argv) print("Output files are saved in {}".format(args.outdir)) # Set a random seed used in PFRL utils.set_random_seed(args.seed) # Set different random seeds for different subprocesses. # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs assert process_seeds.max() < 2**32 def make_ant_env(idx, test): # use different seeds for train vs test envs process_seed = int(process_seeds[idx]) env_seed = 2**32 - 1 - process_seed if test else process_seed # env_seed = np.random.randint(0, 2**32 - 1) if not test else process_seed print('seed', env_seed) utils.set_random_seed(env_seed) # create the anv environment with goal env = AntEnvWithGoal(create_maze_env(args.env), args.env, env_subgoal_dim=15) env.seed(int(env_seed)) if args.render: env = pfrl.wrappers.GymLikeEnvRender(env, mode='human') return env eval_env = make_ant_env(0, test=True) env_state_dim = eval_env.state_dim env_action_dim = eval_env.action_dim if args.env == 'AntMaze' or args.env == 'AntPush': env_goal_dim = 2 else: env_goal_dim = 3 action_size = env_action_dim action_space = eval_env.action_space scale_low = action_space.high * np.ones(env_action_dim) def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(env_state_dim + env_goal_dim, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, args.n_hidden_channels), nn.ReLU(), nn.Linear(args.n_hidden_channels, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=0.0001) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(env_state_dim + env_goal_dim + env_action_dim, 300), nn.ReLU(), nn.Linear(300, 300), nn.ReLU(), nn.Linear(300, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=0.001) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(200000) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) if args.gpu is not None and args.gpu >= 0: assert torch.cuda.is_available() device = torch.device("cuda:{}".format(args.gpu)) else: device = torch.device("cpu") # Hyperparameters in http://arxiv.org/abs/1802.09477 scale_tensor = torch.tensor(scale_low).float().to(device) agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=args.discount, update_interval=args.update_interval, replay_start_size=args.replay_start_size, gpu=args.gpu, minibatch_size=args.batch_size, burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=args.lr, scale=scale_tensor) if len(args.load) > 0: agent.load(args.load) if args.demo: eval_env = make_env(args, seed=0, test=True) eval_stats = experiments.eval_performance( env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=timestep_limit, ) print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: experiments.train_goal_conditioned_agent_with_evaluation( agent=agent, env=make_ant_env(0, test=False), steps=args.steps, eval_n_steps=None, outdir=args.outdir, eval_n_episodes=args.eval_n_runs, eval_interval=5000, use_tensorboard=True, )
def main(): if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") # config file arg parser = argparse.ArgumentParser() parser.add_argument('--cfg_name', type=str, default='cfg_sac.yaml', help='configuration file') parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--demo-record", action="store_true", help="Save video of demo.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--checkpoint-interval", type=int, default=5000, help="Interval in timesteps between saving checkpoint", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)') args = parser.parse_args() cfg_name = args.cfg_name # folder config & logdir task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" env_path = task_path + "/.." cfg_abs_path = task_path + "/../" + cfg_name log_dir = os.path.join(task_path, 'runs/pfrl_sac') save_items = [env_path + '/Environment.hpp', cfg_abs_path, __file__] if not args.demo: cfg_saver = ConfigurationSaver(log_dir, save_items, args) # environment cfg = YAML().load(open(cfg_abs_path, 'r')) impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper)) env = VecEnvPython(impl) steps_per_episode = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt']) total_steps_per_iteration = steps_per_episode * cfg['environment'][ 'num_envs'] total_training_steps = cfg['algorithm'][ 'total_algorithm_updates'] * total_steps_per_iteration obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) # seeding seed = cfg['environment']['seed'] torch.manual_seed(seed) utils.set_random_seed(seed) # Set a random seed used in PFRL obs_size = obs_space.low.size action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight, gain=1.0) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=cfg['algorithm']['learning_rate']) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam( q_func.parameters(), lr=cfg['algorithm']['learning_rate']) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size']) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=cfg['algorithm']['discount_factor'], replay_start_size=cfg['algorithm']['replay_start_size'], gpu=args.gpu, minibatch_size=cfg['algorithm']['minibatch_size'], burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'], ) # logger settings logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='') logger = logging.getLogger(__name__) if len(args.load) > 0: agent.load(args.load) if args.demo: if cfg['environment']['render']: env.show_window() if args.demo_record: env.start_recording_video(args.load + "/../demo_" + os.path.basename(args.load) + ".mp4") eval_stats = eval_performance_pfrl( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=steps_per_episode, visualize=cfg['environment']['render'], ) if cfg['environment']['render']: if args.demo_record: env.stop_recording_video() env.hide_window() print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: train_agent_batch_with_evaluation_pfrl( agent=agent, env=env, outdir=cfg_saver.data_dir, steps=total_training_steps, eval_n_steps=steps_per_episode, eval_n_episodes=None, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=steps_per_episode, visualize=cfg['environment']['render'], use_tensorboard=True, checkpoint_freq=args.checkpoint_interval, logger=logger)