Beispiel #1
0
def make_policy(state_shape, action_shape):
    # net = nn.Sequential(
    #     nn.Conv2d(3, num, 4, stride=2),
    #     nn.ReLU(inplace=True),
    #     nn.Conv2d(num, 64, 4, stride=2),
    #     nn.ReLU(inplace=True),
    #     nn.Conv2d(64, 32, 4, stride=2),
    #     nn.ReLU(inplace=True),
    #     nn.Conv2d(32, 8, 4, stride=2),
    #     nn.ReLU(inplace=True),
    #     Flatten(),  # torch.Size([1, 192])
    #     nn.Linear(192, 64),  # torch.Size([1, 64])
    #     nn.ReLU(inplace=True),
    #     nn.Linear(64, 2 * 2),
    #     Lambda(squashed_diagonal_gaussian_head),
    # )
    num = 256
    net = nn.Sequential(
        # nn.Linear(state_shape[0], num),
        nn.Linear(state_shape, num),
        nn.ReLU(inplace=True),
        # nn.Linear(num, num),
        # nn.ReLU(inplace=True),
        # nn.Linear(num, num),
        # nn.ReLU(inplace=True),
        nn.Linear(num, 128),
        nn.ReLU(inplace=True),
        nn.Linear(128, 64),
        nn.ReLU(inplace=True),
        nn.Linear(64, 2 * action_shape[0]),
        Lambda(squashed_diagonal_gaussian_head),
    )
    net.apply(init_weights)
    return net
Beispiel #2
0
    def __init__(self, state_dim, goal_dim, action_dim, max_action):
        super(StochasticActor, self).__init__()
        self.action_dim = action_dim
        self.l1 = nn.Linear(state_dim + goal_dim, 300)
        self.l2 = nn.Linear(300, 300)
        self.l3 = nn.Linear(300, action_dim * 2)
        self.lambda_fnc = Lambda(self.squashed_diagonal_gaussian_head)

        self.max_action = max_action
Beispiel #3
0
def test_lambda():
    model = nn.Sequential(
        nn.ReLU(),
        Lambda(lambda x: x + 1),
        nn.ReLU(),
    )
    x = torch.rand(3, 2)
    # Since x is all positive, ReLU will not have any effects
    y = model(x)
    torch_assert_allclose(y, x + 1)
def make_policy():
    num = 64
    net = nn.Sequential(
        nn.Conv2d(3, num, 4, stride=2),
        nn.ReLU(inplace=True),
        nn.Conv2d(num, 64, 4, stride=2),
        nn.ReLU(inplace=True),
        nn.Conv2d(64, 32, 4, stride=2),
        nn.ReLU(inplace=True),
        nn.Conv2d(32, 8, 4, stride=2),
        nn.ReLU(inplace=True),
        Flatten(),  # torch.Size([1, 192])
        nn.Linear(192, 64),  # torch.Size([1, 64])
        nn.ReLU(inplace=True),
        nn.Linear(64, 2 * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
    net.apply(init_weights)
    return net
Beispiel #5
0
    def make_agent(self, env, gpu):
        obs_size = env.observation_space.low.size
        action_size = env.action_space.low.size
        hidden_size = 20

        def squashed_diagonal_gaussian_head(x):
            assert x.shape[-1] == action_size * 2
            mean, log_scale = torch.split(x,
                                          int(list(x.size())[-1] / 2),
                                          dim=1)
            log_scale = torch.clamp(log_scale, -20.0, 2.0)
            var = torch.exp(log_scale * 2)
            base_distribution = distributions.Independent(
                distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
            # cache_size=1 is required for numerical stability
            return distributions.transformed_distribution.TransformedDistribution(
                base_distribution,
                [distributions.transforms.TanhTransform(cache_size=1)],
            )

        policy = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(
                hidden_size,
                action_size * 2,
            ),
            nn.Tanh(),
            Lambda(squashed_diagonal_gaussian_head),
        )
        policy[2].weight.detach().mul_(1e-1)
        policy_optimizer = torch.optim.Adam(policy.parameters())

        def make_q_func_with_optimizer():
            q_func = nn.Sequential(
                pfrl.nn.ConcatObsAndAction(),
                nn.Linear(obs_size + action_size, hidden_size),
                nn.ReLU(),
                nn.Linear(hidden_size, 1),
            )
            q_func[3].weight.detach().mul_(1e-1)
            q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=1e-2)
            return q_func, q_func_optimizer

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()

        rbuf = pfrl.replay_buffers.ReplayBuffer(10**6)

        def burnin_action_func():
            return np.random.uniform(env.action_space.low,
                                     env.action_space.high).astype(np.float32)

        agent = pfrl.agents.SoftActorCritic(
            policy=policy,
            q_func1=q_func1,
            q_func2=q_func2,
            policy_optimizer=policy_optimizer,
            q_func1_optimizer=q_func1_optimizer,
            q_func2_optimizer=q_func2_optimizer,
            replay_buffer=rbuf,
            gamma=0.5,
            minibatch_size=100,
            replay_start_size=100,
            burnin_action_func=burnin_action_func,
            entropy_target=-action_size,
            max_grad_norm=1.0,
        )

        return agent
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument(
        "--env",
        type=str,
        default="RoboschoolAtlasForwardWalk-v1",
        help="OpenAI Gym env to perform algorithm on.",
    )
    parser.add_argument("--num-envs",
                        type=int,
                        default=4,
                        help="Number of envs run in parallel.")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--steps",
        type=int,
        default=10**7,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=20,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=100000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=10000,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument(
        "--update-interval",
        type=int,
        default=1,
        help="Interval in timesteps between model updates.",
    )
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="Minibatch size")
    parser.add_argument("--render",
                        action="store_true",
                        help="Render env states in a GUI window.")
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--monitor",
                        action="store_true",
                        help="Wrap env with Monitor to write videos.")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=1000,
        help=
        "Interval in timesteps between outputting log messages during training",
    )
    parser.add_argument("--log-level",
                        type=int,
                        default=logging.INFO,
                        help="Level of the root logger.")
    parser.add_argument(
        "--n-hidden-channels",
        type=int,
        default=1024,
        help="Number of hidden channels of NN models.",
    )
    parser.add_argument("--discount",
                        type=float,
                        default=0.98,
                        help="Discount factor.")
    parser.add_argument("--n-step-return",
                        type=int,
                        default=3,
                        help="N-step return.")
    parser.add_argument("--lr",
                        type=float,
                        default=3e-4,
                        help="Learning rate.")
    parser.add_argument("--adam-eps",
                        type=float,
                        default=1e-1,
                        help="Adam eps.")
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print("Output files are saved in {}".format(args.outdir))

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    def make_batch_env(test):
        return pfrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, args, process_seeds[idx], test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(args, process_seeds[0], test=False)
    timestep_limit = sample_env.spec.max_episode_steps
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)
    del sample_env

    action_size = action_space.low.size

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(obs_space.low.size, args.n_hidden_channels),
        nn.ReLU(),
        nn.Linear(args.n_hidden_channels, args.n_hidden_channels),
        nn.ReLU(),
        nn.Linear(args.n_hidden_channels, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
    torch.nn.init.xavier_uniform_(policy[0].weight)
    torch.nn.init.xavier_uniform_(policy[2].weight)
    torch.nn.init.xavier_uniform_(policy[4].weight)
    policy_optimizer = torch.optim.Adam(policy.parameters(),
                                        lr=args.lr,
                                        eps=args.adam_eps)

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_space.low.size + action_size,
                      args.n_hidden_channels),
            nn.ReLU(),
            nn.Linear(args.n_hidden_channels, args.n_hidden_channels),
            nn.ReLU(),
            nn.Linear(args.n_hidden_channels, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(q_func.parameters(),
                                            lr=args.lr,
                                            eps=args.adam_eps)
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return)

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=args.discount,
        update_interval=args.update_interval,
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=args.lr,
    )

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        eval_env = make_env(args, seed=0, test=True)
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            max_episode_len=timestep_limit,
        )
Beispiel #7
0
    def _test_load_sac(self, gpu):
        obs_size = 11
        action_size = 3

        def squashed_diagonal_gaussian_head(x):
            assert x.shape[-1] == action_size * 2
            mean, log_scale = torch.chunk(x, 2, dim=1)
            log_scale = torch.clamp(log_scale, -20.0, 2.0)
            var = torch.exp(log_scale * 2)
            from torch import distributions

            base_distribution = distributions.Independent(
                distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
            # cache_size=1 is required for numerical stability
            return distributions.transformed_distribution.TransformedDistribution(
                base_distribution,
                [distributions.transforms.TanhTransform(cache_size=1)],
            )

        from pfrl.nn.lmbda import Lambda

        policy = nn.Sequential(
            nn.Linear(obs_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_size * 2),
            Lambda(squashed_diagonal_gaussian_head),
        )
        policy_optimizer = torch.optim.Adam(policy.parameters(), lr=3e-4)

        def make_q_func_with_optimizer():
            q_func = nn.Sequential(
                pfrl.nn.ConcatObsAndAction(),
                nn.Linear(obs_size + action_size, 256),
                nn.ReLU(),
                nn.Linear(256, 256),
                nn.ReLU(),
                nn.Linear(256, 1),
            )
            torch.nn.init.xavier_uniform_(q_func[1].weight)
            torch.nn.init.xavier_uniform_(q_func[3].weight)
            torch.nn.init.xavier_uniform_(q_func[5].weight)
            q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=3e-4)
            return q_func, q_func_optimizer

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()

        agent = agents.SoftActorCritic(
            policy,
            q_func1,
            q_func2,
            policy_optimizer,
            q_func1_optimizer,
            q_func2_optimizer,
            replay_buffers.ReplayBuffer(100),
            gamma=0.99,
            replay_start_size=1000,
            gpu=gpu,
            minibatch_size=256,
            burnin_action_func=None,
            entropy_target=-3,
            temperature_optimizer_lr=3e-4,
        )

        downloaded_model, exists = download_model(
            "SAC", "Hopper-v2", model_type=self.pretrained_type)
        agent.load(downloaded_model)
        if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"):
            assert exists
Beispiel #8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument(
        "--env",
        type=str,
        default="Hopper-v2",
        help="OpenAI Gym MuJoCo env to perform algorithm on.",
    )
    parser.add_argument("--num-envs",
                        type=int,
                        default=1,
                        help="Number of envs run in parallel.")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--steps",
        type=int,
        default=10**6,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=10000,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument("--batch-size",
                        type=int,
                        default=256,
                        help="Minibatch size")
    parser.add_argument("--render",
                        action="store_true",
                        help="Render env states in a GUI window.")
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--load-pretrained",
                        action="store_true",
                        default=False)
    parser.add_argument("--pretrained-type",
                        type=str,
                        default="best",
                        choices=["best", "final"])
    parser.add_argument("--monitor",
                        action="store_true",
                        help="Wrap env with gym.wrappers.Monitor.")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=1000,
        help=
        "Interval in timesteps between outputting log messages during training",
    )
    parser.add_argument("--log-level",
                        type=int,
                        default=logging.INFO,
                        help="Level of the root logger.")
    parser.add_argument(
        "--policy-output-scale",
        type=float,
        default=1.0,
        help="Weight initialization scale of policy output.",
    )
    parser.add_argument(
        "--optimizer",
        type=str,
        default="AdaBelief",
    )
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print("Output files are saved in {}".format(args.outdir))

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Unwrap TimiLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        # Normalize action space to [-1, 1]^n
        env = pfrl.wrappers.NormalizeActionSpace(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return pfrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(process_idx=0, test=False)
    timestep_limit = sample_env.spec.max_episode_steps
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    def make_optimizer(parameters):

        if args.optimizer == "OfficialAdaBelief":
            import adabelief_pytorch

            optim_class = adabelief_pytorch.AdaBelief
            optim = optim_class(parameters, betas=(0.9, 0.999), eps=1e-12)
        else:
            optim_class = getattr(
                torch_optimizer,
                args.optimizer,
                getattr(torch.optim, args.optimizer, None),
            )
            optim = optim_class(parameters)
        assert optim_class is not None
        print(str(optim_class), "with default hyperparameters")
        return optim

    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
    torch.nn.init.xavier_uniform_(policy[0].weight)
    torch.nn.init.xavier_uniform_(policy[2].weight)
    torch.nn.init.xavier_uniform_(policy[4].weight,
                                  gain=args.policy_output_scale)
    policy_optimizer = make_optimizer(policy.parameters())

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = make_optimizer(q_func.parameters())
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(10**6)

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=0.99,
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=3e-4,
    )

    if len(args.load) > 0 or args.load_pretrained:
        if args.load_pretrained:
            raise Exception("Pretrained models are currently unsupported.")
        # either load or load_pretrained must be false
        assert not len(args.load) > 0 or not args.load_pretrained
        if len(args.load) > 0:
            agent.load(args.load)
        else:
            agent.load(
                utils.download_model("SAC",
                                     args.env,
                                     model_type=args.pretrained_type)[0])

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            max_episode_len=timestep_limit,
        )
Beispiel #9
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    parser = argparse.ArgumentParser()
    parser.add_argument('-w',
                        '--weight_dir',
                        type=str,
                        default='',
                        help='path to trained')
    parser.add_argument('-s',
                        '--step_to_load',
                        type=int,
                        default=0,
                        help='step checkpoint to load')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    step_to_load = args.step_to_load

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    save_path = os.path.join(
        weight_dir, 'testing_' + str(step_to_load),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg_sac'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1
    cfg['environment']['num_threads'] = 1
    cfg['environment']['control_dt'] = cfg['testing']['control_dt']
    cfg['environment']['render'] = cfg['testing']['render']

    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    # seeding
    seed = cfg['environment']['seed']
    torch.manual_seed(seed)
    utils.set_random_seed(seed)  # Set a random seed used in PFRL

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )

    policy_optimizer = torch.optim.Adam(policy.parameters(),
                                        lr=cfg['algorithm']['learning_rate'])

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(
            q_func.parameters(), lr=cfg['algorithm']['learning_rate'])
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size'])

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=cfg['algorithm']['discount_factor'],
        replay_start_size=cfg['algorithm']['replay_start_size'],
        gpu=args.gpu,
        minibatch_size=cfg['algorithm']['minibatch_size'],
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'],
    )

    agent.load(weight_dir + '/' + str(step_to_load) + '_checkpoint')

    if cfg['testing']['render']:
        env.wrapper.showWindow()

    if cfg['testing']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(cfg['testing']['seconds'] / cfg['testing']['control_dt'])

    torch.manual_seed(cfg['environment']['seed'])

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['testing']['render'])

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            with agent.eval_mode():
                agent.act_deterministically = True
                act = agent.batch_act(ob)

            ob, rew, done, info = env.step(act,
                                           visualize=cfg['testing']['render'])

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['testing']['record_video']:
            env.stop_recording_video()
Beispiel #10
0
    def __init__(self,
                 state_dim,
                 goal_dim,
                 action_dim,
                 scale,
                 replay_buffer,
                 actor_lr,
                 critic_lr,
                 expl_noise,
                 policy_noise,
                 noise_clip,
                 gamma,
                 policy_freq,
                 tau,
                 is_low_level,
                 buffer_freq,
                 minibatch_size,
                 gpu,
                 add_entropy,
                 burnin_action_func=None,
                 replay_start_size=2500):
        self.scale = scale
        # parameters
        self.expl_noise = expl_noise
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.policy_freq = policy_freq
        self.tau = tau
        self.is_low_level = is_low_level
        self.minibatch_size = minibatch_size
        self.add_entropy = add_entropy
        # create td3 agent
        self.device = torch.device(f'cuda:{gpu}')
        if self.add_entropy:

            def squashed_diagonal_gaussian_head(x):
                mean, log_scale = torch.chunk(x, 2, dim=-1)
                log_scale = torch.clamp(log_scale, -20.0, 2.0)
                var = torch.exp(log_scale * 2)
                base_distribution = distributions.Independent(
                    distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
                return base_distribution

            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, action_dim * 2),
                nn.Tanh(),
                ConstantsMult(
                    torch.cat(
                        (torch.tensor(self.scale), torch.ones(
                            self.scale.size))).float().to(self.device)),
                # pfrl.policies.DeterministicHead(),
                Lambda(squashed_diagonal_gaussian_head),
            )
        else:
            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, action_dim),
                nn.Tanh(),
                ConstantsMult(
                    torch.tensor(self.scale).float().to(self.device)),
                pfrl.policies.DeterministicHead(),
            )

        policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr)

        def make_q_func_with_optimizer():
            q_func = nn.Sequential(
                pfrl.nn.ConcatObsAndAction(),
                nn.Linear(state_dim + goal_dim + action_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, 1),
            )
            q_func_optimizer = torch.optim.Adam(q_func.parameters(),
                                                lr=critic_lr)
            return q_func, q_func_optimizer

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()

        # TODO - have proper low and high values from action space.
        # from the hiro paper, the scale is 1.0
        explorer = explorers.AdditiveGaussian(scale=self.expl_noise * 1.0,
                                              low=-self.scale,
                                              high=self.scale)

        def default_target_policy_smoothing_func(batch_action):
            """Add noises to actions for target policy smoothing."""
            noise = torch.clamp(
                self.policy_noise * torch.randn_like(batch_action),
                -self.noise_clip, self.noise_clip)
            smoothed_action = batch_action + noise
            smoothed_action = torch.min(
                smoothed_action,
                torch.tensor(self.scale).to(self.device).float())
            smoothed_action = torch.max(
                smoothed_action,
                torch.tensor(-self.scale).to(self.device).float())
            return smoothed_action

        if self.is_low_level:
            # standard goal conditioned td3
            self.agent = GoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=
                default_target_policy_smoothing_func)
        else:
            self.agent = HIROHighLevelGoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size / buffer_freq,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=
                default_target_policy_smoothing_func)

        self.device = self.agent.device
Beispiel #11
0
    def __init__(
            self,
            state_dim,
            goal_dim,
            action_dim,
            scale,
            replay_buffer,
            actor_lr,
            critic_lr,
            expl_noise,
            policy_noise,
            noise_clip,
            gamma,
            policy_freq,
            tau,
            is_low_level,
            buffer_freq,
            minibatch_size,
            gpu,
            add_entropy,
            burnin_action_func=None,
            replay_start_size=2500,
            temperature=1.0,
            optimize_temp=False):
        self.scale = scale

        if gpu is not None and gpu >= 0:
            assert torch.cuda.is_available()
            self.device = torch.device("cuda:{}".format(gpu))
        else:
            self.device = torch.device("cpu")

        self.scale_tensor = torch.tensor(self.scale).float().to(self.device)
        # parameters
        self.expl_noise = expl_noise
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.gamma = gamma
        self.policy_freq = policy_freq
        self.tau = tau
        self.is_low_level = is_low_level
        self.minibatch_size = minibatch_size
        self.add_entropy = add_entropy

        # create agent
        if self.add_entropy:
            def squashed_diagonal_gaussian_head(x):
                """
                taken from the SAC code.
                """
                assert x.shape[-1] == action_dim * 2

                mean, log_scale = torch.chunk(x, 2, dim=-1)
                log_scale = torch.clamp(log_scale, -20.0, 2.0)
                var = torch.exp(log_scale * 2)
                base_distribution = distributions.Independent(
                    distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1
                )
                # cache_size=1 is required for numerical stability
                return distributions.transformed_distribution.TransformedDistribution(
                    base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]
                )

            # SAC policy definition:
            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 256),
                nn.ReLU(),
                nn.Linear(256, 256),
                nn.ReLU(),
                nn.Linear(256, action_dim * 2),
                Lambda(squashed_diagonal_gaussian_head),
                )

            torch.nn.init.xavier_uniform_(policy[0].weight)
            torch.nn.init.xavier_uniform_(policy[2].weight)
            torch.nn.init.xavier_uniform_(policy[4].weight)
            explorer = explorers.AdditiveGaussian(
                scale=0.0,
            )

        else:
            policy = nn.Sequential(
                nn.Linear(state_dim + goal_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, action_dim),
                nn.Tanh(),
                pfrl.policies.DeterministicHead(),
                )
            # TODO - have proper low and high values from action space.
            # from the hiro paper, the scale is 1.0
            explorer = explorers.AdditiveGaussian(
                scale=self.expl_noise,
                low=-self.scale,
                high=self.scale
            )


        policy_optimizer = torch.optim.Adam(policy.parameters(), lr=actor_lr)

        def make_q_func_with_optimizer():
            q_func = nn.Sequential(
                pfrl.nn.ConcatObsAndAction(),
                nn.Linear(state_dim + goal_dim + action_dim, 300),
                nn.ReLU(),
                nn.Linear(300, 300),
                nn.ReLU(),
                nn.Linear(300, 1),
            )
            q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=critic_lr)
            return q_func, q_func_optimizer

        q_func1, q_func1_optimizer = make_q_func_with_optimizer()
        q_func2, q_func2_optimizer = make_q_func_with_optimizer()


        def default_target_policy_smoothing_func(batch_action):
            """Add noises to actions for target policy smoothing."""
            noise = torch.clamp(self.policy_noise * torch.randn_like(batch_action), -self.noise_clip, self.noise_clip)
            smoothed_action = batch_action + noise
            smoothed_action = torch.min(smoothed_action, torch.tensor(self.scale).to(self.device).float())
            smoothed_action = torch.max(smoothed_action, torch.tensor(-self.scale).to(self.device).float())
            return smoothed_action

        input_scale = self.scale_tensor

        if self.is_low_level:
            # standard goal conditioned td3
            self.agent = GoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                scale=input_scale,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=default_target_policy_smoothing_func,
                entropy_temperature=temperature,
                optimize_temp=optimize_temp
                )
        else:
            self.agent = HIROHighLevelGoalConditionedTD3(
                policy,
                q_func1,
                q_func2,
                policy_optimizer,
                q_func1_optimizer,
                q_func2_optimizer,
                replay_buffer,
                gamma=gamma,
                soft_update_tau=tau,
                explorer=explorer,
                update_interval=1,
                policy_update_delay=policy_freq,
                replay_start_size=replay_start_size/buffer_freq - 5,
                buffer_freq=buffer_freq,
                minibatch_size=minibatch_size,
                gpu=gpu,
                add_entropy=self.add_entropy,
                scale=input_scale,
                burnin_action_func=burnin_action_func,
                target_policy_smoothing_func=default_target_policy_smoothing_func,
                entropy_temperature=temperature,
                optimize_temp=optimize_temp
                )

        self.device = self.agent.device
Beispiel #12
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--num-envs",
                        type=int,
                        default=1,
                        help="Number of envs run in parallel.")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--steps",
        type=int,
        default=10**7,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=100000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=2500,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument(
        "--update-interval",
        type=int,
        default=1,
        help="Interval in timesteps between model updates.",
    )
    parser.add_argument("--batch-size",
                        type=int,
                        default=100,
                        help="Minibatch size")
    parser.add_argument("--render",
                        action="store_true",
                        help="Render env states in a GUI window.")
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--monitor",
                        action="store_true",
                        help="Wrap env with Monitor to write videos.")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=1000,
        help=
        "Interval in timesteps between outputting log messages during training",
    )
    parser.add_argument("--log-level",
                        type=int,
                        default=logging.INFO,
                        help="Level of the root logger.")
    parser.add_argument(
        "--n-hidden-channels",
        type=int,
        default=256,
        help="Number of hidden channels of NN models.",
    )
    parser.add_argument(
        "--env",
        default="AntMaze",
        help=
        "Type of Ant Env to use. Options are AntMaze, AntFall, and AntPush.",
        type=str)
    parser.add_argument("--discount",
                        type=float,
                        default=0.99,
                        help="Discount factor.")
    parser.add_argument("--n-step-return",
                        type=int,
                        default=3,
                        help="N-step return.")
    parser.add_argument("--lr",
                        type=float,
                        default=3e-4,
                        help="Learning rate.")
    parser.add_argument("--adam-eps",
                        type=float,
                        default=1e-1,
                        help="Adam eps.")
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print("Output files are saved in {}".format(args.outdir))

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    def make_ant_env(idx, test):

        # use different seeds for train vs test envs
        process_seed = int(process_seeds[idx])

        env_seed = 2**32 - 1 - process_seed if test else process_seed
        # env_seed = np.random.randint(0, 2**32 - 1) if not test else process_seed
        print('seed', env_seed)
        utils.set_random_seed(env_seed)
        # create the anv environment with goal
        env = AntEnvWithGoal(create_maze_env(args.env),
                             args.env,
                             env_subgoal_dim=15)
        env.seed(int(env_seed))

        if args.render:
            env = pfrl.wrappers.GymLikeEnvRender(env, mode='human')

        return env

    eval_env = make_ant_env(0, test=True)

    env_state_dim = eval_env.state_dim
    env_action_dim = eval_env.action_dim
    if args.env == 'AntMaze' or args.env == 'AntPush':
        env_goal_dim = 2
    else:
        env_goal_dim = 3

    action_size = env_action_dim

    action_space = eval_env.action_space

    scale_low = action_space.high * np.ones(env_action_dim)

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(env_state_dim + env_goal_dim, args.n_hidden_channels),
        nn.ReLU(),
        nn.Linear(args.n_hidden_channels, args.n_hidden_channels),
        nn.ReLU(),
        nn.Linear(args.n_hidden_channels, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
    torch.nn.init.xavier_uniform_(policy[0].weight)
    torch.nn.init.xavier_uniform_(policy[2].weight)
    torch.nn.init.xavier_uniform_(policy[4].weight)
    policy_optimizer = torch.optim.Adam(policy.parameters(), lr=0.0001)

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(env_state_dim + env_goal_dim + env_action_dim, 300),
            nn.ReLU(),
            nn.Linear(300, 300),
            nn.ReLU(),
            nn.Linear(300, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=0.001)
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(200000)

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    if args.gpu is not None and args.gpu >= 0:
        assert torch.cuda.is_available()
        device = torch.device("cuda:{}".format(args.gpu))
    else:
        device = torch.device("cpu")

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    scale_tensor = torch.tensor(scale_low).float().to(device)

    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=args.discount,
        update_interval=args.update_interval,
        replay_start_size=args.replay_start_size,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=args.lr,
        scale=scale_tensor)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        eval_env = make_env(args, seed=0, test=True)
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_goal_conditioned_agent_with_evaluation(
            agent=agent,
            env=make_ant_env(0, test=False),
            steps=args.steps,
            eval_n_steps=None,
            outdir=args.outdir,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=5000,
            use_tensorboard=True,
        )
Beispiel #13
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    # config file arg
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg_name',
                        type=str,
                        default='cfg_sac.yaml',
                        help='configuration file')
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--demo-record",
                        action="store_true",
                        help="Save video of demo.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=1000,
        help=
        "Interval in timesteps between outputting log messages during training",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--checkpoint-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between saving checkpoint",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    # folder config & logdir
    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/../" + cfg_name
    log_dir = os.path.join(task_path, 'runs/pfrl_sac')

    save_items = [env_path + '/Environment.hpp', cfg_abs_path, __file__]
    if not args.demo:
        cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # environment
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    steps_per_episode = math.floor(cfg['environment']['max_time'] /
                                   cfg['environment']['control_dt'])
    total_steps_per_iteration = steps_per_episode * cfg['environment'][
        'num_envs']

    total_training_steps = cfg['algorithm'][
        'total_algorithm_updates'] * total_steps_per_iteration

    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    # seeding
    seed = cfg['environment']['seed']
    torch.manual_seed(seed)
    utils.set_random_seed(seed)  # Set a random seed used in PFRL

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
    torch.nn.init.xavier_uniform_(policy[0].weight)
    torch.nn.init.xavier_uniform_(policy[2].weight)
    torch.nn.init.xavier_uniform_(policy[4].weight, gain=1.0)
    policy_optimizer = torch.optim.Adam(policy.parameters(),
                                        lr=cfg['algorithm']['learning_rate'])

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(
            q_func.parameters(), lr=cfg['algorithm']['learning_rate'])
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size'])

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=cfg['algorithm']['discount_factor'],
        replay_start_size=cfg['algorithm']['replay_start_size'],
        gpu=args.gpu,
        minibatch_size=cfg['algorithm']['minibatch_size'],
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'],
    )

    # logger settings
    logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
    logger = logging.getLogger(__name__)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        if cfg['environment']['render']:
            env.show_window()
            if args.demo_record:
                env.start_recording_video(args.load + "/../demo_" +
                                          os.path.basename(args.load) + ".mp4")
        eval_stats = eval_performance_pfrl(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
        )
        if cfg['environment']['render']:
            if args.demo_record:
                env.stop_recording_video()
            env.hide_window()
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        train_agent_batch_with_evaluation_pfrl(
            agent=agent,
            env=env,
            outdir=cfg_saver.data_dir,
            steps=total_training_steps,
            eval_n_steps=steps_per_episode,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
            use_tensorboard=True,
            checkpoint_freq=args.checkpoint_interval,
            logger=logger)