Beispiel #1
0
def test_bias_correction_softmax(n_actions):
    base_policy = nn.Sequential(
        nn.Linear(1, n_actions),
        SoftmaxCategoricalHead(),
    )
    another_policy = nn.Sequential(
        nn.Linear(1, n_actions),
        SoftmaxCategoricalHead(),
    )
    q_values = torch.rand(1, n_actions)
    action_value = pfrl.action_value.DiscreteActionValue(q_values)
    _test_bias_correction(base_policy, another_policy, action_value)
Beispiel #2
0
    def make_model(self, env):
        hidden_size = 50

        obs_size = env.observation_space.low.size
        v = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1),
        )

        if self.discrete:
            n_actions = env.action_space.n
            pi = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.Tanh(),
                nn.Linear(hidden_size, n_actions),
                SoftmaxCategoricalHead(),
            )
        else:
            action_size = env.action_space.low.size
            pi = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.Tanh(),
                nn.Linear(hidden_size, action_size),
                GaussianHeadWithStateIndependentCovariance(
                    action_size=action_size,
                    var_type="diagonal",
                    var_func=lambda x: torch.exp(2 * x),
                    var_param_init=0,
                ),
            )

        return Branched(pi, v)
Beispiel #3
0
def test_compute_loss_with_kl_constraint_softmax():
    n_actions = 3
    policy = nn.Sequential(
        nn.Linear(1, n_actions),
        SoftmaxCategoricalHead(),
    )
    _test_compute_loss_with_kl_constraint(policy)
Beispiel #4
0
 def __init__(self, n_input, n_action):
     super().__init__()
     self.n_input = n_input
     self.n_action = n_action
     # network definition
     self.body_p = self.body()
     self.body_v = self.body()
     out_size = self._get_conv_out()
     # last layers
     self.policy = nn.Linear(out_size, self.n_action, bias=False)
     self.value = nn.Linear(out_size, 1, bias=False)
     # softmax
     self.softmax = SoftmaxCategoricalHead()
     # initialize last layers
     self.policy.weight.data = normalized_columns_initializer(
         self.policy.weight.data, 0.01)
     self.value.weight.data = normalized_columns_initializer(
         self.value.weight.data, 1.0)
Beispiel #5
0
    def test_load_a3c(self):
        from pfrl.policies import SoftmaxCategoricalHead

        obs_size = 4
        n_actions = 4
        a3c_model = nn.Sequential(
            nn.Conv2d(obs_size, 16, 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(16, 32, 4, stride=2),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(2592, 256),
            nn.ReLU(),
            pfrl.nn.Branched(
                nn.Sequential(
                    nn.Linear(256, n_actions),
                    SoftmaxCategoricalHead(),
                ),
                nn.Linear(256, 1),
            ),
        )
        from pfrl.optimizers import SharedRMSpropEpsInsideSqrt

        opt = SharedRMSpropEpsInsideSqrt(a3c_model.parameters(),
                                         lr=7e-4,
                                         eps=1e-1,
                                         alpha=0.99)
        agent = agents.A3C(a3c_model,
                           opt,
                           t_max=5,
                           gamma=0.99,
                           beta=1e-2,
                           phi=lambda x: x)
        downloaded_model, exists = download_model(
            "A3C", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type)
        agent.load(downloaded_model)
        if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"):
            assert exists
Beispiel #6
0
    def make_model(self, env):
        hidden_size = 20
        obs_size = env.observation_space.low.size

        def weight_scale(layer, scale):
            with torch.no_grad():
                layer.weight.mul_(scale)
            return layer

        if self.recurrent:
            v = RecurrentSequential(
                nn.LSTM(num_layers=1,
                        input_size=obs_size,
                        hidden_size=hidden_size),
                weight_scale(nn.Linear(hidden_size, 1), 1e-1),
            )
            if self.discrete:
                n_actions = env.action_space.n
                pi = RecurrentSequential(
                    nn.LSTM(num_layers=1,
                            input_size=obs_size,
                            hidden_size=hidden_size),
                    weight_scale(nn.Linear(hidden_size, n_actions), 1e-1),
                    SoftmaxCategoricalHead(),
                )
            else:
                action_size = env.action_space.low.size
                pi = RecurrentSequential(
                    nn.LSTM(num_layers=1,
                            input_size=obs_size,
                            hidden_size=hidden_size),
                    weight_scale(nn.Linear(hidden_size, action_size), 1e-1),
                    GaussianHeadWithStateIndependentCovariance(
                        action_size=action_size,
                        var_type="diagonal",
                        var_func=lambda x: torch.exp(2 * x),
                        var_param_init=0,
                    ),
                )
            return RecurrentBranched(pi, v)
        else:
            v = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.Tanh(),
                weight_scale(nn.Linear(hidden_size, 1), 1e-1),
            )
            if self.discrete:
                n_actions = env.action_space.n
                pi = nn.Sequential(
                    nn.Linear(obs_size, hidden_size),
                    nn.Tanh(),
                    weight_scale(nn.Linear(hidden_size, n_actions), 1e-1),
                    SoftmaxCategoricalHead(),
                )
            else:
                action_size = env.action_space.low.size
                pi = nn.Sequential(
                    nn.Linear(obs_size, hidden_size),
                    nn.Tanh(),
                    weight_scale(nn.Linear(hidden_size, action_size), 1e-1),
                    GaussianHeadWithStateIndependentCovariance(
                        action_size=action_size,
                        var_type="diagonal",
                        var_func=lambda x: torch.exp(2 * x),
                        var_param_init=0,
                    ),
                )
            return pfrl.nn.Branched(pi, v)
Beispiel #7
0
    def _test_abc(
        self, use_lstm, discrete=True, steps=1000000, require_success=True, gpu=-1
    ):
        def make_env(process_idx, test):
            size = 2
            return ABC(
                size=size,
                discrete=discrete,
                episodic=True,
                partially_observable=self.use_lstm,
                deterministic=test,
            )

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        hidden_size = 20
        obs_size = obs_space.low.size
        if discrete:
            output_size = action_space.n
            head = SoftmaxCategoricalHead()
        else:
            output_size = action_space.low.size
            head = GaussianHeadWithStateIndependentCovariance(
                output_size, var_type="diagonal"
            )
        if use_lstm:
            model = pfrl.nn.RecurrentSequential(
                nn.LSTM(
                    num_layers=1,
                    input_size=obs_size,
                    hidden_size=hidden_size,
                ),
                nn.Linear(hidden_size, hidden_size),
                nn.LeakyReLU(),
                nn.Linear(hidden_size, output_size),
                head,
            )
        else:
            model = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                nn.Linear(hidden_size, output_size),
                head,
            )
        opt = torch.optim.Adam(model.parameters())
        beta = 1e-2
        agent = pfrl.agents.REINFORCE(
            model,
            opt,
            gpu=gpu,
            beta=beta,
            batchsize=self.batchsize,
            backward_separately=self.backward_separately,
            act_deterministically=True,
            recurrent=use_lstm,
        )

        pfrl.experiments.train_agent_with_evaluation(
            agent=agent,
            env=make_env(0, False),
            eval_env=make_env(0, True),
            outdir=self.outdir,
            steps=steps,
            train_max_episode_len=2,
            eval_interval=500,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=1,
        )

        # Test
        env = make_env(0, True)
        n_test_runs = 5
        eval_returns, _ = run_evaluation_episodes(
            env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
        )
        if require_success:
            successful_return = 1
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs
Beispiel #8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--processes", type=int, default=16)
    parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--t-max", type=int, default=5)
    parser.add_argument("--beta", type=float, default=1e-2)
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--steps", type=int, default=8 * 10**7)
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--lr", type=float, default=7e-4)
    parser.add_argument("--eval-interval", type=int, default=250000)
    parser.add_argument("--eval-n-steps", type=int, default=125000)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load-pretrained",
                        action="store_true",
                        default=False)
    parser.add_argument("--load", type=str, default="")
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    sample_env = make_env(0, False)
    obs_size = sample_env.observation_space.low.shape[0]
    n_actions = sample_env.action_space.n

    model = nn.Sequential(
        nn.Conv2d(obs_size, 16, 8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(2592, 256),
        nn.ReLU(),
        pfrl.nn.Branched(
            nn.Sequential(
                nn.Linear(256, n_actions),
                SoftmaxCategoricalHead(),
            ),
            nn.Linear(256, 1),
        ),
    )

    # SharedRMSprop is same as torch.optim.RMSprop except that it initializes
    # its state in __init__, allowing it to be moved to shared memory.
    opt = SharedRMSpropEpsInsideSqrt(model.parameters(),
                                     lr=7e-4,
                                     eps=1e-1,
                                     alpha=0.99)
    assert opt.state_dict()["state"], (
        "To share optimizer state across processes, the state must be"
        " initialized before training.")

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = a3c.A3C(
        model,
        opt,
        t_max=args.t_max,
        gamma=0.99,
        beta=args.beta,
        phi=phi,
        max_grad_norm=40.0,
    )

    if args.load_pretrained:
        raise Exception("Pretrained models are currently unsupported.")

    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print("n_steps: {} mean: {} median: {} stdev: {}".format(
            args.eval_n_steps,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            for pg in agent.optimizer.param_groups:
                assert "lr" in pg
                pg["lr"] = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=True,
        )
Beispiel #9
0
def test_ppo_dataset_recurrent_and_non_recurrent_equivalence(
        use_obs_normalizer, gamma, lambd, max_recurrent_sequence_len):
    """Test equivalence between recurrent and non-recurrent datasets.

    When the same feed-forward model is used, the values of
    log_prob, v_pred, next_v_pred obtained by both recurrent and
    non-recurrent dataset creation functions should be the same.
    """
    episodes = make_random_episodes()
    if use_obs_normalizer:
        obs_normalizer = pfrl.nn.EmpiricalNormalization(2, clip_threshold=5)
        obs_normalizer.experience(torch.rand(10, 2))
    else:
        obs_normalizer = None

    def phi(obs):
        return (obs * 0.5).astype(np.float32)

    device = torch.device("cpu")

    obs_size = 2
    n_actions = 3

    non_recurrent_model = pfrl.nn.Branched(
        nn.Sequential(
            nn.Linear(obs_size, n_actions),
            SoftmaxCategoricalHead(),
        ),
        nn.Linear(obs_size, 1),
    )
    recurrent_model = RecurrentSequential(non_recurrent_model, )

    dataset = pfrl.agents.ppo._make_dataset(
        episodes=copy.deepcopy(episodes),
        model=non_recurrent_model,
        phi=phi,
        batch_states=batch_states,
        obs_normalizer=obs_normalizer,
        gamma=gamma,
        lambd=lambd,
        device=device,
    )

    dataset_recurrent = pfrl.agents.ppo._make_dataset_recurrent(
        episodes=copy.deepcopy(episodes),
        model=recurrent_model,
        phi=phi,
        batch_states=batch_states,
        obs_normalizer=obs_normalizer,
        gamma=gamma,
        lambd=lambd,
        max_recurrent_sequence_len=max_recurrent_sequence_len,
        device=device,
    )

    assert "log_prob" not in episodes[0][0]
    assert "log_prob" in dataset[0]
    assert "log_prob" in dataset_recurrent[0][0]
    # They are not just shallow copies
    assert dataset[0]["log_prob"] is not dataset_recurrent[0][0]["log_prob"]

    states = [tr["state"] for tr in dataset]
    recurrent_states = [
        tr["state"] for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(states, recurrent_states)

    actions = [tr["action"] for tr in dataset]
    recurrent_actions = [
        tr["action"] for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(actions, recurrent_actions)

    rewards = [tr["reward"] for tr in dataset]
    recurrent_rewards = [
        tr["reward"] for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(rewards, recurrent_rewards)

    nonterminals = [tr["nonterminal"] for tr in dataset]
    recurrent_nonterminals = [
        tr["nonterminal"]
        for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(nonterminals, recurrent_nonterminals)

    log_probs = [tr["log_prob"] for tr in dataset]
    recurrent_log_probs = [
        tr["log_prob"]
        for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(log_probs, recurrent_log_probs)

    vs_pred = [tr["v_pred"] for tr in dataset]
    recurrent_vs_pred = [
        tr["v_pred"] for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(vs_pred, recurrent_vs_pred)

    next_vs_pred = [tr["next_v_pred"] for tr in dataset]
    recurrent_next_vs_pred = [
        tr["next_v_pred"]
        for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(next_vs_pred, recurrent_next_vs_pred)

    advs = [tr["adv"] for tr in dataset]
    recurrent_advs = [
        tr["adv"] for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(advs, recurrent_advs)

    vs_teacher = [tr["v_teacher"] for tr in dataset]
    recurrent_vs_teacher = [
        tr["v_teacher"]
        for tr in itertools.chain.from_iterable(dataset_recurrent)
    ]
    torch_assert_allclose(vs_teacher, recurrent_vs_teacher)
Beispiel #10
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("processes", type=int)
    parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--t-max", type=int, default=5)
    parser.add_argument("--replay-start-size", type=int, default=10000)
    parser.add_argument("--n-times-replay", type=int, default=4)
    parser.add_argument("--beta", type=float, default=1e-2)
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--steps", type=int, default=10**7)
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--lr", type=float, default=7e-4)
    parser.add_argument("--eval-interval", type=int, default=10**5)
    parser.add_argument("--eval-n-runs", type=int, default=10)
    parser.add_argument("--use-lstm", action="store_true")
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default="")
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    n_actions = gym.make(args.env).action_space.n

    input_to_hidden = nn.Sequential(
        nn.Conv2d(4, 16, 8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(2592, 256),
        nn.ReLU(),
    )

    head = acer.ACERDiscreteActionHead(
        pi=nn.Sequential(
            nn.Linear(256, n_actions),
            SoftmaxCategoricalHead(),
        ),
        q=nn.Sequential(
            nn.Linear(256, n_actions),
            DiscreteActionValueHead(),
        ),
    )

    if args.use_lstm:
        model = pfrl.nn.RecurrentSequential(
            input_to_hidden,
            nn.LSTM(num_layers=1, input_size=256, hidden_size=256),
            head,
        )
    else:
        model = nn.Sequential(input_to_hidden, head)

    model.apply(pfrl.initializers.init_chainer_default)

    opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(),
                                                     lr=args.lr,
                                                     eps=4e-3,
                                                     alpha=0.99)

    replay_buffer = EpisodicReplayBuffer(10**6 // args.processes)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = acer.ACER(
        model,
        opt,
        t_max=args.t_max,
        gamma=0.99,
        replay_buffer=replay_buffer,
        n_times_replay=args.n_times_replay,
        replay_start_size=args.replay_start_size,
        beta=args.beta,
        phi=phi,
        max_grad_norm=40,
        recurrent=args.use_lstm,
    )

    if args.load:
        agent.load(args.load)

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            for pg in agent.optimizer.param_groups:
                assert "lr" in pg
                pg["lr"] = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=False,
        )
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env",
                        type=str,
                        default="BreakoutNoFrameskip-v4",
                        help="Gym Env ID.")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU device ID. Set to -1 to use CPUs only.")
    parser.add_argument(
        "--num-envs",
        type=int,
        default=8,
        help="Number of env instances run in parallel.",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--steps",
                        type=int,
                        default=10**7,
                        help="Total time steps for training.")
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--lr",
                        type=float,
                        default=2.5e-4,
                        help="Learning rate.")
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=100000,
        help="Interval (in timesteps) between evaluation phases.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes ran in an evaluation phase.",
    )
    parser.add_argument(
        "--demo",
        action="store_true",
        default=False,
        help="Run demo episodes, not training.",
    )
    parser.add_argument(
        "--load",
        type=str,
        default="",
        help=("Directory path to load a saved agent data from"
              " if it is a non-empty string."),
    )
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument(
        "--update-interval",
        type=int,
        default=128 * 8,
        help="Interval (in timesteps) between PPO iterations.",
    )
    parser.add_argument(
        "--batchsize",
        type=int,
        default=32 * 8,
        help="Size of minibatch (in timesteps).",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=4,
        help="Number of epochs used for each PPO iteration.",
    )
    parser.add_argument(
        "--log-interval",
        type=int,
        default=10000,
        help="Interval (in timesteps) of printing logs.",
    )
    parser.add_argument(
        "--recurrent",
        action="store_true",
        default=False,
        help="Use a recurrent model. See the code for the model definition.",
    )
    parser.add_argument(
        "--flicker",
        action="store_true",
        default=False,
        help=("Use so-called flickering Atari, where each"
              " screen is blacked out with probability 0.5."),
    )
    parser.add_argument(
        "--no-frame-stack",
        action="store_true",
        default=False,
        help=
        ("Disable frame stacking so that the agent can only see the current screen."
         ),
    )
    parser.add_argument(
        "--checkpoint-frequency",
        type=int,
        default=None,
        help="Frequency at which agents are stored.",
    )
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(idx, test):
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            flicker=args.flicker,
            frame_stack=False,
        )
        env.seed(env_seed)
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        vec_env = pfrl.envs.MultiprocessVectorEnv([
            (lambda: make_env(idx, test))
            for idx, env in enumerate(range(args.num_envs))
        ])
        if not args.no_frame_stack:
            vec_env = pfrl.wrappers.VectorFrameStack(vec_env, 4)
        return vec_env

    sample_env = make_batch_env(test=False)
    print("Observation space", sample_env.observation_space)
    print("Action space", sample_env.action_space)
    n_actions = sample_env.action_space.n
    obs_n_channels = sample_env.observation_space.low.shape[0]
    del sample_env

    def lecun_init(layer, gain=1):
        if isinstance(layer, (nn.Conv2d, nn.Linear)):
            pfrl.initializers.init_lecun_normal(layer.weight, gain)
            nn.init.zeros_(layer.bias)
        else:
            pfrl.initializers.init_lecun_normal(layer.weight_ih_l0, gain)
            pfrl.initializers.init_lecun_normal(layer.weight_hh_l0, gain)
            nn.init.zeros_(layer.bias_ih_l0)
            nn.init.zeros_(layer.bias_hh_l0)
        return layer

    if args.recurrent:
        model = pfrl.nn.RecurrentSequential(
            lecun_init(nn.Conv2d(obs_n_channels, 32, 8, stride=4)),
            nn.ReLU(),
            lecun_init(nn.Conv2d(32, 64, 4, stride=2)),
            nn.ReLU(),
            lecun_init(nn.Conv2d(64, 64, 3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            lecun_init(nn.Linear(3136, 512)),
            nn.ReLU(),
            lecun_init(nn.GRU(num_layers=1, input_size=512, hidden_size=512)),
            pfrl.nn.Branched(
                nn.Sequential(
                    lecun_init(nn.Linear(512, n_actions), 1e-2),
                    SoftmaxCategoricalHead(),
                ),
                lecun_init(nn.Linear(512, 1)),
            ),
        )
    else:
        model = nn.Sequential(
            lecun_init(nn.Conv2d(obs_n_channels, 32, 8, stride=4)),
            nn.ReLU(),
            lecun_init(nn.Conv2d(32, 64, 4, stride=2)),
            nn.ReLU(),
            lecun_init(nn.Conv2d(64, 64, 3, stride=1)),
            nn.ReLU(),
            nn.Flatten(),
            lecun_init(nn.Linear(3136, 512)),
            nn.ReLU(),
            pfrl.nn.Branched(
                nn.Sequential(
                    lecun_init(nn.Linear(512, n_actions), 1e-2),
                    SoftmaxCategoricalHead(),
                ),
                lecun_init(nn.Linear(512, 1)),
            ),
        )

    opt = torch.optim.Adam(model.parameters(), lr=args.lr, eps=1e-5)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = PPO(
        model,
        opt,
        gpu=args.gpu,
        phi=phi,
        update_interval=args.update_interval,
        minibatch_size=args.batchsize,
        epochs=args.epochs,
        clip_eps=0.1,
        clip_eps_vf=None,
        standardize_advantages=True,
        entropy_coef=1e-2,
        recurrent=args.recurrent,
        max_grad_norm=0.5,
    )
    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
        )
        print("n_runs: {} mean: {} median: {} stdev: {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        step_hooks = []

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            for param_group in agent.optimizer.param_groups:
                param_group["lr"] = value

        step_hooks.append(
            experiments.LinearInterpolationHook(args.steps, args.lr, 0,
                                                lr_setter))

        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(False),
            eval_env=make_batch_env(True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            checkpoint_freq=args.checkpoint_frequency,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            save_best_so_far_agent=False,
            step_hooks=step_hooks,
        )
Beispiel #12
0
    def _test_abc(
        self,
        t_max,
        use_lstm,
        discrete=True,
        episodic=True,
        steps=100000,
        require_success=True,
    ):

        nproc = 8

        def make_env(process_idx, test):
            size = 2
            return ABC(
                size=size,
                discrete=discrete,
                episodic=episodic or test,
                partially_observable=self.use_lstm,
                deterministic=test,
            )

        sample_env = make_env(0, False)
        action_space = sample_env.action_space
        obs_space = sample_env.observation_space

        replay_buffer = EpisodicReplayBuffer(10**4)
        obs_size = obs_space.low.size
        hidden_size = 20
        if discrete:
            n_actions = action_space.n
            head = acer.ACERDiscreteActionHead(
                pi=nn.Sequential(
                    nn.Linear(hidden_size, n_actions),
                    SoftmaxCategoricalHead(),
                ),
                q=nn.Sequential(
                    nn.Linear(hidden_size, n_actions),
                    DiscreteActionValueHead(),
                ),
            )
        else:
            action_size = action_space.low.size
            head = acer.ACERContinuousActionHead(
                pi=nn.Sequential(
                    nn.Linear(hidden_size, action_size * 2),
                    GaussianHeadWithDiagonalCovariance(),
                ),
                v=nn.Sequential(nn.Linear(hidden_size, 1), ),
                adv=nn.Sequential(
                    ConcatObsAndAction(),
                    nn.Linear(hidden_size + action_size, 1),
                ),
            )
        if use_lstm:
            model = pfrl.nn.RecurrentSequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                nn.LSTM(num_layers=1,
                        input_size=hidden_size,
                        hidden_size=hidden_size),
                head,
            )
        else:
            model = nn.Sequential(
                nn.Linear(obs_size, hidden_size),
                nn.LeakyReLU(),
                head,
            )
        eps = 1e-8
        opt = pfrl.optimizers.SharedRMSpropEpsInsideSqrt(model.parameters(),
                                                         lr=1e-3,
                                                         eps=eps,
                                                         alpha=0.99)
        gamma = 0.5
        beta = 1e-5
        if self.n_times_replay == 0 and self.disable_online_update:
            # At least one of them must be enabled
            pytest.skip()
        agent = acer.ACER(
            model,
            opt,
            replay_buffer=replay_buffer,
            t_max=t_max,
            gamma=gamma,
            beta=beta,
            n_times_replay=self.n_times_replay,
            act_deterministically=True,
            disable_online_update=self.disable_online_update,
            replay_start_size=100,
            use_trust_region=self.use_trust_region,
            recurrent=use_lstm,
        )

        max_episode_len = None if episodic else 2

        with warnings.catch_warnings(record=True) as warns:
            train_agent_async(
                outdir=self.outdir,
                processes=nproc,
                make_env=make_env,
                agent=agent,
                steps=steps,
                max_episode_len=max_episode_len,
                eval_interval=500,
                eval_n_steps=None,
                eval_n_episodes=5,
                successful_score=1,
            )
            assert len(warns) == 0, warns[0]

        # The agent returned by train_agent_async is not guaranteed to be
        # successful because parameters could be modified by other processes
        # after success. Thus here the successful model is loaded explicitly.
        if require_success:
            agent.load(os.path.join(self.outdir, "successful"))

        # Test
        env = make_env(0, True)
        n_test_runs = 5
        eval_returns = run_evaluation_episodes(
            env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        successful_return = 1
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            assert n_succeeded == n_test_runs
Beispiel #13
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default="CartPole-v0")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--beta", type=float, default=1e-4)
    parser.add_argument("--batchsize", type=int, default=10)
    parser.add_argument("--steps", type=int, default=10**5)
    parser.add_argument("--eval-interval", type=int, default=10**4)
    parser.add_argument("--eval-n-runs", type=int, default=100)
    parser.add_argument("--reward-scale-factor", type=float, default=1e-2)
    parser.add_argument("--render", action="store_true", default=False)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default="")
    parser.add_argument("--log-level", type=int, default=logging.INFO)
    parser.add_argument("--monitor", action="store_true")
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = pfrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and not test:
            env = pfrl.wrappers.Render(env)
        return env

    train_env = make_env(test=False)
    timestep_limit = train_env.spec.max_episode_steps
    obs_space = train_env.observation_space
    action_space = train_env.action_space

    obs_size = obs_space.low.size
    hidden_size = 200
    # Switch policy types accordingly to action space types
    if isinstance(action_space, gym.spaces.Box):
        model = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_size, hidden_size),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_size, action_space.low.size),
            GaussianHeadWithFixedCovariance(0.3),
        )
    else:
        model = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_size, hidden_size),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_size, action_space.n),
            SoftmaxCategoricalHead(),
        )

    opt = torch.optim.Adam(model.parameters(), lr=args.lr)

    agent = pfrl.agents.REINFORCE(
        model,
        opt,
        gpu=args.gpu,
        beta=args.beta,
        batchsize=args.batchsize,
        max_grad_norm=1.0,
    )
    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=train_env,
            eval_env=eval_env,
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            train_max_episode_len=timestep_limit,
        )
Beispiel #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--outdir", type=str, default="results")
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--steps", type=int, default=8 * 10**7)
    parser.add_argument("--update-steps", type=int, default=5)
    parser.add_argument("--lr", type=float, default=7e-4)
    parser.add_argument("--gamma",
                        type=float,
                        default=0.99,
                        help="discount factor")
    parser.add_argument("--rmsprop-epsilon", type=float, default=1e-5)
    parser.add_argument(
        "--use-gae",
        action="store_true",
        default=False,
        help="use generalized advantage estimation",
    )
    parser.add_argument("--tau",
                        type=float,
                        default=0.95,
                        help="gae parameter")
    parser.add_argument("--alpha",
                        type=float,
                        default=0.99,
                        help="RMSprop optimizer alpha")
    parser.add_argument("--eval-interval", type=int, default=10**6)
    parser.add_argument("--eval-n-runs", type=int, default=10)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default="")
    parser.add_argument("--max-grad-norm",
                        type=float,
                        default=40,
                        help="value loss coefficient")
    parser.add_argument(
        "--gpu",
        "-g",
        type=int,
        default=-1,
        help="GPU ID (negative value indicates CPU)",
    )
    parser.add_argument("--num-envs", type=int, default=1)
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.set_defaults(use_lstm=False)
    args = parser.parse_args()

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2**31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        return pfrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])

    sample_env = make_env(0, test=False)
    obs_channel_size = sample_env.observation_space.low.shape[0]
    n_actions = sample_env.action_space.n

    model = nn.Sequential(
        nn.Conv2d(obs_channel_size, 16, 8, stride=4),
        nn.ReLU(),
        nn.Conv2d(16, 32, 4, stride=2),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(2592, 256),
        nn.ReLU(),
        pfrl.nn.Branched(
            nn.Sequential(
                nn.Linear(256, n_actions),
                SoftmaxCategoricalHead(),
            ),
            nn.Linear(256, 1),
        ),
    )
    optimizer = pfrl.optimizers.RMSpropEpsInsideSqrt(
        model.parameters(),
        lr=args.lr,
        eps=args.rmsprop_epsilon,
        alpha=args.alpha,
    )

    agent = a2c.A2C(
        model,
        optimizer,
        gamma=args.gamma,
        gpu=args.gpu,
        num_processes=args.num_envs,
        update_steps=args.update_steps,
        phi=phi,
        use_gae=args.use_gae,
        tau=args.tau,
        max_grad_norm=args.max_grad_norm,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
        )
        print("n_runs: {} mean: {} median: {} stdev: {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
        )