Exemple #1
0
def parse_arch(arch, n_actions):
    if arch == "nature":
        return nn.Sequential(
            pnn.LargeAtariCNN(),
            init_chainer_default(nn.Linear(512, n_actions)),
            DiscreteActionValueHead(),
        )
    elif arch == "doubledqn":
        # raise NotImplementedError("Single shared bias not implemented yet")
        return nn.Sequential(
            pnn.LargeAtariCNN(),
            init_chainer_default(nn.Linear(512, n_actions, bias=False)),
            SingleSharedBias(),
            DiscreteActionValueHead(),
        )
    elif arch == "nips":
        return nn.Sequential(
            pnn.SmallAtariCNN(),
            init_chainer_default(nn.Linear(256, n_actions)),
            DiscreteActionValueHead(),
        )
    elif arch == "dueling":
        return DuelingDQN(n_actions)
    else:
        raise RuntimeError("Not supported architecture: {}".format(arch))
Exemple #2
0
 def __init__(self):
     super().__init__()
     self.l1 = nn.Conv2d(1, 32, 8, stride=4)
     self.l2 = nn.ReLU()
     self.l3 = nn.Conv2d(32, 64, 4, stride=2)
     self.l4 = nn.ReLU()
     self.l5 = nn.Conv2d(64, 64, 3, stride=1)
     self.l6 = nn.Flatten()
     self.l7 = nn.ReLU()
     self.r8 = nn.LSTM(input_size=3136, hidden_size=512)
     self.l9 = init_chainer_default(torch.nn.Linear(512, n_actions))
     self.l10 = DiscreteActionValueHead()
Exemple #3
0
    def _test_load_dqn(self, gpu):
        from pfrl.q_functions import DiscreteActionValueHead

        n_actions = 4
        q_func = nn.Sequential(
            pnn.LargeAtariCNN(),
            init_chainer_default(nn.Linear(512, n_actions)),
            DiscreteActionValueHead(),
        )

        # Use the same hyperparameters as the Nature paper

        opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
            q_func.parameters(),
            lr=2.5e-4,
            alpha=0.95,
            momentum=0.0,
            eps=1e-2,
            centered=True,
        )

        rbuf = replay_buffers.ReplayBuffer(100)

        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(4),
        )

        agent = agents.DQN(
            q_func,
            opt,
            rbuf,
            gpu=gpu,
            gamma=0.99,
            explorer=explorer,
            replay_start_size=50,
            target_update_interval=10**4,
            clip_delta=True,
            update_interval=4,
            batch_accumulator="sum",
            phi=lambda x: x,
        )

        downloaded_model, exists = download_model(
            "DQN", "BreakoutNoFrameskip-v4", model_type=self.pretrained_type)
        agent.load(downloaded_model)
        if os.environ.get("PFRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED"):
            assert exists
Exemple #4
0
    def __init__(
        self,
        n_input_channels,
        n_dim_action,
        n_hidden_channels,
        n_hidden_layers,
        action_space,
        scale_mu=True,
    ):
        self.n_input_channels = n_input_channels
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.n_dim_action = n_dim_action
        assert action_space is not None
        self.scale_mu = scale_mu
        self.action_space = action_space
        super().__init__()
        hidden_layers = nn.ModuleList()
        assert n_hidden_layers >= 1
        hidden_layers.append(
            init_chainer_default(nn.Linear(n_input_channels,
                                           n_hidden_channels)))
        for _ in range(n_hidden_layers - 1):
            hidden_layers.append(
                init_chainer_default(
                    nn.Linear(n_hidden_channels, n_hidden_channels)))
        self.hidden_layers = hidden_layers

        self.v = init_chainer_default(nn.Linear(n_hidden_channels, 1))
        self.mu = init_chainer_default(
            nn.Linear(n_hidden_channels, n_dim_action))
        self.mat_diag = init_chainer_default(
            nn.Linear(n_hidden_channels, n_dim_action))
        non_diag_size = n_dim_action * (n_dim_action - 1) // 2
        if non_diag_size > 0:
            self.mat_non_diag = init_chainer_default(
                nn.Linear(n_hidden_channels, non_diag_size))
Exemple #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        type=str,
        default="BreakoutNoFrameskip-v4",
        help="OpenAI Atari domain to perform algorithm on.",
    )
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load-pretrained",
                        action="store_true",
                        default=False)
    parser.add_argument("--pretrained-type",
                        type=str,
                        default="best",
                        choices=["best", "final"])
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument(
        "--steps",
        type=int,
        default=5 * 10**7,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=5 * 10**4,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument("--eval-n-steps", type=int, default=125000)
    parser.add_argument("--eval-interval", type=int, default=250000)
    parser.add_argument("--n-best-episodes", type=int, default=30)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=None),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = pfrl.wrappers.RandomizeAction(env, 0.05)
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = nn.Sequential(
        pnn.LargeAtariCNN(),
        init_chainer_default(nn.Linear(512, n_actions)),
        DiscreteActionValueHead(),
    )

    # Use the same hyperparameters as the Nature paper

    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
        q_func.parameters(),
        lr=2.5e-4,
        alpha=0.95,
        momentum=0.0,
        eps=1e-2,
        centered=True,
    )

    rbuf = replay_buffers.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.1,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions),
    )

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.DQN
    agent = Agent(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=10**4,
        clip_delta=True,
        update_interval=4,
        batch_accumulator="sum",
        phi=phi,
    )

    if args.load or args.load_pretrained:
        # either load or load_pretrained must be false
        assert not args.load or not args.load_pretrained
        if args.load:
            agent.load(args.load)
        else:
            agent.load(
                utils.download_model("DQN",
                                     args.env,
                                     model_type=args.pretrained_type)[0])

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print("n_episodes: {} mean: {} median: {} stdev {}".format(
            eval_stats["episodes"],
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 30 evaluation episodes, each capped at 5 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=4500,
            logger=None,
        )
        with open(os.path.join(args.outdir, "bestscores.json"), "w") as f:
            json.dump(stats, f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
Exemple #6
0
env = atari_wrappers.wrap_deepmind(
    atari_wrappers.make_atari(ns.env, max_frames=10000),
    episode_life=True,
    clip_rewards=True,
)
test_env = atari_wrappers.wrap_deepmind(
    atari_wrappers.make_atari(ns.env, max_frames=10000),
    episode_life=False,
    clip_rewards=False,
)

n_actions = test_env.action_space.n
q_func = torch.nn.Sequential(
            pnn.LargeAtariCNN(),
            init_chainer_default(torch.nn.Linear(512, n_actions)),
            DiscreteActionValueHead(),
        )

replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10**5)

explorer = explorers.LinearDecayEpsilonGreedy(
    1.0,
    .01,
    ns.steps,
    lambda: numpy.random.randint(n_actions),
)

#Note you can use an env wrapper to do this conversion, but this way
#you avoid storing 64 bit or 32 bit floats in the replay buffer,
#more memory efficient to store bytes and do the neural net conversion later.