Esempio n. 1
0
    def test_append_and_sample(self):
        capacity = self.capacity
        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)

        for n in [10, 15, 5] * 3:
            transs = [
                dict(
                    state=i,
                    action=100 + i,
                    reward=200 + i,
                    next_state=i + 1,
                    next_action=101 + i,
                    is_state_terminal=(i == n - 1),
                ) for i in range(n)
            ]
            for trans in transs:
                rbuf.append(**trans)

        assert len(rbuf) == 90
        assert rbuf.n_episodes == 9

        for k in [10, 30, 90]:
            s = rbuf.sample(k)
            assert len(s) == k

        for k in [1, 3, 9]:
            s = rbuf.sample_episodes(k)
            assert len(s) == k

            s = rbuf.sample_episodes(k, max_len=10)
            for ep in s:
                assert len(ep) <= 10
                for t0, t1 in zip(ep, ep[1:]):
                    assert t0["next_state"] == t1["state"]
                    assert t0["next_action"] == t1["action"]
Esempio n. 2
0
    def test(self):
        if self.replay_buffer_type == "EpisodicReplayBuffer":
            rbuf = replay_buffers.EpisodicReplayBuffer(capacity=None)
        elif self.replay_buffer_type == "PrioritizedEpisodicReplayBuffer":
            rbuf = replay_buffers.PrioritizedEpisodicReplayBuffer(
                capacity=None)
        else:
            assert False

        # 2 transitions for env_id=0
        for _ in range(2):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=False,
            )
            rbuf.append(env_id=0, **trans1)
        # 4 transitions for env_id=1 with a terminal state
        for i in range(4):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=(i == 3),
            )
            rbuf.append(env_id=1, **trans1)
        # 9 transitions for env_id=2
        for _ in range(9):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=False,
            )
            rbuf.append(env_id=2, **trans1)

        # It should have 4 transitions from env_id=1
        assert len(rbuf) == 4

        # env_id=0 episode ends
        rbuf.stop_current_episode(env_id=0)

        # Now it should have 4 + 2 = 6 transitions
        assert len(rbuf) == 6

        # env_id=2 episode ends
        rbuf.stop_current_episode(env_id=2)

        # Finally it should have 4 + 2 + 9 = 15 transitions
        assert len(rbuf) == 15
Esempio n. 3
0
    def test_save_and_load(self):
        capacity = self.capacity

        tempdir = tempfile.mkdtemp()

        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)

        transs = [
            dict(
                state=n,
                action=n + 10,
                reward=n + 20,
                next_state=n + 1,
                next_action=n + 11,
                is_state_terminal=False,
            ) for n in range(5)
        ]

        # Add two episodes
        rbuf.append(**transs[0])
        rbuf.append(**transs[1])
        rbuf.stop_current_episode()

        rbuf.append(**transs[2])
        rbuf.append(**transs[3])
        rbuf.append(**transs[4])
        rbuf.stop_current_episode()

        assert len(rbuf) == 5
        assert rbuf.n_episodes == 2

        # Save
        filename = os.path.join(tempdir, "rbuf.pkl")
        rbuf.save(filename)

        # Initialize rbuf
        rbuf = replay_buffers.EpisodicReplayBuffer(capacity)

        # Of course it has no transition yet
        assert len(rbuf) == 0

        # Load the previously saved buffer
        rbuf.load(filename)

        # Sampled transitions are exactly what I added!
        s5 = rbuf.sample(5)
        assert len(s5) == 5
        for t in s5:
            assert len(t) == 1
            n = t[0]["state"]
            assert n in range(5)
            assert t[0] == transs[n]

        # And sampled episodes are exactly what I added!
        s2e = rbuf.sample_episodes(2)
        assert len(s2e) == 2
        if s2e[0][0]["state"] == 0:
            assert s2e[0] == [transs[0], transs[1]]
            assert s2e[1] == [transs[2], transs[3], transs[4]]
        else:
            assert s2e[0] == [transs[2], transs[3], transs[4]]
            assert s2e[1] == [transs[0], transs[1]]

        # Sizes are correct!
        assert len(rbuf) == 5
        assert rbuf.n_episodes == 2
Esempio n. 4
0
    action_space = env.action_space

    obs_size = obs_space.low.size

    n_actions = action_space.n

    model = A2CNet()
    model.load_state_dict(torch.load('saves/best_a2c_model.sv'))

    optimizer = pfrl.optimizers.RMSpropEpsInsideSqrt(
        model.parameters(),
        lr=1e-5,
        eps=1e-5,
        alpha=0.99,
    )
    rbuf = replay_buffers.EpisodicReplayBuffer(10**6)

    agent = a2c.A2C(
        model,
        optimizer,
        gamma=0.99,
        gpu=0,
        num_processes=3,
        update_steps=5,
        phi=lambda x: np.asarray(x, dtype=np.float32) / 255,
    )
    agent.training = False

    for idx, data in enumerate(test_dataset):
        tqdm.write(" Image " + str(idx))
        env.set_data(data[0], data[1])
Esempio n. 5
0
 def make_replay_buffer(self, env):
     return replay_buffers.EpisodicReplayBuffer(10**5)
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        type=str,
        default="BreakoutNoFrameskip-v4",
        help="OpenAI Atari domain to perform algorithm on.",
    )
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument(
        "--final-exploration-frames",
        type=int,
        default=10**6,
        help="Timesteps after which we stop " + "annealing exploration rate",
    )
    parser.add_argument(
        "--final-epsilon",
        type=float,
        default=0.01,
        help="Final value of epsilon during training.",
    )
    parser.add_argument(
        "--eval-epsilon",
        type=float,
        default=0.001,
        help="Exploration epsilon used during eval episodes.",
    )
    parser.add_argument(
        "--steps",
        type=int,
        default=5 * 10**7,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=5 * 10**4,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument(
        "--target-update-interval",
        type=int,
        default=3 * 10**4,
        help="Frequency (in timesteps) at which " +
        "the target network is updated.",
    )
    parser.add_argument("--demo-n-episodes", type=int, default=30)
    parser.add_argument("--eval-n-steps", type=int, default=125000)
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=250000,
        help="Frequency (in timesteps) of evaluation phase.",
    )
    parser.add_argument(
        "--update-interval",
        type=int,
        default=4,
        help="Frequency (in timesteps) of network updates.",
    )
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument("--lr",
                        type=float,
                        default=2.5e-4,
                        help="Learning rate.")
    parser.add_argument(
        "--recurrent",
        action="store_true",
        default=False,
        help="Use a recurrent model. See the code for the model definition.",
    )
    parser.add_argument(
        "--flicker",
        action="store_true",
        default=False,
        help=("Use so-called flickering Atari, where each"
              " screen is blacked out with probability 0.5."),
    )
    parser.add_argument(
        "--no-frame-stack",
        action="store_true",
        default=False,
        help=
        ("Disable frame stacking so that the agent can only see the current screen."
         ),
    )
    parser.add_argument(
        "--episodic-update-len",
        type=int,
        default=10,
        help="Maximum length of sequences for updating recurrent models",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=32,
        help=("Number of transitions (in a non-recurrent case)"
              " or sequences (in a recurrent case) used for an"
              " update."),
    )
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            flicker=args.flicker,
            frame_stack=not args.no_frame_stack,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)
    print("Observation space", env.observation_space)
    print("Action space", env.action_space)

    n_frames = env.observation_space.shape[0]
    n_actions = env.action_space.n
    if args.recurrent:
        # Q-network with LSTM
        q_func = pfrl.nn.RecurrentSequential(
            nn.Conv2d(n_frames, 32, 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=1),
            nn.Flatten(),
            nn.ReLU(),
            nn.LSTM(input_size=3136, hidden_size=512),
            nn.Linear(512, n_actions),
            DiscreteActionValueHead(),
        )
        # Replay buffer that stores whole episodes
        rbuf = replay_buffers.EpisodicReplayBuffer(10**6)
    else:
        # Q-network without LSTM
        q_func = nn.Sequential(
            nn.Conv2d(n_frames, 32, 8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, stride=1),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions),
            DiscreteActionValueHead(),
        )
        # Replay buffer that stores transitions separately
        rbuf = replay_buffers.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0,
        args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions),
    )

    opt = torch.optim.Adam(q_func.parameters(), lr=1e-4, eps=1e-4)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = pfrl.agents.DoubleDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator="mean",
        phi=phi,
        minibatch_size=args.batch_size,
        episodic_update_len=args.episodic_update_len,
        recurrent=args.recurrent,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.demo_n_episodes,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.demo_n_episodes,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
        )